Beispiel #1
0
 private static <E> PCollection<E> partition(PCollection<E> collection, int numReducers) {
   PType<E> type = collection.getPType();
   PTableType<E, Void> tableType = Avros.tableOf(type, Avros.nulls());
   PTable<E, Void> table = collection.parallelDo(new AsKeyTable<E>(), tableType);
   PGroupedTable<E, Void> grouped =
       numReducers > 0 ? table.groupByKey(numReducers) : table.groupByKey();
   return grouped.ungroup().keys();
 }
  @Test
  public void testUseReaderSchema() throws IOException {

    // Create a schema with only a username, so we can test reading it
    // with an enhanced record structure.
    Schema oldRecordSchema =
        SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord")
            .fields()
            .requiredString("username")
            .endRecord();

    // create the dataset
    Dataset<Record> in =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(oldRecordSchema).build());
    Dataset<Record> out =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(oldRecordSchema).build());
    Record oldUser = new Record(oldRecordSchema);
    oldUser.put("username", "user");

    DatasetWriter<Record> writer = in.newWriter();

    try {

      writer.write(oldUser);

    } finally {
      writer.close();
    }

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);

    // read data from updated dataset that has the new schema.
    // At this point, User class has the old schema
    PCollection<NewUserRecord> data =
        pipeline.read(CrunchDatasets.asSource(in.getUri(), NewUserRecord.class));

    PCollection<NewUserRecord> processed =
        data.parallelDo(new UserRecordIdentityFn(), Avros.records(NewUserRecord.class));

    pipeline.write(processed, CrunchDatasets.asTarget(out));

    DatasetReader reader = out.newReader();

    Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded());

    try {

      // there should be one record that is equal to our old user generic record.
      Assert.assertEquals(oldUser, reader.next());
      Assert.assertFalse(reader.hasNext());

    } finally {
      reader.close();
    }
  }
 /**
  * Returns a {@code PTable} that contains the unique elements of this collection mapped to a count
  * of their occurrences.
  */
 public static <S> PTable<S, Long> count(PCollection<S> collect) {
   PTypeFamily tf = collect.getTypeFamily();
   return collect
       .parallelDo(
           "Aggregate.count",
           new MapFn<S, Pair<S, Long>>() {
             public Pair<S, Long> map(S input) {
               return Pair.of(input, 1L);
             }
           },
           tf.tableOf(collect.getPType(), tf.longs()))
       .groupByKey()
       .combineValues(Aggregators.SUM_LONGS());
 }
  public int run(String[] args) throws Exception {

    Pipeline pipeline = new MRPipeline(SecondarySortingExample.class);
    // Read input
    PCollection<String> lines = pipeline.readTextFile(args[0]);
    // Split each line and count them
    PTable<String, Long> wordcount = lines.parallelDo(new Tokenizer(), Writables.strings()).count();
    // Sort
    PCollection<Pair<String, Long>> sorted =
        Sort.sortPairs(wordcount, ColumnOrder.by(1, Sort.Order.DESCENDING));
    // Write the output
    sorted.write(To.textFile(args[0]));
    // Kick off execution
    PipelineResult result = pipeline.done();
    return result.succeeded() ? 0 : 1;
  }
 /**
  * Returns the number of elements in the provided PCollection.
  *
  * @param collect The PCollection whose elements should be counted.
  * @param <S> The type of the PCollection.
  * @return A {@code PObject} containing the number of elements in the {@code PCollection}.
  */
 public static <S> PObject<Long> length(PCollection<S> collect) {
   PTypeFamily tf = collect.getTypeFamily();
   PTable<Integer, Long> countTable =
       collect
           .parallelDo(
               "Aggregate.count",
               new MapFn<S, Pair<Integer, Long>>() {
                 public Pair<Integer, Long> map(S input) {
                   return Pair.of(1, 1L);
                 }
               },
               tf.tableOf(tf.ints(), tf.longs()))
           .groupByKey()
           .combineValues(Aggregators.SUM_LONGS());
   PCollection<Long> count = countTable.values();
   return new FirstElementPObject<Long>(count);
 }
  /** Returns the smallest numerical element from the input collection. */
  public static <S> PObject<S> min(PCollection<S> collect) {
    Class<S> clazz = collect.getPType().getTypeClass();
    if (!clazz.isPrimitive() && !Comparable.class.isAssignableFrom(clazz)) {
      throw new IllegalArgumentException(
          "Can only get min for Comparable elements, not for: "
              + collect.getPType().getTypeClass());
    }
    PTypeFamily tf = collect.getTypeFamily();
    PCollection<S> minCollect =
        PTables.values(
            collect
                .parallelDo(
                    "min",
                    new DoFn<S, Pair<Boolean, S>>() {
                      private transient S min = null;

                      public void process(S input, Emitter<Pair<Boolean, S>> emitter) {
                        if (min == null || ((Comparable<S>) min).compareTo(input) > 0) {
                          min = input;
                        }
                      }

                      public void cleanup(Emitter<Pair<Boolean, S>> emitter) {
                        if (min != null) {
                          emitter.emit(Pair.of(false, min));
                        }
                      }
                    },
                    tf.tableOf(tf.booleans(), collect.getPType()))
                .groupByKey()
                .combineValues(
                    new CombineFn<Boolean, S>() {
                      public void process(
                          Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
                        S min = null;
                        for (S v : input.second()) {
                          if (min == null || ((Comparable<S>) min).compareTo(v) > 0) {
                            min = v;
                          }
                        }
                        emitter.emit(Pair.of(input.first(), min));
                      }
                    }));
    return new FirstElementPObject<S>(minCollect);
  }
  @Override
  protected MRPipeline createPipeline() throws IOException {
    JobStepConfig stepConfig = getConfig();
    Config config = ConfigUtils.getDefaultConfig();
    ClusterSettings clusterSettings = ClusterSettings.create(config);

    String instanceDir = stepConfig.getInstanceDir();
    long generationID = stepConfig.getGenerationID();
    String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID);
    String outputKey = prefix + "weighted/";
    if (!validOutputPath(outputKey)) {
      return null;
    }

    String indexKey = prefix + "sketch/" + clusterSettings.getSketchIterations();
    String inputKey = prefix + "normalized/";
    MRPipeline p = createBasicPipeline(ClosestSketchVectorFn.class);

    // first I compute the weight of each k-sketch vector, i.e., Voronoi partition
    // I aggregate all together and persist on disk
    // PCollection<ClosestSketchVectorData> weights = inputPairs(p, inputKey, MLAvros.vector())
    PCollection<ClosestSketchVectorData> weights =
        PTables.asPTable(
                inputPairs(p, inputKey, MLAvros.vector())
                    .parallelDo(
                        "computingSketchVectorWeights",
                        new ClosestSketchVectorFn<RealVector>(indexKey, clusterSettings),
                        Avros.pairs(Avros.ints(), Avros.reflects(ClosestSketchVectorData.class))))
            .groupByKey(1)
            .combineValues(new ClosestSketchVectorAggregator(clusterSettings))
            .values()
            .write(avroOutput(outputKey + "kSketchVectorWeights/"));

    // this "pipeline" takes a single ClosestSketchVectorData and returns weighted vectors
    // could be done outside MapReduce, but that would require me to materialize the
    // ClosestSketchVectorData
    weights
        .parallelDo(
            "generatingWeightedSketchVectors",
            new WeightVectorsFn(indexKey),
            KMeansTypes.FOLD_WEIGHTED_VECTOR)
        .write(avroOutput(outputKey + "weightedKSketchVectors/"));

    return p;
  }
Beispiel #8
0
  public static void main(String[] args) throws Exception {

    Pipeline pipeline = new MRPipeline(WordCount.class);
    PCollection<String> lines = pipeline.readTextFile(args[0]);

    PCollection<String> words =
        lines.parallelDo(
            "my splitter",
            new DoFn<String, String>() {
              public void process(String line, Emitter<String> emitter) {
                for (String word : line.split("\\s+")) {
                  emitter.emit(word);
                }
              }
            },
            Writables.strings());

    PTable<String, Long> counts = Aggregate.count(words);

    pipeline.writeTextFile(counts, args[1]);
    pipeline.run();
  }