Exemple #1
0
  @Test
  public void unionWriteShouldNotThrowNPE() throws IOException {
    String outputPath1 = tmpDir.getFileName("output1");
    String outputPath2 = tmpDir.getFileName("output2");
    String outputPath3 = tmpDir.getFileName("output3");

    if (typeFamily == AvroTypeFamily.getInstance()) {
      union.write(To.avroFile(outputPath1));
      pipeline.write(union, To.avroFile(outputPath2));

      pipeline.run();

      checkFileContents(outputPath1);
      checkFileContents(outputPath2);

    } else {

      union.write(To.textFile(outputPath1));
      pipeline.write(union, To.textFile(outputPath2));
      pipeline.writeTextFile(union, outputPath3);

      pipeline.run();

      checkFileContents(outputPath1);
      checkFileContents(outputPath2);
      checkFileContents(outputPath3);
    }
  }
Exemple #2
0
  @Before
  @SuppressWarnings("unchecked")
  public void setUp() throws IOException {
    String inputFile1 = tmpDir.copyResourceFileName("set1.txt");
    String inputFile2 = tmpDir.copyResourceFileName("set2.txt");
    if (pipelineClass == null) {
      pipeline = MemPipeline.getInstance();
    } else {
      pipeline = new MRPipeline(pipelineClass, tmpDir.getDefaultConfiguration());
    }
    PCollection<String> firstCollection =
        pipeline.read(At.textFile(inputFile1, typeFamily.strings()));
    PCollection<String> secondCollection =
        pipeline.read(At.textFile(inputFile2, typeFamily.strings()));

    LOG.info(
        "Test fixture: ["
            + pipeline.getClass().getSimpleName()
            + " : "
            + typeFamily.getClass().getSimpleName()
            + "]  First: "
            + Lists.newArrayList(firstCollection.materialize().iterator())
            + ", Second: "
            + Lists.newArrayList(secondCollection.materialize().iterator()));

    union = secondCollection.union(firstCollection);
  }
Exemple #3
0
 private List<Vector> getInitialVectors(Pipeline p) {
   if (initVectorsPath != null) {
     PCollection<Vector> init = inputParams.getVectorsFromPath(p, initVectorsPath);
     return Lists.newArrayList(init.materialize());
   } else {
     throw new IllegalArgumentException("No initial vector config specified");
   }
 }
Exemple #4
0
 private static <E> PCollection<E> partition(PCollection<E> collection, int numReducers) {
   PType<E> type = collection.getPType();
   PTableType<E, Void> tableType = Avros.tableOf(type, Avros.nulls());
   PTable<E, Void> table = collection.parallelDo(new AsKeyTable<E>(), tableType);
   PGroupedTable<E, Void> grouped =
       numReducers > 0 ? table.groupByKey(numReducers) : table.groupByKey();
   return grouped.ungroup().keys();
 }
  @Test
  public void testUseReaderSchema() throws IOException {

    // Create a schema with only a username, so we can test reading it
    // with an enhanced record structure.
    Schema oldRecordSchema =
        SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord")
            .fields()
            .requiredString("username")
            .endRecord();

    // create the dataset
    Dataset<Record> in =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(oldRecordSchema).build());
    Dataset<Record> out =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(oldRecordSchema).build());
    Record oldUser = new Record(oldRecordSchema);
    oldUser.put("username", "user");

    DatasetWriter<Record> writer = in.newWriter();

    try {

      writer.write(oldUser);

    } finally {
      writer.close();
    }

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);

    // read data from updated dataset that has the new schema.
    // At this point, User class has the old schema
    PCollection<NewUserRecord> data =
        pipeline.read(CrunchDatasets.asSource(in.getUri(), NewUserRecord.class));

    PCollection<NewUserRecord> processed =
        data.parallelDo(new UserRecordIdentityFn(), Avros.records(NewUserRecord.class));

    pipeline.write(processed, CrunchDatasets.asTarget(out));

    DatasetReader reader = out.newReader();

    Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded());

    try {

      // there should be one record that is equal to our old user generic record.
      Assert.assertEquals(oldUser, reader.next());
      Assert.assertFalse(reader.hasNext());

    } finally {
      reader.close();
    }
  }
 /**
  * Returns a {@code PTable} that contains the unique elements of this collection mapped to a count
  * of their occurrences.
  */
 public static <S> PTable<S, Long> count(PCollection<S> collect) {
   PTypeFamily tf = collect.getTypeFamily();
   return collect
       .parallelDo(
           "Aggregate.count",
           new MapFn<S, Pair<S, Long>>() {
             public Pair<S, Long> map(S input) {
               return Pair.of(input, 1L);
             }
           },
           tf.tableOf(collect.getPType(), tf.longs()))
       .groupByKey()
       .combineValues(Aggregators.SUM_LONGS());
 }
  @Override
  public int run(final String[] args) throws Exception {
    createTable();
    final Configuration config = getConf();
    final Pipeline pipeline =
        new MRPipeline(CrunchStockDateInserter.class, "PipelineWithFilterFn", config);
    PCollection<String> lines = pipeline.readTextFile(Constants.HDFS_INPUT_PATH + "/2004_2014.csv");
    PCollection<Put> resultPut = CrunchUtils.returnDates(lines);
    System.out.println("********** size ************ : " + resultPut.getSize());

    pipeline.write(resultPut, new HBaseTarget(Constants.STOCK_DATES_TABLE));
    PipelineResult result = pipeline.done();
    return result.succeeded() ? 0 : 1;
  }
  public int run(String[] args) throws Exception {

    Pipeline pipeline = new MRPipeline(SecondarySortingExample.class);
    // Read input
    PCollection<String> lines = pipeline.readTextFile(args[0]);
    // Split each line and count them
    PTable<String, Long> wordcount = lines.parallelDo(new Tokenizer(), Writables.strings()).count();
    // Sort
    PCollection<Pair<String, Long>> sorted =
        Sort.sortPairs(wordcount, ColumnOrder.by(1, Sort.Order.DESCENDING));
    // Write the output
    sorted.write(To.textFile(args[0]));
    // Kick off execution
    PipelineResult result = pipeline.done();
    return result.succeeded() ? 0 : 1;
  }
 /**
  * Returns the number of elements in the provided PCollection.
  *
  * @param collect The PCollection whose elements should be counted.
  * @param <S> The type of the PCollection.
  * @return A {@code PObject} containing the number of elements in the {@code PCollection}.
  */
 public static <S> PObject<Long> length(PCollection<S> collect) {
   PTypeFamily tf = collect.getTypeFamily();
   PTable<Integer, Long> countTable =
       collect
           .parallelDo(
               "Aggregate.count",
               new MapFn<S, Pair<Integer, Long>>() {
                 public Pair<Integer, Long> map(S input) {
                   return Pair.of(1, 1L);
                 }
               },
               tf.tableOf(tf.ints(), tf.longs()))
           .groupByKey()
           .combineValues(Aggregators.SUM_LONGS());
   PCollection<Long> count = countTable.values();
   return new FirstElementPObject<Long>(count);
 }
  @Override
  protected MRPipeline createPipeline() throws IOException {
    JobStepConfig stepConfig = getConfig();
    Config config = ConfigUtils.getDefaultConfig();
    ClusterSettings clusterSettings = ClusterSettings.create(config);

    String instanceDir = stepConfig.getInstanceDir();
    long generationID = stepConfig.getGenerationID();
    String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID);
    String outputKey = prefix + "weighted/";
    if (!validOutputPath(outputKey)) {
      return null;
    }

    String indexKey = prefix + "sketch/" + clusterSettings.getSketchIterations();
    String inputKey = prefix + "normalized/";
    MRPipeline p = createBasicPipeline(ClosestSketchVectorFn.class);

    // first I compute the weight of each k-sketch vector, i.e., Voronoi partition
    // I aggregate all together and persist on disk
    // PCollection<ClosestSketchVectorData> weights = inputPairs(p, inputKey, MLAvros.vector())
    PCollection<ClosestSketchVectorData> weights =
        PTables.asPTable(
                inputPairs(p, inputKey, MLAvros.vector())
                    .parallelDo(
                        "computingSketchVectorWeights",
                        new ClosestSketchVectorFn<RealVector>(indexKey, clusterSettings),
                        Avros.pairs(Avros.ints(), Avros.reflects(ClosestSketchVectorData.class))))
            .groupByKey(1)
            .combineValues(new ClosestSketchVectorAggregator(clusterSettings))
            .values()
            .write(avroOutput(outputKey + "kSketchVectorWeights/"));

    // this "pipeline" takes a single ClosestSketchVectorData and returns weighted vectors
    // could be done outside MapReduce, but that would require me to materialize the
    // ClosestSketchVectorData
    weights
        .parallelDo(
            "generatingWeightedSketchVectors",
            new WeightVectorsFn(indexKey),
            KMeansTypes.FOLD_WEIGHTED_VECTOR)
        .write(avroOutput(outputKey + "weightedKSketchVectors/"));

    return p;
  }
  /** Returns the smallest numerical element from the input collection. */
  public static <S> PObject<S> min(PCollection<S> collect) {
    Class<S> clazz = collect.getPType().getTypeClass();
    if (!clazz.isPrimitive() && !Comparable.class.isAssignableFrom(clazz)) {
      throw new IllegalArgumentException(
          "Can only get min for Comparable elements, not for: "
              + collect.getPType().getTypeClass());
    }
    PTypeFamily tf = collect.getTypeFamily();
    PCollection<S> minCollect =
        PTables.values(
            collect
                .parallelDo(
                    "min",
                    new DoFn<S, Pair<Boolean, S>>() {
                      private transient S min = null;

                      public void process(S input, Emitter<Pair<Boolean, S>> emitter) {
                        if (min == null || ((Comparable<S>) min).compareTo(input) > 0) {
                          min = input;
                        }
                      }

                      public void cleanup(Emitter<Pair<Boolean, S>> emitter) {
                        if (min != null) {
                          emitter.emit(Pair.of(false, min));
                        }
                      }
                    },
                    tf.tableOf(tf.booleans(), collect.getPType()))
                .groupByKey()
                .combineValues(
                    new CombineFn<Boolean, S>() {
                      public void process(
                          Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
                        S min = null;
                        for (S v : input.second()) {
                          if (min == null || ((Comparable<S>) min).compareTo(v) > 0) {
                            min = v;
                          }
                        }
                        emitter.emit(Pair.of(input.first(), min));
                      }
                    }));
    return new FirstElementPObject<S>(minCollect);
  }
Exemple #12
0
 /**
  * Partitions {@code collection} to be stored efficiently in {@code View}.
  *
  * <p>This restructures the parallel collection so that all of the entities that will be stored in
  * a given partition will be processed by the same writer.
  *
  * <p>If the dataset is not partitioned, then this will structure all of the entities to produce a
  * number of files equal to {@code numWriters}.
  *
  * @param collection a collection of entities
  * @param view a {@link View} of a dataset to partition the collection for
  * @param numWriters the number of writers that should be used
  * @param <E> the type of entities in the collection and underlying dataset
  * @return an equivalent collection of entities partitioned for the view
  * @see #partition(PCollection, View)
  * @since 0.16.0
  */
 public static <E> PCollection<E> partition(
     PCollection<E> collection, View<E> view, int numWriters) {
   DatasetDescriptor descriptor = view.getDataset().getDescriptor();
   if (descriptor.isPartitioned()) {
     GetStorageKey<E> getKey = new GetStorageKey<E>(view);
     PTable<GenericData.Record, E> table = collection.by(getKey, Avros.generics(getKey.schema()));
     PGroupedTable<GenericData.Record, E> grouped =
         numWriters > 0 ? table.groupByKey(numWriters) : table.groupByKey();
     return grouped.ungroup().values();
   } else {
     return partition(collection, numWriters);
   }
 }
  @Test
  public void testMultipleFileReadingFromCrunch() throws IOException {
    Dataset<Record> inputDatasetA =
        repo.create("ns", "inA", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    Dataset<Record> inputDatasetB =
        repo.create("ns", "inB", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    // write two files, each of 5 records
    writeTestUsers(inputDatasetA, 5, 0);
    writeTestUsers(inputDatasetB, 5, 5);

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> dataA = pipeline.read(CrunchDatasets.asSource(inputDatasetA));
    PCollection<GenericData.Record> dataB = pipeline.read(CrunchDatasets.asSource(inputDatasetB));
    pipeline.write(
        dataA.union(dataB), CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
    pipeline.run();

    checkTestUsers(outputDataset, 10);
  }
Exemple #14
0
  public static void main(String[] args) throws Exception {

    Pipeline pipeline = new MRPipeline(WordCount.class);
    PCollection<String> lines = pipeline.readTextFile(args[0]);

    PCollection<String> words =
        lines.parallelDo(
            "my splitter",
            new DoFn<String, String>() {
              public void process(String line, Emitter<String> emitter) {
                for (String word : line.split("\\s+")) {
                  emitter.emit(word);
                }
              }
            },
            Writables.strings());

    PTable<String, Long> counts = Aggregate.count(words);

    pipeline.writeTextFile(counts, args[1]);
    pipeline.run();
  }
Exemple #15
0
  @Override
  public int execute(Configuration conf) throws IOException {
    Pipeline p = pipelineParams.create(KMeansSketchCommand.class, conf);
    List<Vector> initial = null;
    if (initVectorsPath != null) {
      initial = getInitialVectors(p);
    }

    PCollection<Vector> input = inputParams.getVectors(p);
    if (initial == null || initial.isEmpty()) {
      initial = Lists.newArrayList();
      initial.add(input.materialize().iterator().next());
    }
    KMeansParallel kmp = new KMeansParallel(randomParams.getRandom(), indexBits, indexSamples);
    Crossfold cf = new Crossfold(crossFolds);

    List<List<Weighted<Vector>>> wv =
        kmp.initialization(input, numIterations, samplesPerIteration, initial, cf);
    AvroIO.write(toWeightedCenters(wv), new File(outputFile));
    p.done();

    return 0;
  }
Exemple #16
0
 @Test
 public void unionMaterializeShouldNotThrowNPE() throws Exception {
   checkMaterialized(union.materialize());
   checkMaterialized(pipeline.materialize(union));
 }
Exemple #17
0
 @Test
 public void testDifference() throws Exception {
   PCollection<String> difference = Set.difference(set1, set2);
   assertEquals(Lists.newArrayList("b", "e"), Lists.newArrayList(difference.materialize()));
 }
Exemple #18
0
 @Test
 public void testIntersection() throws Exception {
   PCollection<String> intersection = Set.intersection(set1, set2);
   assertEquals(Lists.newArrayList("a", "c"), Lists.newArrayList(intersection.materialize()));
 }
 public void runAsCollection(Pipeline pipeline) throws IOException {
   PCollection<String> shakespeare = getPCollection(pipeline);
   String[] lines = new String[LINES_IN_SHAKES];
   lines = shakespeare.asCollection().getValue().toArray(lines);
   verifyLines(lines);
 }