Example #1
0
  @Test
  public void testTargetView() throws IOException {
    PartitionStrategy partitionStrategy =
        new PartitionStrategy.Builder().hash("username", 2).build();

    Dataset<Record> inputDataset =
        repo.create(
            "ns",
            "in",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());
    Dataset<Record> outputDataset =
        repo.create(
            "ns",
            "out",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());

    writeTestUsers(inputDataset, 10);

    View<Record> inputView = inputDataset.with("username", "test-0");
    Assert.assertEquals(1, datasetSize(inputView));
    View<Record> outputView = outputDataset.with("username", "test-0");

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView));
    pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
    pipeline.run();

    Assert.assertEquals(1, datasetSize(outputDataset));
  }
Example #2
0
  @Test
  public void testPartitionedSource() throws IOException {
    PartitionStrategy partitionStrategy =
        new PartitionStrategy.Builder().hash("username", 2).build();

    Dataset<Record> inputDataset =
        repo.create(
            "ns",
            "in",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());
    Dataset<Record> outputDataset =
        repo.create(
            "ns",
            "out",
            new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(Formats.PARQUET).build());

    writeTestUsers(inputDataset, 10);

    PartitionKey key = new PartitionKey(0);
    Dataset<Record> inputPart0 =
        ((PartitionedDataset<Record>) inputDataset).getPartition(key, false);

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputPart0));
    pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
    pipeline.run();

    Assert.assertEquals(5, datasetSize(outputDataset));
  }
Example #3
0
  @Test
  public void testSignalReadyOutputView() {
    Assume.assumeTrue(!Hadoop.isHadoop1());
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    writeTestUsers(inputDataset, 10);

    View<Record> inputView = inputDataset.with("username", "test-8", "test-9");
    View<Record> outputView = outputDataset.with("username", "test-8", "test-9");
    Assert.assertEquals(2, datasetSize(inputView));

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView));
    pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
    pipeline.run();

    Assert.assertEquals(2, datasetSize(outputView));

    Assert.assertFalse(
        "Output dataset should not be signaled ready", ((Signalable) outputDataset).isReady());
    Assert.assertTrue("Output view should be signaled ready", ((Signalable) outputView).isReady());
  }
Example #4
0
  @Test
  public void unionWriteShouldNotThrowNPE() throws IOException {
    String outputPath1 = tmpDir.getFileName("output1");
    String outputPath2 = tmpDir.getFileName("output2");
    String outputPath3 = tmpDir.getFileName("output3");

    if (typeFamily == AvroTypeFamily.getInstance()) {
      union.write(To.avroFile(outputPath1));
      pipeline.write(union, To.avroFile(outputPath2));

      pipeline.run();

      checkFileContents(outputPath1);
      checkFileContents(outputPath2);

    } else {

      union.write(To.textFile(outputPath1));
      pipeline.write(union, To.textFile(outputPath2));
      pipeline.writeTextFile(union, outputPath3);

      pipeline.run();

      checkFileContents(outputPath1);
      checkFileContents(outputPath2);
      checkFileContents(outputPath3);
    }
  }
Example #5
0
  @Before
  @SuppressWarnings("unchecked")
  public void setUp() throws IOException {
    String inputFile1 = tmpDir.copyResourceFileName("set1.txt");
    String inputFile2 = tmpDir.copyResourceFileName("set2.txt");
    if (pipelineClass == null) {
      pipeline = MemPipeline.getInstance();
    } else {
      pipeline = new MRPipeline(pipelineClass, tmpDir.getDefaultConfiguration());
    }
    PCollection<String> firstCollection =
        pipeline.read(At.textFile(inputFile1, typeFamily.strings()));
    PCollection<String> secondCollection =
        pipeline.read(At.textFile(inputFile2, typeFamily.strings()));

    LOG.info(
        "Test fixture: ["
            + pipeline.getClass().getSimpleName()
            + " : "
            + typeFamily.getClass().getSimpleName()
            + "]  First: "
            + Lists.newArrayList(firstCollection.materialize().iterator())
            + ", Second: "
            + Lists.newArrayList(secondCollection.materialize().iterator()));

    union = secondCollection.union(firstCollection);
  }
Example #6
0
  @Test
  public void testUseReaderSchema() throws IOException {

    // Create a schema with only a username, so we can test reading it
    // with an enhanced record structure.
    Schema oldRecordSchema =
        SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord")
            .fields()
            .requiredString("username")
            .endRecord();

    // create the dataset
    Dataset<Record> in =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(oldRecordSchema).build());
    Dataset<Record> out =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(oldRecordSchema).build());
    Record oldUser = new Record(oldRecordSchema);
    oldUser.put("username", "user");

    DatasetWriter<Record> writer = in.newWriter();

    try {

      writer.write(oldUser);

    } finally {
      writer.close();
    }

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);

    // read data from updated dataset that has the new schema.
    // At this point, User class has the old schema
    PCollection<NewUserRecord> data =
        pipeline.read(CrunchDatasets.asSource(in.getUri(), NewUserRecord.class));

    PCollection<NewUserRecord> processed =
        data.parallelDo(new UserRecordIdentityFn(), Avros.records(NewUserRecord.class));

    pipeline.write(processed, CrunchDatasets.asTarget(out));

    DatasetReader reader = out.newReader();

    Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded());

    try {

      // there should be one record that is equal to our old user generic record.
      Assert.assertEquals(oldUser, reader.next());
      Assert.assertFalse(reader.hasNext());

    } finally {
      reader.close();
    }
  }
  @Override
  public int run(final String[] args) throws Exception {
    createTable();
    final Configuration config = getConf();
    final Pipeline pipeline =
        new MRPipeline(CrunchStockDateInserter.class, "PipelineWithFilterFn", config);
    PCollection<String> lines = pipeline.readTextFile(Constants.HDFS_INPUT_PATH + "/2004_2014.csv");
    PCollection<Put> resultPut = CrunchUtils.returnDates(lines);
    System.out.println("********** size ************ : " + resultPut.getSize());

    pipeline.write(resultPut, new HBaseTarget(Constants.STOCK_DATES_TABLE));
    PipelineResult result = pipeline.done();
    return result.succeeded() ? 0 : 1;
  }
Example #8
0
  private void checkFileContents(String filePath) throws IOException {

    List<String> fileContentValues =
        (typeFamily != AvroTypeFamily.getInstance())
            ? Lists.newArrayList(
                pipeline.read(At.textFile(filePath, typeFamily.strings())).materialize().iterator())
            : Lists.newArrayList(
                pipeline.read(At.avroFile(filePath, Avros.strings())).materialize().iterator());

    Collections.sort(fileContentValues);

    LOG.info("Saved Union: " + fileContentValues);
    assertEquals(EXPECTED, fileContentValues);
  }
Example #9
0
  @Test(expected = CrunchRuntimeException.class)
  public void testWriteModeDefaultFailsWithExisting() throws IOException {
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    writeTestUsers(inputDataset, 1, 0);
    writeTestUsers(outputDataset, 1, 0);

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset));
    pipeline.write(data, CrunchDatasets.asTarget((View<Record>) outputDataset));
  }
Example #10
0
  private void runMapsideJoin(Pipeline pipeline, boolean inMemory, boolean materialize) {
    PTable<Integer, String> customerTable = readTable(pipeline, "customers.txt");
    PTable<Integer, String> orderTable = readTable(pipeline, "orders.txt");

    JoinStrategy<Integer, String, String> mapsideJoin =
        new MapsideJoinStrategy<Integer, String, String>(materialize);
    PTable<Integer, String> custOrders =
        mapsideJoin
            .join(customerTable, orderTable, JoinType.INNER_JOIN)
            .mapValues("concat", new ConcatValuesFn(), Writables.strings());

    PTable<Integer, String> ORDER_TABLE =
        orderTable.mapValues(new CapOrdersFn(), orderTable.getValueType());
    PTable<Integer, Pair<String, String>> joined =
        mapsideJoin.join(custOrders, ORDER_TABLE, JoinType.INNER_JOIN);

    List<Pair<Integer, Pair<String, String>>> expectedJoinResult = Lists.newArrayList();
    expectedJoinResult.add(Pair.of(111, Pair.of("[John Doe,Corn flakes]", "CORN FLAKES")));
    expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PAPER")));
    expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PLUNGER")));
    expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PAPER")));
    expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PLUNGER")));
    expectedJoinResult.add(Pair.of(333, Pair.of("[Someone Else,Toilet brush]", "TOILET BRUSH")));
    Iterable<Pair<Integer, Pair<String, String>>> iter = joined.materialize();

    PipelineResult res = pipeline.run();
    if (!inMemory) {
      assertEquals(materialize ? 2 : 1, res.getStageResults().size());
    }

    List<Pair<Integer, Pair<String, String>>> joinedResultList = Lists.newArrayList(iter);
    Collections.sort(joinedResultList);

    assertEquals(expectedJoinResult, joinedResultList);
  }
  public int run(String[] args) throws Exception {

    Pipeline pipeline = new MRPipeline(SecondarySortingExample.class);
    // Read input
    PCollection<String> lines = pipeline.readTextFile(args[0]);
    // Split each line and count them
    PTable<String, Long> wordcount = lines.parallelDo(new Tokenizer(), Writables.strings()).count();
    // Sort
    PCollection<Pair<String, Long>> sorted =
        Sort.sortPairs(wordcount, ColumnOrder.by(1, Sort.Order.DESCENDING));
    // Write the output
    sorted.write(To.textFile(args[0]));
    // Kick off execution
    PipelineResult result = pipeline.done();
    return result.succeeded() ? 0 : 1;
  }
 public void materialize() {
   try {
     materialized = sourceTarget.read(pipeline.getConfiguration());
   } catch (IOException e) {
     LOG.error("Could not materialize: " + sourceTarget, e);
     throw new CrunchRuntimeException(e);
   }
 }
 @Override
 public Iterator<E> iterator() {
   if (materialized == null) {
     pipeline.run();
     materialize();
   }
   return materialized.iterator();
 }
Example #14
0
 @Before
 public void setUp() throws IOException {
   String set1InputPath = FileHelper.createTempCopyOf("set1.txt");
   String set2InputPath = FileHelper.createTempCopyOf("set2.txt");
   pipeline = new MRPipeline(SetTest.class);
   set1 = pipeline.read(At.textFile(set1InputPath, typeFamily.strings()));
   set2 = pipeline.read(At.textFile(set2InputPath, typeFamily.strings()));
 }
Example #15
0
  @Test
  public void testGeneric() throws IOException {
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    // write two files, each of 5 records
    writeTestUsers(inputDataset, 5, 0);
    writeTestUsers(inputDataset, 5, 5);

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset));
    pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
    pipeline.run();

    checkTestUsers(outputDataset, 10);
  }
Example #16
0
  @Test
  public void testWriteModeOverwrite() throws IOException {
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    writeTestUsers(inputDataset, 1, 0);
    writeTestUsers(outputDataset, 1, 1);

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset));
    pipeline.write(
        data, CrunchDatasets.asTarget((View<Record>) outputDataset), Target.WriteMode.OVERWRITE);

    pipeline.run();

    checkTestUsers(outputDataset, 1);
  }
Example #17
0
 private PTable<Integer, String> readTable(Pipeline pipeline, String filename) {
   try {
     return pipeline
         .readTextFile(tmpDir.copyResourceFileName(filename))
         .parallelDo(
             "asTable",
             new LineSplitter(),
             Writables.tableOf(Writables.ints(), Writables.strings()));
   } catch (IOException e) {
     throw new RuntimeException(e);
   }
 }
Example #18
0
  public static void main(String[] args) throws Exception {

    Pipeline pipeline = new MRPipeline(WordCount.class);
    PCollection<String> lines = pipeline.readTextFile(args[0]);

    PCollection<String> words =
        lines.parallelDo(
            "my splitter",
            new DoFn<String, String>() {
              public void process(String line, Emitter<String> emitter) {
                for (String word : line.split("\\s+")) {
                  emitter.emit(word);
                }
              }
            },
            Writables.strings());

    PTable<String, Long> counts = Aggregate.count(words);

    pipeline.writeTextFile(counts, args[1]);
    pipeline.run();
  }
Example #19
0
  @Override
  public int execute(Configuration conf) throws IOException {
    Pipeline p = pipelineParams.create(KMeansSketchCommand.class, conf);
    List<Vector> initial = null;
    if (initVectorsPath != null) {
      initial = getInitialVectors(p);
    }

    PCollection<Vector> input = inputParams.getVectors(p);
    if (initial == null || initial.isEmpty()) {
      initial = Lists.newArrayList();
      initial.add(input.materialize().iterator().next());
    }
    KMeansParallel kmp = new KMeansParallel(randomParams.getRandom(), indexBits, indexSamples);
    Crossfold cf = new Crossfold(crossFolds);

    List<List<Weighted<Vector>>> wv =
        kmp.initialization(input, numIterations, samplesPerIteration, initial, cf);
    AvroIO.write(toWeightedCenters(wv), new File(outputFile));
    p.done();

    return 0;
  }
Example #20
0
  @Override
  public int execute(Configuration conf) throws Exception {
    Pipeline p = pipelineParams.create(SummaryCommand.class, conf);
    PCollection<Record> records = inputParams.getRecords(p);

    Spec spec = null;
    List<Integer> symbolicColumns = Lists.newArrayList();
    List<Integer> ignoredColumns = Lists.newArrayList();
    if (headerFile != null) {
      spec = Specs.readFromHeaderFile(headerFile, ignoredColumns, symbolicColumns);
    }

    Summarizer summarizer =
        new Summarizer()
            .spec(spec)
            .defaultToSymbolic(false)
            .exceptionColumns(symbolicColumns)
            .ignoreColumns(ignoredColumns);
    Summary summary = summarizer.build(records).getValue();
    summaryParams.save(summary, summaryFile);

    p.done();
    return 0;
  }
Example #21
0
  @Override
  public int execute(Configuration conf) throws Exception {
    Pipeline p = pipelineParams.create(SampleCommand.class, conf);
    PCollection<Record> elements = inputParams.getRecords(p);

    if (sampleSize > 0 && samplingProbability > 0.0) {
      throw new IllegalArgumentException("--size and --prob are mutually exclusive options.");
    }
    PCollection<Record> sample;
    if (sampleSize > 0) {
      sample = ReservoirSampling.sample(elements, sampleSize);
    } else if (samplingProbability > 0.0 && samplingProbability < 1.0) {
      sample = Sample.sample(elements, samplingProbability);
    } else {
      throw new IllegalArgumentException(
          String.format(
              "Invalid input args: sample size = %d, sample prob = %.4f",
              sampleSize, samplingProbability));
    }
    outputParams.write(sample, sampleFile);

    PipelineResult pr = p.done();
    return pr.succeeded() ? 0 : 1;
  }
Example #22
0
 private void runCheckpointPipeline(View<Record> inputView, View<Record> outputView) {
   Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
   PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView));
   pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.CHECKPOINT);
   pipeline.done();
 }
Example #23
0
 @Test
 public void unionMaterializeShouldNotThrowNPE() throws Exception {
   checkMaterialized(union.materialize());
   checkMaterialized(pipeline.materialize(union));
 }
Example #24
0
 @After
 public void tearDown() {
   pipeline.done();
 }
Example #25
0
 private PCollection<String> getPCollection(Pipeline pipeline) throws IOException {
   String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
   PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
   return shakespeare;
 }