Example #1
0
  @Test
  public void testSignalReadyOutputView() {
    Assume.assumeTrue(!Hadoop.isHadoop1());
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    writeTestUsers(inputDataset, 10);

    View<Record> inputView = inputDataset.with("username", "test-8", "test-9");
    View<Record> outputView = outputDataset.with("username", "test-8", "test-9");
    Assert.assertEquals(2, datasetSize(inputView));

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView));
    pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
    pipeline.run();

    Assert.assertEquals(2, datasetSize(outputView));

    Assert.assertFalse(
        "Output dataset should not be signaled ready", ((Signalable) outputDataset).isReady());
    Assert.assertTrue("Output view should be signaled ready", ((Signalable) outputView).isReady());
  }
Example #2
0
  @Test
  public void testTargetView() throws IOException {
    PartitionStrategy partitionStrategy =
        new PartitionStrategy.Builder().hash("username", 2).build();

    Dataset<Record> inputDataset =
        repo.create(
            "ns",
            "in",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());
    Dataset<Record> outputDataset =
        repo.create(
            "ns",
            "out",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());

    writeTestUsers(inputDataset, 10);

    View<Record> inputView = inputDataset.with("username", "test-0");
    Assert.assertEquals(1, datasetSize(inputView));
    View<Record> outputView = outputDataset.with("username", "test-0");

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView));
    pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
    pipeline.run();

    Assert.assertEquals(1, datasetSize(outputDataset));
  }
Example #3
0
  @Test
  public void testPartitionedSource() throws IOException {
    PartitionStrategy partitionStrategy =
        new PartitionStrategy.Builder().hash("username", 2).build();

    Dataset<Record> inputDataset =
        repo.create(
            "ns",
            "in",
            new DatasetDescriptor.Builder()
                .schema(USER_SCHEMA)
                .partitionStrategy(partitionStrategy)
                .build());
    Dataset<Record> outputDataset =
        repo.create(
            "ns",
            "out",
            new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(Formats.PARQUET).build());

    writeTestUsers(inputDataset, 10);

    PartitionKey key = new PartitionKey(0);
    Dataset<Record> inputPart0 =
        ((PartitionedDataset<Record>) inputDataset).getPartition(key, false);

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputPart0));
    pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
    pipeline.run();

    Assert.assertEquals(5, datasetSize(outputDataset));
  }
Example #4
0
  @Test
  public void testUseReaderSchema() throws IOException {

    // Create a schema with only a username, so we can test reading it
    // with an enhanced record structure.
    Schema oldRecordSchema =
        SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord")
            .fields()
            .requiredString("username")
            .endRecord();

    // create the dataset
    Dataset<Record> in =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(oldRecordSchema).build());
    Dataset<Record> out =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(oldRecordSchema).build());
    Record oldUser = new Record(oldRecordSchema);
    oldUser.put("username", "user");

    DatasetWriter<Record> writer = in.newWriter();

    try {

      writer.write(oldUser);

    } finally {
      writer.close();
    }

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);

    // read data from updated dataset that has the new schema.
    // At this point, User class has the old schema
    PCollection<NewUserRecord> data =
        pipeline.read(CrunchDatasets.asSource(in.getUri(), NewUserRecord.class));

    PCollection<NewUserRecord> processed =
        data.parallelDo(new UserRecordIdentityFn(), Avros.records(NewUserRecord.class));

    pipeline.write(processed, CrunchDatasets.asTarget(out));

    DatasetReader reader = out.newReader();

    Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded());

    try {

      // there should be one record that is equal to our old user generic record.
      Assert.assertEquals(oldUser, reader.next());
      Assert.assertFalse(reader.hasNext());

    } finally {
      reader.close();
    }
  }
Example #5
0
  @Test(expected = CrunchRuntimeException.class)
  public void testWriteModeDefaultFailsWithExisting() throws IOException {
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    writeTestUsers(inputDataset, 1, 0);
    writeTestUsers(outputDataset, 1, 0);

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset));
    pipeline.write(data, CrunchDatasets.asTarget((View<Record>) outputDataset));
  }
Example #6
0
  @Test
  public void testGeneric() throws IOException {
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    // write two files, each of 5 records
    writeTestUsers(inputDataset, 5, 0);
    writeTestUsers(inputDataset, 5, 5);

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset));
    pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
    pipeline.run();

    checkTestUsers(outputDataset, 10);
  }
Example #7
0
  @Test
  public void testWriteModeOverwrite() throws IOException {
    Dataset<Record> inputDataset =
        repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());
    Dataset<Record> outputDataset =
        repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build());

    writeTestUsers(inputDataset, 1, 0);
    writeTestUsers(outputDataset, 1, 1);

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset));
    pipeline.write(
        data, CrunchDatasets.asTarget((View<Record>) outputDataset), Target.WriteMode.OVERWRITE);

    pipeline.run();

    checkTestUsers(outputDataset, 1);
  }
Example #8
0
 private void runCheckpointPipeline(View<Record> inputView, View<Record> outputView) {
   Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
   PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView));
   pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.CHECKPOINT);
   pipeline.done();
 }