@Test public void testSignalReadyOutputView() { Assume.assumeTrue(!Hadoop.isHadoop1()); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); writeTestUsers(inputDataset, 10); View<Record> inputView = inputDataset.with("username", "test-8", "test-9"); View<Record> outputView = outputDataset.with("username", "test-8", "test-9"); Assert.assertEquals(2, datasetSize(inputView)); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(2, datasetSize(outputView)); Assert.assertFalse( "Output dataset should not be signaled ready", ((Signalable) outputDataset).isReady()); Assert.assertTrue("Output view should be signaled ready", ((Signalable) outputView).isReady()); }
@Test public void testTargetView() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash("username", 2).build(); Dataset<Record> inputDataset = repo.create( "ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); Dataset<Record> outputDataset = repo.create( "ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); writeTestUsers(inputDataset, 10); View<Record> inputView = inputDataset.with("username", "test-0"); Assert.assertEquals(1, datasetSize(inputView)); View<Record> outputView = outputDataset.with("username", "test-0"); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
@Test public void testPartitionedSource() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash("username", 2).build(); Dataset<Record> inputDataset = repo.create( "ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); Dataset<Record> outputDataset = repo.create( "ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(Formats.PARQUET).build()); writeTestUsers(inputDataset, 10); PartitionKey key = new PartitionKey(0); Dataset<Record> inputPart0 = ((PartitionedDataset<Record>) inputDataset).getPartition(key, false); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputPart0)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(5, datasetSize(outputDataset)); }
@Test public void testUseReaderSchema() throws IOException { // Create a schema with only a username, so we can test reading it // with an enhanced record structure. Schema oldRecordSchema = SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord") .fields() .requiredString("username") .endRecord(); // create the dataset Dataset<Record> in = repo.create("ns", "in", new DatasetDescriptor.Builder().schema(oldRecordSchema).build()); Dataset<Record> out = repo.create("ns", "out", new DatasetDescriptor.Builder().schema(oldRecordSchema).build()); Record oldUser = new Record(oldRecordSchema); oldUser.put("username", "user"); DatasetWriter<Record> writer = in.newWriter(); try { writer.write(oldUser); } finally { writer.close(); } Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); // read data from updated dataset that has the new schema. // At this point, User class has the old schema PCollection<NewUserRecord> data = pipeline.read(CrunchDatasets.asSource(in.getUri(), NewUserRecord.class)); PCollection<NewUserRecord> processed = data.parallelDo(new UserRecordIdentityFn(), Avros.records(NewUserRecord.class)); pipeline.write(processed, CrunchDatasets.asTarget(out)); DatasetReader reader = out.newReader(); Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded()); try { // there should be one record that is equal to our old user generic record. Assert.assertEquals(oldUser, reader.next()); Assert.assertFalse(reader.hasNext()); } finally { reader.close(); } }
@Test(expected = CrunchRuntimeException.class) public void testWriteModeDefaultFailsWithExisting() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); writeTestUsers(inputDataset, 1, 0); writeTestUsers(outputDataset, 1, 0); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset)); pipeline.write(data, CrunchDatasets.asTarget((View<Record>) outputDataset)); }
@Test public void testGeneric() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); // write two files, each of 5 records writeTestUsers(inputDataset, 5, 0); writeTestUsers(inputDataset, 5, 5); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); checkTestUsers(outputDataset, 10); }
@Test public void testWriteModeOverwrite() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); writeTestUsers(inputDataset, 1, 0); writeTestUsers(outputDataset, 1, 1); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset)); pipeline.write( data, CrunchDatasets.asTarget((View<Record>) outputDataset), Target.WriteMode.OVERWRITE); pipeline.run(); checkTestUsers(outputDataset, 1); }
private void runCheckpointPipeline(View<Record> inputView, View<Record> outputView) { Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.CHECKPOINT); pipeline.done(); }