@Test public void testTargetView() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash("username", 2).build(); Dataset<Record> inputDataset = repo.create( "ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); Dataset<Record> outputDataset = repo.create( "ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); writeTestUsers(inputDataset, 10); View<Record> inputView = inputDataset.with("username", "test-0"); Assert.assertEquals(1, datasetSize(inputView)); View<Record> outputView = outputDataset.with("username", "test-0"); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
@Test public void testPartitionedSource() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash("username", 2).build(); Dataset<Record> inputDataset = repo.create( "ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); Dataset<Record> outputDataset = repo.create( "ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(Formats.PARQUET).build()); writeTestUsers(inputDataset, 10); PartitionKey key = new PartitionKey(0); Dataset<Record> inputPart0 = ((PartitionedDataset<Record>) inputDataset).getPartition(key, false); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputPart0)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(5, datasetSize(outputDataset)); }
@Test public void testSignalReadyOutputView() { Assume.assumeTrue(!Hadoop.isHadoop1()); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); writeTestUsers(inputDataset, 10); View<Record> inputView = inputDataset.with("username", "test-8", "test-9"); View<Record> outputView = outputDataset.with("username", "test-8", "test-9"); Assert.assertEquals(2, datasetSize(inputView)); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(2, datasetSize(outputView)); Assert.assertFalse( "Output dataset should not be signaled ready", ((Signalable) outputDataset).isReady()); Assert.assertTrue("Output view should be signaled ready", ((Signalable) outputView).isReady()); }
@Test public void unionWriteShouldNotThrowNPE() throws IOException { String outputPath1 = tmpDir.getFileName("output1"); String outputPath2 = tmpDir.getFileName("output2"); String outputPath3 = tmpDir.getFileName("output3"); if (typeFamily == AvroTypeFamily.getInstance()) { union.write(To.avroFile(outputPath1)); pipeline.write(union, To.avroFile(outputPath2)); pipeline.run(); checkFileContents(outputPath1); checkFileContents(outputPath2); } else { union.write(To.textFile(outputPath1)); pipeline.write(union, To.textFile(outputPath2)); pipeline.writeTextFile(union, outputPath3); pipeline.run(); checkFileContents(outputPath1); checkFileContents(outputPath2); checkFileContents(outputPath3); } }
@Before @SuppressWarnings("unchecked") public void setUp() throws IOException { String inputFile1 = tmpDir.copyResourceFileName("set1.txt"); String inputFile2 = tmpDir.copyResourceFileName("set2.txt"); if (pipelineClass == null) { pipeline = MemPipeline.getInstance(); } else { pipeline = new MRPipeline(pipelineClass, tmpDir.getDefaultConfiguration()); } PCollection<String> firstCollection = pipeline.read(At.textFile(inputFile1, typeFamily.strings())); PCollection<String> secondCollection = pipeline.read(At.textFile(inputFile2, typeFamily.strings())); LOG.info( "Test fixture: [" + pipeline.getClass().getSimpleName() + " : " + typeFamily.getClass().getSimpleName() + "] First: " + Lists.newArrayList(firstCollection.materialize().iterator()) + ", Second: " + Lists.newArrayList(secondCollection.materialize().iterator())); union = secondCollection.union(firstCollection); }
@Test public void testUseReaderSchema() throws IOException { // Create a schema with only a username, so we can test reading it // with an enhanced record structure. Schema oldRecordSchema = SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord") .fields() .requiredString("username") .endRecord(); // create the dataset Dataset<Record> in = repo.create("ns", "in", new DatasetDescriptor.Builder().schema(oldRecordSchema).build()); Dataset<Record> out = repo.create("ns", "out", new DatasetDescriptor.Builder().schema(oldRecordSchema).build()); Record oldUser = new Record(oldRecordSchema); oldUser.put("username", "user"); DatasetWriter<Record> writer = in.newWriter(); try { writer.write(oldUser); } finally { writer.close(); } Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); // read data from updated dataset that has the new schema. // At this point, User class has the old schema PCollection<NewUserRecord> data = pipeline.read(CrunchDatasets.asSource(in.getUri(), NewUserRecord.class)); PCollection<NewUserRecord> processed = data.parallelDo(new UserRecordIdentityFn(), Avros.records(NewUserRecord.class)); pipeline.write(processed, CrunchDatasets.asTarget(out)); DatasetReader reader = out.newReader(); Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded()); try { // there should be one record that is equal to our old user generic record. Assert.assertEquals(oldUser, reader.next()); Assert.assertFalse(reader.hasNext()); } finally { reader.close(); } }
@Override public int run(final String[] args) throws Exception { createTable(); final Configuration config = getConf(); final Pipeline pipeline = new MRPipeline(CrunchStockDateInserter.class, "PipelineWithFilterFn", config); PCollection<String> lines = pipeline.readTextFile(Constants.HDFS_INPUT_PATH + "/2004_2014.csv"); PCollection<Put> resultPut = CrunchUtils.returnDates(lines); System.out.println("********** size ************ : " + resultPut.getSize()); pipeline.write(resultPut, new HBaseTarget(Constants.STOCK_DATES_TABLE)); PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
private void checkFileContents(String filePath) throws IOException { List<String> fileContentValues = (typeFamily != AvroTypeFamily.getInstance()) ? Lists.newArrayList( pipeline.read(At.textFile(filePath, typeFamily.strings())).materialize().iterator()) : Lists.newArrayList( pipeline.read(At.avroFile(filePath, Avros.strings())).materialize().iterator()); Collections.sort(fileContentValues); LOG.info("Saved Union: " + fileContentValues); assertEquals(EXPECTED, fileContentValues); }
@Test(expected = CrunchRuntimeException.class) public void testWriteModeDefaultFailsWithExisting() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); writeTestUsers(inputDataset, 1, 0); writeTestUsers(outputDataset, 1, 0); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset)); pipeline.write(data, CrunchDatasets.asTarget((View<Record>) outputDataset)); }
private void runMapsideJoin(Pipeline pipeline, boolean inMemory, boolean materialize) { PTable<Integer, String> customerTable = readTable(pipeline, "customers.txt"); PTable<Integer, String> orderTable = readTable(pipeline, "orders.txt"); JoinStrategy<Integer, String, String> mapsideJoin = new MapsideJoinStrategy<Integer, String, String>(materialize); PTable<Integer, String> custOrders = mapsideJoin .join(customerTable, orderTable, JoinType.INNER_JOIN) .mapValues("concat", new ConcatValuesFn(), Writables.strings()); PTable<Integer, String> ORDER_TABLE = orderTable.mapValues(new CapOrdersFn(), orderTable.getValueType()); PTable<Integer, Pair<String, String>> joined = mapsideJoin.join(custOrders, ORDER_TABLE, JoinType.INNER_JOIN); List<Pair<Integer, Pair<String, String>>> expectedJoinResult = Lists.newArrayList(); expectedJoinResult.add(Pair.of(111, Pair.of("[John Doe,Corn flakes]", "CORN FLAKES"))); expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PAPER"))); expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PLUNGER"))); expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PAPER"))); expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PLUNGER"))); expectedJoinResult.add(Pair.of(333, Pair.of("[Someone Else,Toilet brush]", "TOILET BRUSH"))); Iterable<Pair<Integer, Pair<String, String>>> iter = joined.materialize(); PipelineResult res = pipeline.run(); if (!inMemory) { assertEquals(materialize ? 2 : 1, res.getStageResults().size()); } List<Pair<Integer, Pair<String, String>>> joinedResultList = Lists.newArrayList(iter); Collections.sort(joinedResultList); assertEquals(expectedJoinResult, joinedResultList); }
public int run(String[] args) throws Exception { Pipeline pipeline = new MRPipeline(SecondarySortingExample.class); // Read input PCollection<String> lines = pipeline.readTextFile(args[0]); // Split each line and count them PTable<String, Long> wordcount = lines.parallelDo(new Tokenizer(), Writables.strings()).count(); // Sort PCollection<Pair<String, Long>> sorted = Sort.sortPairs(wordcount, ColumnOrder.by(1, Sort.Order.DESCENDING)); // Write the output sorted.write(To.textFile(args[0])); // Kick off execution PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
public void materialize() { try { materialized = sourceTarget.read(pipeline.getConfiguration()); } catch (IOException e) { LOG.error("Could not materialize: " + sourceTarget, e); throw new CrunchRuntimeException(e); } }
@Override public Iterator<E> iterator() { if (materialized == null) { pipeline.run(); materialize(); } return materialized.iterator(); }
@Before public void setUp() throws IOException { String set1InputPath = FileHelper.createTempCopyOf("set1.txt"); String set2InputPath = FileHelper.createTempCopyOf("set2.txt"); pipeline = new MRPipeline(SetTest.class); set1 = pipeline.read(At.textFile(set1InputPath, typeFamily.strings())); set2 = pipeline.read(At.textFile(set2InputPath, typeFamily.strings())); }
@Test public void testGeneric() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); // write two files, each of 5 records writeTestUsers(inputDataset, 5, 0); writeTestUsers(inputDataset, 5, 5); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); checkTestUsers(outputDataset, 10); }
@Test public void testWriteModeOverwrite() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); writeTestUsers(inputDataset, 1, 0); writeTestUsers(outputDataset, 1, 1); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputDataset)); pipeline.write( data, CrunchDatasets.asTarget((View<Record>) outputDataset), Target.WriteMode.OVERWRITE); pipeline.run(); checkTestUsers(outputDataset, 1); }
private PTable<Integer, String> readTable(Pipeline pipeline, String filename) { try { return pipeline .readTextFile(tmpDir.copyResourceFileName(filename)) .parallelDo( "asTable", new LineSplitter(), Writables.tableOf(Writables.ints(), Writables.strings())); } catch (IOException e) { throw new RuntimeException(e); } }
public static void main(String[] args) throws Exception { Pipeline pipeline = new MRPipeline(WordCount.class); PCollection<String> lines = pipeline.readTextFile(args[0]); PCollection<String> words = lines.parallelDo( "my splitter", new DoFn<String, String>() { public void process(String line, Emitter<String> emitter) { for (String word : line.split("\\s+")) { emitter.emit(word); } } }, Writables.strings()); PTable<String, Long> counts = Aggregate.count(words); pipeline.writeTextFile(counts, args[1]); pipeline.run(); }
@Override public int execute(Configuration conf) throws IOException { Pipeline p = pipelineParams.create(KMeansSketchCommand.class, conf); List<Vector> initial = null; if (initVectorsPath != null) { initial = getInitialVectors(p); } PCollection<Vector> input = inputParams.getVectors(p); if (initial == null || initial.isEmpty()) { initial = Lists.newArrayList(); initial.add(input.materialize().iterator().next()); } KMeansParallel kmp = new KMeansParallel(randomParams.getRandom(), indexBits, indexSamples); Crossfold cf = new Crossfold(crossFolds); List<List<Weighted<Vector>>> wv = kmp.initialization(input, numIterations, samplesPerIteration, initial, cf); AvroIO.write(toWeightedCenters(wv), new File(outputFile)); p.done(); return 0; }
@Override public int execute(Configuration conf) throws Exception { Pipeline p = pipelineParams.create(SummaryCommand.class, conf); PCollection<Record> records = inputParams.getRecords(p); Spec spec = null; List<Integer> symbolicColumns = Lists.newArrayList(); List<Integer> ignoredColumns = Lists.newArrayList(); if (headerFile != null) { spec = Specs.readFromHeaderFile(headerFile, ignoredColumns, symbolicColumns); } Summarizer summarizer = new Summarizer() .spec(spec) .defaultToSymbolic(false) .exceptionColumns(symbolicColumns) .ignoreColumns(ignoredColumns); Summary summary = summarizer.build(records).getValue(); summaryParams.save(summary, summaryFile); p.done(); return 0; }
@Override public int execute(Configuration conf) throws Exception { Pipeline p = pipelineParams.create(SampleCommand.class, conf); PCollection<Record> elements = inputParams.getRecords(p); if (sampleSize > 0 && samplingProbability > 0.0) { throw new IllegalArgumentException("--size and --prob are mutually exclusive options."); } PCollection<Record> sample; if (sampleSize > 0) { sample = ReservoirSampling.sample(elements, sampleSize); } else if (samplingProbability > 0.0 && samplingProbability < 1.0) { sample = Sample.sample(elements, samplingProbability); } else { throw new IllegalArgumentException( String.format( "Invalid input args: sample size = %d, sample prob = %.4f", sampleSize, samplingProbability)); } outputParams.write(sample, sampleFile); PipelineResult pr = p.done(); return pr.succeeded() ? 0 : 1; }
private void runCheckpointPipeline(View<Record> inputView, View<Record> outputView) { Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.CHECKPOINT); pipeline.done(); }
@Test public void unionMaterializeShouldNotThrowNPE() throws Exception { checkMaterialized(union.materialize()); checkMaterialized(pipeline.materialize(union)); }
@After public void tearDown() { pipeline.done(); }
private PCollection<String> getPCollection(Pipeline pipeline) throws IOException { String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt"); PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath); return shakespeare; }