@Test public void unionWriteShouldNotThrowNPE() throws IOException { String outputPath1 = tmpDir.getFileName("output1"); String outputPath2 = tmpDir.getFileName("output2"); String outputPath3 = tmpDir.getFileName("output3"); if (typeFamily == AvroTypeFamily.getInstance()) { union.write(To.avroFile(outputPath1)); pipeline.write(union, To.avroFile(outputPath2)); pipeline.run(); checkFileContents(outputPath1); checkFileContents(outputPath2); } else { union.write(To.textFile(outputPath1)); pipeline.write(union, To.textFile(outputPath2)); pipeline.writeTextFile(union, outputPath3); pipeline.run(); checkFileContents(outputPath1); checkFileContents(outputPath2); checkFileContents(outputPath3); } }
@Before @SuppressWarnings("unchecked") public void setUp() throws IOException { String inputFile1 = tmpDir.copyResourceFileName("set1.txt"); String inputFile2 = tmpDir.copyResourceFileName("set2.txt"); if (pipelineClass == null) { pipeline = MemPipeline.getInstance(); } else { pipeline = new MRPipeline(pipelineClass, tmpDir.getDefaultConfiguration()); } PCollection<String> firstCollection = pipeline.read(At.textFile(inputFile1, typeFamily.strings())); PCollection<String> secondCollection = pipeline.read(At.textFile(inputFile2, typeFamily.strings())); LOG.info( "Test fixture: [" + pipeline.getClass().getSimpleName() + " : " + typeFamily.getClass().getSimpleName() + "] First: " + Lists.newArrayList(firstCollection.materialize().iterator()) + ", Second: " + Lists.newArrayList(secondCollection.materialize().iterator())); union = secondCollection.union(firstCollection); }
private List<Vector> getInitialVectors(Pipeline p) { if (initVectorsPath != null) { PCollection<Vector> init = inputParams.getVectorsFromPath(p, initVectorsPath); return Lists.newArrayList(init.materialize()); } else { throw new IllegalArgumentException("No initial vector config specified"); } }
private static <E> PCollection<E> partition(PCollection<E> collection, int numReducers) { PType<E> type = collection.getPType(); PTableType<E, Void> tableType = Avros.tableOf(type, Avros.nulls()); PTable<E, Void> table = collection.parallelDo(new AsKeyTable<E>(), tableType); PGroupedTable<E, Void> grouped = numReducers > 0 ? table.groupByKey(numReducers) : table.groupByKey(); return grouped.ungroup().keys(); }
@Test public void testUseReaderSchema() throws IOException { // Create a schema with only a username, so we can test reading it // with an enhanced record structure. Schema oldRecordSchema = SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord") .fields() .requiredString("username") .endRecord(); // create the dataset Dataset<Record> in = repo.create("ns", "in", new DatasetDescriptor.Builder().schema(oldRecordSchema).build()); Dataset<Record> out = repo.create("ns", "out", new DatasetDescriptor.Builder().schema(oldRecordSchema).build()); Record oldUser = new Record(oldRecordSchema); oldUser.put("username", "user"); DatasetWriter<Record> writer = in.newWriter(); try { writer.write(oldUser); } finally { writer.close(); } Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); // read data from updated dataset that has the new schema. // At this point, User class has the old schema PCollection<NewUserRecord> data = pipeline.read(CrunchDatasets.asSource(in.getUri(), NewUserRecord.class)); PCollection<NewUserRecord> processed = data.parallelDo(new UserRecordIdentityFn(), Avros.records(NewUserRecord.class)); pipeline.write(processed, CrunchDatasets.asTarget(out)); DatasetReader reader = out.newReader(); Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded()); try { // there should be one record that is equal to our old user generic record. Assert.assertEquals(oldUser, reader.next()); Assert.assertFalse(reader.hasNext()); } finally { reader.close(); } }
/** * Returns a {@code PTable} that contains the unique elements of this collection mapped to a count * of their occurrences. */ public static <S> PTable<S, Long> count(PCollection<S> collect) { PTypeFamily tf = collect.getTypeFamily(); return collect .parallelDo( "Aggregate.count", new MapFn<S, Pair<S, Long>>() { public Pair<S, Long> map(S input) { return Pair.of(input, 1L); } }, tf.tableOf(collect.getPType(), tf.longs())) .groupByKey() .combineValues(Aggregators.SUM_LONGS()); }
@Override public int run(final String[] args) throws Exception { createTable(); final Configuration config = getConf(); final Pipeline pipeline = new MRPipeline(CrunchStockDateInserter.class, "PipelineWithFilterFn", config); PCollection<String> lines = pipeline.readTextFile(Constants.HDFS_INPUT_PATH + "/2004_2014.csv"); PCollection<Put> resultPut = CrunchUtils.returnDates(lines); System.out.println("********** size ************ : " + resultPut.getSize()); pipeline.write(resultPut, new HBaseTarget(Constants.STOCK_DATES_TABLE)); PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
public int run(String[] args) throws Exception { Pipeline pipeline = new MRPipeline(SecondarySortingExample.class); // Read input PCollection<String> lines = pipeline.readTextFile(args[0]); // Split each line and count them PTable<String, Long> wordcount = lines.parallelDo(new Tokenizer(), Writables.strings()).count(); // Sort PCollection<Pair<String, Long>> sorted = Sort.sortPairs(wordcount, ColumnOrder.by(1, Sort.Order.DESCENDING)); // Write the output sorted.write(To.textFile(args[0])); // Kick off execution PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
/** * Returns the number of elements in the provided PCollection. * * @param collect The PCollection whose elements should be counted. * @param <S> The type of the PCollection. * @return A {@code PObject} containing the number of elements in the {@code PCollection}. */ public static <S> PObject<Long> length(PCollection<S> collect) { PTypeFamily tf = collect.getTypeFamily(); PTable<Integer, Long> countTable = collect .parallelDo( "Aggregate.count", new MapFn<S, Pair<Integer, Long>>() { public Pair<Integer, Long> map(S input) { return Pair.of(1, 1L); } }, tf.tableOf(tf.ints(), tf.longs())) .groupByKey() .combineValues(Aggregators.SUM_LONGS()); PCollection<Long> count = countTable.values(); return new FirstElementPObject<Long>(count); }
@Override protected MRPipeline createPipeline() throws IOException { JobStepConfig stepConfig = getConfig(); Config config = ConfigUtils.getDefaultConfig(); ClusterSettings clusterSettings = ClusterSettings.create(config); String instanceDir = stepConfig.getInstanceDir(); long generationID = stepConfig.getGenerationID(); String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID); String outputKey = prefix + "weighted/"; if (!validOutputPath(outputKey)) { return null; } String indexKey = prefix + "sketch/" + clusterSettings.getSketchIterations(); String inputKey = prefix + "normalized/"; MRPipeline p = createBasicPipeline(ClosestSketchVectorFn.class); // first I compute the weight of each k-sketch vector, i.e., Voronoi partition // I aggregate all together and persist on disk // PCollection<ClosestSketchVectorData> weights = inputPairs(p, inputKey, MLAvros.vector()) PCollection<ClosestSketchVectorData> weights = PTables.asPTable( inputPairs(p, inputKey, MLAvros.vector()) .parallelDo( "computingSketchVectorWeights", new ClosestSketchVectorFn<RealVector>(indexKey, clusterSettings), Avros.pairs(Avros.ints(), Avros.reflects(ClosestSketchVectorData.class)))) .groupByKey(1) .combineValues(new ClosestSketchVectorAggregator(clusterSettings)) .values() .write(avroOutput(outputKey + "kSketchVectorWeights/")); // this "pipeline" takes a single ClosestSketchVectorData and returns weighted vectors // could be done outside MapReduce, but that would require me to materialize the // ClosestSketchVectorData weights .parallelDo( "generatingWeightedSketchVectors", new WeightVectorsFn(indexKey), KMeansTypes.FOLD_WEIGHTED_VECTOR) .write(avroOutput(outputKey + "weightedKSketchVectors/")); return p; }
/** Returns the smallest numerical element from the input collection. */ public static <S> PObject<S> min(PCollection<S> collect) { Class<S> clazz = collect.getPType().getTypeClass(); if (!clazz.isPrimitive() && !Comparable.class.isAssignableFrom(clazz)) { throw new IllegalArgumentException( "Can only get min for Comparable elements, not for: " + collect.getPType().getTypeClass()); } PTypeFamily tf = collect.getTypeFamily(); PCollection<S> minCollect = PTables.values( collect .parallelDo( "min", new DoFn<S, Pair<Boolean, S>>() { private transient S min = null; public void process(S input, Emitter<Pair<Boolean, S>> emitter) { if (min == null || ((Comparable<S>) min).compareTo(input) > 0) { min = input; } } public void cleanup(Emitter<Pair<Boolean, S>> emitter) { if (min != null) { emitter.emit(Pair.of(false, min)); } } }, tf.tableOf(tf.booleans(), collect.getPType())) .groupByKey() .combineValues( new CombineFn<Boolean, S>() { public void process( Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) { S min = null; for (S v : input.second()) { if (min == null || ((Comparable<S>) min).compareTo(v) > 0) { min = v; } } emitter.emit(Pair.of(input.first(), min)); } })); return new FirstElementPObject<S>(minCollect); }
/** * Partitions {@code collection} to be stored efficiently in {@code View}. * * <p>This restructures the parallel collection so that all of the entities that will be stored in * a given partition will be processed by the same writer. * * <p>If the dataset is not partitioned, then this will structure all of the entities to produce a * number of files equal to {@code numWriters}. * * @param collection a collection of entities * @param view a {@link View} of a dataset to partition the collection for * @param numWriters the number of writers that should be used * @param <E> the type of entities in the collection and underlying dataset * @return an equivalent collection of entities partitioned for the view * @see #partition(PCollection, View) * @since 0.16.0 */ public static <E> PCollection<E> partition( PCollection<E> collection, View<E> view, int numWriters) { DatasetDescriptor descriptor = view.getDataset().getDescriptor(); if (descriptor.isPartitioned()) { GetStorageKey<E> getKey = new GetStorageKey<E>(view); PTable<GenericData.Record, E> table = collection.by(getKey, Avros.generics(getKey.schema())); PGroupedTable<GenericData.Record, E> grouped = numWriters > 0 ? table.groupByKey(numWriters) : table.groupByKey(); return grouped.ungroup().values(); } else { return partition(collection, numWriters); } }
@Test public void testMultipleFileReadingFromCrunch() throws IOException { Dataset<Record> inputDatasetA = repo.create("ns", "inA", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); Dataset<Record> inputDatasetB = repo.create("ns", "inB", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder().schema(USER_SCHEMA).build()); // write two files, each of 5 records writeTestUsers(inputDatasetA, 5, 0); writeTestUsers(inputDatasetB, 5, 5); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> dataA = pipeline.read(CrunchDatasets.asSource(inputDatasetA)); PCollection<GenericData.Record> dataB = pipeline.read(CrunchDatasets.asSource(inputDatasetB)); pipeline.write( dataA.union(dataB), CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); checkTestUsers(outputDataset, 10); }
public static void main(String[] args) throws Exception { Pipeline pipeline = new MRPipeline(WordCount.class); PCollection<String> lines = pipeline.readTextFile(args[0]); PCollection<String> words = lines.parallelDo( "my splitter", new DoFn<String, String>() { public void process(String line, Emitter<String> emitter) { for (String word : line.split("\\s+")) { emitter.emit(word); } } }, Writables.strings()); PTable<String, Long> counts = Aggregate.count(words); pipeline.writeTextFile(counts, args[1]); pipeline.run(); }
@Override public int execute(Configuration conf) throws IOException { Pipeline p = pipelineParams.create(KMeansSketchCommand.class, conf); List<Vector> initial = null; if (initVectorsPath != null) { initial = getInitialVectors(p); } PCollection<Vector> input = inputParams.getVectors(p); if (initial == null || initial.isEmpty()) { initial = Lists.newArrayList(); initial.add(input.materialize().iterator().next()); } KMeansParallel kmp = new KMeansParallel(randomParams.getRandom(), indexBits, indexSamples); Crossfold cf = new Crossfold(crossFolds); List<List<Weighted<Vector>>> wv = kmp.initialization(input, numIterations, samplesPerIteration, initial, cf); AvroIO.write(toWeightedCenters(wv), new File(outputFile)); p.done(); return 0; }
@Test public void unionMaterializeShouldNotThrowNPE() throws Exception { checkMaterialized(union.materialize()); checkMaterialized(pipeline.materialize(union)); }
@Test public void testDifference() throws Exception { PCollection<String> difference = Set.difference(set1, set2); assertEquals(Lists.newArrayList("b", "e"), Lists.newArrayList(difference.materialize())); }
@Test public void testIntersection() throws Exception { PCollection<String> intersection = Set.intersection(set1, set2); assertEquals(Lists.newArrayList("a", "c"), Lists.newArrayList(intersection.materialize())); }
public void runAsCollection(Pipeline pipeline) throws IOException { PCollection<String> shakespeare = getPCollection(pipeline); String[] lines = new String[LINES_IN_SHAKES]; lines = shakespeare.asCollection().getValue().toArray(lines); verifyLines(lines); }