@Test @SuppressWarnings("unchecked") public void testTop() { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of())); PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength())); PCollection<List<String>> top2 = input.apply(Top.<String>largest(2)); PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3)); PCollection<KV<String, Integer>> inputTable = createInputTable(p); PCollection<KV<String, List<Integer>>> largestPerKey = inputTable.apply(Top.<String, Integer>largestPerKey(2)); PCollection<KV<String, List<Integer>>> smallestPerKey = inputTable.apply(Top.<String, Integer>smallestPerKey(2)); DataflowAssert.thatSingletonIterable(top1).containsInAnyOrder(Arrays.asList("bb")); DataflowAssert.thatSingletonIterable(top2).containsInAnyOrder("z", "c"); DataflowAssert.thatSingletonIterable(top3).containsInAnyOrder("a", "bb", "c"); DataflowAssert.that(largestPerKey) .containsInAnyOrder(KV.of("a", Arrays.asList(3, 2)), KV.of("b", Arrays.asList(100, 10))); DataflowAssert.that(smallestPerKey) .containsInAnyOrder(KV.of("a", Arrays.asList(1, 2)), KV.of("b", Arrays.asList(1, 10))); p.run(); }
@Test(dataProvider = "reads") public void testGATKReadCoding(final List<GATKRead> reads) { // The simplest way to figure out if a class is coded correctly is to create a PCollection // of that type and see if it matches the List version. final Pipeline p = GATKTestPipeline.create(); DataflowUtils.registerGATKCoders(p); // Need to explicitly set the coder to GATKReadCoder, otherwise Create fails to infer // a coder properly in the case where the List contains a mix of different GATKRead // implementations. final PCollection<GATKRead> dataflowReads = p.apply(Create.of(reads).withCoder(new GATKReadCoder())); DataflowAssert.that(dataflowReads).containsInAnyOrder(reads); final PCollection<GATKRead> dataflowReadsAfterTransform = dataflowReads .apply( ParDo.of( new DoFn<GATKRead, GATKRead>() { private static final long serialVersionUID = 1l; @Override public void processElement(ProcessContext c) throws Exception { c.output(c.element()); } })) .setCoder(new GATKReadCoder()); DataflowAssert.that(dataflowReadsAfterTransform).containsInAnyOrder(reads); p.run(); }
@Test @SuppressWarnings("unchecked") public void testTopEmpty() { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(EMPTY_COLLECTION)).withCoder(StringUtf8Coder.of())); PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength())); PCollection<List<String>> top2 = input.apply(Top.<String>largest(2)); PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3)); PCollection<KV<String, Integer>> inputTable = createEmptyInputTable(p); PCollection<KV<String, List<Integer>>> largestPerKey = inputTable.apply(Top.<String, Integer>largestPerKey(2)); PCollection<KV<String, List<Integer>>> smallestPerKey = inputTable.apply(Top.<String, Integer>smallestPerKey(2)); DataflowAssert.thatSingletonIterable(top1).empty(); DataflowAssert.thatSingletonIterable(top2).empty(); DataflowAssert.thatSingletonIterable(top3).empty(); DataflowAssert.that(largestPerKey).empty(); DataflowAssert.that(smallestPerKey).empty(); p.run(); }
@Override public PDone apply(PCollection<Integer> input) { // Apply an operation so that this is a composite transform. input.apply(Count.<Integer>perElement()); return PDone.in(input.getPipeline()); }
@Override public PCollectionView<Iterable<T>> apply(PCollection<T> input) { return input.apply( CreatePCollectionView.<T, Iterable<T>>of( PCollectionViews.iterableView( input.getPipeline(), input.getWindowingStrategy(), input.getCoder()))); }
@Override public PCollection<T> apply(PInput input) { if (filepattern == null) { throw new IllegalStateException( "need to set the filepattern of an AvroIO.Read transform"); } if (schema == null) { throw new IllegalStateException("need to set the schema of an AvroIO.Read transform"); } if (validate) { try { checkState( !IOChannelUtils.getFactory(filepattern).match(filepattern).isEmpty(), "Unable to find any files matching %s", filepattern); } catch (IOException e) { throw new IllegalStateException(String.format("Failed to validate %s", filepattern), e); } } @SuppressWarnings("unchecked") Bounded<T> read = type == GenericRecord.class ? (Bounded<T>) com.google.cloud.dataflow.sdk.io.Read.from( AvroSource.from(filepattern).withSchema(schema)) : com.google.cloud.dataflow.sdk.io.Read.from( AvroSource.from(filepattern).withSchema(type)); PCollection<T> pcol = input.getPipeline().apply("Read", read); // Honor the default output coder that would have been used by this PTransform. pcol.setCoder(getDefaultOutputCoder()); return pcol; }
/** * Applies {@code ApproximateUnique(sampleSize)} verifying that the estimation error falls within * the maximum allowed error of {@code 2/sqrt(sampleSize)}. */ private void runApproximateUniquePipeline(int sampleSize) { Pipeline p = TestPipeline.create(); PCollection<String> collection = readPCollection(p); final PCollectionView<Long> exact = collection .apply(RemoveDuplicates.<String>create()) .apply(Combine.globally(new CountElements<String>())) .apply(View.<Long>asSingleton()); PCollection<Long> approximate = collection.apply(ApproximateUnique.<String>globally(sampleSize)); PCollection<KV<Long, Long>> approximateAndExact = approximate.apply( ParDo.of( new DoFn<Long, KV<Long, Long>>() { @Override public void processElement(ProcessContext c) { c.output(KV.of(c.element(), c.sideInput(exact))); } }) .withSideInputs(exact)); DataflowAssert.that(approximateAndExact).satisfies(new VerifyEstimatePerKeyFn(sampleSize)); p.run(); }
@Override public PCollection<Integer> apply(PCollection<Integer> input) { // Apply an operation so that this is a composite transform. input.apply(Count.<Integer>perElement()); // Return a value unrelated to the input. return input.getPipeline().apply(Create.of(1, 2, 3, 4)); }
@Override public PCollectionTuple apply(PCollection<Integer> input) { PCollection<Integer> sum = input.apply(Sum.integersGlobally()); // Fails here when attempting to construct a tuple with an unbound object. return PCollectionTuple.of(sumTag, sum) .and( doneTag, PCollection.<Void>createPrimitiveOutputInternal( input.getPipeline(), WindowingStrategy.globalDefault(), input.isBounded())); }
@Test public void testPartiallyBoundFailure() throws IOException { Pipeline p = DataflowPipeline.create(buildPipelineOptions()); PCollection<Integer> input = p.begin().apply(Create.of(1, 2, 3)); thrown.expect(IllegalStateException.class); input.apply(new PartiallyBoundOutputCreator()); Assert.fail("Failure expected from use of partially bound output"); }
@Override public PCollectionView<T> apply(PCollection<T> input) { return input.apply( CreatePCollectionView.<T, T>of( PCollectionViews.singletonView( input.getPipeline(), input.getWindowingStrategy(), hasDefault, defaultValue, input.getCoder()))); }
@Test public void testCountConstraint() { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of())); expectedEx.expect(IllegalArgumentException.class); expectedEx.expectMessage(Matchers.containsString(">= 0")); input.apply(Top.of(-1, new OrderByLength())); }
@Test @Category(RunnableOnService.class) public void testApproximateUniqueWithSmallInput() { Pipeline p = TestPipeline.create(); PCollection<Integer> input = p.apply(Create.of(Arrays.asList(1, 2, 3, 3))); PCollection<Long> estimate = input.apply(ApproximateUnique.<Integer>globally(1000)); DataflowAssert.thatSingleton(estimate).isEqualTo(3L); p.run(); }
@Override protected void testProgram() throws Exception { Pipeline p = FlinkTestPipeline.create(); PCollection<TableRow> input1 = p.apply(Create.of(EVENT_ARRAY)); PCollection<TableRow> input2 = p.apply(Create.of(CC_ARRAY)); PCollection<String> output = JoinExamples.joinEvents(input1, input2); output.apply(TextIO.Write.to(resultPath)); p.run(); }
@Test public void testUnsupportedFilePattern() throws IOException { File outFolder = tmpFolder.newFolder(); // Windows doesn't like resolving paths with * in them. String filename = outFolder.toPath().resolve("output@5").toString(); Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(LINES_ARRAY)).withCoder(StringUtf8Coder.of())); expectedException.expect(IllegalArgumentException.class); expectedException.expectMessage("Output name components are not allowed to contain"); input.apply(TextIO.Write.to(filename)); }
public static void main(String[] args) { PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class); KafkaStreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class); options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds"); options.setStreaming(true); options.setCheckpointingInterval(1000L); options.setNumberOfExecutionRetries(5); options.setExecutionRetryDelay(3000L); options.setRunner(FlinkPipelineRunner.class); System.out.println( options.getKafkaTopic() + " " + options.getZookeeper() + " " + options.getBroker() + " " + options.getGroup()); Pipeline pipeline = Pipeline.create(options); Properties p = new Properties(); p.setProperty("zookeeper.connect", options.getZookeeper()); p.setProperty("bootstrap.servers", options.getBroker()); p.setProperty("group.id", options.getGroup()); // this is the Flink consumer that reads the input to // the program from a kafka topic. FlinkKafkaConsumer08<String> kafkaConsumer = new FlinkKafkaConsumer08<>(options.getKafkaTopic(), new SimpleStringSchema(), p); PCollection<String> words = pipeline .apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount")) .apply(ParDo.of(new ExtractWordsFn())) .apply( Window.<String>into( FixedWindows.of(Duration.standardSeconds(options.getWindowSize()))) .triggering(AfterWatermark.pastEndOfWindow()) .withAllowedLateness(Duration.ZERO) .discardingFiredPanes()); PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement()); wordCounts.apply(ParDo.of(new FormatAsStringFn())).apply(TextIO.Write.to("./outputKafka.txt")); pipeline.run(); }
@Test public void testMultiGraphPipelineSerialization() throws IOException { Pipeline p = DataflowPipeline.create(buildPipelineOptions()); PCollection<Integer> input = p.begin().apply(Create.of(1, 2, 3)); input.apply(new UnrelatedOutputCreator()); input.apply(new UnboundOutputCreator()); DataflowPipelineTranslator t = DataflowPipelineTranslator.fromOptions( PipelineOptionsFactory.as(DataflowPipelineOptions.class)); // Check that translation doesn't fail. t.translate(p, Collections.<DataflowPackage>emptyList()); }
/** * Merge the statistics from each block. The resulting "collection" contains a single element, * with the answer. */ private static PCollection<RecalibrationTables> aggregateStatistics( final PCollection<RecalibrationTables> tables) { return tables // aggregate .apply(Combine.globally(new RecalibrationTablesMerger())) // call finalize on the result .apply( ParDo.named("finalizeRecalTables") .of( new DoFnWLog<RecalibrationTables, RecalibrationTables>("finalizeRecalTables") { private static final long serialVersionUID = 1L; @Override public void processElement(ProcessContext c) throws Exception { RecalibrationTables tables = c.element(); if (null == tables) { // the merger may return null when there are no inputs at all. In that // case we don't want to // crash (though it's really an edge case). log.warn("No recalibration tables!"); } else { // normal case: recalibrate BaseRecalibrationEngine.finalizeRecalibrationTables(tables); } c.output(tables); } })); }
@Override public PDone apply(PCollection<T> input) { if (topic == null) { throw new IllegalStateException("need to set the topic of a PubsubIO.Write transform"); } return PDone.in(input.getPipeline()); }
@Test public void testTopEmptyWithIncompatibleWindows() { Pipeline p = TestPipeline.create(); Bound<String> windowingFn = Window.<String>into(FixedWindows.of(Duration.standardDays(10L))); PCollection<String> input = p.apply(Create.timestamped(Collections.<String>emptyList(), Collections.<Long>emptyList())) .apply(windowingFn); expectedEx.expect(IllegalStateException.class); expectedEx.expectMessage("Top"); expectedEx.expectMessage("GlobalWindows"); expectedEx.expectMessage("withoutDefaults"); expectedEx.expectMessage("asSingletonView"); input.apply(Top.of(1, new OrderByLength())); }
private void runApproximateUniqueWithDuplicates( int elementCount, int uniqueCount, int sampleSize) { assert elementCount >= uniqueCount; List<Double> elements = Lists.newArrayList(); for (int i = 0; i < elementCount; i++) { elements.add(1.0 / (i % uniqueCount + 1)); } Collections.shuffle(elements); Pipeline p = TestPipeline.create(); PCollection<Double> input = p.apply(Create.of(elements)); PCollection<Long> estimate = input.apply(ApproximateUnique.<Double>globally(sampleSize)); DataflowAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize)); p.run(); }
@Override public PCollection<RecalibrationTables> apply( PCollection<AddContextDataToReadOptimized.ContextShard> input) { PCollection<RecalibrationTables> oneStatPerWorker = input.apply( ParDo.named("BaseRecalibrator") .withSideInputs(headerView, refDictionary) .of(new BaseRecalibratorOptimizedFn(headerView, refDictionary, recalArgs))); return aggregateStatistics(oneStatPerWorker); }
<T> void runTestWrite(T[] elems, Coder<T> coder) throws Exception { File tmpFile = tmpFolder.newFile("file.txt"); String filename = tmpFile.getPath(); Pipeline p = TestPipeline.create(); PCollection<T> input = p.apply(Create.of(Arrays.asList(elems)).withCoder(coder)); TextIO.Write.Bound<T> write; if (coder.equals(StringUtf8Coder.of())) { TextIO.Write.Bound<String> writeStrings = TextIO.Write.to(filename).withoutSharding(); // T==String write = (TextIO.Write.Bound<T>) writeStrings; } else { write = TextIO.Write.to(filename).withCoder(coder).withoutSharding(); } input.apply(write); p.run(); List<String> actual = new ArrayList<>(); try (BufferedReader reader = new BufferedReader(new FileReader(tmpFile))) { for (; ; ) { String line = reader.readLine(); if (line == null) { break; } actual.add(line); } } String[] expected = new String[elems.length]; for (int i = 0; i < elems.length; i++) { T elem = elems[i]; byte[] encodedElem = CoderUtils.encodeToByteArray(coder, elem); String line = new String(encodedElem); expected[i] = line; } assertThat(actual, containsInAnyOrder(expected)); }
/** * Takes a few Reads and will write them to a BAM file. The Reads don't have to be sorted * initially, the BAM file will be. All the reads must fit into a single worker's memory, so this * won't go well if you have too many. * * @param pipeline the pipeline to add this operation to. * @param reads the reads to write (they don't need to be sorted). * @param header the header that corresponds to the reads. * @param destPath the GCS or local path to write to (must start with "gs://" if writing to GCS). * @param parquet whether to write out BAM or Parquet data (BDG AlignmentRecords); only applies * when writing to Hadoop */ public static void writeToFile( Pipeline pipeline, PCollection<GATKRead> reads, final SAMFileHeader header, final String destPath, final boolean parquet) { if (BucketUtils.isHadoopUrl(destPath) || pipeline.getRunner().getClass().equals(SparkPipelineRunner.class)) { writeToHadoop(pipeline, reads, header, destPath, parquet); } else { PCollectionView<Iterable<GATKRead>> iterableView = reads.apply(View.<GATKRead>asIterable()); PCollection<String> dummy = pipeline.apply("output file name", Create.<String>of(destPath)); dummy.apply( ParDo.named("save to BAM file") .withSideInputs(iterableView) .of(new SaveToBAMFile(header, iterableView))); } }
@Test public void testWriteSharded() throws IOException { File outFolder = tmpFolder.newFolder(); String filename = outFolder.toPath().resolve("output").toString(); Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(LINES_ARRAY)).withCoder(StringUtf8Coder.of())); input.apply(TextIO.Write.to(filename).withNumShards(2).withSuffix(".txt")); p.run(); String[] files = outFolder.list(); assertThat( Arrays.asList(files), containsInAnyOrder("output-00000-of-00002.txt", "output-00001-of-00002.txt")); }
private void runApproximateUniqueWithSkewedDistributions( int elementCount, final int uniqueCount, final int sampleSize) { List<Integer> elements = Lists.newArrayList(); // Zipf distribution with approximately elementCount items. double s = 1 - 1.0 * uniqueCount / elementCount; double maxCount = Math.pow(uniqueCount, s); for (int k = 0; k < uniqueCount; k++) { int count = Math.max(1, (int) Math.round(maxCount * Math.pow(k, -s))); // Element k occurs count times. for (int c = 0; c < count; c++) { elements.add(k); } } Pipeline p = TestPipeline.create(); PCollection<Integer> input = p.apply(Create.of(elements)); PCollection<Long> estimate = input.apply(ApproximateUnique.<Integer>globally(sampleSize)); DataflowAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize)); p.run(); }
@Override public PCollection<GATKRead> apply(PCollection<GATKRead> input) { return input.apply( ParDo.named("ApplyBQSR") .of( new DoFnWLog<GATKRead, GATKRead>("ApplyBQSRStub") { private static final long serialVersionUID = 1L; @Override public void processElement(ProcessContext c) throws Exception { c.output(c.element()); } }) .withSideInputs(header, recalibrationReport)); }
@Override public PCollection<T> apply(PInput input) { if (topic == null && subscription == null) { throw new IllegalStateException( "need to set either the topic or the subscription for " + "a PubsubIO.Read transform"); } if (topic != null && subscription != null) { throw new IllegalStateException( "Can't set both the topic and the subscription for a " + "PubsubIO.Read transform"); } return PCollection.<T>createPrimitiveOutputInternal( input.getPipeline(), WindowingStrategy.globalDefault(), IsBounded.UNBOUNDED) .setCoder(coder); }
private static void writeToHadoop( Pipeline pipeline, PCollection<GATKRead> reads, final SAMFileHeader header, final String destPath, final boolean parquet) { if (destPath.equals("/dev/null")) { return; } String headerString = Base64.getEncoder().encodeToString(SerializableUtils.serializeToByteArray(header)); @SuppressWarnings("unchecked") Class<? extends FileOutputFormat<NullWritable, SAMRecordWritable>> outputFormatClass = (Class<? extends FileOutputFormat<NullWritable, SAMRecordWritable>>) (Class<?>) TemplatedKeyIgnoringBAMOutputFormat.class; @SuppressWarnings("unchecked") HadoopIO.Write.Bound<NullWritable, SAMRecordWritable> write = HadoopIO.Write.to(destPath, outputFormatClass, NullWritable.class, SAMRecordWritable.class) .withConfigurationProperty( TemplatedKeyIgnoringBAMOutputFormat.SAM_HEADER_PROPERTY_NAME, headerString); PCollection<KV<NullWritable, SAMRecordWritable>> samReads = reads .apply( ParDo.of( new DoFn<GATKRead, KV<NullWritable, SAMRecordWritable>>() { private static final long serialVersionUID = 1L; @Override public void processElement(ProcessContext c) throws Exception { SAMRecord samRecord = c.element().convertToSAMRecord(header); SAMRecordWritable samRecordWritable = new SAMRecordWritable(); samRecordWritable.set(samRecord); c.output(KV.of(NullWritable.get(), samRecordWritable)); } })) .setCoder( KvCoder.of( WritableCoder.of(NullWritable.class), WritableCoder.of(SAMRecordWritable.class))); // write as a single (unsharded) file samReads.apply(write.withoutSharding()); }
/** * addQuantizationInfo takes the computed RecalibrationTable and adds the QuantizationInfo and * RequestedCovariates objects. We call this triplet "BaseRecalOutput". It contains everything we * need from phase 1 to continue onto phase 2 of BQSR. */ private static PCollection<BaseRecalOutput> addQuantizationInfo( PCollectionView<SAMFileHeader> headerView, RecalibrationArgumentCollection recalArgs, PCollection<RecalibrationTables> recal) { return recal.apply( ParDo.named("addQuantizationInfo") .withSideInputs(headerView) .of( new DoFnWLog<RecalibrationTables, BaseRecalOutput>("addQuantizationInfo") { private static final long serialVersionUID = 1L; @Override public void processElement(ProcessContext c) throws IOException { RecalibrationTables rt = c.element(); SAMFileHeader header = c.sideInput(headerView); // BaseRecalOutput ret = new BaseRecalOutput(rt, // baseRecalibratorWorker.getQuantizationInfo(rt), // baseRecalibratorWorker.getRequestedCovariates()); // Saving and loading back the report actually changes it. So we have to do it. // TODO(issue#799): Figure out what it changes, and just do that instead of // doing the whole rigamarole. File temp = IOUtils.createTempFile("temp-recalibrationtable-", ".tmp"); if (null == rt) { // special case where we have zero reads in the input. Create a valid empty // report. log.debug("Special case: zero reads in input."); BaseRecalibrationEngine recalibrationEngine = new BaseRecalibrationEngine(recalArgs, header); rt = recalibrationEngine.getRecalibrationTables(); BaseRecalibrationEngine.finalizeRecalibrationTables(rt); } try { BaseRecalibratorOptimizedFn.saveTextualReport(temp, header, rt, recalArgs); BaseRecalOutput ret = new BaseRecalOutput(temp); c.output(ret); } catch (FileNotFoundException e) { throw new GATKException("can't find my own temporary file", e); } catch (IOException e) { throw new GATKException( "unable to save temporary report to " + temp.getPath(), e); } } })); }