@Test(dataProvider = "bases") public void addContextDataTest( List<GATKRead> reads, List<Variant> variantList, List<KV<GATKRead, ReferenceBases>> kvReadRefBases, List<KV<GATKRead, ReadContextData>> kvReadContextData, List<SimpleInterval> intervals, List<KV<GATKRead, Iterable<Variant>>> kvReadiVariant) { Pipeline p = GATKTestPipeline.create(); DataflowUtils.registerGATKCoders(p); PCollection<GATKRead> pReads = DataflowTestUtils.pCollectionCreateAndVerify(p, reads, new GATKReadCoder()); PCollection<KV<GATKRead, ReferenceBases>> pReadRef = DataflowTestUtils.pCollectionCreateAndVerify( p, kvReadRefBases, KvCoder.of(new GATKReadCoder(), SerializableCoder.of(ReferenceBases.class))); PCollection<KV<GATKRead, Iterable<Variant>>> pReadVariants = p.apply( Create.of(kvReadiVariant) .withCoder(KvCoder.of(new GATKReadCoder(), IterableCoder.of(new VariantCoder())))); PCollection<KV<GATKRead, ReadContextData>> joinedResults = AddContextDataToRead.join(pReads, pReadRef, pReadVariants); PCollection<KV<GATKRead, ReadContextData>> pkvReadContextData = p.apply( Create.of(kvReadContextData) .withCoder(KvCoder.of(new GATKReadCoder(), new ReadContextDataCoder()))); DataflowTestUtils.keyReadContextDataMatcher(joinedResults, pkvReadContextData); p.run(); }
@Override protected void testProgram() throws Exception { Pipeline p = FlinkTestPipeline.create(); PCollection<TableRow> input1 = p.apply(Create.of(EVENT_ARRAY)); PCollection<TableRow> input2 = p.apply(Create.of(CC_ARRAY)); PCollection<String> output = JoinExamples.joinEvents(input1, input2); output.apply(TextIO.Write.to(resultPath)); p.run(); }
@Test public void testCompressedRead() throws Exception { String[] lines = {"Irritable eagle", "Optimistic jay", "Fanciful hawk"}; File tmpFile = tmpFolder.newFile("test"); String filename = tmpFile.getPath(); List<String> expected = new ArrayList<>(); try (PrintStream writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(tmpFile)))) { for (String line : lines) { writer.println(line); expected.add(line); } } Pipeline p = TestPipeline.create(); TextIO.Read.Bound<String> read = TextIO.Read.from(filename).withCompressionType(CompressionType.GZIP); PCollection<String> output = p.apply(read); DataflowAssert.that(output).containsInAnyOrder(expected); p.run(); tmpFile.delete(); }
<T> void runTestRead(T[] expected, Coder<T> coder) throws Exception { File tmpFile = tmpFolder.newFile("file.txt"); String filename = tmpFile.getPath(); try (PrintStream writer = new PrintStream(new FileOutputStream(tmpFile))) { for (T elem : expected) { byte[] encodedElem = CoderUtils.encodeToByteArray(coder, elem); String line = new String(encodedElem); writer.println(line); } } Pipeline p = TestPipeline.create(); TextIO.Read.Bound<T> read; if (coder.equals(StringUtf8Coder.of())) { TextIO.Read.Bound<String> readStrings = TextIO.Read.from(filename); // T==String read = (TextIO.Read.Bound<T>) readStrings; } else { read = TextIO.Read.from(filename).withCoder(coder); } PCollection<T> output = p.apply(read); DataflowAssert.that(output).containsInAnyOrder(expected); p.run(); }
@Test(dataProvider = "reads") public void testGATKReadCoding(final List<GATKRead> reads) { // The simplest way to figure out if a class is coded correctly is to create a PCollection // of that type and see if it matches the List version. final Pipeline p = GATKTestPipeline.create(); DataflowUtils.registerGATKCoders(p); // Need to explicitly set the coder to GATKReadCoder, otherwise Create fails to infer // a coder properly in the case where the List contains a mix of different GATKRead // implementations. final PCollection<GATKRead> dataflowReads = p.apply(Create.of(reads).withCoder(new GATKReadCoder())); DataflowAssert.that(dataflowReads).containsInAnyOrder(reads); final PCollection<GATKRead> dataflowReadsAfterTransform = dataflowReads .apply( ParDo.of( new DoFn<GATKRead, GATKRead>() { private static final long serialVersionUID = 1l; @Override public void processElement(ProcessContext c) throws Exception { c.output(c.element()); } })) .setCoder(new GATKReadCoder()); DataflowAssert.that(dataflowReadsAfterTransform).containsInAnyOrder(reads); p.run(); }
/** * Applies {@code ApproximateUnique(sampleSize)} verifying that the estimation error falls within * the maximum allowed error of {@code 2/sqrt(sampleSize)}. */ private static void runApproximateUniquePipeline(int sampleSize) { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(TEST_LINES)); PCollection<Long> approximate = input.apply(ApproximateUnique.<String>globally(sampleSize)); final PCollectionView<Long> exact = input .apply(RemoveDuplicates.<String>create()) .apply(Count.<String>globally()) .apply(View.<Long>asSingleton()); PCollection<KV<Long, Long>> approximateAndExact = approximate.apply( ParDo.of( new DoFn<Long, KV<Long, Long>>() { @Override public void processElement(ProcessContext c) { c.output(KV.of(c.element(), c.sideInput(exact))); } }) .withSideInputs(exact)); DataflowAssert.that(approximateAndExact).satisfies(new VerifyEstimatePerKeyFn(sampleSize)); p.run(); }
@Test @SuppressWarnings("unchecked") public void testTop() { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of())); PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength())); PCollection<List<String>> top2 = input.apply(Top.<String>largest(2)); PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3)); PCollection<KV<String, Integer>> inputTable = createInputTable(p); PCollection<KV<String, List<Integer>>> largestPerKey = inputTable.apply(Top.<String, Integer>largestPerKey(2)); PCollection<KV<String, List<Integer>>> smallestPerKey = inputTable.apply(Top.<String, Integer>smallestPerKey(2)); DataflowAssert.thatSingletonIterable(top1).containsInAnyOrder(Arrays.asList("bb")); DataflowAssert.thatSingletonIterable(top2).containsInAnyOrder("z", "c"); DataflowAssert.thatSingletonIterable(top3).containsInAnyOrder("a", "bb", "c"); DataflowAssert.that(largestPerKey) .containsInAnyOrder(KV.of("a", Arrays.asList(3, 2)), KV.of("b", Arrays.asList(100, 10))); DataflowAssert.that(smallestPerKey) .containsInAnyOrder(KV.of("a", Arrays.asList(1, 2)), KV.of("b", Arrays.asList(1, 10))); p.run(); }
@Override public PCollection<KV<URI, String>> apply(PInput input) { Pipeline pipeline = input.getPipeline(); // Create one TextIO.Read transform for each document // and add its output to a PCollectionList PCollectionList<KV<URI, String>> urisToLines = PCollectionList.empty(pipeline); // TextIO.Read supports: // - file: URIs and paths locally // - gs: URIs on the service for (final URI uri : uris) { String uriString; if (uri.getScheme().equals("file")) { uriString = new File(uri).getPath(); } else { uriString = uri.toString(); } PCollection<KV<URI, String>> oneUriToLines = pipeline .apply(TextIO.Read.from(uriString).named("TextIO.Read(" + uriString + ")")) .apply(WithKeys.<URI, String>of(uri).setName("WithKeys(" + uriString + ")")); urisToLines = urisToLines.and(oneUriToLines); } return urisToLines.apply(Flatten.<KV<URI, String>>pCollections()); }
@Test @SuppressWarnings("unchecked") public void testTopEmpty() { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(EMPTY_COLLECTION)).withCoder(StringUtf8Coder.of())); PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength())); PCollection<List<String>> top2 = input.apply(Top.<String>largest(2)); PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3)); PCollection<KV<String, Integer>> inputTable = createEmptyInputTable(p); PCollection<KV<String, List<Integer>>> largestPerKey = inputTable.apply(Top.<String, Integer>largestPerKey(2)); PCollection<KV<String, List<Integer>>> smallestPerKey = inputTable.apply(Top.<String, Integer>smallestPerKey(2)); DataflowAssert.thatSingletonIterable(top1).empty(); DataflowAssert.thatSingletonIterable(top2).empty(); DataflowAssert.thatSingletonIterable(top3).empty(); DataflowAssert.that(largestPerKey).empty(); DataflowAssert.that(smallestPerKey).empty(); p.run(); }
@Test public void testReadNamed() { Pipeline p = TestPipeline.create(); { PCollection<String> output1 = p.apply(TextIO.Read.from("/tmp/file.txt")); assertEquals("TextIO.Read.out", output1.getName()); } { PCollection<String> output2 = p.apply(TextIO.Read.named("MyRead").from("/tmp/file.txt")); assertEquals("MyRead.out", output2.getName()); } { PCollection<String> output3 = p.apply(TextIO.Read.from("/tmp/file.txt").named("HerRead")); assertEquals("HerRead.out", output3.getName()); } }
@Test public void testCountConstraint() { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of())); expectedEx.expect(IllegalArgumentException.class); expectedEx.expectMessage(Matchers.containsString(">= 0")); input.apply(Top.of(-1, new OrderByLength())); }
/** Recursive wildcards are not supported. This tests "**". */ @Test public void testBadWildcardRecursive() throws Exception { Pipeline pipeline = TestPipeline.create(); pipeline.apply(TextIO.Read.from("gs://bucket/foo**/baz")); // Check that running does fail. expectedException.expect(IllegalArgumentException.class); expectedException.expectMessage("wildcard"); pipeline.run(); }
@Test(dataProvider = "bases") public void fullTest( List<GATKRead> reads, List<Variant> variantList, List<KV<GATKRead, ReferenceBases>> kvReadRefBases, List<KV<GATKRead, ReadContextData>> kvReadContextData, List<SimpleInterval> intervals, List<KV<GATKRead, Iterable<Variant>>> kvReadiVariant) { Pipeline p = GATKTestPipeline.create(); DataflowUtils.registerGATKCoders(p); PCollection<GATKRead> pReads = DataflowTestUtils.pCollectionCreateAndVerify(p, reads, new GATKReadCoder()); PCollection<Variant> pVariant = p.apply(Create.of(variantList)); VariantsDataflowSource mockVariantsSource = mock(VariantsDataflowSource.class); when(mockVariantsSource.getAllVariants()).thenReturn(pVariant); RefAPISource mockSource = mock(RefAPISource.class, withSettings().serializable()); for (SimpleInterval i : intervals) { when(mockSource.getReferenceBases( any(PipelineOptions.class), any(RefAPIMetadata.class), eq(i))) .thenReturn(FakeReferenceSource.bases(i)); } String referenceName = "refName"; String refId = "0xbjfjd23f"; Map<String, String> referenceNameToIdTable = Maps.newHashMap(); referenceNameToIdTable.put(referenceName, refId); RefAPIMetadata refAPIMetadata = new RefAPIMetadata(referenceName, referenceNameToIdTable); RefAPISource.setRefAPISource(mockSource); PCollection<KV<GATKRead, ReadContextData>> result = AddContextDataToRead.add(pReads, /*mockSource,*/ refAPIMetadata, mockVariantsSource); PCollection<KV<GATKRead, ReadContextData>> pkvReadContextData = p.apply( Create.of(kvReadContextData) .withCoder(KvCoder.of(new GATKReadCoder(), new ReadContextDataCoder()))); DataflowTestUtils.keyReadContextDataMatcher(result, pkvReadContextData); p.run(); }
/** Reads a large {@code PCollection<String>}. */ private PCollection<String> readPCollection(Pipeline p) { // TODO: Read PCollection from a set of text files. List<String> page = TestUtils.LINES; final int pages = 1000; ArrayList<String> file = new ArrayList<>(pages * page.size()); for (int i = 0; i < pages; i++) { file.addAll(page); } assert file.size() == pages * page.size(); PCollection<String> words = p.apply(Create.of(file)); return words; }
// This is a purely compile-time test. If the code compiles, then it worked. @Test public void testPerKeySerializabilityRequirement() { Pipeline p = TestPipeline.create(); p.apply( "CreateCollection", Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of())); PCollection<KV<String, Integer>> inputTable = createInputTable(p); inputTable.apply(Top.<String, Integer, IntegerComparator>perKey(1, new IntegerComparator())); inputTable.apply( "PerKey2", Top.<String, Integer, IntegerComparator2>perKey(1, new IntegerComparator2())); }
public static void main(String[] args) throws Exception { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline pipeline = Pipeline.create(options); pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class)); pipeline .apply(new ReadDocuments(listInputDocuments(options))) .apply(new ComputeTfIdf()) .apply(new WriteTfIdf(options.getOutput())); pipeline.run(); }
@Test @Category(RunnableOnService.class) public void testApproximateUniqueWithSmallInput() { Pipeline p = TestPipeline.create(); PCollection<Integer> input = p.apply(Create.of(Arrays.asList(1, 2, 3, 3))); PCollection<Long> estimate = input.apply(ApproximateUnique.<Integer>globally(1000)); DataflowAssert.thatSingleton(estimate).isEqualTo(3L); p.run(); }
/** Recursive wildcards are not supported. This tests "**". */ @Test public void testBadWildcardRecursive() throws Exception { DataflowPipelineOptions options = buildPipelineOptions(); Pipeline pipeline = DataflowPipeline.create(options); DataflowPipelineTranslator t = DataflowPipelineTranslator.fromOptions(options); pipeline.apply(TextIO.Read.from("gs://bucket/foo**/baz")); // Check that translation does fail. thrown.expect(IllegalArgumentException.class); thrown.expectMessage("Unsupported wildcard usage"); t.translate(pipeline, Collections.<DataflowPackage>emptyList()); }
/** Sets up and starts streaming pipeline. */ public static void main(String[] args) { PubsubFileInjectorOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(PubsubFileInjectorOptions.class); Pipeline pipeline = Pipeline.create(options); pipeline .apply(TextIO.Read.from(options.getInput())) .apply( IntraBundleParallelization.of(PubsubFileInjector.publish(options.getOutputTopic())) .withMaxParallelism(20)); pipeline.run(); }
@Test public void testUnsupportedFilePattern() throws IOException { File outFolder = tmpFolder.newFolder(); // Windows doesn't like resolving paths with * in them. String filename = outFolder.toPath().resolve("output@5").toString(); Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(LINES_ARRAY)).withCoder(StringUtf8Coder.of())); expectedException.expect(IllegalArgumentException.class); expectedException.expectMessage("Output name components are not allowed to contain"); input.apply(TextIO.Write.to(filename)); }
/** * Runs the batch injector for the streaming pipeline. * * <p>The injector pipeline will read from the given text file, and inject data into the Google * Cloud Pub/Sub topic. */ public void runInjectorPipeline(String inputFile, String topic) { DataflowPipelineOptions copiedOptions = options.cloneAs(DataflowPipelineOptions.class); copiedOptions.setStreaming(false); copiedOptions.setNumWorkers( options.as(ExamplePubsubTopicOptions.class).getInjectorNumWorkers()); copiedOptions.setJobName(options.getJobName() + "-injector"); Pipeline injectorPipeline = Pipeline.create(copiedOptions); injectorPipeline .apply(TextIO.Read.from(inputFile)) .apply( IntraBundleParallelization.of(PubsubFileInjector.publish(topic)) .withMaxParallelism(20)); DataflowPipelineJob injectorJob = (DataflowPipelineJob) injectorPipeline.run(); jobsToCancel.add(injectorJob); }
public static void main(String[] args) { PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class); KafkaStreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class); options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds"); options.setStreaming(true); options.setCheckpointingInterval(1000L); options.setNumberOfExecutionRetries(5); options.setExecutionRetryDelay(3000L); options.setRunner(FlinkPipelineRunner.class); System.out.println( options.getKafkaTopic() + " " + options.getZookeeper() + " " + options.getBroker() + " " + options.getGroup()); Pipeline pipeline = Pipeline.create(options); Properties p = new Properties(); p.setProperty("zookeeper.connect", options.getZookeeper()); p.setProperty("bootstrap.servers", options.getBroker()); p.setProperty("group.id", options.getGroup()); // this is the Flink consumer that reads the input to // the program from a kafka topic. FlinkKafkaConsumer08<String> kafkaConsumer = new FlinkKafkaConsumer08<>(options.getKafkaTopic(), new SimpleStringSchema(), p); PCollection<String> words = pipeline .apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount")) .apply(ParDo.of(new ExtractWordsFn())) .apply( Window.<String>into( FixedWindows.of(Duration.standardSeconds(options.getWindowSize()))) .triggering(AfterWatermark.pastEndOfWindow()) .withAllowedLateness(Duration.ZERO) .discardingFiredPanes()); PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement()); wordCounts.apply(ParDo.of(new FormatAsStringFn())).apply(TextIO.Write.to("./outputKafka.txt")); pipeline.run(); }
@Test public void testTopEmptyWithIncompatibleWindows() { Pipeline p = TestPipeline.create(); Bound<String> windowingFn = Window.<String>into(FixedWindows.of(Duration.standardDays(10L))); PCollection<String> input = p.apply(Create.timestamped(Collections.<String>emptyList(), Collections.<Long>emptyList())) .apply(windowingFn); expectedEx.expect(IllegalStateException.class); expectedEx.expectMessage("Top"); expectedEx.expectMessage("GlobalWindows"); expectedEx.expectMessage("withoutDefaults"); expectedEx.expectMessage("asSingletonView"); input.apply(Top.of(1, new OrderByLength())); }
private void runApproximateUniqueWithDuplicates( int elementCount, int uniqueCount, int sampleSize) { assert elementCount >= uniqueCount; List<Double> elements = Lists.newArrayList(); for (int i = 0; i < elementCount; i++) { elements.add(1.0 / (i % uniqueCount + 1)); } Collections.shuffle(elements); Pipeline p = TestPipeline.create(); PCollection<Double> input = p.apply(Create.of(elements)); PCollection<Long> estimate = input.apply(ApproximateUnique.<Double>globally(sampleSize)); DataflowAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize)); p.run(); }
<T> void runTestWrite(T[] elems, Coder<T> coder) throws Exception { File tmpFile = tmpFolder.newFile("file.txt"); String filename = tmpFile.getPath(); Pipeline p = TestPipeline.create(); PCollection<T> input = p.apply(Create.of(Arrays.asList(elems)).withCoder(coder)); TextIO.Write.Bound<T> write; if (coder.equals(StringUtf8Coder.of())) { TextIO.Write.Bound<String> writeStrings = TextIO.Write.to(filename).withoutSharding(); // T==String write = (TextIO.Write.Bound<T>) writeStrings; } else { write = TextIO.Write.to(filename).withCoder(coder).withoutSharding(); } input.apply(write); p.run(); List<String> actual = new ArrayList<>(); try (BufferedReader reader = new BufferedReader(new FileReader(tmpFile))) { for (; ; ) { String line = reader.readLine(); if (line == null) { break; } actual.add(line); } } String[] expected = new String[elems.length]; for (int i = 0; i < elems.length; i++) { T elem = elems[i]; byte[] encodedElem = CoderUtils.encodeToByteArray(coder, elem); String line = new String(encodedElem); expected[i] = line; } assertThat(actual, containsInAnyOrder(expected)); }
/** * Takes a few Reads and will write them to a BAM file. The Reads don't have to be sorted * initially, the BAM file will be. All the reads must fit into a single worker's memory, so this * won't go well if you have too many. * * @param pipeline the pipeline to add this operation to. * @param reads the reads to write (they don't need to be sorted). * @param header the header that corresponds to the reads. * @param destPath the GCS or local path to write to (must start with "gs://" if writing to GCS). * @param parquet whether to write out BAM or Parquet data (BDG AlignmentRecords); only applies * when writing to Hadoop */ public static void writeToFile( Pipeline pipeline, PCollection<GATKRead> reads, final SAMFileHeader header, final String destPath, final boolean parquet) { if (BucketUtils.isHadoopUrl(destPath) || pipeline.getRunner().getClass().equals(SparkPipelineRunner.class)) { writeToHadoop(pipeline, reads, header, destPath, parquet); } else { PCollectionView<Iterable<GATKRead>> iterableView = reads.apply(View.<GATKRead>asIterable()); PCollection<String> dummy = pipeline.apply("output file name", Create.<String>of(destPath)); dummy.apply( ParDo.named("save to BAM file") .withSideInputs(iterableView) .of(new SaveToBAMFile(header, iterableView))); } }
@Test public void testWriteSharded() throws IOException { File outFolder = tmpFolder.newFolder(); String filename = outFolder.toPath().resolve("output").toString(); Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(LINES_ARRAY)).withCoder(StringUtf8Coder.of())); input.apply(TextIO.Write.to(filename).withNumShards(2).withSuffix(".txt")); p.run(); String[] files = outFolder.list(); assertThat( Arrays.asList(files), containsInAnyOrder("output-00000-of-00002.txt", "output-00001-of-00002.txt")); }
@Test public void testApproximateUniquePerKey() { List<KV<Long, Long>> elements = Lists.newArrayList(); List<Long> keys = ImmutableList.of(20L, 50L, 100L); int elementCount = 1000; int sampleSize = 100; // Use the key as the number of unique values. for (long uniqueCount : keys) { for (long value = 0; value < elementCount; value++) { elements.add(KV.of(uniqueCount, value % uniqueCount)); } } Pipeline p = TestPipeline.create(); PCollection<KV<Long, Long>> input = p.apply(Create.of(elements)); PCollection<KV<Long, Long>> counts = input.apply(ApproximateUnique.<Long, Long>perKey(sampleSize)); DataflowAssert.that(counts).satisfies(new VerifyEstimatePerKeyFn(sampleSize)); p.run(); }
private void runApproximateUniqueWithSkewedDistributions( int elementCount, final int uniqueCount, final int sampleSize) { List<Integer> elements = Lists.newArrayList(); // Zipf distribution with approximately elementCount items. double s = 1 - 1.0 * uniqueCount / elementCount; double maxCount = Math.pow(uniqueCount, s); for (int k = 0; k < uniqueCount; k++) { int count = Math.max(1, (int) Math.round(maxCount * Math.pow(k, -s))); // Element k occurs count times. for (int c = 0; c < count; c++) { elements.add(k); } } Pipeline p = TestPipeline.create(); PCollection<Integer> input = p.apply(Create.of(elements)); PCollection<Long> estimate = input.apply(ApproximateUnique.<Integer>globally(sampleSize)); DataflowAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize)); p.run(); }
private void applyRead(Pipeline pipeline, String path) { pipeline.apply("Read(" + path + ")", TextIO.Read.from(path)); }