@Test @SuppressWarnings("unchecked") public void testTopEmpty() { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(EMPTY_COLLECTION)).withCoder(StringUtf8Coder.of())); PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength())); PCollection<List<String>> top2 = input.apply(Top.<String>largest(2)); PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3)); PCollection<KV<String, Integer>> inputTable = createEmptyInputTable(p); PCollection<KV<String, List<Integer>>> largestPerKey = inputTable.apply(Top.<String, Integer>largestPerKey(2)); PCollection<KV<String, List<Integer>>> smallestPerKey = inputTable.apply(Top.<String, Integer>smallestPerKey(2)); DataflowAssert.thatSingletonIterable(top1).empty(); DataflowAssert.thatSingletonIterable(top2).empty(); DataflowAssert.thatSingletonIterable(top3).empty(); DataflowAssert.that(largestPerKey).empty(); DataflowAssert.that(smallestPerKey).empty(); p.run(); }
@Override public PCollection<KV<URI, String>> apply(PInput input) { Pipeline pipeline = input.getPipeline(); // Create one TextIO.Read transform for each document // and add its output to a PCollectionList PCollectionList<KV<URI, String>> urisToLines = PCollectionList.empty(pipeline); // TextIO.Read supports: // - file: URIs and paths locally // - gs: URIs on the service for (final URI uri : uris) { String uriString; if (uri.getScheme().equals("file")) { uriString = new File(uri).getPath(); } else { uriString = uri.toString(); } PCollection<KV<URI, String>> oneUriToLines = pipeline .apply(TextIO.Read.from(uriString).named("TextIO.Read(" + uriString + ")")) .apply(WithKeys.<URI, String>of(uri).setName("WithKeys(" + uriString + ")")); urisToLines = urisToLines.and(oneUriToLines); } return urisToLines.apply(Flatten.<KV<URI, String>>pCollections()); }
@Test(dataProvider = "reads") public void testGATKReadCoding(final List<GATKRead> reads) { // The simplest way to figure out if a class is coded correctly is to create a PCollection // of that type and see if it matches the List version. final Pipeline p = GATKTestPipeline.create(); DataflowUtils.registerGATKCoders(p); // Need to explicitly set the coder to GATKReadCoder, otherwise Create fails to infer // a coder properly in the case where the List contains a mix of different GATKRead // implementations. final PCollection<GATKRead> dataflowReads = p.apply(Create.of(reads).withCoder(new GATKReadCoder())); DataflowAssert.that(dataflowReads).containsInAnyOrder(reads); final PCollection<GATKRead> dataflowReadsAfterTransform = dataflowReads .apply( ParDo.of( new DoFn<GATKRead, GATKRead>() { private static final long serialVersionUID = 1l; @Override public void processElement(ProcessContext c) throws Exception { c.output(c.element()); } })) .setCoder(new GATKReadCoder()); DataflowAssert.that(dataflowReadsAfterTransform).containsInAnyOrder(reads); p.run(); }
@Test public void testCompressedRead() throws Exception { String[] lines = {"Irritable eagle", "Optimistic jay", "Fanciful hawk"}; File tmpFile = tmpFolder.newFile("test"); String filename = tmpFile.getPath(); List<String> expected = new ArrayList<>(); try (PrintStream writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(tmpFile)))) { for (String line : lines) { writer.println(line); expected.add(line); } } Pipeline p = TestPipeline.create(); TextIO.Read.Bound<String> read = TextIO.Read.from(filename).withCompressionType(CompressionType.GZIP); PCollection<String> output = p.apply(read); DataflowAssert.that(output).containsInAnyOrder(expected); p.run(); tmpFile.delete(); }
@Test public void testSettingOfSdkPipelineOptions() throws IOException { DataflowPipelineOptions options = buildPipelineOptions(); options.setRunner(DataflowPipelineRunner.class); Pipeline p = buildPipeline(options); p.traverseTopologically(new RecordingPipelineVisitor()); Job job = DataflowPipelineTranslator.fromOptions(options) .translate(p, Collections.<DataflowPackage>emptyList()) .getJob(); // Note that the contents of this materialized map may be changed by the act of reading an // option, which will cause the default to get materialized whereas it would otherwise be // left absent. It is permissible to simply alter this test to reflect current behavior. assertEquals( ImmutableMap.of( "options", ImmutableMap.builder() .put("appName", "DataflowPipelineTranslatorTest") .put("project", "some-project") .put( "pathValidatorClass", "com.google.cloud.dataflow.sdk.util.DataflowPathValidator") .put("runner", "com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner") .put("jobName", "some-job-name") .put("tempLocation", "gs://somebucket/some/path") .put("stagingLocation", "gs://somebucket/some/path/staging") .put("stableUniqueNames", "WARNING") .put("streaming", false) .put("numberOfWorkerHarnessThreads", 0) .build()), job.getEnvironment().getSdkPipelineOptions()); }
/** This tests a few corner cases that should not crash. */ @Test public void testGoodWildcards() throws Exception { TestDataflowPipelineOptions options = buildTestPipelineOptions(); options.setGcsUtil(buildMockGcsUtil()); Pipeline pipeline = Pipeline.create(options); applyRead(pipeline, "gs://bucket/foo"); applyRead(pipeline, "gs://bucket/foo/"); applyRead(pipeline, "gs://bucket/foo/*"); applyRead(pipeline, "gs://bucket/foo/?"); applyRead(pipeline, "gs://bucket/foo/[0-9]"); applyRead(pipeline, "gs://bucket/foo/*baz*"); applyRead(pipeline, "gs://bucket/foo/*baz?"); applyRead(pipeline, "gs://bucket/foo/[0-9]baz?"); applyRead(pipeline, "gs://bucket/foo/baz/*"); applyRead(pipeline, "gs://bucket/foo/baz/*wonka*"); applyRead(pipeline, "gs://bucket/foo/*baz/wonka*"); applyRead(pipeline, "gs://bucket/foo*/baz"); applyRead(pipeline, "gs://bucket/foo?/baz"); applyRead(pipeline, "gs://bucket/foo[0-9]/baz"); // Check that running doesn't fail. pipeline.run(); }
/** * Applies {@code ApproximateUnique(sampleSize)} verifying that the estimation error falls within * the maximum allowed error of {@code 2/sqrt(sampleSize)}. */ private void runApproximateUniquePipeline(int sampleSize) { Pipeline p = TestPipeline.create(); PCollection<String> collection = readPCollection(p); final PCollectionView<Long> exact = collection .apply(RemoveDuplicates.<String>create()) .apply(Combine.globally(new CountElements<String>())) .apply(View.<Long>asSingleton()); PCollection<Long> approximate = collection.apply(ApproximateUnique.<String>globally(sampleSize)); PCollection<KV<Long, Long>> approximateAndExact = approximate.apply( ParDo.of( new DoFn<Long, KV<Long, Long>>() { @Override public void processElement(ProcessContext c) { c.output(KV.of(c.element(), c.sideInput(exact))); } }) .withSideInputs(exact)); DataflowAssert.that(approximateAndExact).satisfies(new VerifyEstimatePerKeyFn(sampleSize)); p.run(); }
@Test(dataProvider = "bases") public void addContextDataTest( List<GATKRead> reads, List<Variant> variantList, List<KV<GATKRead, ReferenceBases>> kvReadRefBases, List<KV<GATKRead, ReadContextData>> kvReadContextData, List<SimpleInterval> intervals, List<KV<GATKRead, Iterable<Variant>>> kvReadiVariant) { Pipeline p = GATKTestPipeline.create(); DataflowUtils.registerGATKCoders(p); PCollection<GATKRead> pReads = DataflowTestUtils.pCollectionCreateAndVerify(p, reads, new GATKReadCoder()); PCollection<KV<GATKRead, ReferenceBases>> pReadRef = DataflowTestUtils.pCollectionCreateAndVerify( p, kvReadRefBases, KvCoder.of(new GATKReadCoder(), SerializableCoder.of(ReferenceBases.class))); PCollection<KV<GATKRead, Iterable<Variant>>> pReadVariants = p.apply( Create.of(kvReadiVariant) .withCoder(KvCoder.of(new GATKReadCoder(), IterableCoder.of(new VariantCoder())))); PCollection<KV<GATKRead, ReadContextData>> joinedResults = AddContextDataToRead.join(pReads, pReadRef, pReadVariants); PCollection<KV<GATKRead, ReadContextData>> pkvReadContextData = p.apply( Create.of(kvReadContextData) .withCoder(KvCoder.of(new GATKReadCoder(), new ReadContextDataCoder()))); DataflowTestUtils.keyReadContextDataMatcher(joinedResults, pkvReadContextData); p.run(); }
<T> void runTestRead(T[] expected, Coder<T> coder) throws Exception { File tmpFile = tmpFolder.newFile("file.txt"); String filename = tmpFile.getPath(); try (PrintStream writer = new PrintStream(new FileOutputStream(tmpFile))) { for (T elem : expected) { byte[] encodedElem = CoderUtils.encodeToByteArray(coder, elem); String line = new String(encodedElem); writer.println(line); } } Pipeline p = TestPipeline.create(); TextIO.Read.Bound<T> read; if (coder.equals(StringUtf8Coder.of())) { TextIO.Read.Bound<String> readStrings = TextIO.Read.from(filename); // T==String read = (TextIO.Read.Bound<T>) readStrings; } else { read = TextIO.Read.from(filename).withCoder(coder); } PCollection<T> output = p.apply(read); DataflowAssert.that(output).containsInAnyOrder(expected); p.run(); }
@Test @SuppressWarnings("unchecked") public void testTop() { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of())); PCollection<List<String>> top1 = input.apply(Top.of(1, new OrderByLength())); PCollection<List<String>> top2 = input.apply(Top.<String>largest(2)); PCollection<List<String>> top3 = input.apply(Top.<String>smallest(3)); PCollection<KV<String, Integer>> inputTable = createInputTable(p); PCollection<KV<String, List<Integer>>> largestPerKey = inputTable.apply(Top.<String, Integer>largestPerKey(2)); PCollection<KV<String, List<Integer>>> smallestPerKey = inputTable.apply(Top.<String, Integer>smallestPerKey(2)); DataflowAssert.thatSingletonIterable(top1).containsInAnyOrder(Arrays.asList("bb")); DataflowAssert.thatSingletonIterable(top2).containsInAnyOrder("z", "c"); DataflowAssert.thatSingletonIterable(top3).containsInAnyOrder("a", "bb", "c"); DataflowAssert.that(largestPerKey) .containsInAnyOrder(KV.of("a", Arrays.asList(3, 2)), KV.of("b", Arrays.asList(100, 10))); DataflowAssert.that(smallestPerKey) .containsInAnyOrder(KV.of("a", Arrays.asList(1, 2)), KV.of("b", Arrays.asList(1, 10))); p.run(); }
@SuppressWarnings("unchecked") @Before public void setup() { MockitoAnnotations.initMocks(this); when(underTest.getOptions()).thenReturn(cbtOptions); when(underTest.getCoderRegistry()).thenReturn(registry); when(cbtOptions.as(any(Class.class))).thenReturn(cbtOptions); CloudBigtableIO.initializeForWrite(underTest); }
/** Recursive wildcards are not supported. This tests "**". */ @Test public void testBadWildcardRecursive() throws Exception { Pipeline pipeline = TestPipeline.create(); pipeline.apply(TextIO.Read.from("gs://bucket/foo**/baz")); // Check that running does fail. expectedException.expect(IllegalArgumentException.class); expectedException.expectMessage("wildcard"); pipeline.run(); }
@Test public void testPartiallyBoundFailure() throws IOException { Pipeline p = DataflowPipeline.create(buildPipelineOptions()); PCollection<Integer> input = p.begin().apply(Create.of(1, 2, 3)); thrown.expect(IllegalStateException.class); input.apply(new PartiallyBoundOutputCreator()); Assert.fail("Failure expected from use of partially bound output"); }
@Test public void testCountConstraint() { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of())); expectedEx.expect(IllegalArgumentException.class); expectedEx.expectMessage(Matchers.containsString(">= 0")); input.apply(Top.of(-1, new OrderByLength())); }
public static void main(String[] args) throws Exception { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline pipeline = Pipeline.create(options); pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class)); pipeline .apply(new ReadDocuments(listInputDocuments(options))) .apply(new ComputeTfIdf()) .apply(new WriteTfIdf(options.getOutput())); pipeline.run(); }
// This is a purely compile-time test. If the code compiles, then it worked. @Test public void testPerKeySerializabilityRequirement() { Pipeline p = TestPipeline.create(); p.apply( "CreateCollection", Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of())); PCollection<KV<String, Integer>> inputTable = createInputTable(p); inputTable.apply(Top.<String, Integer, IntegerComparator>perKey(1, new IntegerComparator())); inputTable.apply( "PerKey2", Top.<String, Integer, IntegerComparator2>perKey(1, new IntegerComparator2())); }
@Test @Category(RunnableOnService.class) public void testApproximateUniqueWithSmallInput() { Pipeline p = TestPipeline.create(); PCollection<Integer> input = p.apply(Create.of(Arrays.asList(1, 2, 3, 3))); PCollection<Long> estimate = input.apply(ApproximateUnique.<Integer>globally(1000)); DataflowAssert.thatSingleton(estimate).isEqualTo(3L); p.run(); }
/** Recursive wildcards are not supported. This tests "**". */ @Test public void testBadWildcardRecursive() throws Exception { DataflowPipelineOptions options = buildPipelineOptions(); Pipeline pipeline = DataflowPipeline.create(options); DataflowPipelineTranslator t = DataflowPipelineTranslator.fromOptions(options); pipeline.apply(TextIO.Read.from("gs://bucket/foo**/baz")); // Check that translation does fail. thrown.expect(IllegalArgumentException.class); thrown.expectMessage("Unsupported wildcard usage"); t.translate(pipeline, Collections.<DataflowPackage>emptyList()); }
@Test public void testScalingAlgorithmMissing() throws IOException { DataflowPipelineOptions options = buildPipelineOptions(); Pipeline p = buildPipeline(options); p.traverseTopologically(new RecordingPipelineVisitor()); Job job = DataflowPipelineTranslator.fromOptions(options) .translate(p, Collections.<DataflowPackage>emptyList()) .getJob(); assertEquals(1, job.getEnvironment().getWorkerPools().size()); assertNull(job.getEnvironment().getWorkerPools().get(0).getAutoscalingSettings()); }
/** Sets up and starts streaming pipeline. */ public static void main(String[] args) { PubsubFileInjectorOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(PubsubFileInjectorOptions.class); Pipeline pipeline = Pipeline.create(options); pipeline .apply(TextIO.Read.from(options.getInput())) .apply( IntraBundleParallelization.of(PubsubFileInjector.publish(options.getOutputTopic())) .withMaxParallelism(20)); pipeline.run(); }
@Override protected void testProgram() throws Exception { Pipeline p = FlinkTestPipeline.create(); PCollection<TableRow> input1 = p.apply(Create.of(EVENT_ARRAY)); PCollection<TableRow> input2 = p.apply(Create.of(CC_ARRAY)); PCollection<String> output = JoinExamples.joinEvents(input1, input2); output.apply(TextIO.Write.to(resultPath)); p.run(); }
@Test public void testUnsupportedFilePattern() throws IOException { File outFolder = tmpFolder.newFolder(); // Windows doesn't like resolving paths with * in them. String filename = outFolder.toPath().resolve("output@5").toString(); Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(Arrays.asList(LINES_ARRAY)).withCoder(StringUtf8Coder.of())); expectedException.expect(IllegalArgumentException.class); expectedException.expectMessage("Output name components are not allowed to contain"); input.apply(TextIO.Write.to(filename)); }
/** * Runs the batch injector for the streaming pipeline. * * <p>The injector pipeline will read from the given text file, and inject data into the Google * Cloud Pub/Sub topic. */ public void runInjectorPipeline(String inputFile, String topic) { DataflowPipelineOptions copiedOptions = options.cloneAs(DataflowPipelineOptions.class); copiedOptions.setStreaming(false); copiedOptions.setNumWorkers( options.as(ExamplePubsubTopicOptions.class).getInjectorNumWorkers()); copiedOptions.setJobName(options.getJobName() + "-injector"); Pipeline injectorPipeline = Pipeline.create(copiedOptions); injectorPipeline .apply(TextIO.Read.from(inputFile)) .apply( IntraBundleParallelization.of(PubsubFileInjector.publish(topic)) .withMaxParallelism(20)); DataflowPipelineJob injectorJob = (DataflowPipelineJob) injectorPipeline.run(); jobsToCancel.add(injectorJob); }
public static void main(String[] args) { PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class); KafkaStreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class); options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds"); options.setStreaming(true); options.setCheckpointingInterval(1000L); options.setNumberOfExecutionRetries(5); options.setExecutionRetryDelay(3000L); options.setRunner(FlinkPipelineRunner.class); System.out.println( options.getKafkaTopic() + " " + options.getZookeeper() + " " + options.getBroker() + " " + options.getGroup()); Pipeline pipeline = Pipeline.create(options); Properties p = new Properties(); p.setProperty("zookeeper.connect", options.getZookeeper()); p.setProperty("bootstrap.servers", options.getBroker()); p.setProperty("group.id", options.getGroup()); // this is the Flink consumer that reads the input to // the program from a kafka topic. FlinkKafkaConsumer08<String> kafkaConsumer = new FlinkKafkaConsumer08<>(options.getKafkaTopic(), new SimpleStringSchema(), p); PCollection<String> words = pipeline .apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount")) .apply(ParDo.of(new ExtractWordsFn())) .apply( Window.<String>into( FixedWindows.of(Duration.standardSeconds(options.getWindowSize()))) .triggering(AfterWatermark.pastEndOfWindow()) .withAllowedLateness(Duration.ZERO) .discardingFiredPanes()); PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement()); wordCounts.apply(ParDo.of(new FormatAsStringFn())).apply(TextIO.Write.to("./outputKafka.txt")); pipeline.run(); }
@Test public void testTopEmptyWithIncompatibleWindows() { Pipeline p = TestPipeline.create(); Bound<String> windowingFn = Window.<String>into(FixedWindows.of(Duration.standardDays(10L))); PCollection<String> input = p.apply(Create.timestamped(Collections.<String>emptyList(), Collections.<Long>emptyList())) .apply(windowingFn); expectedEx.expect(IllegalStateException.class); expectedEx.expectMessage("Top"); expectedEx.expectMessage("GlobalWindows"); expectedEx.expectMessage("withoutDefaults"); expectedEx.expectMessage("asSingletonView"); input.apply(Top.of(1, new OrderByLength())); }
@Test public void testMultiGraphPipelineSerialization() throws IOException { Pipeline p = DataflowPipeline.create(buildPipelineOptions()); PCollection<Integer> input = p.begin().apply(Create.of(1, 2, 3)); input.apply(new UnrelatedOutputCreator()); input.apply(new UnboundOutputCreator()); DataflowPipelineTranslator t = DataflowPipelineTranslator.fromOptions( PipelineOptionsFactory.as(DataflowPipelineOptions.class)); // Check that translation doesn't fail. t.translate(p, Collections.<DataflowPackage>emptyList()); }
@Test public void testZoneConfig() throws IOException { final String testZone = "test-zone-1"; DataflowPipelineOptions options = buildPipelineOptions(); options.setZone(testZone); Pipeline p = buildPipeline(options); p.traverseTopologically(new RecordingPipelineVisitor()); Job job = DataflowPipelineTranslator.fromOptions(options) .translate(p, Collections.<DataflowPackage>emptyList()) .getJob(); assertEquals(1, job.getEnvironment().getWorkerPools().size()); assertEquals(testZone, job.getEnvironment().getWorkerPools().get(0).getZone()); }
@Test public void testDiskSizeGbConfig() throws IOException { final Integer diskSizeGb = 1234; DataflowPipelineOptions options = buildPipelineOptions(); options.setDiskSizeGb(diskSizeGb); Pipeline p = buildPipeline(options); p.traverseTopologically(new RecordingPipelineVisitor()); Job job = DataflowPipelineTranslator.fromOptions(options) .translate(p, Collections.<DataflowPackage>emptyList()) .getJob(); assertEquals(1, job.getEnvironment().getWorkerPools().size()); assertEquals(diskSizeGb, job.getEnvironment().getWorkerPools().get(0).getDiskSizeGb()); }
private void runApproximateUniqueWithDuplicates( int elementCount, int uniqueCount, int sampleSize) { assert elementCount >= uniqueCount; List<Double> elements = Lists.newArrayList(); for (int i = 0; i < elementCount; i++) { elements.add(1.0 / (i % uniqueCount + 1)); } Collections.shuffle(elements); Pipeline p = TestPipeline.create(); PCollection<Double> input = p.apply(Create.of(elements)); PCollection<Long> estimate = input.apply(ApproximateUnique.<Double>globally(sampleSize)); DataflowAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize)); p.run(); }
@Test public void testReadNamed() { Pipeline p = TestPipeline.create(); { PCollection<String> output1 = p.apply(TextIO.Read.from("/tmp/file.txt")); assertEquals("TextIO.Read.out", output1.getName()); } { PCollection<String> output2 = p.apply(TextIO.Read.named("MyRead").from("/tmp/file.txt")); assertEquals("MyRead.out", output2.getName()); } { PCollection<String> output3 = p.apply(TextIO.Read.from("/tmp/file.txt").named("HerRead")); assertEquals("HerRead.out", output3.getName()); } }