public static void main(String[] args) { PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class); KafkaStreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class); options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds"); options.setStreaming(true); options.setCheckpointingInterval(1000L); options.setNumberOfExecutionRetries(5); options.setExecutionRetryDelay(3000L); options.setRunner(FlinkPipelineRunner.class); System.out.println( options.getKafkaTopic() + " " + options.getZookeeper() + " " + options.getBroker() + " " + options.getGroup()); Pipeline pipeline = Pipeline.create(options); Properties p = new Properties(); p.setProperty("zookeeper.connect", options.getZookeeper()); p.setProperty("bootstrap.servers", options.getBroker()); p.setProperty("group.id", options.getGroup()); // this is the Flink consumer that reads the input to // the program from a kafka topic. FlinkKafkaConsumer08<String> kafkaConsumer = new FlinkKafkaConsumer08<>(options.getKafkaTopic(), new SimpleStringSchema(), p); PCollection<String> words = pipeline .apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount")) .apply(ParDo.of(new ExtractWordsFn())) .apply( Window.<String>into( FixedWindows.of(Duration.standardSeconds(options.getWindowSize()))) .triggering(AfterWatermark.pastEndOfWindow()) .withAllowedLateness(Duration.ZERO) .discardingFiredPanes()); PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement()); wordCounts.apply(ParDo.of(new FormatAsStringFn())).apply(TextIO.Write.to("./outputKafka.txt")); pipeline.run(); }
@Test public void testReadEmptyCollectionSideInput() throws Exception { SideInputInfo sideInputInfo = createCollectionSideInputInfo(createSideInputSource()); assertThatContains( SideInputUtils.readSideInput( PipelineOptionsFactory.create(), sideInputInfo, new BatchModeExecutionContext())); }
@Test public void testReadSingletonSideInput() throws Exception { SideInputInfo sideInputInfo = createSingletonSideInputInfo(createSideInputSource(42)); assertEquals( 42, SideInputUtils.readSideInput( PipelineOptionsFactory.create(), sideInputInfo, new BatchModeExecutionContext())); }
@Test public void testCreateNormalParDoFn() throws Exception { String stringState = "some state"; long longState = 42L; TestDoFn fn = new TestDoFn(stringState, longState); String serializedFn = StringUtils.byteArrayToJsonString( SerializableUtils.serializeToByteArray( new DoFnInfo(fn, WindowingStrategy.globalDefault()))); CloudObject cloudUserFn = CloudObject.forClassName("DoFn"); addString(cloudUserFn, "serialized_fn", serializedFn); String tag = "output"; MultiOutputInfo multiOutputInfo = new MultiOutputInfo(); multiOutputInfo.setTag(tag); List<MultiOutputInfo> multiOutputInfos = Arrays.asList(multiOutputInfo); PipelineOptions options = PipelineOptionsFactory.create(); DataflowExecutionContext context = BatchModeExecutionContext.fromOptions(options); CounterSet counters = new CounterSet(); StateSampler stateSampler = new StateSampler("test", counters.getAddCounterMutator()); ParDoFn parDoFn = factory.create( options, cloudUserFn, "name", "transformName", null, multiOutputInfos, 1, context, counters.getAddCounterMutator(), stateSampler); // Test that the factory created the correct class assertThat(parDoFn, instanceOf(NormalParDoFn.class)); // Test that the DoFnInfo reflects the one passed in NormalParDoFn normalParDoFn = (NormalParDoFn) parDoFn; DoFnInfo doFnInfo = normalParDoFn.getDoFnInfo(); DoFn actualDoFn = doFnInfo.getDoFn(); assertThat(actualDoFn, instanceOf(TestDoFn.class)); assertThat(doFnInfo.getWindowingStrategy().getWindowFn(), instanceOf(GlobalWindows.class)); assertThat( doFnInfo.getWindowingStrategy().getTrigger().getSpec(), instanceOf(DefaultTrigger.class)); // Test that the deserialized user DoFn is as expected TestDoFn actualTestDoFn = (TestDoFn) actualDoFn; assertEquals(stringState, actualTestDoFn.stringState); assertEquals(longState, actualTestDoFn.longState); assertEquals(context, normalParDoFn.getExecutionContext()); }
@Test public void testSplitsWithSmallBlocks() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); // Test reading from an object file with many small random-sized blocks. List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT); String filename = generateTestFile( "tmp.avro", expected, SyncBehavior.SYNC_RANDOM, 100 /* max records/block */, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC); File file = new File(filename); // Small minimum bundle size AvroSource<Bird> source = AvroSource.from(filename).withSchema(Bird.class).withMinBundleSize(100L); // Assert that the source produces the expected records assertEquals(expected, SourceTestUtils.readFromSource(source, options)); List<? extends BoundedSource<Bird>> splits; int nonEmptySplits; // Split with the minimum bundle size splits = source.splitIntoBundles(100L, options); assertTrue(splits.size() > 2); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); nonEmptySplits = 0; for (BoundedSource<Bird> subSource : splits) { if (SourceTestUtils.readFromSource(subSource, options).size() > 0) { nonEmptySplits += 1; } } assertTrue(nonEmptySplits > 2); // Split with larger bundle size splits = source.splitIntoBundles(file.length() / 4, options); assertTrue(splits.size() > 2); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); nonEmptySplits = 0; for (BoundedSource<Bird> subSource : splits) { if (SourceTestUtils.readFromSource(subSource, options).size() > 0) { nonEmptySplits += 1; } } assertTrue(nonEmptySplits > 2); // Split with the file length splits = source.splitIntoBundles(file.length(), options); assertTrue(splits.size() == 1); SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); }
public static void main(String[] args) throws Exception { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline pipeline = Pipeline.create(options); pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class)); pipeline .apply(new ReadDocuments(listInputDocuments(options))) .apply(new ComputeTfIdf()) .apply(new WriteTfIdf(options.getOutput())); pipeline.run(); }
@Test public void testCreateUnknownParDoFn() throws Exception { CloudObject cloudUserFn = CloudObject.forClassName("UnknownKindOfDoFn"); try { CounterSet counters = new CounterSet(); StateSampler stateSampler = new StateSampler("test", counters.getAddCounterMutator()); factory.create( PipelineOptionsFactory.create(), cloudUserFn, "name", "transformName", null, null, 1, BatchModeExecutionContext.fromOptions(PipelineOptionsFactory.create()), counters.getAddCounterMutator(), stateSampler); fail("should have thrown an exception"); } catch (Exception exn) { assertThat(exn.toString(), Matchers.containsString("No known ParDoFnFactory")); } }
/** Sets up and starts streaming pipeline. */ public static void main(String[] args) { PubsubFileInjectorOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(PubsubFileInjectorOptions.class); Pipeline pipeline = Pipeline.create(options); pipeline .apply(TextIO.Read.from(options.getInput())) .apply( IntraBundleParallelization.of(PubsubFileInjector.publish(options.getOutputTopic())) .withMaxParallelism(20)); pipeline.run(); }
private static DataflowPipelineOptions buildPipelineOptions() throws IOException { GcsUtil mockGcsUtil = mock(GcsUtil.class); when(mockGcsUtil.bucketExists(any(GcsPath.class))).thenReturn(true); when(mockGcsUtil.isGcsPatternSupported(anyString())).thenCallRealMethod(); DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class); options.setGcpCredential(new TestCredential()); options.setJobName("some-job-name"); options.setProject("some-project"); options.setTempLocation(GcsPath.fromComponents("somebucket", "some/path").toString()); options.setFilesToStage(new LinkedList<String>()); options.setDataflowClient(buildMockDataflow(new IsValidCreateRequest())); options.setGcsUtil(mockGcsUtil); return options; }
@Test public void testMultiGraphPipelineSerialization() throws IOException { Pipeline p = DataflowPipeline.create(buildPipelineOptions()); PCollection<Integer> input = p.begin().apply(Create.of(1, 2, 3)); input.apply(new UnrelatedOutputCreator()); input.apply(new UnboundOutputCreator()); DataflowPipelineTranslator t = DataflowPipelineTranslator.fromOptions( PipelineOptionsFactory.as(DataflowPipelineOptions.class)); // Check that translation doesn't fail. t.translate(p, Collections.<DataflowPackage>emptyList()); }
@SuppressWarnings("rawtypes") private static ParDoFn createCombineValuesFn(String phase, Combine.KeyedCombineFn combineFn) throws Exception { // This partially mirrors the work that // com.google.cloud.dataflow.sdk.transforms.Combine.translateHelper // does, at least for the KeyedCombineFn. The phase is generated // by the back-end. CloudObject spec = CloudObject.forClassName("CombineValuesFn"); addString( spec, PropertyNames.SERIALIZED_FN, byteArrayToJsonString(serializeToByteArray(combineFn))); addString(spec, PropertyNames.PHASE, phase); return parDoFnFactory.create( PipelineOptionsFactory.create(), spec, "name", "transformName", null, // no side inputs null, // no side outputs 1, // single main output DataflowExecutionContext.withoutSideInputs(), (new CounterSet()).getAddCounterMutator(), null); }
private TestDataflowPipelineOptions buildTestPipelineOptions() { TestDataflowPipelineOptions options = PipelineOptionsFactory.as(TestDataflowPipelineOptions.class); options.setGcpCredential(new TestCredential()); return options; }