public static void main(String[] args) { PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class); KafkaStreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class); options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds"); options.setStreaming(true); options.setCheckpointingInterval(1000L); options.setNumberOfExecutionRetries(5); options.setExecutionRetryDelay(3000L); options.setRunner(FlinkPipelineRunner.class); System.out.println( options.getKafkaTopic() + " " + options.getZookeeper() + " " + options.getBroker() + " " + options.getGroup()); Pipeline pipeline = Pipeline.create(options); Properties p = new Properties(); p.setProperty("zookeeper.connect", options.getZookeeper()); p.setProperty("bootstrap.servers", options.getBroker()); p.setProperty("group.id", options.getGroup()); // this is the Flink consumer that reads the input to // the program from a kafka topic. FlinkKafkaConsumer08<String> kafkaConsumer = new FlinkKafkaConsumer08<>(options.getKafkaTopic(), new SimpleStringSchema(), p); PCollection<String> words = pipeline .apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount")) .apply(ParDo.of(new ExtractWordsFn())) .apply( Window.<String>into( FixedWindows.of(Duration.standardSeconds(options.getWindowSize()))) .triggering(AfterWatermark.pastEndOfWindow()) .withAllowedLateness(Duration.ZERO) .discardingFiredPanes()); PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement()); wordCounts.apply(ParDo.of(new FormatAsStringFn())).apply(TextIO.Write.to("./outputKafka.txt")); pipeline.run(); }
/** * Applies {@code ApproximateUnique(sampleSize)} verifying that the estimation error falls within * the maximum allowed error of {@code 2/sqrt(sampleSize)}. */ private void runApproximateUniquePipeline(int sampleSize) { Pipeline p = TestPipeline.create(); PCollection<String> collection = readPCollection(p); final PCollectionView<Long> exact = collection .apply(RemoveDuplicates.<String>create()) .apply(Combine.globally(new CountElements<String>())) .apply(View.<Long>asSingleton()); PCollection<Long> approximate = collection.apply(ApproximateUnique.<String>globally(sampleSize)); PCollection<KV<Long, Long>> approximateAndExact = approximate.apply( ParDo.of( new DoFn<Long, KV<Long, Long>>() { @Override public void processElement(ProcessContext c) { c.output(KV.of(c.element(), c.sideInput(exact))); } }) .withSideInputs(exact)); DataflowAssert.that(approximateAndExact).satisfies(new VerifyEstimatePerKeyFn(sampleSize)); p.run(); }
/** Example test that tests a PTransform by using an in-memory input and inspecting the output. */ @Test @Category(RunnableOnService.class) public void testCountWords() throws Exception { Pipeline p = TestPipeline.create(); PCollection<String> input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of())); PCollection<String> output = input.apply(new CountWords()) .apply(ParDo.of(new FormatAsTextFn())); DataflowAssert.that(output).containsInAnyOrder(COUNTS_ARRAY); p.run(); }
@Override public PCollection<GATKRead> apply(PCollection<GATKRead> input) { return input.apply( ParDo.named("ApplyBQSR") .of( new DoFnWLog<GATKRead, GATKRead>("ApplyBQSRStub") { private static final long serialVersionUID = 1L; @Override public void processElement(ProcessContext c) throws Exception { c.output(c.element()); } }) .withSideInputs(header, recalibrationReport)); }
@Override public PCollection<RecalibrationTables> apply( PCollection<KV<GATKRead, ReadContextData>> input) { return input.apply( ParDo.named("BaseRecalibrator") .of( new DoFnWLog<KV<GATKRead, ReadContextData>, RecalibrationTables>( "BaseRecalibratorStub") { private static final long serialVersionUID = 1L; @Override public void processElement(ProcessContext c) throws Exception { c.output( new RecalibrationTables( new StandardCovariateList( new RecalibrationArgumentCollection(), Collections.emptyList()))); } }) .withSideInputs(header)); }
@Override public PCollection<KV<ReferenceShard, Iterable<GATKRead>>> apply(PCollection<GATKRead> input) { PCollection<KV<ReferenceShard, GATKRead>> keyReadByReferenceShard = input.apply( ParDo.of( new DoFn<GATKRead, KV<ReferenceShard, GATKRead>>() { private static final long serialVersionUID = 1L; @Override public void processElement(ProcessContext c) throws Exception { // Apply our reference window function to each read before deciding which // reference shard it belongs to. ReferenceShard shard = ReferenceShard.getShardNumberFromInterval( referenceWindowFunction.apply(c.element())); c.output(KV.of(shard, c.element())); } }) .named("KeyReadByRefShard")); return keyReadByReferenceShard.apply(GroupByKey.<ReferenceShard, GATKRead>create()); }
@Override public PCollection<T> apply(PCollection<T> in) { return in.apply( ParDo.named("CreateIndex") .of( new DoFn<T, KV<T, Void>>() { @Override public void processElement(ProcessContext c) { c.output(KV.of(c.element(), (Void) null)); } })) .apply( Combine.<T, Void>perKey( new SerializableFunction<Iterable<Void>, Void>() { @Override public Void apply(Iterable<Void> iter) { return null; // ignore input } })) .apply(Keys.<T>create()); }
@Override public PCollection<OutputT> apply(PCollection<? extends InputT> input) { return input.apply( ParDo.of(new MultiThreadedIntraBundleProcessingDoFn<>(doFn, maxParallelism))); }