private DataflowPipeline buildPipeline(DataflowPipelineOptions options) { DataflowPipeline p = DataflowPipeline.create(options); p.apply(TextIO.Read.named("ReadMyFile").from("gs://bucket/object")) .apply(TextIO.Write.named("WriteMyFile").to("gs://bucket/object")); return p; }
@Override protected void testProgram() throws Exception { Pipeline p = FlinkTestPipeline.create(); PCollection<TableRow> input1 = p.apply(Create.of(EVENT_ARRAY)); PCollection<TableRow> input2 = p.apply(Create.of(CC_ARRAY)); PCollection<String> output = JoinExamples.joinEvents(input1, input2); output.apply(TextIO.Write.to(resultPath)); p.run(); }
public static void main(String[] args) { PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class); KafkaStreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class); options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds"); options.setStreaming(true); options.setCheckpointingInterval(1000L); options.setNumberOfExecutionRetries(5); options.setExecutionRetryDelay(3000L); options.setRunner(FlinkPipelineRunner.class); System.out.println( options.getKafkaTopic() + " " + options.getZookeeper() + " " + options.getBroker() + " " + options.getGroup()); Pipeline pipeline = Pipeline.create(options); Properties p = new Properties(); p.setProperty("zookeeper.connect", options.getZookeeper()); p.setProperty("bootstrap.servers", options.getBroker()); p.setProperty("group.id", options.getGroup()); // this is the Flink consumer that reads the input to // the program from a kafka topic. FlinkKafkaConsumer08<String> kafkaConsumer = new FlinkKafkaConsumer08<>(options.getKafkaTopic(), new SimpleStringSchema(), p); PCollection<String> words = pipeline .apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount")) .apply(ParDo.of(new ExtractWordsFn())) .apply( Window.<String>into( FixedWindows.of(Duration.standardSeconds(options.getWindowSize()))) .triggering(AfterWatermark.pastEndOfWindow()) .withAllowedLateness(Duration.ZERO) .discardingFiredPanes()); PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement()); wordCounts.apply(ParDo.of(new FormatAsStringFn())).apply(TextIO.Write.to("./outputKafka.txt")); pipeline.run(); }
/** Returns a Step for a DoFn by creating and translating a pipeline. */ private static Step createPredefinedStep() throws Exception { DataflowPipelineOptions options = buildPipelineOptions(); DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options); DataflowPipeline pipeline = DataflowPipeline.create(options); String stepName = "DoFn1"; pipeline .apply(TextIO.Read.named("ReadMyFile").from("gs://bucket/in")) .apply(ParDo.of(new NoOpFn()).named(stepName)) .apply(TextIO.Write.named("WriteMyFile").to("gs://bucket/out")); Job job = translator.translate(pipeline, Collections.<DataflowPackage>emptyList()).getJob(); assertEquals(3, job.getSteps().size()); Step step = job.getSteps().get(1); assertEquals(stepName, getString(step.getProperties(), PropertyNames.USER_NAME)); return step; }
@Test public void testPredefinedAddStep() throws Exception { DataflowPipelineOptions options = buildPipelineOptions(); DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options); DataflowPipelineTranslator.registerTransformTranslator( EmbeddedTransform.class, new EmbeddedTranslator()); // Create a predefined step using another pipeline Step predefinedStep = createPredefinedStep(); // Create a pipeline that the predefined step will be embedded into DataflowPipeline pipeline = DataflowPipeline.create(options); pipeline .apply(TextIO.Read.named("ReadMyFile").from("gs://bucket/in")) .apply(ParDo.of(new NoOpFn())) .apply(new EmbeddedTransform(predefinedStep.clone())) .apply(TextIO.Write.named("WriteMyFile").to("gs://bucket/out")); Job job = translator.translate(pipeline, Collections.<DataflowPackage>emptyList()).getJob(); List<Step> steps = job.getSteps(); assertEquals(4, steps.size()); // The input to the embedded step should match the output of the step before Map<String, Object> step1Out = getOutputPortReference(steps.get(1)); Map<String, Object> step2In = getDictionary(steps.get(2).getProperties(), PropertyNames.PARALLEL_INPUT); assertEquals(step1Out, step2In); // The output from the embedded step should match the input of the step after Map<String, Object> step2Out = getOutputPortReference(steps.get(2)); Map<String, Object> step3In = getDictionary(steps.get(3).getProperties(), PropertyNames.PARALLEL_INPUT); assertEquals(step2Out, step3In); // The step should not have been modified other than remapping the input Step predefinedStepClone = predefinedStep.clone(); Step embeddedStepClone = steps.get(2).clone(); predefinedStepClone.getProperties().remove(PropertyNames.PARALLEL_INPUT); embeddedStepClone.getProperties().remove(PropertyNames.PARALLEL_INPUT); assertEquals(predefinedStepClone, embeddedStepClone); }
@Override public PDone apply(PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf) { return wordToUriAndTfIdf .apply( ParDo.named("Format") .of( new DoFn<KV<String, KV<URI, Double>>, String>() { private static final long serialVersionUID = 0; @Override public void processElement(ProcessContext c) { c.output( String.format( "%s,\t%s,\t%f", c.element().getKey(), c.element().getValue().getKey(), c.element().getValue().getValue())); } })) .apply(TextIO.Write.to(output).withSuffix(".csv")); }