/** This tests a few corner cases that should not crash. */ @Test public void testGoodWildcards() throws Exception { TestDataflowPipelineOptions options = buildTestPipelineOptions(); options.setGcsUtil(buildMockGcsUtil()); Pipeline pipeline = Pipeline.create(options); applyRead(pipeline, "gs://bucket/foo"); applyRead(pipeline, "gs://bucket/foo/"); applyRead(pipeline, "gs://bucket/foo/*"); applyRead(pipeline, "gs://bucket/foo/?"); applyRead(pipeline, "gs://bucket/foo/[0-9]"); applyRead(pipeline, "gs://bucket/foo/*baz*"); applyRead(pipeline, "gs://bucket/foo/*baz?"); applyRead(pipeline, "gs://bucket/foo/[0-9]baz?"); applyRead(pipeline, "gs://bucket/foo/baz/*"); applyRead(pipeline, "gs://bucket/foo/baz/*wonka*"); applyRead(pipeline, "gs://bucket/foo/*baz/wonka*"); applyRead(pipeline, "gs://bucket/foo*/baz"); applyRead(pipeline, "gs://bucket/foo?/baz"); applyRead(pipeline, "gs://bucket/foo[0-9]/baz"); // Check that running doesn't fail. pipeline.run(); }
public static void main(String[] args) throws Exception { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline pipeline = Pipeline.create(options); pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class)); pipeline .apply(new ReadDocuments(listInputDocuments(options))) .apply(new ComputeTfIdf()) .apply(new WriteTfIdf(options.getOutput())); pipeline.run(); }
/** Sets up and starts streaming pipeline. */ public static void main(String[] args) { PubsubFileInjectorOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(PubsubFileInjectorOptions.class); Pipeline pipeline = Pipeline.create(options); pipeline .apply(TextIO.Read.from(options.getInput())) .apply( IntraBundleParallelization.of(PubsubFileInjector.publish(options.getOutputTopic())) .withMaxParallelism(20)); pipeline.run(); }
/** * Runs the batch injector for the streaming pipeline. * * <p>The injector pipeline will read from the given text file, and inject data into the Google * Cloud Pub/Sub topic. */ public void runInjectorPipeline(String inputFile, String topic) { DataflowPipelineOptions copiedOptions = options.cloneAs(DataflowPipelineOptions.class); copiedOptions.setStreaming(false); copiedOptions.setNumWorkers( options.as(ExamplePubsubTopicOptions.class).getInjectorNumWorkers()); copiedOptions.setJobName(options.getJobName() + "-injector"); Pipeline injectorPipeline = Pipeline.create(copiedOptions); injectorPipeline .apply(TextIO.Read.from(inputFile)) .apply( IntraBundleParallelization.of(PubsubFileInjector.publish(topic)) .withMaxParallelism(20)); DataflowPipelineJob injectorJob = (DataflowPipelineJob) injectorPipeline.run(); jobsToCancel.add(injectorJob); }
public static void main(String[] args) { PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class); KafkaStreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class); options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds"); options.setStreaming(true); options.setCheckpointingInterval(1000L); options.setNumberOfExecutionRetries(5); options.setExecutionRetryDelay(3000L); options.setRunner(FlinkPipelineRunner.class); System.out.println( options.getKafkaTopic() + " " + options.getZookeeper() + " " + options.getBroker() + " " + options.getGroup()); Pipeline pipeline = Pipeline.create(options); Properties p = new Properties(); p.setProperty("zookeeper.connect", options.getZookeeper()); p.setProperty("bootstrap.servers", options.getBroker()); p.setProperty("group.id", options.getGroup()); // this is the Flink consumer that reads the input to // the program from a kafka topic. FlinkKafkaConsumer08<String> kafkaConsumer = new FlinkKafkaConsumer08<>(options.getKafkaTopic(), new SimpleStringSchema(), p); PCollection<String> words = pipeline .apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount")) .apply(ParDo.of(new ExtractWordsFn())) .apply( Window.<String>into( FixedWindows.of(Duration.standardSeconds(options.getWindowSize()))) .triggering(AfterWatermark.pastEndOfWindow()) .withAllowedLateness(Duration.ZERO) .discardingFiredPanes()); PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement()); wordCounts.apply(ParDo.of(new FormatAsStringFn())).apply(TextIO.Write.to("./outputKafka.txt")); pipeline.run(); }