public static void main(String[] args) {
    PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class);
    KafkaStreamingWordCountOptions options =
        PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class);
    options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds");
    options.setStreaming(true);
    options.setCheckpointingInterval(1000L);
    options.setNumberOfExecutionRetries(5);
    options.setExecutionRetryDelay(3000L);
    options.setRunner(FlinkPipelineRunner.class);

    System.out.println(
        options.getKafkaTopic()
            + " "
            + options.getZookeeper()
            + " "
            + options.getBroker()
            + " "
            + options.getGroup());
    Pipeline pipeline = Pipeline.create(options);

    Properties p = new Properties();
    p.setProperty("zookeeper.connect", options.getZookeeper());
    p.setProperty("bootstrap.servers", options.getBroker());
    p.setProperty("group.id", options.getGroup());

    // this is the Flink consumer that reads the input to
    // the program from a kafka topic.
    FlinkKafkaConsumer08<String> kafkaConsumer =
        new FlinkKafkaConsumer08<>(options.getKafkaTopic(), new SimpleStringSchema(), p);

    PCollection<String> words =
        pipeline
            .apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount"))
            .apply(ParDo.of(new ExtractWordsFn()))
            .apply(
                Window.<String>into(
                        FixedWindows.of(Duration.standardSeconds(options.getWindowSize())))
                    .triggering(AfterWatermark.pastEndOfWindow())
                    .withAllowedLateness(Duration.ZERO)
                    .discardingFiredPanes());

    PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement());

    wordCounts.apply(ParDo.of(new FormatAsStringFn())).apply(TextIO.Write.to("./outputKafka.txt"));

    pipeline.run();
  }
  /**
   * Applies {@code ApproximateUnique(sampleSize)} verifying that the estimation error falls within
   * the maximum allowed error of {@code 2/sqrt(sampleSize)}.
   */
  private void runApproximateUniquePipeline(int sampleSize) {
    Pipeline p = TestPipeline.create();
    PCollection<String> collection = readPCollection(p);

    final PCollectionView<Long> exact =
        collection
            .apply(RemoveDuplicates.<String>create())
            .apply(Combine.globally(new CountElements<String>()))
            .apply(View.<Long>asSingleton());

    PCollection<Long> approximate =
        collection.apply(ApproximateUnique.<String>globally(sampleSize));

    PCollection<KV<Long, Long>> approximateAndExact =
        approximate.apply(
            ParDo.of(
                    new DoFn<Long, KV<Long, Long>>() {
                      @Override
                      public void processElement(ProcessContext c) {
                        c.output(KV.of(c.element(), c.sideInput(exact)));
                      }
                    })
                .withSideInputs(exact));

    DataflowAssert.that(approximateAndExact).satisfies(new VerifyEstimatePerKeyFn(sampleSize));

    p.run();
  }
Exemplo n.º 3
0
  /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */
  @Test
  @Category(RunnableOnService.class)
  public void testCountWords() throws Exception {
    Pipeline p = TestPipeline.create();

    PCollection<String> input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of()));

    PCollection<String> output = input.apply(new CountWords())
      .apply(ParDo.of(new FormatAsTextFn()));

    DataflowAssert.that(output).containsInAnyOrder(COUNTS_ARRAY);
    p.run();
  }
Exemplo n.º 4
0
  @Override
  public PCollection<KV<ReferenceShard, Iterable<GATKRead>>> apply(PCollection<GATKRead> input) {
    PCollection<KV<ReferenceShard, GATKRead>> keyReadByReferenceShard =
        input.apply(
            ParDo.of(
                    new DoFn<GATKRead, KV<ReferenceShard, GATKRead>>() {
                      private static final long serialVersionUID = 1L;

                      @Override
                      public void processElement(ProcessContext c) throws Exception {
                        // Apply our reference window function to each read before deciding which
                        // reference shard it belongs to.
                        ReferenceShard shard =
                            ReferenceShard.getShardNumberFromInterval(
                                referenceWindowFunction.apply(c.element()));
                        c.output(KV.of(shard, c.element()));
                      }
                    })
                .named("KeyReadByRefShard"));
    return keyReadByReferenceShard.apply(GroupByKey.<ReferenceShard, GATKRead>create());
  }
 @Override
 public PCollection<OutputT> apply(PCollection<? extends InputT> input) {
   return input.apply(
       ParDo.of(new MultiThreadedIntraBundleProcessingDoFn<>(doFn, maxParallelism)));
 }