public static void main(String[] args) {
    PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class);
    KafkaStreamingWordCountOptions options =
        PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class);
    options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds");
    options.setStreaming(true);
    options.setCheckpointingInterval(1000L);
    options.setNumberOfExecutionRetries(5);
    options.setExecutionRetryDelay(3000L);
    options.setRunner(FlinkPipelineRunner.class);

    System.out.println(
        options.getKafkaTopic()
            + " "
            + options.getZookeeper()
            + " "
            + options.getBroker()
            + " "
            + options.getGroup());
    Pipeline pipeline = Pipeline.create(options);

    Properties p = new Properties();
    p.setProperty("zookeeper.connect", options.getZookeeper());
    p.setProperty("bootstrap.servers", options.getBroker());
    p.setProperty("group.id", options.getGroup());

    // this is the Flink consumer that reads the input to
    // the program from a kafka topic.
    FlinkKafkaConsumer08<String> kafkaConsumer =
        new FlinkKafkaConsumer08<>(options.getKafkaTopic(), new SimpleStringSchema(), p);

    PCollection<String> words =
        pipeline
            .apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount"))
            .apply(ParDo.of(new ExtractWordsFn()))
            .apply(
                Window.<String>into(
                        FixedWindows.of(Duration.standardSeconds(options.getWindowSize())))
                    .triggering(AfterWatermark.pastEndOfWindow())
                    .withAllowedLateness(Duration.ZERO)
                    .discardingFiredPanes());

    PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement());

    wordCounts.apply(ParDo.of(new FormatAsStringFn())).apply(TextIO.Write.to("./outputKafka.txt"));

    pipeline.run();
  }
  /**
   * Applies {@code ApproximateUnique(sampleSize)} verifying that the estimation error falls within
   * the maximum allowed error of {@code 2/sqrt(sampleSize)}.
   */
  private void runApproximateUniquePipeline(int sampleSize) {
    Pipeline p = TestPipeline.create();
    PCollection<String> collection = readPCollection(p);

    final PCollectionView<Long> exact =
        collection
            .apply(RemoveDuplicates.<String>create())
            .apply(Combine.globally(new CountElements<String>()))
            .apply(View.<Long>asSingleton());

    PCollection<Long> approximate =
        collection.apply(ApproximateUnique.<String>globally(sampleSize));

    PCollection<KV<Long, Long>> approximateAndExact =
        approximate.apply(
            ParDo.of(
                    new DoFn<Long, KV<Long, Long>>() {
                      @Override
                      public void processElement(ProcessContext c) {
                        c.output(KV.of(c.element(), c.sideInput(exact)));
                      }
                    })
                .withSideInputs(exact));

    DataflowAssert.that(approximateAndExact).satisfies(new VerifyEstimatePerKeyFn(sampleSize));

    p.run();
  }
Exemplo n.º 3
0
  /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */
  @Test
  @Category(RunnableOnService.class)
  public void testCountWords() throws Exception {
    Pipeline p = TestPipeline.create();

    PCollection<String> input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of()));

    PCollection<String> output = input.apply(new CountWords())
      .apply(ParDo.of(new FormatAsTextFn()));

    DataflowAssert.that(output).containsInAnyOrder(COUNTS_ARRAY);
    p.run();
  }
    @Override
    public PCollection<GATKRead> apply(PCollection<GATKRead> input) {
      return input.apply(
          ParDo.named("ApplyBQSR")
              .of(
                  new DoFnWLog<GATKRead, GATKRead>("ApplyBQSRStub") {
                    private static final long serialVersionUID = 1L;

                    @Override
                    public void processElement(ProcessContext c) throws Exception {
                      c.output(c.element());
                    }
                  })
              .withSideInputs(header, recalibrationReport));
    }
    @Override
    public PCollection<RecalibrationTables> apply(
        PCollection<KV<GATKRead, ReadContextData>> input) {
      return input.apply(
          ParDo.named("BaseRecalibrator")
              .of(
                  new DoFnWLog<KV<GATKRead, ReadContextData>, RecalibrationTables>(
                      "BaseRecalibratorStub") {
                    private static final long serialVersionUID = 1L;

                    @Override
                    public void processElement(ProcessContext c) throws Exception {
                      c.output(
                          new RecalibrationTables(
                              new StandardCovariateList(
                                  new RecalibrationArgumentCollection(), Collections.emptyList())));
                    }
                  })
              .withSideInputs(header));
    }
Exemplo n.º 6
0
  @Override
  public PCollection<KV<ReferenceShard, Iterable<GATKRead>>> apply(PCollection<GATKRead> input) {
    PCollection<KV<ReferenceShard, GATKRead>> keyReadByReferenceShard =
        input.apply(
            ParDo.of(
                    new DoFn<GATKRead, KV<ReferenceShard, GATKRead>>() {
                      private static final long serialVersionUID = 1L;

                      @Override
                      public void processElement(ProcessContext c) throws Exception {
                        // Apply our reference window function to each read before deciding which
                        // reference shard it belongs to.
                        ReferenceShard shard =
                            ReferenceShard.getShardNumberFromInterval(
                                referenceWindowFunction.apply(c.element()));
                        c.output(KV.of(shard, c.element()));
                      }
                    })
                .named("KeyReadByRefShard"));
    return keyReadByReferenceShard.apply(GroupByKey.<ReferenceShard, GATKRead>create());
  }
 @Override
 public PCollection<T> apply(PCollection<T> in) {
   return in.apply(
           ParDo.named("CreateIndex")
               .of(
                   new DoFn<T, KV<T, Void>>() {
                     @Override
                     public void processElement(ProcessContext c) {
                       c.output(KV.of(c.element(), (Void) null));
                     }
                   }))
       .apply(
           Combine.<T, Void>perKey(
               new SerializableFunction<Iterable<Void>, Void>() {
                 @Override
                 public Void apply(Iterable<Void> iter) {
                   return null; // ignore input
                 }
               }))
       .apply(Keys.<T>create());
 }
 @Override
 public PCollection<OutputT> apply(PCollection<? extends InputT> input) {
   return input.apply(
       ParDo.of(new MultiThreadedIntraBundleProcessingDoFn<>(doFn, maxParallelism)));
 }