コード例 #1
0
  public static void main(String[] args) {
    PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class);
    KafkaStreamingWordCountOptions options =
        PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class);
    options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds");
    options.setStreaming(true);
    options.setCheckpointingInterval(1000L);
    options.setNumberOfExecutionRetries(5);
    options.setExecutionRetryDelay(3000L);
    options.setRunner(FlinkPipelineRunner.class);

    System.out.println(
        options.getKafkaTopic()
            + " "
            + options.getZookeeper()
            + " "
            + options.getBroker()
            + " "
            + options.getGroup());
    Pipeline pipeline = Pipeline.create(options);

    Properties p = new Properties();
    p.setProperty("zookeeper.connect", options.getZookeeper());
    p.setProperty("bootstrap.servers", options.getBroker());
    p.setProperty("group.id", options.getGroup());

    // this is the Flink consumer that reads the input to
    // the program from a kafka topic.
    FlinkKafkaConsumer08<String> kafkaConsumer =
        new FlinkKafkaConsumer08<>(options.getKafkaTopic(), new SimpleStringSchema(), p);

    PCollection<String> words =
        pipeline
            .apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount"))
            .apply(ParDo.of(new ExtractWordsFn()))
            .apply(
                Window.<String>into(
                        FixedWindows.of(Duration.standardSeconds(options.getWindowSize())))
                    .triggering(AfterWatermark.pastEndOfWindow())
                    .withAllowedLateness(Duration.ZERO)
                    .discardingFiredPanes());

    PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement());

    wordCounts.apply(ParDo.of(new FormatAsStringFn())).apply(TextIO.Write.to("./outputKafka.txt"));

    pipeline.run();
  }
コード例 #2
0
  @Test(dataProvider = "reads")
  public void testGATKReadCoding(final List<GATKRead> reads) {
    // The simplest way to figure out if a class is coded correctly is to create a PCollection
    // of that type and see if it matches the List version.
    final Pipeline p = GATKTestPipeline.create();
    DataflowUtils.registerGATKCoders(p);

    // Need to explicitly set the coder to GATKReadCoder, otherwise Create fails to infer
    // a coder properly in the case where the List contains a mix of different GATKRead
    // implementations.
    final PCollection<GATKRead> dataflowReads =
        p.apply(Create.of(reads).withCoder(new GATKReadCoder()));
    DataflowAssert.that(dataflowReads).containsInAnyOrder(reads);

    final PCollection<GATKRead> dataflowReadsAfterTransform =
        dataflowReads
            .apply(
                ParDo.of(
                    new DoFn<GATKRead, GATKRead>() {
                      private static final long serialVersionUID = 1l;

                      @Override
                      public void processElement(ProcessContext c) throws Exception {
                        c.output(c.element());
                      }
                    }))
            .setCoder(new GATKReadCoder());
    DataflowAssert.that(dataflowReadsAfterTransform).containsInAnyOrder(reads);

    p.run();
  }
コード例 #3
0
  /**
   * Returns a UnionTable for the given input PCollection, using the given union index and the given
   * unionTableEncoder.
   */
  private <V> PCollection<KV<K, RawUnionValue>> makeUnionTable(
      final int index,
      PCollection<KV<K, V>> pCollection,
      KvCoder<K, RawUnionValue> unionTableEncoder) {

    return pCollection
        .apply(ParDo.of(new ConstructUnionTableFn<K, V>(index)).named("MakeUnionTable" + index))
        .setCoder(unionTableEncoder);
  }
  /** Returns a Step for a DoFn by creating and translating a pipeline. */
  private static Step createPredefinedStep() throws Exception {
    DataflowPipelineOptions options = buildPipelineOptions();
    DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);
    DataflowPipeline pipeline = DataflowPipeline.create(options);
    String stepName = "DoFn1";
    pipeline
        .apply(TextIO.Read.named("ReadMyFile").from("gs://bucket/in"))
        .apply(ParDo.of(new NoOpFn()).named(stepName))
        .apply(TextIO.Write.named("WriteMyFile").to("gs://bucket/out"));
    Job job = translator.translate(pipeline, Collections.<DataflowPackage>emptyList()).getJob();

    assertEquals(3, job.getSteps().size());
    Step step = job.getSteps().get(1);
    assertEquals(stepName, getString(step.getProperties(), PropertyNames.USER_NAME));
    return step;
  }
コード例 #5
0
  private static void writeToHadoop(
      Pipeline pipeline,
      PCollection<GATKRead> reads,
      final SAMFileHeader header,
      final String destPath,
      final boolean parquet) {
    if (destPath.equals("/dev/null")) {
      return;
    }

    String headerString =
        Base64.getEncoder().encodeToString(SerializableUtils.serializeToByteArray(header));

    @SuppressWarnings("unchecked")
    Class<? extends FileOutputFormat<NullWritable, SAMRecordWritable>> outputFormatClass =
        (Class<? extends FileOutputFormat<NullWritable, SAMRecordWritable>>)
            (Class<?>) TemplatedKeyIgnoringBAMOutputFormat.class;
    @SuppressWarnings("unchecked")
    HadoopIO.Write.Bound<NullWritable, SAMRecordWritable> write =
        HadoopIO.Write.to(destPath, outputFormatClass, NullWritable.class, SAMRecordWritable.class)
            .withConfigurationProperty(
                TemplatedKeyIgnoringBAMOutputFormat.SAM_HEADER_PROPERTY_NAME, headerString);

    PCollection<KV<NullWritable, SAMRecordWritable>> samReads =
        reads
            .apply(
                ParDo.of(
                    new DoFn<GATKRead, KV<NullWritable, SAMRecordWritable>>() {
                      private static final long serialVersionUID = 1L;

                      @Override
                      public void processElement(ProcessContext c) throws Exception {
                        SAMRecord samRecord = c.element().convertToSAMRecord(header);
                        SAMRecordWritable samRecordWritable = new SAMRecordWritable();
                        samRecordWritable.set(samRecord);
                        c.output(KV.of(NullWritable.get(), samRecordWritable));
                      }
                    }))
            .setCoder(
                KvCoder.of(
                    WritableCoder.of(NullWritable.class),
                    WritableCoder.of(SAMRecordWritable.class)));

    // write as a single (unsharded) file
    samReads.apply(write.withoutSharding());
  }
コード例 #6
0
  @Override
  public PCollection<KV<K, CoGbkResult>> apply(KeyedPCollectionTuple<K> input) {
    if (input.isEmpty()) {
      throw new IllegalArgumentException("must have at least one input to a KeyedPCollections");
    }

    // First build the union coder.
    // TODO: Look at better integration of union types with the
    // schema specified in the input.
    List<Coder<?>> codersList = new ArrayList<>();
    for (TaggedKeyedPCollection<K, ?> entry : input.getKeyedCollections()) {
      codersList.add(getValueCoder(entry.pCollection));
    }
    UnionCoder unionCoder = UnionCoder.of(codersList);
    Coder<K> keyCoder = input.getKeyCoder();
    KvCoder<K, RawUnionValue> kVCoder = KvCoder.of(keyCoder, unionCoder);

    PCollectionList<KV<K, RawUnionValue>> unionTables = PCollectionList.empty(input.getPipeline());

    // TODO: Use the schema to order the indices rather than depending
    // on the fact that the schema ordering is identical to the ordering from
    // input.getJoinCollections().
    int index = -1;
    for (TaggedKeyedPCollection<K, ?> entry : input.getKeyedCollections()) {
      index++;
      PCollection<KV<K, RawUnionValue>> unionTable =
          makeUnionTable(index, entry.pCollection, kVCoder);
      unionTables = unionTables.and(unionTable);
    }

    PCollection<KV<K, RawUnionValue>> flattenedTable =
        unionTables.apply(Flatten.<KV<K, RawUnionValue>>pCollections());

    PCollection<KV<K, Iterable<RawUnionValue>>> groupedTable =
        flattenedTable.apply(GroupByKey.<K, RawUnionValue>create());

    CoGbkResultSchema tupleTags = input.getCoGbkResultSchema();
    PCollection<KV<K, CoGbkResult>> result =
        groupedTable.apply(
            ParDo.of(new ConstructCoGbkResultFn<K>(tupleTags)).named("ConstructCoGbkResultFn"));
    result.setCoder(KvCoder.of(keyCoder, CoGbkResultCoder.of(tupleTags, unionCoder)));

    return result;
  }
  @Test
  public void testPredefinedAddStep() throws Exception {
    DataflowPipelineOptions options = buildPipelineOptions();

    DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);
    DataflowPipelineTranslator.registerTransformTranslator(
        EmbeddedTransform.class, new EmbeddedTranslator());

    // Create a predefined step using another pipeline
    Step predefinedStep = createPredefinedStep();

    // Create a pipeline that the predefined step will be embedded into
    DataflowPipeline pipeline = DataflowPipeline.create(options);
    pipeline
        .apply(TextIO.Read.named("ReadMyFile").from("gs://bucket/in"))
        .apply(ParDo.of(new NoOpFn()))
        .apply(new EmbeddedTransform(predefinedStep.clone()))
        .apply(TextIO.Write.named("WriteMyFile").to("gs://bucket/out"));
    Job job = translator.translate(pipeline, Collections.<DataflowPackage>emptyList()).getJob();

    List<Step> steps = job.getSteps();
    assertEquals(4, steps.size());

    // The input to the embedded step should match the output of the step before
    Map<String, Object> step1Out = getOutputPortReference(steps.get(1));
    Map<String, Object> step2In =
        getDictionary(steps.get(2).getProperties(), PropertyNames.PARALLEL_INPUT);
    assertEquals(step1Out, step2In);

    // The output from the embedded step should match the input of the step after
    Map<String, Object> step2Out = getOutputPortReference(steps.get(2));
    Map<String, Object> step3In =
        getDictionary(steps.get(3).getProperties(), PropertyNames.PARALLEL_INPUT);
    assertEquals(step2Out, step3In);

    // The step should not have been modified other than remapping the input
    Step predefinedStepClone = predefinedStep.clone();
    Step embeddedStepClone = steps.get(2).clone();
    predefinedStepClone.getProperties().remove(PropertyNames.PARALLEL_INPUT);
    embeddedStepClone.getProperties().remove(PropertyNames.PARALLEL_INPUT);
    assertEquals(predefinedStepClone, embeddedStepClone);
  }
コード例 #8
0
  @Override
  public PCollection<KV<ReferenceShard, Iterable<GATKRead>>> apply(PCollection<GATKRead> input) {
    PCollection<KV<ReferenceShard, GATKRead>> keyReadByReferenceShard =
        input.apply(
            ParDo.of(
                    new DoFn<GATKRead, KV<ReferenceShard, GATKRead>>() {
                      private static final long serialVersionUID = 1L;

                      @Override
                      public void processElement(ProcessContext c) throws Exception {
                        // Apply our reference window function to each read before deciding which
                        // reference shard it belongs to.
                        ReferenceShard shard =
                            ReferenceShard.getShardNumberFromInterval(
                                referenceWindowFunction.apply(c.element()));
                        c.output(KV.of(shard, c.element()));
                      }
                    })
                .named("KeyReadByRefShard"));
    return keyReadByReferenceShard.apply(GroupByKey.<ReferenceShard, GATKRead>create());
  }