public static void main(String[] args) { PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class); KafkaStreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class); options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds"); options.setStreaming(true); options.setCheckpointingInterval(1000L); options.setNumberOfExecutionRetries(5); options.setExecutionRetryDelay(3000L); options.setRunner(FlinkPipelineRunner.class); System.out.println( options.getKafkaTopic() + " " + options.getZookeeper() + " " + options.getBroker() + " " + options.getGroup()); Pipeline pipeline = Pipeline.create(options); Properties p = new Properties(); p.setProperty("zookeeper.connect", options.getZookeeper()); p.setProperty("bootstrap.servers", options.getBroker()); p.setProperty("group.id", options.getGroup()); // this is the Flink consumer that reads the input to // the program from a kafka topic. FlinkKafkaConsumer08<String> kafkaConsumer = new FlinkKafkaConsumer08<>(options.getKafkaTopic(), new SimpleStringSchema(), p); PCollection<String> words = pipeline .apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount")) .apply(ParDo.of(new ExtractWordsFn())) .apply( Window.<String>into( FixedWindows.of(Duration.standardSeconds(options.getWindowSize()))) .triggering(AfterWatermark.pastEndOfWindow()) .withAllowedLateness(Duration.ZERO) .discardingFiredPanes()); PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement()); wordCounts.apply(ParDo.of(new FormatAsStringFn())).apply(TextIO.Write.to("./outputKafka.txt")); pipeline.run(); }
@Test(dataProvider = "reads") public void testGATKReadCoding(final List<GATKRead> reads) { // The simplest way to figure out if a class is coded correctly is to create a PCollection // of that type and see if it matches the List version. final Pipeline p = GATKTestPipeline.create(); DataflowUtils.registerGATKCoders(p); // Need to explicitly set the coder to GATKReadCoder, otherwise Create fails to infer // a coder properly in the case where the List contains a mix of different GATKRead // implementations. final PCollection<GATKRead> dataflowReads = p.apply(Create.of(reads).withCoder(new GATKReadCoder())); DataflowAssert.that(dataflowReads).containsInAnyOrder(reads); final PCollection<GATKRead> dataflowReadsAfterTransform = dataflowReads .apply( ParDo.of( new DoFn<GATKRead, GATKRead>() { private static final long serialVersionUID = 1l; @Override public void processElement(ProcessContext c) throws Exception { c.output(c.element()); } })) .setCoder(new GATKReadCoder()); DataflowAssert.that(dataflowReadsAfterTransform).containsInAnyOrder(reads); p.run(); }
/** * Returns a UnionTable for the given input PCollection, using the given union index and the given * unionTableEncoder. */ private <V> PCollection<KV<K, RawUnionValue>> makeUnionTable( final int index, PCollection<KV<K, V>> pCollection, KvCoder<K, RawUnionValue> unionTableEncoder) { return pCollection .apply(ParDo.of(new ConstructUnionTableFn<K, V>(index)).named("MakeUnionTable" + index)) .setCoder(unionTableEncoder); }
/** Returns a Step for a DoFn by creating and translating a pipeline. */ private static Step createPredefinedStep() throws Exception { DataflowPipelineOptions options = buildPipelineOptions(); DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options); DataflowPipeline pipeline = DataflowPipeline.create(options); String stepName = "DoFn1"; pipeline .apply(TextIO.Read.named("ReadMyFile").from("gs://bucket/in")) .apply(ParDo.of(new NoOpFn()).named(stepName)) .apply(TextIO.Write.named("WriteMyFile").to("gs://bucket/out")); Job job = translator.translate(pipeline, Collections.<DataflowPackage>emptyList()).getJob(); assertEquals(3, job.getSteps().size()); Step step = job.getSteps().get(1); assertEquals(stepName, getString(step.getProperties(), PropertyNames.USER_NAME)); return step; }
private static void writeToHadoop( Pipeline pipeline, PCollection<GATKRead> reads, final SAMFileHeader header, final String destPath, final boolean parquet) { if (destPath.equals("/dev/null")) { return; } String headerString = Base64.getEncoder().encodeToString(SerializableUtils.serializeToByteArray(header)); @SuppressWarnings("unchecked") Class<? extends FileOutputFormat<NullWritable, SAMRecordWritable>> outputFormatClass = (Class<? extends FileOutputFormat<NullWritable, SAMRecordWritable>>) (Class<?>) TemplatedKeyIgnoringBAMOutputFormat.class; @SuppressWarnings("unchecked") HadoopIO.Write.Bound<NullWritable, SAMRecordWritable> write = HadoopIO.Write.to(destPath, outputFormatClass, NullWritable.class, SAMRecordWritable.class) .withConfigurationProperty( TemplatedKeyIgnoringBAMOutputFormat.SAM_HEADER_PROPERTY_NAME, headerString); PCollection<KV<NullWritable, SAMRecordWritable>> samReads = reads .apply( ParDo.of( new DoFn<GATKRead, KV<NullWritable, SAMRecordWritable>>() { private static final long serialVersionUID = 1L; @Override public void processElement(ProcessContext c) throws Exception { SAMRecord samRecord = c.element().convertToSAMRecord(header); SAMRecordWritable samRecordWritable = new SAMRecordWritable(); samRecordWritable.set(samRecord); c.output(KV.of(NullWritable.get(), samRecordWritable)); } })) .setCoder( KvCoder.of( WritableCoder.of(NullWritable.class), WritableCoder.of(SAMRecordWritable.class))); // write as a single (unsharded) file samReads.apply(write.withoutSharding()); }
@Override public PCollection<KV<K, CoGbkResult>> apply(KeyedPCollectionTuple<K> input) { if (input.isEmpty()) { throw new IllegalArgumentException("must have at least one input to a KeyedPCollections"); } // First build the union coder. // TODO: Look at better integration of union types with the // schema specified in the input. List<Coder<?>> codersList = new ArrayList<>(); for (TaggedKeyedPCollection<K, ?> entry : input.getKeyedCollections()) { codersList.add(getValueCoder(entry.pCollection)); } UnionCoder unionCoder = UnionCoder.of(codersList); Coder<K> keyCoder = input.getKeyCoder(); KvCoder<K, RawUnionValue> kVCoder = KvCoder.of(keyCoder, unionCoder); PCollectionList<KV<K, RawUnionValue>> unionTables = PCollectionList.empty(input.getPipeline()); // TODO: Use the schema to order the indices rather than depending // on the fact that the schema ordering is identical to the ordering from // input.getJoinCollections(). int index = -1; for (TaggedKeyedPCollection<K, ?> entry : input.getKeyedCollections()) { index++; PCollection<KV<K, RawUnionValue>> unionTable = makeUnionTable(index, entry.pCollection, kVCoder); unionTables = unionTables.and(unionTable); } PCollection<KV<K, RawUnionValue>> flattenedTable = unionTables.apply(Flatten.<KV<K, RawUnionValue>>pCollections()); PCollection<KV<K, Iterable<RawUnionValue>>> groupedTable = flattenedTable.apply(GroupByKey.<K, RawUnionValue>create()); CoGbkResultSchema tupleTags = input.getCoGbkResultSchema(); PCollection<KV<K, CoGbkResult>> result = groupedTable.apply( ParDo.of(new ConstructCoGbkResultFn<K>(tupleTags)).named("ConstructCoGbkResultFn")); result.setCoder(KvCoder.of(keyCoder, CoGbkResultCoder.of(tupleTags, unionCoder))); return result; }
@Test public void testPredefinedAddStep() throws Exception { DataflowPipelineOptions options = buildPipelineOptions(); DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options); DataflowPipelineTranslator.registerTransformTranslator( EmbeddedTransform.class, new EmbeddedTranslator()); // Create a predefined step using another pipeline Step predefinedStep = createPredefinedStep(); // Create a pipeline that the predefined step will be embedded into DataflowPipeline pipeline = DataflowPipeline.create(options); pipeline .apply(TextIO.Read.named("ReadMyFile").from("gs://bucket/in")) .apply(ParDo.of(new NoOpFn())) .apply(new EmbeddedTransform(predefinedStep.clone())) .apply(TextIO.Write.named("WriteMyFile").to("gs://bucket/out")); Job job = translator.translate(pipeline, Collections.<DataflowPackage>emptyList()).getJob(); List<Step> steps = job.getSteps(); assertEquals(4, steps.size()); // The input to the embedded step should match the output of the step before Map<String, Object> step1Out = getOutputPortReference(steps.get(1)); Map<String, Object> step2In = getDictionary(steps.get(2).getProperties(), PropertyNames.PARALLEL_INPUT); assertEquals(step1Out, step2In); // The output from the embedded step should match the input of the step after Map<String, Object> step2Out = getOutputPortReference(steps.get(2)); Map<String, Object> step3In = getDictionary(steps.get(3).getProperties(), PropertyNames.PARALLEL_INPUT); assertEquals(step2Out, step3In); // The step should not have been modified other than remapping the input Step predefinedStepClone = predefinedStep.clone(); Step embeddedStepClone = steps.get(2).clone(); predefinedStepClone.getProperties().remove(PropertyNames.PARALLEL_INPUT); embeddedStepClone.getProperties().remove(PropertyNames.PARALLEL_INPUT); assertEquals(predefinedStepClone, embeddedStepClone); }
@Override public PCollection<KV<ReferenceShard, Iterable<GATKRead>>> apply(PCollection<GATKRead> input) { PCollection<KV<ReferenceShard, GATKRead>> keyReadByReferenceShard = input.apply( ParDo.of( new DoFn<GATKRead, KV<ReferenceShard, GATKRead>>() { private static final long serialVersionUID = 1L; @Override public void processElement(ProcessContext c) throws Exception { // Apply our reference window function to each read before deciding which // reference shard it belongs to. ReferenceShard shard = ReferenceShard.getShardNumberFromInterval( referenceWindowFunction.apply(c.element())); c.output(KV.of(shard, c.element())); } }) .named("KeyReadByRefShard")); return keyReadByReferenceShard.apply(GroupByKey.<ReferenceShard, GATKRead>create()); }