@Override public PCollection<KV<K, CoGbkResult>> apply(KeyedPCollectionTuple<K> input) { if (input.isEmpty()) { throw new IllegalArgumentException("must have at least one input to a KeyedPCollections"); } // First build the union coder. // TODO: Look at better integration of union types with the // schema specified in the input. List<Coder<?>> codersList = new ArrayList<>(); for (TaggedKeyedPCollection<K, ?> entry : input.getKeyedCollections()) { codersList.add(getValueCoder(entry.pCollection)); } UnionCoder unionCoder = UnionCoder.of(codersList); Coder<K> keyCoder = input.getKeyCoder(); KvCoder<K, RawUnionValue> kVCoder = KvCoder.of(keyCoder, unionCoder); PCollectionList<KV<K, RawUnionValue>> unionTables = PCollectionList.empty(input.getPipeline()); // TODO: Use the schema to order the indices rather than depending // on the fact that the schema ordering is identical to the ordering from // input.getJoinCollections(). int index = -1; for (TaggedKeyedPCollection<K, ?> entry : input.getKeyedCollections()) { index++; PCollection<KV<K, RawUnionValue>> unionTable = makeUnionTable(index, entry.pCollection, kVCoder); unionTables = unionTables.and(unionTable); } PCollection<KV<K, RawUnionValue>> flattenedTable = unionTables.apply(Flatten.<KV<K, RawUnionValue>>pCollections()); PCollection<KV<K, Iterable<RawUnionValue>>> groupedTable = flattenedTable.apply(GroupByKey.<K, RawUnionValue>create()); CoGbkResultSchema tupleTags = input.getCoGbkResultSchema(); PCollection<KV<K, CoGbkResult>> result = groupedTable.apply( ParDo.of(new ConstructCoGbkResultFn<K>(tupleTags)).named("ConstructCoGbkResultFn")); result.setCoder(KvCoder.of(keyCoder, CoGbkResultCoder.of(tupleTags, unionCoder))); return result; }
@Override public PCollection<KV<ReferenceShard, Iterable<GATKRead>>> apply(PCollection<GATKRead> input) { PCollection<KV<ReferenceShard, GATKRead>> keyReadByReferenceShard = input.apply( ParDo.of( new DoFn<GATKRead, KV<ReferenceShard, GATKRead>>() { private static final long serialVersionUID = 1L; @Override public void processElement(ProcessContext c) throws Exception { // Apply our reference window function to each read before deciding which // reference shard it belongs to. ReferenceShard shard = ReferenceShard.getShardNumberFromInterval( referenceWindowFunction.apply(c.element())); c.output(KV.of(shard, c.element())); } }) .named("KeyReadByRefShard")); return keyReadByReferenceShard.apply(GroupByKey.<ReferenceShard, GATKRead>create()); }