@Override
  public PCollection<KV<K, CoGbkResult>> apply(KeyedPCollectionTuple<K> input) {
    if (input.isEmpty()) {
      throw new IllegalArgumentException("must have at least one input to a KeyedPCollections");
    }

    // First build the union coder.
    // TODO: Look at better integration of union types with the
    // schema specified in the input.
    List<Coder<?>> codersList = new ArrayList<>();
    for (TaggedKeyedPCollection<K, ?> entry : input.getKeyedCollections()) {
      codersList.add(getValueCoder(entry.pCollection));
    }
    UnionCoder unionCoder = UnionCoder.of(codersList);
    Coder<K> keyCoder = input.getKeyCoder();
    KvCoder<K, RawUnionValue> kVCoder = KvCoder.of(keyCoder, unionCoder);

    PCollectionList<KV<K, RawUnionValue>> unionTables = PCollectionList.empty(input.getPipeline());

    // TODO: Use the schema to order the indices rather than depending
    // on the fact that the schema ordering is identical to the ordering from
    // input.getJoinCollections().
    int index = -1;
    for (TaggedKeyedPCollection<K, ?> entry : input.getKeyedCollections()) {
      index++;
      PCollection<KV<K, RawUnionValue>> unionTable =
          makeUnionTable(index, entry.pCollection, kVCoder);
      unionTables = unionTables.and(unionTable);
    }

    PCollection<KV<K, RawUnionValue>> flattenedTable =
        unionTables.apply(Flatten.<KV<K, RawUnionValue>>pCollections());

    PCollection<KV<K, Iterable<RawUnionValue>>> groupedTable =
        flattenedTable.apply(GroupByKey.<K, RawUnionValue>create());

    CoGbkResultSchema tupleTags = input.getCoGbkResultSchema();
    PCollection<KV<K, CoGbkResult>> result =
        groupedTable.apply(
            ParDo.of(new ConstructCoGbkResultFn<K>(tupleTags)).named("ConstructCoGbkResultFn"));
    result.setCoder(KvCoder.of(keyCoder, CoGbkResultCoder.of(tupleTags, unionCoder)));

    return result;
  }
Пример #2
0
  @Override
  public PCollection<KV<ReferenceShard, Iterable<GATKRead>>> apply(PCollection<GATKRead> input) {
    PCollection<KV<ReferenceShard, GATKRead>> keyReadByReferenceShard =
        input.apply(
            ParDo.of(
                    new DoFn<GATKRead, KV<ReferenceShard, GATKRead>>() {
                      private static final long serialVersionUID = 1L;

                      @Override
                      public void processElement(ProcessContext c) throws Exception {
                        // Apply our reference window function to each read before deciding which
                        // reference shard it belongs to.
                        ReferenceShard shard =
                            ReferenceShard.getShardNumberFromInterval(
                                referenceWindowFunction.apply(c.element()));
                        c.output(KV.of(shard, c.element()));
                      }
                    })
                .named("KeyReadByRefShard"));
    return keyReadByReferenceShard.apply(GroupByKey.<ReferenceShard, GATKRead>create());
  }