@Override public PCollection<KV<URI, String>> apply(PInput input) { Pipeline pipeline = input.getPipeline(); // Create one TextIO.Read transform for each document // and add its output to a PCollectionList PCollectionList<KV<URI, String>> urisToLines = PCollectionList.empty(pipeline); // TextIO.Read supports: // - file: URIs and paths locally // - gs: URIs on the service for (final URI uri : uris) { String uriString; if (uri.getScheme().equals("file")) { uriString = new File(uri).getPath(); } else { uriString = uri.toString(); } PCollection<KV<URI, String>> oneUriToLines = pipeline .apply(TextIO.Read.from(uriString).named("TextIO.Read(" + uriString + ")")) .apply(WithKeys.<URI, String>of(uri).setName("WithKeys(" + uriString + ")")); urisToLines = urisToLines.and(oneUriToLines); } return urisToLines.apply(Flatten.<KV<URI, String>>pCollections()); }
@Override public PCollection<KV<K, CoGbkResult>> apply(KeyedPCollectionTuple<K> input) { if (input.isEmpty()) { throw new IllegalArgumentException("must have at least one input to a KeyedPCollections"); } // First build the union coder. // TODO: Look at better integration of union types with the // schema specified in the input. List<Coder<?>> codersList = new ArrayList<>(); for (TaggedKeyedPCollection<K, ?> entry : input.getKeyedCollections()) { codersList.add(getValueCoder(entry.pCollection)); } UnionCoder unionCoder = UnionCoder.of(codersList); Coder<K> keyCoder = input.getKeyCoder(); KvCoder<K, RawUnionValue> kVCoder = KvCoder.of(keyCoder, unionCoder); PCollectionList<KV<K, RawUnionValue>> unionTables = PCollectionList.empty(input.getPipeline()); // TODO: Use the schema to order the indices rather than depending // on the fact that the schema ordering is identical to the ordering from // input.getJoinCollections(). int index = -1; for (TaggedKeyedPCollection<K, ?> entry : input.getKeyedCollections()) { index++; PCollection<KV<K, RawUnionValue>> unionTable = makeUnionTable(index, entry.pCollection, kVCoder); unionTables = unionTables.and(unionTable); } PCollection<KV<K, RawUnionValue>> flattenedTable = unionTables.apply(Flatten.<KV<K, RawUnionValue>>pCollections()); PCollection<KV<K, Iterable<RawUnionValue>>> groupedTable = flattenedTable.apply(GroupByKey.<K, RawUnionValue>create()); CoGbkResultSchema tupleTags = input.getCoGbkResultSchema(); PCollection<KV<K, CoGbkResult>> result = groupedTable.apply( ParDo.of(new ConstructCoGbkResultFn<K>(tupleTags)).named("ConstructCoGbkResultFn")); result.setCoder(KvCoder.of(keyCoder, CoGbkResultCoder.of(tupleTags, unionCoder))); return result; }