/** * Merge the statistics from each block. The resulting "collection" contains a single element, * with the answer. */ private static PCollection<RecalibrationTables> aggregateStatistics( final PCollection<RecalibrationTables> tables) { return tables // aggregate .apply(Combine.globally(new RecalibrationTablesMerger())) // call finalize on the result .apply( ParDo.named("finalizeRecalTables") .of( new DoFnWLog<RecalibrationTables, RecalibrationTables>("finalizeRecalTables") { private static final long serialVersionUID = 1L; @Override public void processElement(ProcessContext c) throws Exception { RecalibrationTables tables = c.element(); if (null == tables) { // the merger may return null when there are no inputs at all. In that // case we don't want to // crash (though it's really an edge case). log.warn("No recalibration tables!"); } else { // normal case: recalibrate BaseRecalibrationEngine.finalizeRecalibrationTables(tables); } c.output(tables); } })); }
@Override public PCollection<RecalibrationTables> apply( PCollection<AddContextDataToReadOptimized.ContextShard> input) { PCollection<RecalibrationTables> oneStatPerWorker = input.apply( ParDo.named("BaseRecalibrator") .withSideInputs(headerView, refDictionary) .of(new BaseRecalibratorOptimizedFn(headerView, refDictionary, recalArgs))); return aggregateStatistics(oneStatPerWorker); }
@Override public PCollection<GATKRead> apply(PCollection<GATKRead> input) { return input.apply( ParDo.named("ApplyBQSR") .of( new DoFnWLog<GATKRead, GATKRead>("ApplyBQSRStub") { private static final long serialVersionUID = 1L; @Override public void processElement(ProcessContext c) throws Exception { c.output(c.element()); } }) .withSideInputs(header, recalibrationReport)); }
/** * addQuantizationInfo takes the computed RecalibrationTable and adds the QuantizationInfo and * RequestedCovariates objects. We call this triplet "BaseRecalOutput". It contains everything we * need from phase 1 to continue onto phase 2 of BQSR. */ private static PCollection<BaseRecalOutput> addQuantizationInfo( PCollectionView<SAMFileHeader> headerView, RecalibrationArgumentCollection recalArgs, PCollection<RecalibrationTables> recal) { return recal.apply( ParDo.named("addQuantizationInfo") .withSideInputs(headerView) .of( new DoFnWLog<RecalibrationTables, BaseRecalOutput>("addQuantizationInfo") { private static final long serialVersionUID = 1L; @Override public void processElement(ProcessContext c) throws IOException { RecalibrationTables rt = c.element(); SAMFileHeader header = c.sideInput(headerView); // BaseRecalOutput ret = new BaseRecalOutput(rt, // baseRecalibratorWorker.getQuantizationInfo(rt), // baseRecalibratorWorker.getRequestedCovariates()); // Saving and loading back the report actually changes it. So we have to do it. // TODO(issue#799): Figure out what it changes, and just do that instead of // doing the whole rigamarole. File temp = IOUtils.createTempFile("temp-recalibrationtable-", ".tmp"); if (null == rt) { // special case where we have zero reads in the input. Create a valid empty // report. log.debug("Special case: zero reads in input."); BaseRecalibrationEngine recalibrationEngine = new BaseRecalibrationEngine(recalArgs, header); rt = recalibrationEngine.getRecalibrationTables(); BaseRecalibrationEngine.finalizeRecalibrationTables(rt); } try { BaseRecalibratorOptimizedFn.saveTextualReport(temp, header, rt, recalArgs); BaseRecalOutput ret = new BaseRecalOutput(temp); c.output(ret); } catch (FileNotFoundException e) { throw new GATKException("can't find my own temporary file", e); } catch (IOException e) { throw new GATKException( "unable to save temporary report to " + temp.getPath(), e); } } })); }
@Override public PCollection<RecalibrationTables> apply( PCollection<KV<GATKRead, ReadContextData>> input) { return input.apply( ParDo.named("BaseRecalibrator") .of( new DoFnWLog<KV<GATKRead, ReadContextData>, RecalibrationTables>( "BaseRecalibratorStub") { private static final long serialVersionUID = 1L; @Override public void processElement(ProcessContext c) throws Exception { c.output( new RecalibrationTables( new StandardCovariateList( new RecalibrationArgumentCollection(), Collections.emptyList()))); } }) .withSideInputs(header)); }
/** * Takes a few Reads and will write them to a BAM file. The Reads don't have to be sorted * initially, the BAM file will be. All the reads must fit into a single worker's memory, so this * won't go well if you have too many. * * @param pipeline the pipeline to add this operation to. * @param reads the reads to write (they don't need to be sorted). * @param header the header that corresponds to the reads. * @param destPath the GCS or local path to write to (must start with "gs://" if writing to GCS). * @param parquet whether to write out BAM or Parquet data (BDG AlignmentRecords); only applies * when writing to Hadoop */ public static void writeToFile( Pipeline pipeline, PCollection<GATKRead> reads, final SAMFileHeader header, final String destPath, final boolean parquet) { if (BucketUtils.isHadoopUrl(destPath) || pipeline.getRunner().getClass().equals(SparkPipelineRunner.class)) { writeToHadoop(pipeline, reads, header, destPath, parquet); } else { PCollectionView<Iterable<GATKRead>> iterableView = reads.apply(View.<GATKRead>asIterable()); PCollection<String> dummy = pipeline.apply("output file name", Create.<String>of(destPath)); dummy.apply( ParDo.named("save to BAM file") .withSideInputs(iterableView) .of(new SaveToBAMFile(header, iterableView))); } }
@Override public PDone apply(PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf) { return wordToUriAndTfIdf .apply( ParDo.named("Format") .of( new DoFn<KV<String, KV<URI, Double>>, String>() { private static final long serialVersionUID = 0; @Override public void processElement(ProcessContext c) { c.output( String.format( "%s,\t%s,\t%f", c.element().getKey(), c.element().getValue().getKey(), c.element().getValue().getValue())); } })) .apply(TextIO.Write.to(output).withSuffix(".csv")); }
@Override public PCollection<KV<String, KV<URI, Double>>> apply( PCollection<KV<URI, String>> uriToContent) { // Compute the total number of documents, and // prepare this singleton PCollectionView for // use as a side input. final PCollectionView<Long> totalDocuments = uriToContent .apply(Keys.<URI>create().setName("GetURIs")) .apply(RemoveDuplicates.<URI>create().setName("RemoveDuplicateDocs")) .apply(Count.<URI>globally()) .apply(View.<Long>asSingleton()); // Create a collection of pairs mapping a URI to each // of the words in the document associated with that that URI. PCollection<KV<URI, String>> uriToWords = uriToContent.apply( ParDo.named("SplitWords") .of( new DoFn<KV<URI, String>, KV<URI, String>>() { private static final long serialVersionUID = 0; @Override public void processElement(ProcessContext c) { URI uri = c.element().getKey(); String line = c.element().getValue(); for (String word : line.split("\\W+")) { // Log INFO messages when the word “love” is found. if (word.toLowerCase().equals("love")) { LOG.info("Found {}", word.toLowerCase()); } if (!word.isEmpty()) { c.output(KV.of(uri, word.toLowerCase())); } } } })); // Compute a mapping from each word to the total // number of documents in which it appears. PCollection<KV<String, Long>> wordToDocCount = uriToWords .apply(RemoveDuplicates.<KV<URI, String>>create().setName("RemoveDuplicateWords")) .apply(Values.<String>create()) .apply(Count.<String>perElement().setName("CountDocs")); // Compute a mapping from each URI to the total // number of words in the document associated with that URI. PCollection<KV<URI, Long>> uriToWordTotal = uriToWords .apply(Keys.<URI>create().setName("GetURIs2")) .apply(Count.<URI>perElement().setName("CountWords")); // Count, for each (URI, word) pair, the number of // occurrences of that word in the document associated // with the URI. PCollection<KV<KV<URI, String>, Long>> uriAndWordToCount = uriToWords.apply(Count.<KV<URI, String>>perElement().setName("CountWordDocPairs")); // Adjust the above collection to a mapping from // (URI, word) pairs to counts into an isomorphic mapping // from URI to (word, count) pairs, to prepare for a join // by the URI key. PCollection<KV<URI, KV<String, Long>>> uriToWordAndCount = uriAndWordToCount.apply( ParDo.named("ShiftKeys") .of( new DoFn<KV<KV<URI, String>, Long>, KV<URI, KV<String, Long>>>() { private static final long serialVersionUID = 0; @Override public void processElement(ProcessContext c) { URI uri = c.element().getKey().getKey(); String word = c.element().getKey().getValue(); Long occurrences = c.element().getValue(); c.output(KV.of(uri, KV.of(word, occurrences))); } })); // Prepare to join the mapping of URI to (word, count) pairs with // the mapping of URI to total word counts, by associating // each of the input PCollection<KV<URI, ...>> with // a tuple tag. Each input must have the same key type, URI // in this case. The type parameter of the tuple tag matches // the types of the values for each collection. final TupleTag<Long> wordTotalsTag = new TupleTag<Long>(); final TupleTag<KV<String, Long>> wordCountsTag = new TupleTag<KV<String, Long>>(); KeyedPCollectionTuple<URI> coGbkInput = KeyedPCollectionTuple.of(wordTotalsTag, uriToWordTotal) .and(wordCountsTag, uriToWordAndCount); // Perform a CoGroupByKey (a sort of pre-join) on the prepared // inputs. This yields a mapping from URI to a CoGbkResult // (CoGroupByKey Result). The CoGbkResult is a mapping // from the above tuple tags to the values in each input // associated with a particular URI. In this case, each // KV<URI, CoGbkResult> group a URI with the total number of // words in that document as well as all the (word, count) // pairs for particular words. PCollection<KV<URI, CoGbkResult>> uriToWordAndCountAndTotal = coGbkInput.apply(CoGroupByKey.<URI>create().setName("CoGroupByURI")); // Compute a mapping from each word to a (URI, term frequency) // pair for each URI. A word's term frequency for a document // is simply the number of times that word occurs in the document // divided by the total number of words in the document. PCollection<KV<String, KV<URI, Double>>> wordToUriAndTf = uriToWordAndCountAndTotal.apply( ParDo.named("ComputeTermFrequencies") .of( new DoFn<KV<URI, CoGbkResult>, KV<String, KV<URI, Double>>>() { private static final long serialVersionUID = 0; @Override public void processElement(ProcessContext c) { URI uri = c.element().getKey(); Long wordTotal = c.element().getValue().getOnly(wordTotalsTag); for (KV<String, Long> wordAndCount : c.element().getValue().getAll(wordCountsTag)) { String word = wordAndCount.getKey(); Long wordCount = wordAndCount.getValue(); Double termFrequency = wordCount.doubleValue() / wordTotal.doubleValue(); c.output(KV.of(word, KV.of(uri, termFrequency))); } } })); // Compute a mapping from each word to its document frequency. // A word's document frequency in a corpus is the number of // documents in which the word appears divided by the total // number of documents in the corpus. Note how the total number of // documents is passed as a side input; the same value is // presented to each invocation of the DoFn. PCollection<KV<String, Double>> wordToDf = wordToDocCount.apply( ParDo.named("ComputeDocFrequencies") .withSideInputs(totalDocuments) .of( new DoFn<KV<String, Long>, KV<String, Double>>() { private static final long serialVersionUID = 0; @Override public void processElement(ProcessContext c) { String word = c.element().getKey(); Long documentCount = c.element().getValue(); Long documentTotal = c.sideInput(totalDocuments); Double documentFrequency = documentCount.doubleValue() / documentTotal.doubleValue(); c.output(KV.of(word, documentFrequency)); } })); // Join the term frequency and document frequency // collections, each keyed on the word. final TupleTag<KV<URI, Double>> tfTag = new TupleTag<KV<URI, Double>>(); final TupleTag<Double> dfTag = new TupleTag<Double>(); PCollection<KV<String, CoGbkResult>> wordToUriAndTfAndDf = KeyedPCollectionTuple.of(tfTag, wordToUriAndTf) .and(dfTag, wordToDf) .apply(CoGroupByKey.<String>create()); // Compute a mapping from each word to a (URI, TF-IDF) score // for each URI. There are a variety of definitions of TF-IDF // ("term frequency - inverse document frequency") score; // here we use a basic version that is the term frequency // divided by the log of the document frequency. PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf = wordToUriAndTfAndDf.apply( ParDo.named("ComputeTfIdf") .of( new DoFn<KV<String, CoGbkResult>, KV<String, KV<URI, Double>>>() { private static final long serialVersionUID = 0; @Override public void processElement(ProcessContext c) { String word = c.element().getKey(); Double df = c.element().getValue().getOnly(dfTag); for (KV<URI, Double> uriAndTf : c.element().getValue().getAll(tfTag)) { URI uri = uriAndTf.getKey(); Double tf = uriAndTf.getValue(); Double tfIdf = tf * Math.log(1 / df); c.output(KV.of(word, KV.of(uri, tfIdf))); } } })); return wordToUriAndTfIdf; }