/**
   * Merge the statistics from each block. The resulting "collection" contains a single element,
   * with the answer.
   */
  private static PCollection<RecalibrationTables> aggregateStatistics(
      final PCollection<RecalibrationTables> tables) {
    return tables
        // aggregate
        .apply(Combine.globally(new RecalibrationTablesMerger()))
        // call finalize on the result
        .apply(
            ParDo.named("finalizeRecalTables")
                .of(
                    new DoFnWLog<RecalibrationTables, RecalibrationTables>("finalizeRecalTables") {
                      private static final long serialVersionUID = 1L;

                      @Override
                      public void processElement(ProcessContext c) throws Exception {
                        RecalibrationTables tables = c.element();
                        if (null == tables) {
                          // the merger may return null when there are no inputs at all. In that
                          // case we don't want to
                          // crash (though it's really an edge case).
                          log.warn("No recalibration tables!");
                        } else {
                          // normal case: recalibrate
                          BaseRecalibrationEngine.finalizeRecalibrationTables(tables);
                        }
                        c.output(tables);
                      }
                    }));
  }
 @Override
 public PCollection<RecalibrationTables> apply(
     PCollection<AddContextDataToReadOptimized.ContextShard> input) {
   PCollection<RecalibrationTables> oneStatPerWorker =
       input.apply(
           ParDo.named("BaseRecalibrator")
               .withSideInputs(headerView, refDictionary)
               .of(new BaseRecalibratorOptimizedFn(headerView, refDictionary, recalArgs)));
   return aggregateStatistics(oneStatPerWorker);
 }
    @Override
    public PCollection<GATKRead> apply(PCollection<GATKRead> input) {
      return input.apply(
          ParDo.named("ApplyBQSR")
              .of(
                  new DoFnWLog<GATKRead, GATKRead>("ApplyBQSRStub") {
                    private static final long serialVersionUID = 1L;

                    @Override
                    public void processElement(ProcessContext c) throws Exception {
                      c.output(c.element());
                    }
                  })
              .withSideInputs(header, recalibrationReport));
    }
  /**
   * addQuantizationInfo takes the computed RecalibrationTable and adds the QuantizationInfo and
   * RequestedCovariates objects. We call this triplet "BaseRecalOutput". It contains everything we
   * need from phase 1 to continue onto phase 2 of BQSR.
   */
  private static PCollection<BaseRecalOutput> addQuantizationInfo(
      PCollectionView<SAMFileHeader> headerView,
      RecalibrationArgumentCollection recalArgs,
      PCollection<RecalibrationTables> recal) {
    return recal.apply(
        ParDo.named("addQuantizationInfo")
            .withSideInputs(headerView)
            .of(
                new DoFnWLog<RecalibrationTables, BaseRecalOutput>("addQuantizationInfo") {
                  private static final long serialVersionUID = 1L;

                  @Override
                  public void processElement(ProcessContext c) throws IOException {
                    RecalibrationTables rt = c.element();
                    SAMFileHeader header = c.sideInput(headerView);
                    // BaseRecalOutput ret = new BaseRecalOutput(rt,
                    // baseRecalibratorWorker.getQuantizationInfo(rt),
                    // baseRecalibratorWorker.getRequestedCovariates());
                    // Saving and loading back the report actually changes it. So we have to do it.
                    // TODO(issue#799): Figure out what it changes, and just do that instead of
                    // doing the whole rigamarole.
                    File temp = IOUtils.createTempFile("temp-recalibrationtable-", ".tmp");
                    if (null == rt) {
                      // special case where we have zero reads in the input. Create a valid empty
                      // report.
                      log.debug("Special case: zero reads in input.");
                      BaseRecalibrationEngine recalibrationEngine =
                          new BaseRecalibrationEngine(recalArgs, header);
                      rt = recalibrationEngine.getRecalibrationTables();
                      BaseRecalibrationEngine.finalizeRecalibrationTables(rt);
                    }
                    try {
                      BaseRecalibratorOptimizedFn.saveTextualReport(temp, header, rt, recalArgs);
                      BaseRecalOutput ret = new BaseRecalOutput(temp);
                      c.output(ret);
                    } catch (FileNotFoundException e) {
                      throw new GATKException("can't find my own temporary file", e);
                    } catch (IOException e) {
                      throw new GATKException(
                          "unable to save temporary report to " + temp.getPath(), e);
                    }
                  }
                }));
  }
    @Override
    public PCollection<RecalibrationTables> apply(
        PCollection<KV<GATKRead, ReadContextData>> input) {
      return input.apply(
          ParDo.named("BaseRecalibrator")
              .of(
                  new DoFnWLog<KV<GATKRead, ReadContextData>, RecalibrationTables>(
                      "BaseRecalibratorStub") {
                    private static final long serialVersionUID = 1L;

                    @Override
                    public void processElement(ProcessContext c) throws Exception {
                      c.output(
                          new RecalibrationTables(
                              new StandardCovariateList(
                                  new RecalibrationArgumentCollection(), Collections.emptyList())));
                    }
                  })
              .withSideInputs(header));
    }
Example #6
0
  /**
   * Takes a few Reads and will write them to a BAM file. The Reads don't have to be sorted
   * initially, the BAM file will be. All the reads must fit into a single worker's memory, so this
   * won't go well if you have too many.
   *
   * @param pipeline the pipeline to add this operation to.
   * @param reads the reads to write (they don't need to be sorted).
   * @param header the header that corresponds to the reads.
   * @param destPath the GCS or local path to write to (must start with "gs://" if writing to GCS).
   * @param parquet whether to write out BAM or Parquet data (BDG AlignmentRecords); only applies
   *     when writing to Hadoop
   */
  public static void writeToFile(
      Pipeline pipeline,
      PCollection<GATKRead> reads,
      final SAMFileHeader header,
      final String destPath,
      final boolean parquet) {
    if (BucketUtils.isHadoopUrl(destPath)
        || pipeline.getRunner().getClass().equals(SparkPipelineRunner.class)) {
      writeToHadoop(pipeline, reads, header, destPath, parquet);
    } else {
      PCollectionView<Iterable<GATKRead>> iterableView = reads.apply(View.<GATKRead>asIterable());

      PCollection<String> dummy = pipeline.apply("output file name", Create.<String>of(destPath));

      dummy.apply(
          ParDo.named("save to BAM file")
              .withSideInputs(iterableView)
              .of(new SaveToBAMFile(header, iterableView)));
    }
  }
Example #7
0
    @Override
    public PDone apply(PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf) {
      return wordToUriAndTfIdf
          .apply(
              ParDo.named("Format")
                  .of(
                      new DoFn<KV<String, KV<URI, Double>>, String>() {
                        private static final long serialVersionUID = 0;

                        @Override
                        public void processElement(ProcessContext c) {
                          c.output(
                              String.format(
                                  "%s,\t%s,\t%f",
                                  c.element().getKey(),
                                  c.element().getValue().getKey(),
                                  c.element().getValue().getValue()));
                        }
                      }))
          .apply(TextIO.Write.to(output).withSuffix(".csv"));
    }
Example #8
0
    @Override
    public PCollection<KV<String, KV<URI, Double>>> apply(
        PCollection<KV<URI, String>> uriToContent) {

      // Compute the total number of documents, and
      // prepare this singleton PCollectionView for
      // use as a side input.
      final PCollectionView<Long> totalDocuments =
          uriToContent
              .apply(Keys.<URI>create().setName("GetURIs"))
              .apply(RemoveDuplicates.<URI>create().setName("RemoveDuplicateDocs"))
              .apply(Count.<URI>globally())
              .apply(View.<Long>asSingleton());

      // Create a collection of pairs mapping a URI to each
      // of the words in the document associated with that that URI.
      PCollection<KV<URI, String>> uriToWords =
          uriToContent.apply(
              ParDo.named("SplitWords")
                  .of(
                      new DoFn<KV<URI, String>, KV<URI, String>>() {
                        private static final long serialVersionUID = 0;

                        @Override
                        public void processElement(ProcessContext c) {
                          URI uri = c.element().getKey();
                          String line = c.element().getValue();
                          for (String word : line.split("\\W+")) {
                            // Log INFO messages when the word “love” is found.
                            if (word.toLowerCase().equals("love")) {
                              LOG.info("Found {}", word.toLowerCase());
                            }

                            if (!word.isEmpty()) {
                              c.output(KV.of(uri, word.toLowerCase()));
                            }
                          }
                        }
                      }));

      // Compute a mapping from each word to the total
      // number of documents in which it appears.
      PCollection<KV<String, Long>> wordToDocCount =
          uriToWords
              .apply(RemoveDuplicates.<KV<URI, String>>create().setName("RemoveDuplicateWords"))
              .apply(Values.<String>create())
              .apply(Count.<String>perElement().setName("CountDocs"));

      // Compute a mapping from each URI to the total
      // number of words in the document associated with that URI.
      PCollection<KV<URI, Long>> uriToWordTotal =
          uriToWords
              .apply(Keys.<URI>create().setName("GetURIs2"))
              .apply(Count.<URI>perElement().setName("CountWords"));

      // Count, for each (URI, word) pair, the number of
      // occurrences of that word in the document associated
      // with the URI.
      PCollection<KV<KV<URI, String>, Long>> uriAndWordToCount =
          uriToWords.apply(Count.<KV<URI, String>>perElement().setName("CountWordDocPairs"));

      // Adjust the above collection to a mapping from
      // (URI, word) pairs to counts into an isomorphic mapping
      // from URI to (word, count) pairs, to prepare for a join
      // by the URI key.
      PCollection<KV<URI, KV<String, Long>>> uriToWordAndCount =
          uriAndWordToCount.apply(
              ParDo.named("ShiftKeys")
                  .of(
                      new DoFn<KV<KV<URI, String>, Long>, KV<URI, KV<String, Long>>>() {
                        private static final long serialVersionUID = 0;

                        @Override
                        public void processElement(ProcessContext c) {
                          URI uri = c.element().getKey().getKey();
                          String word = c.element().getKey().getValue();
                          Long occurrences = c.element().getValue();
                          c.output(KV.of(uri, KV.of(word, occurrences)));
                        }
                      }));

      // Prepare to join the mapping of URI to (word, count) pairs with
      // the mapping of URI to total word counts, by associating
      // each of the input PCollection<KV<URI, ...>> with
      // a tuple tag. Each input must have the same key type, URI
      // in this case. The type parameter of the tuple tag matches
      // the types of the values for each collection.
      final TupleTag<Long> wordTotalsTag = new TupleTag<Long>();
      final TupleTag<KV<String, Long>> wordCountsTag = new TupleTag<KV<String, Long>>();
      KeyedPCollectionTuple<URI> coGbkInput =
          KeyedPCollectionTuple.of(wordTotalsTag, uriToWordTotal)
              .and(wordCountsTag, uriToWordAndCount);

      // Perform a CoGroupByKey (a sort of pre-join) on the prepared
      // inputs. This yields a mapping from URI to a CoGbkResult
      // (CoGroupByKey Result). The CoGbkResult is a mapping
      // from the above tuple tags to the values in each input
      // associated with a particular URI. In this case, each
      // KV<URI, CoGbkResult> group a URI with the total number of
      // words in that document as well as all the (word, count)
      // pairs for particular words.
      PCollection<KV<URI, CoGbkResult>> uriToWordAndCountAndTotal =
          coGbkInput.apply(CoGroupByKey.<URI>create().setName("CoGroupByURI"));

      // Compute a mapping from each word to a (URI, term frequency)
      // pair for each URI. A word's term frequency for a document
      // is simply the number of times that word occurs in the document
      // divided by the total number of words in the document.
      PCollection<KV<String, KV<URI, Double>>> wordToUriAndTf =
          uriToWordAndCountAndTotal.apply(
              ParDo.named("ComputeTermFrequencies")
                  .of(
                      new DoFn<KV<URI, CoGbkResult>, KV<String, KV<URI, Double>>>() {
                        private static final long serialVersionUID = 0;

                        @Override
                        public void processElement(ProcessContext c) {
                          URI uri = c.element().getKey();
                          Long wordTotal = c.element().getValue().getOnly(wordTotalsTag);

                          for (KV<String, Long> wordAndCount :
                              c.element().getValue().getAll(wordCountsTag)) {
                            String word = wordAndCount.getKey();
                            Long wordCount = wordAndCount.getValue();
                            Double termFrequency =
                                wordCount.doubleValue() / wordTotal.doubleValue();
                            c.output(KV.of(word, KV.of(uri, termFrequency)));
                          }
                        }
                      }));

      // Compute a mapping from each word to its document frequency.
      // A word's document frequency in a corpus is the number of
      // documents in which the word appears divided by the total
      // number of documents in the corpus. Note how the total number of
      // documents is passed as a side input; the same value is
      // presented to each invocation of the DoFn.
      PCollection<KV<String, Double>> wordToDf =
          wordToDocCount.apply(
              ParDo.named("ComputeDocFrequencies")
                  .withSideInputs(totalDocuments)
                  .of(
                      new DoFn<KV<String, Long>, KV<String, Double>>() {
                        private static final long serialVersionUID = 0;

                        @Override
                        public void processElement(ProcessContext c) {
                          String word = c.element().getKey();
                          Long documentCount = c.element().getValue();
                          Long documentTotal = c.sideInput(totalDocuments);
                          Double documentFrequency =
                              documentCount.doubleValue() / documentTotal.doubleValue();

                          c.output(KV.of(word, documentFrequency));
                        }
                      }));

      // Join the term frequency and document frequency
      // collections, each keyed on the word.
      final TupleTag<KV<URI, Double>> tfTag = new TupleTag<KV<URI, Double>>();
      final TupleTag<Double> dfTag = new TupleTag<Double>();
      PCollection<KV<String, CoGbkResult>> wordToUriAndTfAndDf =
          KeyedPCollectionTuple.of(tfTag, wordToUriAndTf)
              .and(dfTag, wordToDf)
              .apply(CoGroupByKey.<String>create());

      // Compute a mapping from each word to a (URI, TF-IDF) score
      // for each URI. There are a variety of definitions of TF-IDF
      // ("term frequency - inverse document frequency") score;
      // here we use a basic version that is the term frequency
      // divided by the log of the document frequency.
      PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf =
          wordToUriAndTfAndDf.apply(
              ParDo.named("ComputeTfIdf")
                  .of(
                      new DoFn<KV<String, CoGbkResult>, KV<String, KV<URI, Double>>>() {
                        private static final long serialVersionUID = 0;

                        @Override
                        public void processElement(ProcessContext c) {
                          String word = c.element().getKey();
                          Double df = c.element().getValue().getOnly(dfTag);

                          for (KV<URI, Double> uriAndTf : c.element().getValue().getAll(tfTag)) {
                            URI uri = uriAndTf.getKey();
                            Double tf = uriAndTf.getValue();
                            Double tfIdf = tf * Math.log(1 / df);
                            c.output(KV.of(word, KV.of(uri, tfIdf)));
                          }
                        }
                      }));

      return wordToUriAndTfIdf;
    }