Esempio n. 1
0
  @Override
  protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);
    IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);

    // Setup all the inputs
    final ProgressLogger progress = new ProgressLogger(log, 10000000, "Processed", "loci");
    final ReferenceSequenceFileWalker refWalker =
        new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
    final SamReader in =
        SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
    final SamLocusIterator iterator = getLocusIterator(in);

    final List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>();
    final CountingFilter dupeFilter = new CountingDuplicateFilter();
    final CountingFilter mapqFilter = new CountingMapQFilter(MINIMUM_MAPPING_QUALITY);
    final CountingPairedFilter pairFilter = new CountingPairedFilter();
    filters.add(mapqFilter);
    filters.add(dupeFilter);
    if (!COUNT_UNPAIRED) {
      filters.add(pairFilter);
    }
    filters.add(
        new SecondaryAlignmentFilter()); // Not a counting filter because we never want to count
                                         // reads twice
    iterator.setSamFilters(filters);
    iterator.setEmitUncoveredLoci(true);
    iterator.setMappingQualityScoreCutoff(0); // Handled separately because we want to count bases
    iterator.setQualityScoreCutoff(0); // Handled separately because we want to count bases
    iterator.setIncludeNonPfReads(false);

    final int max = COVERAGE_CAP;
    final long[] HistogramArray = new long[max + 1];
    final long[] baseQHistogramArray = new long[Byte.MAX_VALUE];
    final boolean usingStopAfter = STOP_AFTER > 0;
    final long stopAfter = STOP_AFTER - 1;
    long counter = 0;

    long basesExcludedByBaseq = 0;
    long basesExcludedByOverlap = 0;
    long basesExcludedByCapping = 0;

    // Loop through all the loci
    while (iterator.hasNext()) {
      final SamLocusIterator.LocusInfo info = iterator.next();

      // Check that the reference is not N
      final ReferenceSequence ref = refWalker.get(info.getSequenceIndex());
      final byte base = ref.getBases()[info.getPosition() - 1];
      if (base == 'N') continue;

      // Figure out the coverage while not counting overlapping reads twice, and excluding various
      // things
      final HashSet<String> readNames = new HashSet<String>(info.getRecordAndPositions().size());
      int pileupSize = 0;
      for (final SamLocusIterator.RecordAndOffset recs : info.getRecordAndPositions()) {

        if (recs.getBaseQuality() < MINIMUM_BASE_QUALITY) {
          ++basesExcludedByBaseq;
          continue;
        }
        if (!readNames.add(recs.getRecord().getReadName())) {
          ++basesExcludedByOverlap;
          continue;
        }
        pileupSize++;
        if (pileupSize <= max) {
          baseQHistogramArray[recs.getRecord().getBaseQualities()[recs.getOffset()]]++;
        }
      }

      final int depth = Math.min(readNames.size(), max);
      if (depth < readNames.size()) basesExcludedByCapping += readNames.size() - max;
      HistogramArray[depth]++;

      // Record progress and perhaps stop
      progress.record(info.getSequenceName(), info.getPosition());
      if (usingStopAfter && ++counter > stopAfter) break;
    }

    // Construct and write the outputs
    final Histogram<Integer> histo = new Histogram<Integer>("coverage", "count");
    for (int i = 0; i < HistogramArray.length; ++i) {
      histo.increment(i, HistogramArray[i]);
    }

    // Construct and write the outputs
    final Histogram<Integer> baseQHisto = new Histogram<Integer>("value", "baseq_count");
    for (int i = 0; i < baseQHistogramArray.length; ++i) {
      baseQHisto.increment(i, baseQHistogramArray[i]);
    }

    final WgsMetrics metrics = generateWgsMetrics();
    metrics.GENOME_TERRITORY = (long) histo.getSumOfValues();
    metrics.MEAN_COVERAGE = histo.getMean();
    metrics.SD_COVERAGE = histo.getStandardDeviation();
    metrics.MEDIAN_COVERAGE = histo.getMedian();
    metrics.MAD_COVERAGE = histo.getMedianAbsoluteDeviation();

    final long basesExcludedByDupes = getBasesExcludedBy(dupeFilter);
    final long basesExcludedByMapq = getBasesExcludedBy(mapqFilter);
    final long basesExcludedByPairing = getBasesExcludedBy(pairFilter);
    final double total = histo.getSum();
    final double totalWithExcludes =
        total
            + basesExcludedByDupes
            + basesExcludedByMapq
            + basesExcludedByPairing
            + basesExcludedByBaseq
            + basesExcludedByOverlap
            + basesExcludedByCapping;
    metrics.PCT_EXC_DUPE = basesExcludedByDupes / totalWithExcludes;
    metrics.PCT_EXC_MAPQ = basesExcludedByMapq / totalWithExcludes;
    metrics.PCT_EXC_UNPAIRED = basesExcludedByPairing / totalWithExcludes;
    metrics.PCT_EXC_BASEQ = basesExcludedByBaseq / totalWithExcludes;
    metrics.PCT_EXC_OVERLAP = basesExcludedByOverlap / totalWithExcludes;
    metrics.PCT_EXC_CAPPED = basesExcludedByCapping / totalWithExcludes;
    metrics.PCT_EXC_TOTAL = (totalWithExcludes - total) / totalWithExcludes;

    metrics.PCT_1X =
        MathUtil.sum(HistogramArray, 1, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_5X =
        MathUtil.sum(HistogramArray, 5, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_10X =
        MathUtil.sum(HistogramArray, 10, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_15X =
        MathUtil.sum(HistogramArray, 15, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_20X =
        MathUtil.sum(HistogramArray, 20, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_25X =
        MathUtil.sum(HistogramArray, 25, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_30X =
        MathUtil.sum(HistogramArray, 30, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_40X =
        MathUtil.sum(HistogramArray, 40, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_50X =
        MathUtil.sum(HistogramArray, 50, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_60X =
        MathUtil.sum(HistogramArray, 60, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_70X =
        MathUtil.sum(HistogramArray, 70, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_80X =
        MathUtil.sum(HistogramArray, 80, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_90X =
        MathUtil.sum(HistogramArray, 90, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_100X =
        MathUtil.sum(HistogramArray, 100, HistogramArray.length)
            / (double) metrics.GENOME_TERRITORY;

    final MetricsFile<WgsMetrics, Integer> out = getMetricsFile();
    out.addMetric(metrics);
    out.addHistogram(histo);
    if (INCLUDE_BQ_HISTOGRAM) {
      out.addHistogram(baseQHisto);
    }
    out.write(OUTPUT);

    return 0;
  }