/** * Calculates a histogram using the estimateRoi method to estimate the effective yield doing x * sequencing for x=1..10. */ public Histogram<Double> calculateRoiHistogram() { if (ESTIMATED_LIBRARY_SIZE == null) { try { calculateDerivedMetrics(); if (ESTIMATED_LIBRARY_SIZE == null) return null; } catch (IllegalStateException ise) { return null; } } long uniquePairs = READ_PAIRS_EXAMINED - READ_PAIR_DUPLICATES; Histogram<Double> histo = new Histogram<Double>(); for (double x = 1; x <= 100; x += 1) { histo.increment(x, estimateRoi(ESTIMATED_LIBRARY_SIZE, x, READ_PAIRS_EXAMINED, uniquePairs)); } return histo; }
Histogram<Integer> getMeanQualityHistogram() { final String label = useOriginalQualities ? "MEAN_ORIGINAL_QUALITY" : "MEAN_QUALITY"; final Histogram<Integer> meanQualities = new Histogram<>("CYCLE", label); int firstReadLength = 0; for (int cycle = 0; cycle < firstReadTotalsByCycle.length; ++cycle) { if (firstReadTotalsByCycle[cycle] > 0) { meanQualities.increment( cycle, firstReadTotalsByCycle[cycle] / firstReadCountsByCycle[cycle]); firstReadLength = cycle; } } for (int i = 0; i < secondReadTotalsByCycle.length; ++i) { if (secondReadCountsByCycle[i] > 0) { final int cycle = firstReadLength + i; meanQualities.increment(cycle, secondReadTotalsByCycle[i] / secondReadCountsByCycle[i]); } } return meanQualities; }
@Override protected int doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsWritable(OUTPUT); IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE); // Setup all the inputs final ProgressLogger progress = new ProgressLogger(log, 10000000, "Processed", "loci"); final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE); final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT); final SamLocusIterator iterator = getLocusIterator(in); final List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>(); final CountingFilter dupeFilter = new CountingDuplicateFilter(); final CountingFilter mapqFilter = new CountingMapQFilter(MINIMUM_MAPPING_QUALITY); final CountingPairedFilter pairFilter = new CountingPairedFilter(); filters.add(mapqFilter); filters.add(dupeFilter); if (!COUNT_UNPAIRED) { filters.add(pairFilter); } filters.add( new SecondaryAlignmentFilter()); // Not a counting filter because we never want to count // reads twice iterator.setSamFilters(filters); iterator.setEmitUncoveredLoci(true); iterator.setMappingQualityScoreCutoff(0); // Handled separately because we want to count bases iterator.setQualityScoreCutoff(0); // Handled separately because we want to count bases iterator.setIncludeNonPfReads(false); final int max = COVERAGE_CAP; final long[] HistogramArray = new long[max + 1]; final long[] baseQHistogramArray = new long[Byte.MAX_VALUE]; final boolean usingStopAfter = STOP_AFTER > 0; final long stopAfter = STOP_AFTER - 1; long counter = 0; long basesExcludedByBaseq = 0; long basesExcludedByOverlap = 0; long basesExcludedByCapping = 0; // Loop through all the loci while (iterator.hasNext()) { final SamLocusIterator.LocusInfo info = iterator.next(); // Check that the reference is not N final ReferenceSequence ref = refWalker.get(info.getSequenceIndex()); final byte base = ref.getBases()[info.getPosition() - 1]; if (base == 'N') continue; // Figure out the coverage while not counting overlapping reads twice, and excluding various // things final HashSet<String> readNames = new HashSet<String>(info.getRecordAndPositions().size()); int pileupSize = 0; for (final SamLocusIterator.RecordAndOffset recs : info.getRecordAndPositions()) { if (recs.getBaseQuality() < MINIMUM_BASE_QUALITY) { ++basesExcludedByBaseq; continue; } if (!readNames.add(recs.getRecord().getReadName())) { ++basesExcludedByOverlap; continue; } pileupSize++; if (pileupSize <= max) { baseQHistogramArray[recs.getRecord().getBaseQualities()[recs.getOffset()]]++; } } final int depth = Math.min(readNames.size(), max); if (depth < readNames.size()) basesExcludedByCapping += readNames.size() - max; HistogramArray[depth]++; // Record progress and perhaps stop progress.record(info.getSequenceName(), info.getPosition()); if (usingStopAfter && ++counter > stopAfter) break; } // Construct and write the outputs final Histogram<Integer> histo = new Histogram<Integer>("coverage", "count"); for (int i = 0; i < HistogramArray.length; ++i) { histo.increment(i, HistogramArray[i]); } // Construct and write the outputs final Histogram<Integer> baseQHisto = new Histogram<Integer>("value", "baseq_count"); for (int i = 0; i < baseQHistogramArray.length; ++i) { baseQHisto.increment(i, baseQHistogramArray[i]); } final WgsMetrics metrics = generateWgsMetrics(); metrics.GENOME_TERRITORY = (long) histo.getSumOfValues(); metrics.MEAN_COVERAGE = histo.getMean(); metrics.SD_COVERAGE = histo.getStandardDeviation(); metrics.MEDIAN_COVERAGE = histo.getMedian(); metrics.MAD_COVERAGE = histo.getMedianAbsoluteDeviation(); final long basesExcludedByDupes = getBasesExcludedBy(dupeFilter); final long basesExcludedByMapq = getBasesExcludedBy(mapqFilter); final long basesExcludedByPairing = getBasesExcludedBy(pairFilter); final double total = histo.getSum(); final double totalWithExcludes = total + basesExcludedByDupes + basesExcludedByMapq + basesExcludedByPairing + basesExcludedByBaseq + basesExcludedByOverlap + basesExcludedByCapping; metrics.PCT_EXC_DUPE = basesExcludedByDupes / totalWithExcludes; metrics.PCT_EXC_MAPQ = basesExcludedByMapq / totalWithExcludes; metrics.PCT_EXC_UNPAIRED = basesExcludedByPairing / totalWithExcludes; metrics.PCT_EXC_BASEQ = basesExcludedByBaseq / totalWithExcludes; metrics.PCT_EXC_OVERLAP = basesExcludedByOverlap / totalWithExcludes; metrics.PCT_EXC_CAPPED = basesExcludedByCapping / totalWithExcludes; metrics.PCT_EXC_TOTAL = (totalWithExcludes - total) / totalWithExcludes; metrics.PCT_1X = MathUtil.sum(HistogramArray, 1, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_5X = MathUtil.sum(HistogramArray, 5, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_10X = MathUtil.sum(HistogramArray, 10, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_15X = MathUtil.sum(HistogramArray, 15, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_20X = MathUtil.sum(HistogramArray, 20, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_25X = MathUtil.sum(HistogramArray, 25, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_30X = MathUtil.sum(HistogramArray, 30, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_40X = MathUtil.sum(HistogramArray, 40, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_50X = MathUtil.sum(HistogramArray, 50, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_60X = MathUtil.sum(HistogramArray, 60, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_70X = MathUtil.sum(HistogramArray, 70, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_80X = MathUtil.sum(HistogramArray, 80, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_90X = MathUtil.sum(HistogramArray, 90, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; metrics.PCT_100X = MathUtil.sum(HistogramArray, 100, HistogramArray.length) / (double) metrics.GENOME_TERRITORY; final MetricsFile<WgsMetrics, Integer> out = getMetricsFile(); out.addMetric(metrics); out.addHistogram(histo); if (INCLUDE_BQ_HISTOGRAM) { out.addHistogram(baseQHisto); } out.write(OUTPUT); return 0; }
/** * Main method for the program. Checks that all input files are present and readable and that the * output file can be written to. Then iterates through all the records accumulating metrics. * Finally writes metrics file */ protected int doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsWritable(OUTPUT); final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT); final Histogram<Integer> mismatchesHist = new Histogram<Integer>("Predicted", "Mismatches"); final Histogram<Integer> totalHist = new Histogram<Integer>("Predicted", "Total_Bases"); final Map<String, Histogram> mismatchesByTypeHist = new HashMap<String, Histogram>(); final Map<String, Histogram> totalByTypeHist = new HashMap<String, Histogram>(); // Set up the histograms byte[] bases = {'A', 'C', 'G', 'T'}; for (final byte base : bases) { final Histogram<Integer> h = new Histogram<Integer>("Predicted", (char) base + ">"); mismatchesByTypeHist.put((char) base + ">", h); final Histogram<Integer> h2 = new Histogram<Integer>("Predicted", ">" + (char) base); mismatchesByTypeHist.put(">" + (char) base, h2); } for (final byte base : bases) { final Histogram<Integer> h = new Histogram<Integer>("Predicted", (char) base + ">"); totalByTypeHist.put((char) base + ">", h); final Histogram<Integer> h2 = new Histogram<Integer>("Predicted", ">" + (char) base); totalByTypeHist.put(">" + (char) base, h2); } for (final SAMRecord record : reader) { // Ignore these as we don't know the truth if (record.getReadUnmappedFlag() || record.isSecondaryOrSupplementary()) { continue; } final byte[] readBases = record.getReadBases(); final byte[] readQualities = record.getBaseQualities(); final byte[] refBases = SequenceUtil.makeReferenceFromAlignment(record, false); // We've seen stranger things if (readQualities.length != readBases.length) { throw new PicardException( "Missing Qualities (" + readQualities.length + "," + readBases.length + ") : " + record.getSAMString()); } if (refBases.length != readBases.length) { throw new PicardException( "The read length did not match the inferred reference length, please check your MD and CIGAR."); } int cycleIndex; // zero-based if (record.getReadNegativeStrandFlag()) { cycleIndex = readBases.length - 1 + CYCLE_OFFSET; } else { cycleIndex = CYCLE_OFFSET; } for (int i = 0; i < readBases.length; i++) { if (-1 == CYCLE || cycleIndex == CYCLE) { if ('-' != refBases[i] && '0' != refBases[i]) { // not insertion and not soft-clipped if (!SequenceUtil.basesEqual(readBases[i], refBases[i])) { // mismatch mismatchesHist.increment((int) readQualities[i]); if (SequenceUtil.isValidBase(refBases[i])) { mismatchesByTypeHist .get((char) refBases[i] + ">") .increment((int) readQualities[i]); } if (SequenceUtil.isValidBase(readBases[i])) { mismatchesByTypeHist .get(">" + (char) readBases[i]) .increment((int) readQualities[i]); } } else { mismatchesHist.increment( (int) readQualities[i], 0); // to make sure the bin will exist } totalHist.increment((int) readQualities[i]); if (SequenceUtil.isValidBase(readBases[i])) { totalByTypeHist.get(">" + (char) readBases[i]).increment((int) readQualities[i]); } if (SequenceUtil.isValidBase(refBases[i])) { totalByTypeHist.get((char) refBases[i] + ">").increment((int) readQualities[i]); } } } cycleIndex += record.getReadNegativeStrandFlag() ? -1 : 1; } } CloserUtil.close(reader); final Histogram<Integer> hist = new Histogram<Integer>("Predicted", "Observed"); double sumOfSquaresError = 0.0; // compute the aggregate phred values for (final Integer key : mismatchesHist.keySet()) { final double numMismatches = mismatchesHist.get(key).getValue(); final double numBases = totalHist.get(key).getValue(); final double phredErr = Math.log10(numMismatches / numBases) * -10.0; sumOfSquaresError += (0 == numMismatches) ? 0.0 : (key - phredErr) * (key - phredErr); hist.increment(key, phredErr); // make sure the bin will exist for (final byte base : bases) { mismatchesByTypeHist.get(">" + (char) base).increment(key, 0.0); mismatchesByTypeHist.get((char) base + ">").increment(key, 0.0); totalByTypeHist.get(">" + (char) base).increment(key, 0.0); totalByTypeHist.get((char) base + ">").increment(key, 0.0); } } final QualityScoreAccuracyMetrics metrics = new QualityScoreAccuracyMetrics(); metrics.SUM_OF_SQUARE_ERROR = sumOfSquaresError; final MetricsFile<QualityScoreAccuracyMetrics, Integer> out = getMetricsFile(); out.addMetric(metrics); out.addHistogram(hist); for (final byte base : bases) { // >base : histograms for mismatches *to* the given base Histogram<Integer> m = mismatchesByTypeHist.get(">" + (char) base); Histogram<Integer> t = totalByTypeHist.get(">" + (char) base); Histogram<Integer> h = new Histogram<Integer>(m.getBinLabel(), m.getValueLabel()); for (final Integer key : m.keySet()) { final double numMismatches = m.get(key).getValue(); final double numBases = t.get(key).getValue(); final double phredErr = Math.log10(numMismatches / numBases) * -10.0; h.increment(key, phredErr); } out.addHistogram(h); // base> : histograms for mismatches *from* the given base m = mismatchesByTypeHist.get((char) base + ">"); t = totalByTypeHist.get(">" + (char) base); h = new Histogram<Integer>(m.getBinLabel(), m.getValueLabel()); for (final Integer key : m.keySet()) { final double numMismatches = m.get(key).getValue(); final double numBases = t.get(key).getValue(); final double phredErr = Math.log10(numMismatches / numBases) * -10.0; h.increment(key, phredErr); } out.addHistogram(h); } out.addHistogram(mismatchesHist); out.addHistogram(totalHist); out.write(OUTPUT); return 0; }