public int compare(final SAMRecord rec1, final SAMRecord rec2) { if (rec1.getReadUnmappedFlag()) { if (rec2.getReadUnmappedFlag()) return 0; else return 1; } else if (rec2.getReadUnmappedFlag()) { return -1; } return -SAMUtils.compareMapqs(rec1.getMappingQuality(), rec2.getMappingQuality()); }
public SAMRecord next() { SAMRecord cur = it.next(); if (last != null) verifyRecord(last, cur); if (!cur.getReadUnmappedFlag()) last = cur; return cur; }
/** Note: this is the only getKey function that handles unmapped reads specially! */ public static long getKey(final SAMRecord rec) { final int refIdx = rec.getReferenceIndex(); final int start = rec.getAlignmentStart(); if (!(rec.getReadUnmappedFlag() || refIdx < 0 || start < 0)) return getKey(refIdx, start); // Put unmapped reads at the end, but don't give them all the exact same // key so that they can be distributed to different reducers. // // A random number would probably be best, but to ensure that the same // record always gets the same key we use a fast hash instead. // // We avoid using hashCode(), because it's not guaranteed to have the // same value across different processes. int hash = 0; byte[] var; if ((var = rec.getVariableBinaryRepresentation()) != null) { // Undecoded BAM record: just hash its raw data. hash = (int) MurmurHash3.murmurhash3(var, hash); } else { // Decoded BAM record or any SAM record: hash a few representative // fields together. hash = (int) MurmurHash3.murmurhash3(rec.getReadName(), hash); hash = (int) MurmurHash3.murmurhash3(rec.getReadBases(), hash); hash = (int) MurmurHash3.murmurhash3(rec.getBaseQualities(), hash); hash = (int) MurmurHash3.murmurhash3(rec.getCigarString(), hash); } return getKey0(Integer.MAX_VALUE, hash); }
private boolean overlaps(SAMRecord r) { if (intervals == null || (r.getReadUnmappedFlag() && r.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START)) { return true; } if (r.getReadUnmappedFlag()) { // special case for unmapped reads with coordinate set for (Locatable interval : intervals) { if (interval.getStart() <= r.getStart() && interval.getEnd() >= r.getStart()) { // This follows the behavior of htsjdk's SamReader which states that // "an unmapped read will be returned by this call if it has a coordinate for // the purpose of sorting that is in the query region". return true; } } } final Interval interval = new Interval(r.getContig(), r.getStart(), r.getEnd()); Collection<Interval> overlaps = overlapDetector.getOverlaps(interval); return !overlaps.isEmpty(); }
@Override public SAMRecordPair getNextReadPair() { // insert first read into dictionary by queryname // insert second read into dictionary // check if the dictionary length for that entry has both pairs // if it is does return the read pair // otherwise continue reading // this way just return pairs as they are completed // should be MUCH faster // make sure to delete the entry after returning so that we dont have a memory leak if (iterator.hasNext()) { while (iterator.hasNext()) { SAMRecord record = iterator.next(); countRead(record); // skip if the read is unmapped, not properly paired or mate is unmapped if (record.getReadUnmappedFlag() == true || record.getProperPairFlag() == false || record.getMateUnmappedFlag() == true) { continue; } String query = record.getReadName(); // check if read mate has been read already if (readBuffer.containsKey(query)) { // if it has then return the pair SAMRecordPair pair = readBuffer.get(query); pair.addPair(record); if (pair.bothPairsAligned() && pair.isValidPair()) { // prevent memory leak by deleting keys that are no longer needed readBuffer.remove(query); return pair; } else { throw new RuntimeException(query + " is not properly mated"); } } else { // otherwise create an entry and store it by its query name SAMRecordPair pair = new SAMRecordPair(); pair.addPair(record); readBuffer.put(query, pair); } } } else { if (readBuffer.size() > 0) { for (String key : readBuffer.keySet()) { logger.info("No mate for for " + key); } throw new RuntimeException( "No mates found for some reads please make sure all reads are properly paired"); } } return null; }
private boolean isOutOfOrder(final SAMRecord last, final SAMRecord cur) { if (last == null || cur.getReadUnmappedFlag()) return false; else { if (last.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX || last.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) throw new UserException.MalformedBAM( last, String.format("read %s has inconsistent mapping information.", last.format())); if (cur.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX || cur.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) throw new UserException.MalformedBAM( last, String.format("read %s has inconsistent mapping information.", cur.format())); return (last.getReferenceIndex() > cur.getReferenceIndex()) || (last.getReferenceIndex().equals(cur.getReferenceIndex()) && last.getAlignmentStart() > cur.getAlignmentStart()); } }
// // Returns a downsampled set of reads for each sample. // private List<List<SAMRecord>> getReads( List<String> inputFiles, List<Feature> regions, ReAligner realigner) { int downsampleTarget = desiredNumberOfReads(regions); List<DownsampledReadList> readsList = new ArrayList<DownsampledReadList>(); for (String input : inputFiles) { Set<String> readIds = new HashSet<String>(); DownsampledReadList reads = new DownsampledReadList(downsampleTarget); readsList.add(reads); for (Feature region : regions) { SAMFileReader reader = new SAMFileReader(new File(input)); reader.setValidationStringency(ValidationStringency.SILENT); Iterator<SAMRecord> iter; if (region != null) { iter = reader.queryOverlapping( region.getSeqname(), (int) region.getStart(), (int) region.getEnd()); } else { iter = reader.iterator(); } while (iter.hasNext()) { SAMRecord read = iter.next(); // Don't allow same read to be counted twice. if ((!realigner.isFiltered(read)) && (!read.getDuplicateReadFlag()) && (!read.getReadFailsVendorQualityCheckFlag()) && (read.getMappingQuality() >= realigner.getMinMappingQuality() || read.getReadUnmappedFlag()) && (!readIds.contains(getIdentifier(read)))) { if (read.getReadString().length() > readLength) { reader.close(); throw new IllegalArgumentException( "Maximum read length of: " + readLength + " exceeded for: " + read.getSAMString()); } readIds.add(getIdentifier(read)); reads.add(read); } } if (reads.getTotalReadCount() != reads.getReads().size()) { if (isDebug) { System.err.println( "downsampled: " + regions.get(0).getDescriptor() + ": " + reads.getTotalReadCount() + " -> " + reads.getReads().size()); } } reader.close(); } } List<List<SAMRecord>> sampleReads = new ArrayList<List<SAMRecord>>(); for (DownsampledReadList downsampledReads : readsList) { sampleReads.add(downsampledReads.getReads()); } return sampleReads; }
/** * Main method for the program. Checks that all input files are present and readable and that the * output file can be written to. Then iterates through all the records accumulating metrics. * Finally writes metrics file */ protected int doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsWritable(OUTPUT); final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT); final Histogram<Integer> mismatchesHist = new Histogram<Integer>("Predicted", "Mismatches"); final Histogram<Integer> totalHist = new Histogram<Integer>("Predicted", "Total_Bases"); final Map<String, Histogram> mismatchesByTypeHist = new HashMap<String, Histogram>(); final Map<String, Histogram> totalByTypeHist = new HashMap<String, Histogram>(); // Set up the histograms byte[] bases = {'A', 'C', 'G', 'T'}; for (final byte base : bases) { final Histogram<Integer> h = new Histogram<Integer>("Predicted", (char) base + ">"); mismatchesByTypeHist.put((char) base + ">", h); final Histogram<Integer> h2 = new Histogram<Integer>("Predicted", ">" + (char) base); mismatchesByTypeHist.put(">" + (char) base, h2); } for (final byte base : bases) { final Histogram<Integer> h = new Histogram<Integer>("Predicted", (char) base + ">"); totalByTypeHist.put((char) base + ">", h); final Histogram<Integer> h2 = new Histogram<Integer>("Predicted", ">" + (char) base); totalByTypeHist.put(">" + (char) base, h2); } for (final SAMRecord record : reader) { // Ignore these as we don't know the truth if (record.getReadUnmappedFlag() || record.isSecondaryOrSupplementary()) { continue; } final byte[] readBases = record.getReadBases(); final byte[] readQualities = record.getBaseQualities(); final byte[] refBases = SequenceUtil.makeReferenceFromAlignment(record, false); // We've seen stranger things if (readQualities.length != readBases.length) { throw new PicardException( "Missing Qualities (" + readQualities.length + "," + readBases.length + ") : " + record.getSAMString()); } if (refBases.length != readBases.length) { throw new PicardException( "The read length did not match the inferred reference length, please check your MD and CIGAR."); } int cycleIndex; // zero-based if (record.getReadNegativeStrandFlag()) { cycleIndex = readBases.length - 1 + CYCLE_OFFSET; } else { cycleIndex = CYCLE_OFFSET; } for (int i = 0; i < readBases.length; i++) { if (-1 == CYCLE || cycleIndex == CYCLE) { if ('-' != refBases[i] && '0' != refBases[i]) { // not insertion and not soft-clipped if (!SequenceUtil.basesEqual(readBases[i], refBases[i])) { // mismatch mismatchesHist.increment((int) readQualities[i]); if (SequenceUtil.isValidBase(refBases[i])) { mismatchesByTypeHist .get((char) refBases[i] + ">") .increment((int) readQualities[i]); } if (SequenceUtil.isValidBase(readBases[i])) { mismatchesByTypeHist .get(">" + (char) readBases[i]) .increment((int) readQualities[i]); } } else { mismatchesHist.increment( (int) readQualities[i], 0); // to make sure the bin will exist } totalHist.increment((int) readQualities[i]); if (SequenceUtil.isValidBase(readBases[i])) { totalByTypeHist.get(">" + (char) readBases[i]).increment((int) readQualities[i]); } if (SequenceUtil.isValidBase(refBases[i])) { totalByTypeHist.get((char) refBases[i] + ">").increment((int) readQualities[i]); } } } cycleIndex += record.getReadNegativeStrandFlag() ? -1 : 1; } } CloserUtil.close(reader); final Histogram<Integer> hist = new Histogram<Integer>("Predicted", "Observed"); double sumOfSquaresError = 0.0; // compute the aggregate phred values for (final Integer key : mismatchesHist.keySet()) { final double numMismatches = mismatchesHist.get(key).getValue(); final double numBases = totalHist.get(key).getValue(); final double phredErr = Math.log10(numMismatches / numBases) * -10.0; sumOfSquaresError += (0 == numMismatches) ? 0.0 : (key - phredErr) * (key - phredErr); hist.increment(key, phredErr); // make sure the bin will exist for (final byte base : bases) { mismatchesByTypeHist.get(">" + (char) base).increment(key, 0.0); mismatchesByTypeHist.get((char) base + ">").increment(key, 0.0); totalByTypeHist.get(">" + (char) base).increment(key, 0.0); totalByTypeHist.get((char) base + ">").increment(key, 0.0); } } final QualityScoreAccuracyMetrics metrics = new QualityScoreAccuracyMetrics(); metrics.SUM_OF_SQUARE_ERROR = sumOfSquaresError; final MetricsFile<QualityScoreAccuracyMetrics, Integer> out = getMetricsFile(); out.addMetric(metrics); out.addHistogram(hist); for (final byte base : bases) { // >base : histograms for mismatches *to* the given base Histogram<Integer> m = mismatchesByTypeHist.get(">" + (char) base); Histogram<Integer> t = totalByTypeHist.get(">" + (char) base); Histogram<Integer> h = new Histogram<Integer>(m.getBinLabel(), m.getValueLabel()); for (final Integer key : m.keySet()) { final double numMismatches = m.get(key).getValue(); final double numBases = t.get(key).getValue(); final double phredErr = Math.log10(numMismatches / numBases) * -10.0; h.increment(key, phredErr); } out.addHistogram(h); // base> : histograms for mismatches *from* the given base m = mismatchesByTypeHist.get((char) base + ">"); t = totalByTypeHist.get(">" + (char) base); h = new Histogram<Integer>(m.getBinLabel(), m.getValueLabel()); for (final Integer key : m.keySet()) { final double numMismatches = m.get(key).getValue(); final double numBases = t.get(key).getValue(); final double phredErr = Math.log10(numMismatches / numBases) * -10.0; h.increment(key, phredErr); } out.addHistogram(h); } out.addHistogram(mismatchesHist); out.addHistogram(totalHist); out.write(OUTPUT); return 0; }
/** * Returns true if we don't think this read is eligible for the BAQ calculation. Examples include * non-PF reads, duplicates, or unmapped reads. Used by baqRead to determine if a read should fall * through the calculation. * * @param read * @return */ public boolean excludeReadFromBAQ(SAMRecord read) { // keeping mapped reads, regardless of pairing status, or primary alignment status. return read.getReadUnmappedFlag() || read.getReadFailsVendorQualityCheckFlag() || read.getDuplicateReadFlag(); }
@Override public void execute() { log.info("Loading reads..."); List<SAMRecord> reads = new ArrayList<SAMRecord>(); for (SAMRecord sr : BAM) { if (!sr.getReadUnmappedFlag()) { reads.add(sr); } } log.info(" {} reads loaded", reads.size()); Collections.shuffle(reads); log.info(" shuffled"); DataTable dt = new DataTable( "LanderWaterman", "Realistic Lander-Waterman stats", "reads", "bp_used", "bp_total", "pct_genome"); for (int numReads : NUM_READS) { log.info("Selecting {} reads...", numReads); Map<String, boolean[]> utilizationMask = new HashMap<String, boolean[]>(); for (SAMSequenceRecord ssr : BAM.getFileHeader().getSequenceDictionary().getSequences()) { utilizationMask.put(ssr.getSequenceName(), new boolean[ssr.getSequenceLength()]); } for (int i = 0; i < numReads; i++) { SAMRecord read = reads.get(i); for (int j = read.getAlignmentStart(); j < read.getAlignmentEnd(); j++) { utilizationMask.get(read.getReferenceName())[j] = true; } } int basesUsed = 0, basesTotal = 0; for (String refName : utilizationMask.keySet()) { for (int i = 0; i < utilizationMask.get(refName).length; i++) { if (utilizationMask.get(refName)[i]) { basesUsed++; } basesTotal++; } } dt.set("lw" + numReads, "reads", numReads); dt.set("lw" + numReads, "bp_used", basesUsed); dt.set("lw" + numReads, "bp_total", basesTotal); dt.set("lw" + numReads, "pct_genome", (float) basesUsed / (float) basesTotal); log.info( " reads: {}, bp_used: {}, bp_total: {}, pct_genome: {}", numReads, basesUsed, basesTotal, (float) basesUsed / (float) basesTotal); } out.println(dt); }