/** * Compare two records based on their duplicate scores. If the scores are equal, we break ties * based on mapping quality (added to the mate's mapping quality if paired and mapped), then * library/read name. * * <p>If true is given to assumeMateCigar, then any score that can use the mate cigar to to * compute the mate's score will return the score computed on both ends. * * <p>We allow different scoring strategies. We return <0 if rec1 has a better strategy than rec2. */ public static int compare( final SAMRecord rec1, final SAMRecord rec2, final ScoringStrategy scoringStrategy, final boolean assumeMateCigar) { int cmp; // always prefer paired over non-paired if (rec1.getReadPairedFlag() != rec2.getReadPairedFlag()) return rec1.getReadPairedFlag() ? 1 : -1; cmp = computeDuplicateScore(rec2, scoringStrategy, assumeMateCigar) - computeDuplicateScore(rec1, scoringStrategy, assumeMateCigar); /** * Finally, use library ID and read name This is important because we cannot control the order * in which reads appear for reads that are comparable up to now (i.e. cmp == 0). We want to * deterministically choose them, and so we need this. */ if (0 == cmp) cmp = SAMUtils.getCanonicalRecordName(rec1).compareTo(SAMUtils.getCanonicalRecordName(rec2)); return cmp; }
/** * Returns the duplicate score computed from the given fragment. value should be capped by * Short.MAX_VALUE/2 since the score from two reads will be added and an overflow will be * * <p>If true is given to assumeMateCigar, then any score that can use the mate cigar to compute * the mate's score will return the score computed on both ends. */ public static short computeDuplicateScore( final SAMRecord record, final ScoringStrategy scoringStrategy, final boolean assumeMateCigar) { Short storedScore = (Short) record.getTransientAttribute(Attr.DuplicateScore); if (storedScore == null) { short score = 0; switch (scoringStrategy) { case SUM_OF_BASE_QUALITIES: // two (very) long reads worth of high-quality bases can go over Short.MAX_VALUE/2 // and risk overflow. score += (short) Math.min(getSumOfBaseQualities(record), Short.MAX_VALUE / 2); break; case TOTAL_MAPPED_REFERENCE_LENGTH: if (!record.getReadUnmappedFlag()) { // no need to remember the score since this scoring mechanism is symmetric score = (short) Math.min(record.getCigar().getReferenceLength(), Short.MAX_VALUE / 2); } if (assumeMateCigar && record.getReadPairedFlag() && !record.getMateUnmappedFlag()) { score += (short) Math.min( SAMUtils.getMateCigar(record).getReferenceLength(), Short.MAX_VALUE / 2); } break; // The RANDOM score gives the same score to both reads so that they get filtered together. // it's not critical do use the readName since the scores from both ends get added, but it // seem // to be clearer this way. case RANDOM: // start with a random number between Short.MIN_VALUE/4 and Short.MAX_VALUE/4 score += (short) (hasher.hashUnencodedChars(record.getReadName()) & 0b11_1111_1111_1111); // subtract Short.MIN_VALUE/4 from it to end up with a number between // 0 and Short.MAX_VALUE/2. This number can be then discounted in case the read is // not passing filters. We need to stay far from overflow so that when we add the two // scores from the two read mates we do not overflow since that could cause us to chose a // failing read-pair instead of a passing one. score -= Short.MIN_VALUE / 4; } // make sure that filter-failing records are heavily discounted. (the discount can happen // twice, once // for each mate, so need to make sure we do not subtract more than Short.MIN_VALUE overall.) score += record.getReadFailsVendorQualityCheckFlag() ? (short) (Short.MIN_VALUE / 2) : 0; storedScore = score; record.setTransientAttribute(Attr.DuplicateScore, storedScore); } return storedScore; }
public void acceptRecord(final SAMRecordAndReference args) { final SAMRecord rec = args.getSamRecord(); final ReferenceSequence ref = args.getReferenceSequence(); if (rec.getReadPairedFlag()) { if (rec.getFirstOfPairFlag()) { firstOfPairCollector.addRecord(rec, ref); } else { secondOfPairCollector.addRecord(rec, ref); } pairCollector.addRecord(rec, ref); } else { unpairedCollector.addRecord(rec, ref); } }
private static void removeMateInfo(SAMRecord rec) { if (rec.getReadPairedFlag()) { // Remove all information of its mate // flag rec.setProperPairFlag(false); // not paired any more rec.setMateUnmappedFlag(false); rec.setMateNegativeStrandFlag(false); // entries rec.setMateReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); rec.setMateAlignmentStart(0); rec.setInferredInsertSize(0); // TODO: remove tags and values that are mate pair inclined. } }
private void collectReadData(final SAMRecord record, final ReferenceSequence ref) { metrics.TOTAL_READS++; readLengthHistogram.increment(record.getReadBases().length); if (!record.getReadFailsVendorQualityCheckFlag()) { metrics.PF_READS++; if (isNoiseRead(record)) metrics.PF_NOISE_READS++; if (record.getReadUnmappedFlag()) { // If the read is unmapped see if it's adapter sequence final byte[] readBases = record.getReadBases(); if (!(record instanceof BAMRecord)) StringUtil.toUpperCase(readBases); if (isAdapterSequence(readBases)) { this.adapterReads++; } } else if (doRefMetrics) { metrics.PF_READS_ALIGNED++; if (!record.getReadNegativeStrandFlag()) numPositiveStrand++; if (record.getReadPairedFlag() && !record.getMateUnmappedFlag()) { metrics.READS_ALIGNED_IN_PAIRS++; // Check that both ends have mapq > minimum final Integer mateMq = record.getIntegerAttribute("MQ"); if (mateMq == null || mateMq >= MAPPING_QUALITY_THRESOLD && record.getMappingQuality() >= MAPPING_QUALITY_THRESOLD) { ++this.chimerasDenominator; // With both reads mapped we can see if this pair is chimeric if (Math.abs(record.getInferredInsertSize()) > maxInsertSize || !record.getReferenceIndex().equals(record.getMateReferenceIndex())) { ++this.chimeras; } } } } } }