/** * Finds the adaptor boundary around the read and returns the first base inside the adaptor that * is closest to the read boundary. If the read is in the positive strand, this is the first base * after the end of the fragment (Picard calls it 'insert'), if the read is in the negative * strand, this is the first base before the beginning of the fragment. * * <p>There are two cases we need to treat here: * * <p>1) Our read is in the reverse strand : * * <p><----------------------| * |---------------------> * * <p>in these cases, the adaptor boundary is at the mate start (minus one) * * <p>2) Our read is in the forward strand : * * <p>|----------------------> * <----------------------| * * <p>in these cases the adaptor boundary is at the start of the read plus the inferred insert * size (plus one) * * @param read the read being tested for the adaptor boundary * @return the reference coordinate for the adaptor boundary (effectively the first base IN the * adaptor, closest to the read. NULL if the read is unmapped or the mate is mapped to another * contig. */ public static Integer getAdaptorBoundary(final SAMRecord read) { final int MAXIMUM_ADAPTOR_LENGTH = 8; final int insertSize = Math.abs( read .getInferredInsertSize()); // the inferred insert size can be negative if the mate // is mapped before the read (so we take the absolute // value) if (insertSize == 0 || read .getReadUnmappedFlag()) // no adaptors in reads with mates in another chromosome or // unmapped pairs return null; Integer adaptorBoundary; // the reference coordinate for the adaptor boundary (effectively the first // base IN the adaptor, closest to the read) if (read.getReadNegativeStrandFlag()) adaptorBoundary = read.getMateAlignmentStart() - 1; // case 1 (see header) else adaptorBoundary = read.getAlignmentStart() + insertSize + 1; // case 2 (see header) if ((adaptorBoundary < read.getAlignmentStart() - MAXIMUM_ADAPTOR_LENGTH) || (adaptorBoundary > read.getAlignmentEnd() + MAXIMUM_ADAPTOR_LENGTH)) adaptorBoundary = null; // we are being conservative by not allowing the adaptor boundary to go beyond what // we belive is the maximum size of an adaptor return adaptorBoundary; }
private boolean passesFilter( final SAMRecord record, final String sequence, final int startPos, final int endPos, final boolean contained) { if (record == null) { return false; } if (!safeEquals(record.getReferenceName(), sequence)) { return false; } final int alignmentStart = record.getAlignmentStart(); int alignmentEnd = record.getAlignmentEnd(); if (alignmentStart <= 0) { assertTrue(record.getReadUnmappedFlag()); return false; } if (alignmentEnd <= 0) { // For indexing-only records, treat as single base alignment. assertTrue(record.getReadUnmappedFlag()); alignmentEnd = alignmentStart; } if (contained) { if (startPos != 0 && alignmentStart < startPos) { return false; } if (endPos != 0 && alignmentEnd > endPos) { return false; } } else { if (startPos != 0 && alignmentEnd < startPos) { return false; } if (endPos != 0 && alignmentStart > endPos) { return false; } } return true; }
/** * Returns the duplicate score computed from the given fragment. value should be capped by * Short.MAX_VALUE/2 since the score from two reads will be added and an overflow will be * * <p>If true is given to assumeMateCigar, then any score that can use the mate cigar to compute * the mate's score will return the score computed on both ends. */ public static short computeDuplicateScore( final SAMRecord record, final ScoringStrategy scoringStrategy, final boolean assumeMateCigar) { Short storedScore = (Short) record.getTransientAttribute(Attr.DuplicateScore); if (storedScore == null) { short score = 0; switch (scoringStrategy) { case SUM_OF_BASE_QUALITIES: // two (very) long reads worth of high-quality bases can go over Short.MAX_VALUE/2 // and risk overflow. score += (short) Math.min(getSumOfBaseQualities(record), Short.MAX_VALUE / 2); break; case TOTAL_MAPPED_REFERENCE_LENGTH: if (!record.getReadUnmappedFlag()) { // no need to remember the score since this scoring mechanism is symmetric score = (short) Math.min(record.getCigar().getReferenceLength(), Short.MAX_VALUE / 2); } if (assumeMateCigar && record.getReadPairedFlag() && !record.getMateUnmappedFlag()) { score += (short) Math.min( SAMUtils.getMateCigar(record).getReferenceLength(), Short.MAX_VALUE / 2); } break; // The RANDOM score gives the same score to both reads so that they get filtered together. // it's not critical do use the readName since the scores from both ends get added, but it // seem // to be clearer this way. case RANDOM: // start with a random number between Short.MIN_VALUE/4 and Short.MAX_VALUE/4 score += (short) (hasher.hashUnencodedChars(record.getReadName()) & 0b11_1111_1111_1111); // subtract Short.MIN_VALUE/4 from it to end up with a number between // 0 and Short.MAX_VALUE/2. This number can be then discounted in case the read is // not passing filters. We need to stay far from overflow so that when we add the two // scores from the two read mates we do not overflow since that could cause us to chose a // failing read-pair instead of a passing one. score -= Short.MIN_VALUE / 4; } // make sure that filter-failing records are heavily discounted. (the discount can happen // twice, once // for each mate, so need to make sure we do not subtract more than Short.MIN_VALUE overall.) score += record.getReadFailsVendorQualityCheckFlag() ? (short) (Short.MIN_VALUE / 2) : 0; storedScore = score; record.setTransientAttribute(Attr.DuplicateScore, storedScore); } return storedScore; }
private void collectReadData(final SAMRecord record, final ReferenceSequence ref) { metrics.TOTAL_READS++; readLengthHistogram.increment(record.getReadBases().length); if (!record.getReadFailsVendorQualityCheckFlag()) { metrics.PF_READS++; if (isNoiseRead(record)) metrics.PF_NOISE_READS++; if (record.getReadUnmappedFlag()) { // If the read is unmapped see if it's adapter sequence final byte[] readBases = record.getReadBases(); if (!(record instanceof BAMRecord)) StringUtil.toUpperCase(readBases); if (isAdapterSequence(readBases)) { this.adapterReads++; } } else if (doRefMetrics) { metrics.PF_READS_ALIGNED++; if (!record.getReadNegativeStrandFlag()) numPositiveStrand++; if (record.getReadPairedFlag() && !record.getMateUnmappedFlag()) { metrics.READS_ALIGNED_IN_PAIRS++; // Check that both ends have mapq > minimum final Integer mateMq = record.getIntegerAttribute("MQ"); if (mateMq == null || mateMq >= MAPPING_QUALITY_THRESOLD && record.getMappingQuality() >= MAPPING_QUALITY_THRESOLD) { ++this.chimerasDenominator; // With both reads mapped we can see if this pair is chimeric if (Math.abs(record.getInferredInsertSize()) > maxInsertSize || !record.getReferenceIndex().equals(record.getMateReferenceIndex())) { ++this.chimeras; } } } } } }
private void collectQualityData(final SAMRecord record, final ReferenceSequence reference) { // If the read isnt an aligned PF read then look at the read for no-calls if (record.getReadUnmappedFlag() || record.getReadFailsVendorQualityCheckFlag() || !doRefMetrics) { final byte[] readBases = record.getReadBases(); for (int i = 0; i < readBases.length; i++) { if (SequenceUtil.isNoCall(readBases[i])) { badCycleHistogram.increment( CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i)); } } } else if (!record.getReadFailsVendorQualityCheckFlag()) { final boolean highQualityMapping = isHighQualityMapping(record); if (highQualityMapping) metrics.PF_HQ_ALIGNED_READS++; final byte[] readBases = record.getReadBases(); final byte[] refBases = reference.getBases(); final byte[] qualities = record.getBaseQualities(); final int refLength = refBases.length; long mismatchCount = 0; long hqMismatchCount = 0; for (final AlignmentBlock alignmentBlock : record.getAlignmentBlocks()) { final int readIndex = alignmentBlock.getReadStart() - 1; final int refIndex = alignmentBlock.getReferenceStart() - 1; final int length = alignmentBlock.getLength(); for (int i = 0; i < length && refIndex + i < refLength; ++i) { final int readBaseIndex = readIndex + i; boolean mismatch = !SequenceUtil.basesEqual(readBases[readBaseIndex], refBases[refIndex + i]); boolean bisulfiteBase = false; if (mismatch && isBisulfiteSequenced) { if ((record.getReadNegativeStrandFlag() && (refBases[refIndex + i] == 'G' || refBases[refIndex + i] == 'g') && (readBases[readBaseIndex] == 'A' || readBases[readBaseIndex] == 'a')) || ((!record.getReadNegativeStrandFlag()) && (refBases[refIndex + i] == 'C' || refBases[refIndex + i] == 'c') && (readBases[readBaseIndex] == 'T') || readBases[readBaseIndex] == 't')) { bisulfiteBase = true; mismatch = false; } } if (mismatch) mismatchCount++; metrics.PF_ALIGNED_BASES++; if (!bisulfiteBase) nonBisulfiteAlignedBases++; if (highQualityMapping) { metrics.PF_HQ_ALIGNED_BASES++; if (!bisulfiteBase) hqNonBisulfiteAlignedBases++; if (qualities[readBaseIndex] >= BASE_QUALITY_THRESHOLD) metrics.PF_HQ_ALIGNED_Q20_BASES++; if (mismatch) hqMismatchCount++; } if (mismatch || SequenceUtil.isNoCall(readBases[readBaseIndex])) { badCycleHistogram.increment( CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i)); } } } mismatchHistogram.increment(mismatchCount); hqMismatchHistogram.increment(hqMismatchCount); // Add any insertions and/or deletions to the global count for (final CigarElement elem : record.getCigar().getCigarElements()) { final CigarOperator op = elem.getOperator(); if (op == CigarOperator.INSERTION || op == CigarOperator.DELETION) ++this.indels; } } }
public void find_coverage(SAMResource sres) { int start_base = sres.region.range.start; int end_base = sres.region.range.end; int coverage_len = (end_base - start_base) + 1; int i, end, ref_i, read_i, len; int[] coverage = new int[coverage_len]; Arrays.fill(coverage, 0); WorkingFile wf = null; if (outfile != null) { try { wf = new WorkingFile(outfile); ps = wf.getPrintStream(); } catch (Exception e) { System.err.println("I/O error: " + e); // debug e.printStackTrace(); System.exit(1); } } try { // // gather coverage info: // CloseableIterator<SAMRecord> iterator = sres.get_iterator(); int read_count = 0; int ref_min = -1; int ref_max = -1; while (iterator.hasNext()) { SAMRecord sr = iterator.next(); read_count++; // System.err.println(sr.getReadName() + ": " + sr.getAlignmentStart() + "-" + // sr.getAlignmentEnd()); // debug if (sr.getReadUnmappedFlag()) continue; if (sr.getDuplicateReadFlag()) { if (verbose_mode) System.err.println( sr.getReadName() + "." + (sr.getReadNegativeStrandFlag() ? "R" : "F") + " ignoring, duplicate"); continue; } byte[] read = sr.getReadBases(); byte[] quals = sr.getBaseQualities(); for (AlignmentBlock ab : sr.getAlignmentBlocks()) { len = ab.getLength(); read_i = ab.getReadStart() - 1; ref_i = ab.getReferenceStart() - start_base; if (ref_min == -1 || ref_i < ref_min) ref_min = ref_i; for (i = read_i, end = read_i + len; i < end; i++, ref_i++) { if (ref_i >= 0 && ref_i < coverage_len) { if (quals[i] >= MIN_QUALITY) { if (verbose_mode) System.err.println( sr.getReadName() + "." + (sr.getReadNegativeStrandFlag() ? "R" : "F") + " hit at " + (ref_i + start_base) + " as=" + sr.getAlignmentStart() + " ae=" + sr.getAlignmentEnd()); coverage[ref_i]++; } else if (verbose_mode) { System.err.println( sr.getReadName() + "." + (sr.getReadNegativeStrandFlag() ? "R" : "F") + " qual_reject at " + (ref_i + start_base) + " as=" + sr.getAlignmentStart() + " ae=" + sr.getAlignmentEnd()); } } } if (ref_max == -1 || ref_i > ref_max) ref_max = ref_i; } } sres.close(); System.err.println( "records:" + read_count + " ref_min:" + (ref_min + start_base) + " ref_max:" + (ref_max + start_base)); // debug // // report coverage info: // for (i = 0; i < coverage.length; i++) { if (name != null) ps.print(name + ","); ps.println((i + start_base) + "," + coverage[i]); // debug } if (wf != null) wf.finish(); } catch (Exception e) { System.err.println("ERROR: " + e); // debug e.printStackTrace(); } }