Пример #1
0
  /**
   * Finds the adaptor boundary around the read and returns the first base inside the adaptor that
   * is closest to the read boundary. If the read is in the positive strand, this is the first base
   * after the end of the fragment (Picard calls it 'insert'), if the read is in the negative
   * strand, this is the first base before the beginning of the fragment.
   *
   * <p>There are two cases we need to treat here:
   *
   * <p>1) Our read is in the reverse strand :
   *
   * <p><----------------------| * |--------------------->
   *
   * <p>in these cases, the adaptor boundary is at the mate start (minus one)
   *
   * <p>2) Our read is in the forward strand :
   *
   * <p>|----------------------> * <----------------------|
   *
   * <p>in these cases the adaptor boundary is at the start of the read plus the inferred insert
   * size (plus one)
   *
   * @param read the read being tested for the adaptor boundary
   * @return the reference coordinate for the adaptor boundary (effectively the first base IN the
   *     adaptor, closest to the read. NULL if the read is unmapped or the mate is mapped to another
   *     contig.
   */
  public static Integer getAdaptorBoundary(final SAMRecord read) {
    final int MAXIMUM_ADAPTOR_LENGTH = 8;
    final int insertSize =
        Math.abs(
            read
                .getInferredInsertSize()); // the inferred insert size can be negative if the mate
                                           // is mapped before the read (so we take the absolute
                                           // value)

    if (insertSize == 0
        || read
            .getReadUnmappedFlag()) // no adaptors in reads with mates in another chromosome or
                                    // unmapped pairs
    return null;

    Integer
        adaptorBoundary; // the reference coordinate for the adaptor boundary (effectively the first
                         // base IN the adaptor, closest to the read)
    if (read.getReadNegativeStrandFlag())
      adaptorBoundary = read.getMateAlignmentStart() - 1; // case 1 (see header)
    else adaptorBoundary = read.getAlignmentStart() + insertSize + 1; // case 2 (see header)

    if ((adaptorBoundary < read.getAlignmentStart() - MAXIMUM_ADAPTOR_LENGTH)
        || (adaptorBoundary > read.getAlignmentEnd() + MAXIMUM_ADAPTOR_LENGTH))
      adaptorBoundary =
          null; // we are being conservative by not allowing the adaptor boundary to go beyond what
                // we belive is the maximum size of an adaptor

    return adaptorBoundary;
  }
Пример #2
0
 private boolean passesFilter(
     final SAMRecord record,
     final String sequence,
     final int startPos,
     final int endPos,
     final boolean contained) {
   if (record == null) {
     return false;
   }
   if (!safeEquals(record.getReferenceName(), sequence)) {
     return false;
   }
   final int alignmentStart = record.getAlignmentStart();
   int alignmentEnd = record.getAlignmentEnd();
   if (alignmentStart <= 0) {
     assertTrue(record.getReadUnmappedFlag());
     return false;
   }
   if (alignmentEnd <= 0) {
     // For indexing-only records, treat as single base alignment.
     assertTrue(record.getReadUnmappedFlag());
     alignmentEnd = alignmentStart;
   }
   if (contained) {
     if (startPos != 0 && alignmentStart < startPos) {
       return false;
     }
     if (endPos != 0 && alignmentEnd > endPos) {
       return false;
     }
   } else {
     if (startPos != 0 && alignmentEnd < startPos) {
       return false;
     }
     if (endPos != 0 && alignmentStart > endPos) {
       return false;
     }
   }
   return true;
 }
Пример #3
0
  /**
   * Returns the duplicate score computed from the given fragment. value should be capped by
   * Short.MAX_VALUE/2 since the score from two reads will be added and an overflow will be
   *
   * <p>If true is given to assumeMateCigar, then any score that can use the mate cigar to compute
   * the mate's score will return the score computed on both ends.
   */
  public static short computeDuplicateScore(
      final SAMRecord record,
      final ScoringStrategy scoringStrategy,
      final boolean assumeMateCigar) {
    Short storedScore = (Short) record.getTransientAttribute(Attr.DuplicateScore);

    if (storedScore == null) {
      short score = 0;
      switch (scoringStrategy) {
        case SUM_OF_BASE_QUALITIES:
          // two (very) long reads worth of high-quality bases can go over Short.MAX_VALUE/2
          // and risk overflow.
          score += (short) Math.min(getSumOfBaseQualities(record), Short.MAX_VALUE / 2);
          break;
        case TOTAL_MAPPED_REFERENCE_LENGTH:
          if (!record.getReadUnmappedFlag()) {
            // no need to remember the score since this scoring mechanism is symmetric
            score = (short) Math.min(record.getCigar().getReferenceLength(), Short.MAX_VALUE / 2);
          }
          if (assumeMateCigar && record.getReadPairedFlag() && !record.getMateUnmappedFlag()) {
            score +=
                (short)
                    Math.min(
                        SAMUtils.getMateCigar(record).getReferenceLength(), Short.MAX_VALUE / 2);
          }
          break;
          // The RANDOM score gives the same score to both reads so that they get filtered together.
          // it's not critical do use the readName since the scores from both ends get added, but it
          // seem
          // to be clearer this way.
        case RANDOM:
          // start with a random number between Short.MIN_VALUE/4 and Short.MAX_VALUE/4
          score += (short) (hasher.hashUnencodedChars(record.getReadName()) & 0b11_1111_1111_1111);
          // subtract Short.MIN_VALUE/4 from it to end up with a number between
          // 0 and Short.MAX_VALUE/2. This number can be then discounted in case the read is
          // not passing filters. We need to stay far from overflow so that when we add the two
          // scores from the two read mates we do not overflow since that could cause us to chose a
          // failing read-pair instead of a passing one.
          score -= Short.MIN_VALUE / 4;
      }

      // make sure that filter-failing records are heavily discounted. (the discount can happen
      // twice, once
      // for each mate, so need to make sure we do not subtract more than Short.MIN_VALUE overall.)
      score += record.getReadFailsVendorQualityCheckFlag() ? (short) (Short.MIN_VALUE / 2) : 0;

      storedScore = score;
      record.setTransientAttribute(Attr.DuplicateScore, storedScore);
    }

    return storedScore;
  }
      private void collectReadData(final SAMRecord record, final ReferenceSequence ref) {
        metrics.TOTAL_READS++;
        readLengthHistogram.increment(record.getReadBases().length);

        if (!record.getReadFailsVendorQualityCheckFlag()) {
          metrics.PF_READS++;
          if (isNoiseRead(record)) metrics.PF_NOISE_READS++;

          if (record.getReadUnmappedFlag()) {
            // If the read is unmapped see if it's adapter sequence
            final byte[] readBases = record.getReadBases();
            if (!(record instanceof BAMRecord)) StringUtil.toUpperCase(readBases);

            if (isAdapterSequence(readBases)) {
              this.adapterReads++;
            }
          } else if (doRefMetrics) {
            metrics.PF_READS_ALIGNED++;
            if (!record.getReadNegativeStrandFlag()) numPositiveStrand++;

            if (record.getReadPairedFlag() && !record.getMateUnmappedFlag()) {
              metrics.READS_ALIGNED_IN_PAIRS++;

              // Check that both ends have mapq > minimum
              final Integer mateMq = record.getIntegerAttribute("MQ");
              if (mateMq == null
                  || mateMq >= MAPPING_QUALITY_THRESOLD
                      && record.getMappingQuality() >= MAPPING_QUALITY_THRESOLD) {
                ++this.chimerasDenominator;

                // With both reads mapped we can see if this pair is chimeric
                if (Math.abs(record.getInferredInsertSize()) > maxInsertSize
                    || !record.getReferenceIndex().equals(record.getMateReferenceIndex())) {
                  ++this.chimeras;
                }
              }
            }
          }
        }
      }
      private void collectQualityData(final SAMRecord record, final ReferenceSequence reference) {
        // If the read isnt an aligned PF read then look at the read for no-calls
        if (record.getReadUnmappedFlag()
            || record.getReadFailsVendorQualityCheckFlag()
            || !doRefMetrics) {
          final byte[] readBases = record.getReadBases();
          for (int i = 0; i < readBases.length; i++) {
            if (SequenceUtil.isNoCall(readBases[i])) {
              badCycleHistogram.increment(
                  CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i));
            }
          }
        } else if (!record.getReadFailsVendorQualityCheckFlag()) {
          final boolean highQualityMapping = isHighQualityMapping(record);
          if (highQualityMapping) metrics.PF_HQ_ALIGNED_READS++;

          final byte[] readBases = record.getReadBases();
          final byte[] refBases = reference.getBases();
          final byte[] qualities = record.getBaseQualities();
          final int refLength = refBases.length;
          long mismatchCount = 0;
          long hqMismatchCount = 0;

          for (final AlignmentBlock alignmentBlock : record.getAlignmentBlocks()) {
            final int readIndex = alignmentBlock.getReadStart() - 1;
            final int refIndex = alignmentBlock.getReferenceStart() - 1;
            final int length = alignmentBlock.getLength();

            for (int i = 0; i < length && refIndex + i < refLength; ++i) {
              final int readBaseIndex = readIndex + i;
              boolean mismatch =
                  !SequenceUtil.basesEqual(readBases[readBaseIndex], refBases[refIndex + i]);
              boolean bisulfiteBase = false;
              if (mismatch && isBisulfiteSequenced) {
                if ((record.getReadNegativeStrandFlag()
                        && (refBases[refIndex + i] == 'G' || refBases[refIndex + i] == 'g')
                        && (readBases[readBaseIndex] == 'A' || readBases[readBaseIndex] == 'a'))
                    || ((!record.getReadNegativeStrandFlag())
                            && (refBases[refIndex + i] == 'C' || refBases[refIndex + i] == 'c')
                            && (readBases[readBaseIndex] == 'T')
                        || readBases[readBaseIndex] == 't')) {

                  bisulfiteBase = true;
                  mismatch = false;
                }
              }

              if (mismatch) mismatchCount++;

              metrics.PF_ALIGNED_BASES++;
              if (!bisulfiteBase) nonBisulfiteAlignedBases++;

              if (highQualityMapping) {
                metrics.PF_HQ_ALIGNED_BASES++;
                if (!bisulfiteBase) hqNonBisulfiteAlignedBases++;
                if (qualities[readBaseIndex] >= BASE_QUALITY_THRESHOLD)
                  metrics.PF_HQ_ALIGNED_Q20_BASES++;
                if (mismatch) hqMismatchCount++;
              }

              if (mismatch || SequenceUtil.isNoCall(readBases[readBaseIndex])) {
                badCycleHistogram.increment(
                    CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i));
              }
            }
          }

          mismatchHistogram.increment(mismatchCount);
          hqMismatchHistogram.increment(hqMismatchCount);

          // Add any insertions and/or deletions to the global count
          for (final CigarElement elem : record.getCigar().getCigarElements()) {
            final CigarOperator op = elem.getOperator();
            if (op == CigarOperator.INSERTION || op == CigarOperator.DELETION) ++this.indels;
          }
        }
      }
Пример #6
0
  public void find_coverage(SAMResource sres) {
    int start_base = sres.region.range.start;
    int end_base = sres.region.range.end;

    int coverage_len = (end_base - start_base) + 1;
    int i, end, ref_i, read_i, len;

    int[] coverage = new int[coverage_len];
    Arrays.fill(coverage, 0);

    WorkingFile wf = null;
    if (outfile != null) {
      try {
        wf = new WorkingFile(outfile);
        ps = wf.getPrintStream();
      } catch (Exception e) {
        System.err.println("I/O error: " + e); // debug
        e.printStackTrace();
        System.exit(1);
      }
    }

    try {
      //
      //  gather coverage info:
      //
      CloseableIterator<SAMRecord> iterator = sres.get_iterator();
      int read_count = 0;
      int ref_min = -1;
      int ref_max = -1;

      while (iterator.hasNext()) {
        SAMRecord sr = iterator.next();
        read_count++;

        //	System.err.println(sr.getReadName() + ": " + sr.getAlignmentStart() + "-" +
        // sr.getAlignmentEnd());  // debug

        if (sr.getReadUnmappedFlag()) continue;
        if (sr.getDuplicateReadFlag()) {
          if (verbose_mode)
            System.err.println(
                sr.getReadName()
                    + "."
                    + (sr.getReadNegativeStrandFlag() ? "R" : "F")
                    + " ignoring, duplicate");
          continue;
        }

        byte[] read = sr.getReadBases();
        byte[] quals = sr.getBaseQualities();

        for (AlignmentBlock ab : sr.getAlignmentBlocks()) {
          len = ab.getLength();
          read_i = ab.getReadStart() - 1;
          ref_i = ab.getReferenceStart() - start_base;

          if (ref_min == -1 || ref_i < ref_min) ref_min = ref_i;

          for (i = read_i, end = read_i + len; i < end; i++, ref_i++) {
            if (ref_i >= 0 && ref_i < coverage_len) {
              if (quals[i] >= MIN_QUALITY) {
                if (verbose_mode)
                  System.err.println(
                      sr.getReadName()
                          + "."
                          + (sr.getReadNegativeStrandFlag() ? "R" : "F")
                          + " hit at "
                          + (ref_i + start_base)
                          + " as="
                          + sr.getAlignmentStart()
                          + " ae="
                          + sr.getAlignmentEnd());
                coverage[ref_i]++;
              } else if (verbose_mode) {
                System.err.println(
                    sr.getReadName()
                        + "."
                        + (sr.getReadNegativeStrandFlag() ? "R" : "F")
                        + " qual_reject at "
                        + (ref_i + start_base)
                        + " as="
                        + sr.getAlignmentStart()
                        + " ae="
                        + sr.getAlignmentEnd());
              }
            }
          }
          if (ref_max == -1 || ref_i > ref_max) ref_max = ref_i;
        }
      }
      sres.close();
      System.err.println(
          "records:"
              + read_count
              + " ref_min:"
              + (ref_min + start_base)
              + " ref_max:"
              + (ref_max + start_base)); // debug

      //
      //  report coverage info:
      //
      for (i = 0; i < coverage.length; i++) {
        if (name != null) ps.print(name + ",");
        ps.println((i + start_base) + "," + coverage[i]); // debug
      }
      if (wf != null) wf.finish();

    } catch (Exception e) {
      System.err.println("ERROR: " + e); // debug
      e.printStackTrace();
    }
  }