Пример #1
0
 public static String encodeBQTag(SAMRecord read, byte[] baq) {
   // Offset to base alignment quality (BAQ), of the same length as the read sequence.
   // At the i-th read base, BAQi = Qi - (BQi - 64) where Qi is the i-th base quality.
   // so BQi = Qi - BAQi + 64
   byte[] bqTag = new byte[baq.length];
   for (int i = 0; i < bqTag.length; i++) {
     final int bq = (int) read.getBaseQualities()[i] + 64;
     final int baq_i = (int) baq[i];
     final int tag = bq - baq_i;
     // problem with the calculation of the correction factor; this is our problem
     if (tag < 0)
       throw new ReviewedGATKException(
           "BAQ tag calculation error.  BAQ value above base quality at " + read);
     // the original quality is too high, almost certainly due to using the wrong encoding in the
     // BAM file
     if (tag > Byte.MAX_VALUE)
       throw new UserException.MisencodedBAM(
           read,
           "we encountered an extremely high quality score ("
               + (int) read.getBaseQualities()[i]
               + ") with BAQ correction factor of "
               + baq_i);
     bqTag[i] = (byte) tag;
   }
   return new String(bqTag);
 }
Пример #2
0
  /**
   * Returns a new qual array for read that includes the BAQ adjustment. Does not support on-the-fly
   * BAQ calculation
   *
   * @param read the SAMRecord to operate on
   * @param overwriteOriginalQuals If true, we replace the original qualities scores in the read
   *     with their BAQ'd version
   * @param useRawQualsIfNoBAQTag If useRawQualsIfNoBAQTag is true, then if there's no BAQ
   *     annotation we just use the raw quality scores. Throws IllegalStateException is false and no
   *     BAQ tag is present
   * @return
   */
  public static byte[] calcBAQFromTag(
      SAMRecord read, boolean overwriteOriginalQuals, boolean useRawQualsIfNoBAQTag) {
    byte[] rawQuals = read.getBaseQualities();
    byte[] newQuals = rawQuals;
    byte[] baq = getBAQTag(read);

    if (baq != null) {
      // Offset to base alignment quality (BAQ), of the same length as the read sequence.
      // At the i-th read base, BAQi = Qi - (BQi - 64) where Qi is the i-th base quality.
      newQuals = overwriteOriginalQuals ? rawQuals : new byte[rawQuals.length];
      for (int i = 0; i < rawQuals.length; i++) {
        int rawQual = (int) rawQuals[i];
        int baq_delta = (int) baq[i] - 64;
        int newval = rawQual - baq_delta;
        if (newval < 0)
          throw new UserException.MalformedBAM(
              read, "BAQ tag error: the BAQ value is larger than the base quality");
        newQuals[i] = (byte) newval;
      }
    } else if (!useRawQualsIfNoBAQTag) {
      throw new IllegalStateException(
          "Required BAQ tag to be present, but none was on read " + read.getReadName());
    }

    return newQuals;
  }
Пример #3
0
  /** Note: this is the only getKey function that handles unmapped reads specially! */
  public static long getKey(final SAMRecord rec) {
    final int refIdx = rec.getReferenceIndex();
    final int start = rec.getAlignmentStart();

    if (!(rec.getReadUnmappedFlag() || refIdx < 0 || start < 0)) return getKey(refIdx, start);

    // Put unmapped reads at the end, but don't give them all the exact same
    // key so that they can be distributed to different reducers.
    //
    // A random number would probably be best, but to ensure that the same
    // record always gets the same key we use a fast hash instead.
    //
    // We avoid using hashCode(), because it's not guaranteed to have the
    // same value across different processes.

    int hash = 0;
    byte[] var;
    if ((var = rec.getVariableBinaryRepresentation()) != null) {
      // Undecoded BAM record: just hash its raw data.
      hash = (int) MurmurHash3.murmurhash3(var, hash);
    } else {
      // Decoded BAM record or any SAM record: hash a few representative
      // fields together.
      hash = (int) MurmurHash3.murmurhash3(rec.getReadName(), hash);
      hash = (int) MurmurHash3.murmurhash3(rec.getReadBases(), hash);
      hash = (int) MurmurHash3.murmurhash3(rec.getBaseQualities(), hash);
      hash = (int) MurmurHash3.murmurhash3(rec.getCigarString(), hash);
    }
    return getKey0(Integer.MAX_VALUE, hash);
  }
Пример #4
0
  /**
   * @param read a read containing the variant
   * @return the number of hard clipped and low qual bases at the read start (where start is the
   *     leftmost end w.r.t. the reference)
   */
  public static int getNumClippedBasesAtStart(final SAMRecord read) {
    // check for hard clips (never consider these bases):
    final Cigar c = read.getCigar();
    final CigarElement first = c.getCigarElement(0);

    int numStartClippedBases = 0;
    if (first.getOperator() == CigarOperator.H) {
      numStartClippedBases = first.getLength();
    }
    final byte[] unclippedReadBases = read.getReadBases();
    final byte[] unclippedReadQuals = read.getBaseQualities();

    // Do a stricter base clipping than provided by CIGAR string, since this one may be too
    // conservative,
    // and may leave a string of Q2 bases still hanging off the reads.
    // TODO: this code may not even get used because HaplotypeCaller already hard clips low quality
    // tails
    for (int i = numStartClippedBases; i < unclippedReadBases.length; i++) {
      if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD)
        numStartClippedBases++;
      else break;
    }

    return numStartClippedBases;
  }
Пример #5
0
  /**
   * HACK TO CREATE GATKSAMRECORD BASED ONLY A SAMRECORD FOR TESTING PURPOSES ONLY
   *
   * @param read
   */
  public GATKSAMRecord(final SAMRecord read) {
    super(read.getHeader());
    super.setReferenceIndex(read.getReferenceIndex());
    super.setAlignmentStart(read.getAlignmentStart());
    super.setReadName(read.getReadName());
    super.setMappingQuality(read.getMappingQuality());
    // indexing bin done below
    super.setCigar(read.getCigar());
    super.setFlags(read.getFlags());
    super.setMateReferenceIndex(read.getMateReferenceIndex());
    super.setMateAlignmentStart(read.getMateAlignmentStart());
    super.setInferredInsertSize(read.getInferredInsertSize());
    SAMReadGroupRecord samRG = read.getReadGroup();
    SAMBinaryTagAndValue samAttr = GATKBin.getReadBinaryAttributes(read);
    if (samAttr == null) {
      clearAttributes();
    } else {
      setAttributes(samAttr);
    }
    if (samRG != null) {
      GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(samRG);
      setReadGroup(rg);
    }

    super.setFileSource(read.getFileSource());
    super.setReadName(read.getReadName());
    super.setCigarString(read.getCigarString());
    super.setReadBases(read.getReadBases());
    super.setBaseQualities(read.getBaseQualities());
    // From SAMRecord constructor: Do this after the above because setCigarString will clear it.
    GATKBin.setReadIndexingBin(this, GATKBin.getReadIndexingBin(read));
  }
Пример #6
0
  /**
   * Modifies read in place so that the base quality scores are capped by the BAQ calculation. Uses
   * the BAQ tag if present already and alwaysRecalculate is false, otherwise fires up the HMM and
   * does the BAQ on the fly using the refReader to obtain the reference bases as needed.
   *
   * @param read
   * @param refReader
   * @param calculationType
   * @return BQ qualities for use, in case qmode is DONT_MODIFY
   */
  public byte[] baqRead(
      SAMRecord read,
      IndexedFastaSequenceFile refReader,
      CalculationMode calculationType,
      QualityMode qmode) {
    if (DEBUG) System.out.printf("BAQ %s read %s%n", calculationType, read.getReadName());

    byte[] BAQQuals =
        read.getBaseQualities(); // in general we are overwriting quals, so just get a pointer to
    // them
    if (calculationType == CalculationMode.OFF) { // we don't want to do anything
      ; // just fall though
    } else if (excludeReadFromBAQ(read)) {; // just fall through
    } else {
      final boolean readHasBAQTag = hasBAQTag(read);

      if (calculationType == CalculationMode.RECALCULATE || !readHasBAQTag) {
        if (DEBUG) System.out.printf("  Calculating BAQ on the fly%n");
        BAQCalculationResult hmmResult = calcBAQFromHMM(read, refReader);
        if (hmmResult != null) {
          switch (qmode) {
            case ADD_TAG:
              addBAQTag(read, hmmResult.bq);
              break;
            case OVERWRITE_QUALS:
              System.arraycopy(hmmResult.bq, 0, read.getBaseQualities(), 0, hmmResult.bq.length);
              break;
            case DONT_MODIFY:
              BAQQuals = hmmResult.bq;
              break;
            default:
              throw new ReviewedGATKException("BUG: unexpected qmode " + qmode);
          }
        } else if (readHasBAQTag) {
          // remove the BAQ tag if it's there because we cannot trust it
          read.setAttribute(BAQ_TAG, null);
        }
      } else if (qmode
          == QualityMode.OVERWRITE_QUALS) { // only makes sense if we are overwriting quals
        if (DEBUG) System.out.printf("  Taking BAQ from tag%n");
        // this overwrites the original qualities
        calcBAQFromTag(read, true, false);
      }
    }

    return BAQQuals;
  }
 @Test
 public void testFixBadQuals() {
   final byte[] fixedQuals = {28, 29, 31, 32, 33, 30, 31, 27, 26, 25};
   final byte[] badQuals = {59, 60, 62, 63, 64, 61, 62, 58, 57, 56};
   final ReadTransformer tr = new MisencodedBaseQualityReadTransformer();
   final SAMRecord read = createRead(badQuals);
   final SAMRecord fixedRead = tr.apply(read);
   Assert.assertEquals(fixedQuals, fixedRead.getBaseQualities());
 }
Пример #8
0
  // we need to pad ref by at least the bandwidth / 2 on either side
  public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) {
    // todo -- need to handle the case where the cigar sum of lengths doesn't cover the whole read
    Pair<Integer, Integer> queryRange = calculateQueryRange(read);
    if (queryRange == null) return null; // read has Ns, or is completely clipped away

    int queryStart = queryRange.getFirst();
    int queryEnd = queryRange.getSecond();

    BAQCalculationResult baqResult =
        calcBAQFromHMM(ref, read.getReadBases(), read.getBaseQualities(), queryStart, queryEnd);

    // cap quals
    int readI = 0, refI = 0;
    for (CigarElement elt : read.getCigar().getCigarElements()) {
      int l = elt.getLength();
      switch (elt.getOperator()) {
        case N: // cannot handle these
          return null;
        case H:
        case P: // ignore pads and hard clips
          break;
        case S:
          refI += l; // move the reference too, in addition to I
        case I:
          // todo -- is it really the case that we want to treat I and S the same?
          for (int i = readI; i < readI + l; i++) baqResult.bq[i] = baqResult.rawQuals[i];
          readI += l;
          break;
        case D:
          refI += l;
          break;
        case M:
          for (int i = readI; i < readI + l; i++) {
            int expectedPos = refI - refOffset + (i - readI);
            baqResult.bq[i] =
                capBaseByBAQ(
                    baqResult.rawQuals[i], baqResult.bq[i], baqResult.state[i], expectedPos);
          }
          readI += l;
          refI += l;
          break;
        default:
          throw new ReviewedGATKException(
              "BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName());
      }
    }
    if (readI != read.getReadLength()) // odd cigar string
    System.arraycopy(baqResult.rawQuals, 0, baqResult.bq, 0, baqResult.bq.length);

    return baqResult;
  }
Пример #9
0
  /**
   * Returns the BAQ adjusted quality score for this read at this offset. Does not support
   * on-the-fly BAQ calculation
   *
   * @param read the SAMRecord to operate on
   * @param offset the offset of operate on
   * @param useRawQualsIfNoBAQTag If useRawQualsIfNoBAQTag is true, then if there's no BAQ
   *     annotation we just use the raw quality scores. Throws IllegalStateException is false and no
   *     BAQ tag is present
   * @return
   */
  public static byte calcBAQFromTag(SAMRecord read, int offset, boolean useRawQualsIfNoBAQTag) {
    byte rawQual = read.getBaseQualities()[offset];
    byte newQual = rawQual;
    byte[] baq = getBAQTag(read);

    if (baq != null) {
      // Offset to base alignment quality (BAQ), of the same length as the read sequence.
      // At the i-th read base, BAQi = Qi - (BQi - 64) where Qi is the i-th base quality.
      int baq_delta = (int) baq[offset] - 64;
      int newval = rawQual - baq_delta;
      if (newval < 0)
        throw new UserException.MalformedBAM(
            read, "BAQ tag error: the BAQ value is larger than the base quality");
      newQual = (byte) newval;

    } else if (!useRawQualsIfNoBAQTag) {
      throw new IllegalStateException(
          "Required BAQ tag to be present, but none was on read " + read.getReadName());
    }

    return newQual;
  }
  public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read) {

    long numStartClippedBases = 0;
    long numEndClippedBases = 0;

    byte[] unclippedReadQuals = read.getBaseQualities();
    byte[] unclippedReadBases = read.getReadBases();

    // Do a stricter base clipping than provided by CIGAR string, since this one may be too
    // conservative,
    // and may leave a string of Q2 bases still hanging off the reads.
    for (int i = 0; i < read.getReadLength(); i++) {
      if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) numStartClippedBases++;
      else break;
    }
    for (int i = read.getReadLength() - 1; i >= 0; i--) {
      if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) numEndClippedBases++;
      else break;
    }
    // System.out.format("numstart: %d numend: %d\n", numStartClippedBases, numEndClippedBases);
    if (numStartClippedBases + numEndClippedBases >= read.getReadLength()) {
      return 0; /// Double.POSITIVE_INFINITY;
    }
    byte[] readBases =
        Arrays.copyOfRange(
            unclippedReadBases,
            (int) numStartClippedBases,
            (int) (read.getReadBases().length - numEndClippedBases));

    byte[] readQuals =
        Arrays.copyOfRange(
            unclippedReadQuals,
            (int) numStartClippedBases,
            (int) (read.getReadBases().length - numEndClippedBases));

    int readLength = readBases.length;

    // initialize path metric and traceback memories for Viterbi computation
    pathMetricArray = new double[readLength + 1][PATH_METRIC_TABLE_LENGTH];
    bestStateIndexArray = new int[readLength + 1][PATH_METRIC_TABLE_LENGTH];

    for (int k = 1; k < PATH_METRIC_TABLE_LENGTH; k++) pathMetricArray[0][k] = 0;

    /*

     if (doSimpleCalculationModel) {

        // No Viterbi algorithm - assume no sequencing indel artifacts,

        // so we can collapse computations and pr(read | haplotype) is just probability of observing overlap
        // of read with haplotype.
        int haplotypeIndex = initialIndexInHaplotype;
        double c =  0.0;//deletionErrorProbabilities[1] +logOneMinusInsertionStartProbability;
        // compute likelihood of portion of base to the left of the haplotype
        for (int indR=readStartIdx-1; indR >= 0; indR--) {
            byte readBase = readBases[indR];
            byte readQual = readQuals[indR];
            if (readQual <= 2)
                continue;
            double pBaseRead = getProbabilityOfReadBaseGivenXandI((byte)0, readBase, readQual, LEFT_ALIGN_INDEX, 0);

            // pBaseRead has -10*log10(Prob(base[i]|haplotype[i])
            pRead += pBaseRead;

        }
        //System.out.format("\nSt: %d Pre-Likelihood:%f\n",readStartIdx, pRead);

        for (int indR=readStartIdx; indR < readBases.length; indR++) {
            byte readBase = readBases[indR];
            byte readQual = readQuals[indR];

            byte haplotypeBase;
            if (haplotypeIndex < RIGHT_ALIGN_INDEX)
                haplotypeBase = haplotype.getBases()[haplotypeIndex];
            else
                haplotypeBase = (byte)0; // dummy

            double pBaseRead = getProbabilityOfReadBaseGivenXandI(haplotypeBase, readBase, readQual, haplotypeIndex, 0);
            if (haplotypeBase != 0)
                pBaseRead += c;

            // pBaseRead has -10*log10(Prob(base[i]|haplotype[i])
            if (readQual > 3)
                pRead += pBaseRead;
            haplotypeIndex++;
            if (haplotypeIndex >= haplotype.getBases().length)
                haplotypeIndex = RIGHT_ALIGN_INDEX;
            //System.out.format("H:%c R:%c RQ:%d HI:%d %4.5f %4.5f\n", haplotypeBase, readBase, (int)readQual, haplotypeIndex, pBaseRead, pRead);
         }
        //System.out.format("\nSt: %d Post-Likelihood:%f\n",readStartIdx, pRead);

        if (DEBUG) {
            System.out.println(read.getReadName());
            System.out.print("Haplotype:");

            for (int k=0; k <haplotype.getBases().length; k++) {
                System.out.format("%c ", haplotype.getBases()[k]);
            }
            System.out.println();

            System.out.print("Read bases: ");
            for (int k=0; k <readBases.length; k++) {
                System.out.format("%c ", readBases[k]);
            }
            System.out.format("\nLikelihood:%f\n",pRead);

        }

        if (read.getReadName().contains("106880")) {

            System.out.println("aca");

            System.out.println("Haplotype:");

            for (int k=initialIndexInHaplotype; k <haplotype.getBases().length; k++) {
                System.out.format("%c ", haplotype.getBases()[k]);
            }
            System.out.println();

            System.out.println("Read bases: ");
            for (int k=readStartIdx; k <readBases.length; k++) {
                System.out.format("%c ", readBases[k]);
            }

        }
        return pRead;

    }
            */

    // Update path metric computations based on branch metric (Add/Compare/Select operations)
    // do forward direction first, ie from anchor to end of read
    // outer loop
    for (int indR = 0; indR < readLength; indR++) {
      byte readBase = readBases[indR];
      byte readQual = readQuals[indR];

      for (int indX = LEFT_ALIGN_INDEX; indX <= RIGHT_ALIGN_INDEX; indX++) {

        byte haplotypeBase;
        if (indX > LEFT_ALIGN_INDEX && indX < RIGHT_ALIGN_INDEX)
          haplotypeBase = haplotype.getBases()[indX - 1];
        else haplotypeBase = readBase;

        updatePathMetrics(haplotypeBase, indX, indR, readBase, readQual);
      }
    }

    // for debugging only: compute backtracking to find optimal route through trellis. Since I'm
    // only interested
    // in log-likelihood of best state, this isn't really necessary.
    double bestMetric = MathUtils.arrayMin(pathMetricArray[readLength]);

    if (DEBUG) {

      System.out.println(read.getReadName());
      System.out.print("Haplotype:");

      for (int k = 0; k < haplotype.getBases().length; k++) {
        System.out.format("%c ", haplotype.getBases()[k]);
      }
      System.out.println();

      System.out.print("Read bases: ");
      for (int k = 0; k < readBases.length; k++) {
        System.out.format("%c ", readBases[k]);
      }
      System.out.println();

      System.out.print("Read quals: ");
      for (int k = 0; k < readQuals.length; k++) {
        System.out.format("%d ", (int) readQuals[k]);
      }
      System.out.println();

      // start from last position of read, go backwards to find optimal alignment
      int[] bestIndexArray = new int[readLength];
      int bestIndex = MathUtils.minElementIndex(pathMetricArray[readLength]);
      bestIndexArray[readLength - 1] = bestIndex;

      for (int k = readLength - 2; k >= 0; k--) {
        bestIndex = bestStateIndexArray[k][bestIndex];
        bestIndexArray[k] = bestIndex;
      }

      System.out.print("Alignment: ");
      for (int k = 0; k < readBases.length; k++) {
        System.out.format("%d ", bestIndexArray[k]);
      }
      System.out.println();
    }
    // now just take optimum along all path metrics: that's the log likelihood of best alignment
    if (DEBUG) System.out.format("Likelihood: %5.4f\n", bestMetric);
    return bestMetric;
  }
  /**
   * Main method for the program. Checks that all input files are present and readable and that the
   * output file can be written to. Then iterates through all the records accumulating metrics.
   * Finally writes metrics file
   */
  protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);

    final SamReader reader =
        SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);

    final Histogram<Integer> mismatchesHist = new Histogram<Integer>("Predicted", "Mismatches");
    final Histogram<Integer> totalHist = new Histogram<Integer>("Predicted", "Total_Bases");
    final Map<String, Histogram> mismatchesByTypeHist = new HashMap<String, Histogram>();
    final Map<String, Histogram> totalByTypeHist = new HashMap<String, Histogram>();

    // Set up the histograms
    byte[] bases = {'A', 'C', 'G', 'T'};
    for (final byte base : bases) {
      final Histogram<Integer> h = new Histogram<Integer>("Predicted", (char) base + ">");
      mismatchesByTypeHist.put((char) base + ">", h);
      final Histogram<Integer> h2 = new Histogram<Integer>("Predicted", ">" + (char) base);
      mismatchesByTypeHist.put(">" + (char) base, h2);
    }
    for (final byte base : bases) {
      final Histogram<Integer> h = new Histogram<Integer>("Predicted", (char) base + ">");
      totalByTypeHist.put((char) base + ">", h);
      final Histogram<Integer> h2 = new Histogram<Integer>("Predicted", ">" + (char) base);
      totalByTypeHist.put(">" + (char) base, h2);
    }

    for (final SAMRecord record : reader) {
      // Ignore these as we don't know the truth
      if (record.getReadUnmappedFlag() || record.isSecondaryOrSupplementary()) {
        continue;
      }
      final byte[] readBases = record.getReadBases();
      final byte[] readQualities = record.getBaseQualities();
      final byte[] refBases = SequenceUtil.makeReferenceFromAlignment(record, false);

      // We've seen stranger things
      if (readQualities.length != readBases.length) {
        throw new PicardException(
            "Missing Qualities ("
                + readQualities.length
                + ","
                + readBases.length
                + ") : "
                + record.getSAMString());
      }

      if (refBases.length != readBases.length) {
        throw new PicardException(
            "The read length did not match the inferred reference length, please check your MD and CIGAR.");
      }

      int cycleIndex; // zero-based
      if (record.getReadNegativeStrandFlag()) {
        cycleIndex = readBases.length - 1 + CYCLE_OFFSET;
      } else {
        cycleIndex = CYCLE_OFFSET;
      }

      for (int i = 0; i < readBases.length; i++) {
        if (-1 == CYCLE || cycleIndex == CYCLE) {
          if ('-' != refBases[i] && '0' != refBases[i]) { // not insertion and not soft-clipped
            if (!SequenceUtil.basesEqual(readBases[i], refBases[i])) { // mismatch
              mismatchesHist.increment((int) readQualities[i]);
              if (SequenceUtil.isValidBase(refBases[i])) {
                mismatchesByTypeHist
                    .get((char) refBases[i] + ">")
                    .increment((int) readQualities[i]);
              }
              if (SequenceUtil.isValidBase(readBases[i])) {
                mismatchesByTypeHist
                    .get(">" + (char) readBases[i])
                    .increment((int) readQualities[i]);
              }
            } else {
              mismatchesHist.increment(
                  (int) readQualities[i], 0); // to make sure the bin will exist
            }
            totalHist.increment((int) readQualities[i]);
            if (SequenceUtil.isValidBase(readBases[i])) {
              totalByTypeHist.get(">" + (char) readBases[i]).increment((int) readQualities[i]);
            }
            if (SequenceUtil.isValidBase(refBases[i])) {
              totalByTypeHist.get((char) refBases[i] + ">").increment((int) readQualities[i]);
            }
          }
        }
        cycleIndex += record.getReadNegativeStrandFlag() ? -1 : 1;
      }
    }
    CloserUtil.close(reader);

    final Histogram<Integer> hist = new Histogram<Integer>("Predicted", "Observed");

    double sumOfSquaresError = 0.0;

    // compute the aggregate phred values
    for (final Integer key : mismatchesHist.keySet()) {
      final double numMismatches = mismatchesHist.get(key).getValue();
      final double numBases = totalHist.get(key).getValue();
      final double phredErr = Math.log10(numMismatches / numBases) * -10.0;
      sumOfSquaresError += (0 == numMismatches) ? 0.0 : (key - phredErr) * (key - phredErr);
      hist.increment(key, phredErr);

      // make sure the bin will exist
      for (final byte base : bases) {
        mismatchesByTypeHist.get(">" + (char) base).increment(key, 0.0);
        mismatchesByTypeHist.get((char) base + ">").increment(key, 0.0);
        totalByTypeHist.get(">" + (char) base).increment(key, 0.0);
        totalByTypeHist.get((char) base + ">").increment(key, 0.0);
      }
    }

    final QualityScoreAccuracyMetrics metrics = new QualityScoreAccuracyMetrics();
    metrics.SUM_OF_SQUARE_ERROR = sumOfSquaresError;

    final MetricsFile<QualityScoreAccuracyMetrics, Integer> out = getMetricsFile();
    out.addMetric(metrics);
    out.addHistogram(hist);
    for (final byte base : bases) {
      // >base : histograms for mismatches *to* the given base
      Histogram<Integer> m = mismatchesByTypeHist.get(">" + (char) base);
      Histogram<Integer> t = totalByTypeHist.get(">" + (char) base);
      Histogram<Integer> h = new Histogram<Integer>(m.getBinLabel(), m.getValueLabel());
      for (final Integer key : m.keySet()) {
        final double numMismatches = m.get(key).getValue();
        final double numBases = t.get(key).getValue();
        final double phredErr = Math.log10(numMismatches / numBases) * -10.0;
        h.increment(key, phredErr);
      }
      out.addHistogram(h);

      // base> : histograms for mismatches *from* the given base
      m = mismatchesByTypeHist.get((char) base + ">");
      t = totalByTypeHist.get(">" + (char) base);
      h = new Histogram<Integer>(m.getBinLabel(), m.getValueLabel());
      for (final Integer key : m.keySet()) {
        final double numMismatches = m.get(key).getValue();
        final double numBases = t.get(key).getValue();
        final double phredErr = Math.log10(numMismatches / numBases) * -10.0;
        h.increment(key, phredErr);
      }
      out.addHistogram(h);
    }

    out.addHistogram(mismatchesHist);
    out.addHistogram(totalHist);
    out.write(OUTPUT);

    return 0;
  }
  @Override
  protected void acceptRead(final SAMRecord rec, final ReferenceSequence ref) {
    // see if the whole read should be skipped
    if (recordFilter.filterOut(rec)) return;

    // check read group + library
    final String library =
        (rec.getReadGroup() == null)
            ? UNKNOWN_LIBRARY
            : getOrElse(rec.getReadGroup().getLibrary(), UNKNOWN_LIBRARY);
    if (!libraries.contains(library)) {
      // should never happen if SAM is valid
      throw new PicardException("Record contains library that is missing from header: " + library);
    }

    // set up some constants that don't change in the loop below
    final int contextFullLength = 2 * CONTEXT_SIZE + 1;
    final ArtifactCounter counter = artifactCounters.get(library);
    final byte[] readBases = rec.getReadBases();
    final byte[] readQuals;
    if (USE_OQ) {
      final byte[] tmp = rec.getOriginalBaseQualities();
      readQuals = tmp == null ? rec.getBaseQualities() : tmp;
    } else {
      readQuals = rec.getBaseQualities();
    }

    // iterate over aligned positions
    for (final AlignmentBlock block : rec.getAlignmentBlocks()) {
      for (int offset = 0; offset < block.getLength(); offset++) {
        // remember, these are 1-based!
        final int readPos = block.getReadStart() + offset;
        final int refPos = block.getReferenceStart() + offset;

        // skip low BQ sites
        final byte qual = readQuals[readPos - 1];
        if (qual < MINIMUM_QUALITY_SCORE) continue;

        // skip N bases in read
        final char readBase = Character.toUpperCase((char) readBases[readPos - 1]);
        if (readBase == 'N') continue;

        /**
         * Skip regions outside of intervals.
         *
         * <p>NB: IntervalListReferenceSequenceMask.get() has side-effects which assume that
         * successive ReferenceSequence's passed to this method will be in-order (e.g. it will break
         * if you call acceptRead() with chr1, then chr2, then chr1 again). So this only works if
         * the underlying iteration is coordinate-sorted.
         */
        if (intervalMask != null && !intervalMask.get(ref.getContigIndex(), refPos)) continue;

        // skip dbSNP sites
        if (dbSnpMask != null && dbSnpMask.isDbSnpSite(ref.getName(), refPos)) continue;

        // skip the ends of the reference
        final int contextStartIndex = refPos - CONTEXT_SIZE - 1;
        if (contextStartIndex < 0 || contextStartIndex + contextFullLength > ref.length()) continue;

        // skip contexts with N bases
        final String context = getRefContext(ref, contextStartIndex, contextFullLength);
        if (context.contains("N")) continue;

        // count the base!
        counter.countRecord(context, readBase, rec);
      }
    }
  }
Пример #13
0
 public BAQCalculationResult(SAMRecord read, byte[] ref) {
   this(read.getBaseQualities(), read.getReadBases(), ref);
 }