Ejemplo n.º 1
0
  // we need to pad ref by at least the bandwidth / 2 on either side
  public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) {
    // todo -- need to handle the case where the cigar sum of lengths doesn't cover the whole read
    Pair<Integer, Integer> queryRange = calculateQueryRange(read);
    if (queryRange == null) return null; // read has Ns, or is completely clipped away

    int queryStart = queryRange.getFirst();
    int queryEnd = queryRange.getSecond();

    BAQCalculationResult baqResult =
        calcBAQFromHMM(ref, read.getReadBases(), read.getBaseQualities(), queryStart, queryEnd);

    // cap quals
    int readI = 0, refI = 0;
    for (CigarElement elt : read.getCigar().getCigarElements()) {
      int l = elt.getLength();
      switch (elt.getOperator()) {
        case N: // cannot handle these
          return null;
        case H:
        case P: // ignore pads and hard clips
          break;
        case S:
          refI += l; // move the reference too, in addition to I
        case I:
          // todo -- is it really the case that we want to treat I and S the same?
          for (int i = readI; i < readI + l; i++) baqResult.bq[i] = baqResult.rawQuals[i];
          readI += l;
          break;
        case D:
          refI += l;
          break;
        case M:
          for (int i = readI; i < readI + l; i++) {
            int expectedPos = refI - refOffset + (i - readI);
            baqResult.bq[i] =
                capBaseByBAQ(
                    baqResult.rawQuals[i], baqResult.bq[i], baqResult.state[i], expectedPos);
          }
          readI += l;
          refI += l;
          break;
        default:
          throw new ReviewedGATKException(
              "BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName());
      }
    }
    if (readI != read.getReadLength()) // odd cigar string
    System.arraycopy(baqResult.rawQuals, 0, baqResult.bq, 0, baqResult.bq.length);

    return baqResult;
  }
  public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read) {

    long numStartClippedBases = 0;
    long numEndClippedBases = 0;

    byte[] unclippedReadQuals = read.getBaseQualities();
    byte[] unclippedReadBases = read.getReadBases();

    // Do a stricter base clipping than provided by CIGAR string, since this one may be too
    // conservative,
    // and may leave a string of Q2 bases still hanging off the reads.
    for (int i = 0; i < read.getReadLength(); i++) {
      if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) numStartClippedBases++;
      else break;
    }
    for (int i = read.getReadLength() - 1; i >= 0; i--) {
      if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) numEndClippedBases++;
      else break;
    }
    // System.out.format("numstart: %d numend: %d\n", numStartClippedBases, numEndClippedBases);
    if (numStartClippedBases + numEndClippedBases >= read.getReadLength()) {
      return 0; /// Double.POSITIVE_INFINITY;
    }
    byte[] readBases =
        Arrays.copyOfRange(
            unclippedReadBases,
            (int) numStartClippedBases,
            (int) (read.getReadBases().length - numEndClippedBases));

    byte[] readQuals =
        Arrays.copyOfRange(
            unclippedReadQuals,
            (int) numStartClippedBases,
            (int) (read.getReadBases().length - numEndClippedBases));

    int readLength = readBases.length;

    // initialize path metric and traceback memories for Viterbi computation
    pathMetricArray = new double[readLength + 1][PATH_METRIC_TABLE_LENGTH];
    bestStateIndexArray = new int[readLength + 1][PATH_METRIC_TABLE_LENGTH];

    for (int k = 1; k < PATH_METRIC_TABLE_LENGTH; k++) pathMetricArray[0][k] = 0;

    /*

     if (doSimpleCalculationModel) {

        // No Viterbi algorithm - assume no sequencing indel artifacts,

        // so we can collapse computations and pr(read | haplotype) is just probability of observing overlap
        // of read with haplotype.
        int haplotypeIndex = initialIndexInHaplotype;
        double c =  0.0;//deletionErrorProbabilities[1] +logOneMinusInsertionStartProbability;
        // compute likelihood of portion of base to the left of the haplotype
        for (int indR=readStartIdx-1; indR >= 0; indR--) {
            byte readBase = readBases[indR];
            byte readQual = readQuals[indR];
            if (readQual <= 2)
                continue;
            double pBaseRead = getProbabilityOfReadBaseGivenXandI((byte)0, readBase, readQual, LEFT_ALIGN_INDEX, 0);

            // pBaseRead has -10*log10(Prob(base[i]|haplotype[i])
            pRead += pBaseRead;

        }
        //System.out.format("\nSt: %d Pre-Likelihood:%f\n",readStartIdx, pRead);

        for (int indR=readStartIdx; indR < readBases.length; indR++) {
            byte readBase = readBases[indR];
            byte readQual = readQuals[indR];

            byte haplotypeBase;
            if (haplotypeIndex < RIGHT_ALIGN_INDEX)
                haplotypeBase = haplotype.getBases()[haplotypeIndex];
            else
                haplotypeBase = (byte)0; // dummy

            double pBaseRead = getProbabilityOfReadBaseGivenXandI(haplotypeBase, readBase, readQual, haplotypeIndex, 0);
            if (haplotypeBase != 0)
                pBaseRead += c;

            // pBaseRead has -10*log10(Prob(base[i]|haplotype[i])
            if (readQual > 3)
                pRead += pBaseRead;
            haplotypeIndex++;
            if (haplotypeIndex >= haplotype.getBases().length)
                haplotypeIndex = RIGHT_ALIGN_INDEX;
            //System.out.format("H:%c R:%c RQ:%d HI:%d %4.5f %4.5f\n", haplotypeBase, readBase, (int)readQual, haplotypeIndex, pBaseRead, pRead);
         }
        //System.out.format("\nSt: %d Post-Likelihood:%f\n",readStartIdx, pRead);

        if (DEBUG) {
            System.out.println(read.getReadName());
            System.out.print("Haplotype:");

            for (int k=0; k <haplotype.getBases().length; k++) {
                System.out.format("%c ", haplotype.getBases()[k]);
            }
            System.out.println();

            System.out.print("Read bases: ");
            for (int k=0; k <readBases.length; k++) {
                System.out.format("%c ", readBases[k]);
            }
            System.out.format("\nLikelihood:%f\n",pRead);

        }

        if (read.getReadName().contains("106880")) {

            System.out.println("aca");

            System.out.println("Haplotype:");

            for (int k=initialIndexInHaplotype; k <haplotype.getBases().length; k++) {
                System.out.format("%c ", haplotype.getBases()[k]);
            }
            System.out.println();

            System.out.println("Read bases: ");
            for (int k=readStartIdx; k <readBases.length; k++) {
                System.out.format("%c ", readBases[k]);
            }

        }
        return pRead;

    }
            */

    // Update path metric computations based on branch metric (Add/Compare/Select operations)
    // do forward direction first, ie from anchor to end of read
    // outer loop
    for (int indR = 0; indR < readLength; indR++) {
      byte readBase = readBases[indR];
      byte readQual = readQuals[indR];

      for (int indX = LEFT_ALIGN_INDEX; indX <= RIGHT_ALIGN_INDEX; indX++) {

        byte haplotypeBase;
        if (indX > LEFT_ALIGN_INDEX && indX < RIGHT_ALIGN_INDEX)
          haplotypeBase = haplotype.getBases()[indX - 1];
        else haplotypeBase = readBase;

        updatePathMetrics(haplotypeBase, indX, indR, readBase, readQual);
      }
    }

    // for debugging only: compute backtracking to find optimal route through trellis. Since I'm
    // only interested
    // in log-likelihood of best state, this isn't really necessary.
    double bestMetric = MathUtils.arrayMin(pathMetricArray[readLength]);

    if (DEBUG) {

      System.out.println(read.getReadName());
      System.out.print("Haplotype:");

      for (int k = 0; k < haplotype.getBases().length; k++) {
        System.out.format("%c ", haplotype.getBases()[k]);
      }
      System.out.println();

      System.out.print("Read bases: ");
      for (int k = 0; k < readBases.length; k++) {
        System.out.format("%c ", readBases[k]);
      }
      System.out.println();

      System.out.print("Read quals: ");
      for (int k = 0; k < readQuals.length; k++) {
        System.out.format("%d ", (int) readQuals[k]);
      }
      System.out.println();

      // start from last position of read, go backwards to find optimal alignment
      int[] bestIndexArray = new int[readLength];
      int bestIndex = MathUtils.minElementIndex(pathMetricArray[readLength]);
      bestIndexArray[readLength - 1] = bestIndex;

      for (int k = readLength - 2; k >= 0; k--) {
        bestIndex = bestStateIndexArray[k][bestIndex];
        bestIndexArray[k] = bestIndex;
      }

      System.out.print("Alignment: ");
      for (int k = 0; k < readBases.length; k++) {
        System.out.format("%d ", bestIndexArray[k]);
      }
      System.out.println();
    }
    // now just take optimum along all path metrics: that's the log likelihood of best alignment
    if (DEBUG) System.out.format("Likelihood: %5.4f\n", bestMetric);
    return bestMetric;
  }
Ejemplo n.º 3
0
 public static boolean isEmpty(final SAMRecord read) {
   return read.getReadBases() == null || read.getReadLength() == 0;
 }