// we need to pad ref by at least the bandwidth / 2 on either side public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) { // todo -- need to handle the case where the cigar sum of lengths doesn't cover the whole read Pair<Integer, Integer> queryRange = calculateQueryRange(read); if (queryRange == null) return null; // read has Ns, or is completely clipped away int queryStart = queryRange.getFirst(); int queryEnd = queryRange.getSecond(); BAQCalculationResult baqResult = calcBAQFromHMM(ref, read.getReadBases(), read.getBaseQualities(), queryStart, queryEnd); // cap quals int readI = 0, refI = 0; for (CigarElement elt : read.getCigar().getCigarElements()) { int l = elt.getLength(); switch (elt.getOperator()) { case N: // cannot handle these return null; case H: case P: // ignore pads and hard clips break; case S: refI += l; // move the reference too, in addition to I case I: // todo -- is it really the case that we want to treat I and S the same? for (int i = readI; i < readI + l; i++) baqResult.bq[i] = baqResult.rawQuals[i]; readI += l; break; case D: refI += l; break; case M: for (int i = readI; i < readI + l; i++) { int expectedPos = refI - refOffset + (i - readI); baqResult.bq[i] = capBaseByBAQ( baqResult.rawQuals[i], baqResult.bq[i], baqResult.state[i], expectedPos); } readI += l; refI += l; break; default: throw new ReviewedGATKException( "BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName()); } } if (readI != read.getReadLength()) // odd cigar string System.arraycopy(baqResult.rawQuals, 0, baqResult.bq, 0, baqResult.bq.length); return baqResult; }
public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read) { long numStartClippedBases = 0; long numEndClippedBases = 0; byte[] unclippedReadQuals = read.getBaseQualities(); byte[] unclippedReadBases = read.getReadBases(); // Do a stricter base clipping than provided by CIGAR string, since this one may be too // conservative, // and may leave a string of Q2 bases still hanging off the reads. for (int i = 0; i < read.getReadLength(); i++) { if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) numStartClippedBases++; else break; } for (int i = read.getReadLength() - 1; i >= 0; i--) { if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) numEndClippedBases++; else break; } // System.out.format("numstart: %d numend: %d\n", numStartClippedBases, numEndClippedBases); if (numStartClippedBases + numEndClippedBases >= read.getReadLength()) { return 0; /// Double.POSITIVE_INFINITY; } byte[] readBases = Arrays.copyOfRange( unclippedReadBases, (int) numStartClippedBases, (int) (read.getReadBases().length - numEndClippedBases)); byte[] readQuals = Arrays.copyOfRange( unclippedReadQuals, (int) numStartClippedBases, (int) (read.getReadBases().length - numEndClippedBases)); int readLength = readBases.length; // initialize path metric and traceback memories for Viterbi computation pathMetricArray = new double[readLength + 1][PATH_METRIC_TABLE_LENGTH]; bestStateIndexArray = new int[readLength + 1][PATH_METRIC_TABLE_LENGTH]; for (int k = 1; k < PATH_METRIC_TABLE_LENGTH; k++) pathMetricArray[0][k] = 0; /* if (doSimpleCalculationModel) { // No Viterbi algorithm - assume no sequencing indel artifacts, // so we can collapse computations and pr(read | haplotype) is just probability of observing overlap // of read with haplotype. int haplotypeIndex = initialIndexInHaplotype; double c = 0.0;//deletionErrorProbabilities[1] +logOneMinusInsertionStartProbability; // compute likelihood of portion of base to the left of the haplotype for (int indR=readStartIdx-1; indR >= 0; indR--) { byte readBase = readBases[indR]; byte readQual = readQuals[indR]; if (readQual <= 2) continue; double pBaseRead = getProbabilityOfReadBaseGivenXandI((byte)0, readBase, readQual, LEFT_ALIGN_INDEX, 0); // pBaseRead has -10*log10(Prob(base[i]|haplotype[i]) pRead += pBaseRead; } //System.out.format("\nSt: %d Pre-Likelihood:%f\n",readStartIdx, pRead); for (int indR=readStartIdx; indR < readBases.length; indR++) { byte readBase = readBases[indR]; byte readQual = readQuals[indR]; byte haplotypeBase; if (haplotypeIndex < RIGHT_ALIGN_INDEX) haplotypeBase = haplotype.getBases()[haplotypeIndex]; else haplotypeBase = (byte)0; // dummy double pBaseRead = getProbabilityOfReadBaseGivenXandI(haplotypeBase, readBase, readQual, haplotypeIndex, 0); if (haplotypeBase != 0) pBaseRead += c; // pBaseRead has -10*log10(Prob(base[i]|haplotype[i]) if (readQual > 3) pRead += pBaseRead; haplotypeIndex++; if (haplotypeIndex >= haplotype.getBases().length) haplotypeIndex = RIGHT_ALIGN_INDEX; //System.out.format("H:%c R:%c RQ:%d HI:%d %4.5f %4.5f\n", haplotypeBase, readBase, (int)readQual, haplotypeIndex, pBaseRead, pRead); } //System.out.format("\nSt: %d Post-Likelihood:%f\n",readStartIdx, pRead); if (DEBUG) { System.out.println(read.getReadName()); System.out.print("Haplotype:"); for (int k=0; k <haplotype.getBases().length; k++) { System.out.format("%c ", haplotype.getBases()[k]); } System.out.println(); System.out.print("Read bases: "); for (int k=0; k <readBases.length; k++) { System.out.format("%c ", readBases[k]); } System.out.format("\nLikelihood:%f\n",pRead); } if (read.getReadName().contains("106880")) { System.out.println("aca"); System.out.println("Haplotype:"); for (int k=initialIndexInHaplotype; k <haplotype.getBases().length; k++) { System.out.format("%c ", haplotype.getBases()[k]); } System.out.println(); System.out.println("Read bases: "); for (int k=readStartIdx; k <readBases.length; k++) { System.out.format("%c ", readBases[k]); } } return pRead; } */ // Update path metric computations based on branch metric (Add/Compare/Select operations) // do forward direction first, ie from anchor to end of read // outer loop for (int indR = 0; indR < readLength; indR++) { byte readBase = readBases[indR]; byte readQual = readQuals[indR]; for (int indX = LEFT_ALIGN_INDEX; indX <= RIGHT_ALIGN_INDEX; indX++) { byte haplotypeBase; if (indX > LEFT_ALIGN_INDEX && indX < RIGHT_ALIGN_INDEX) haplotypeBase = haplotype.getBases()[indX - 1]; else haplotypeBase = readBase; updatePathMetrics(haplotypeBase, indX, indR, readBase, readQual); } } // for debugging only: compute backtracking to find optimal route through trellis. Since I'm // only interested // in log-likelihood of best state, this isn't really necessary. double bestMetric = MathUtils.arrayMin(pathMetricArray[readLength]); if (DEBUG) { System.out.println(read.getReadName()); System.out.print("Haplotype:"); for (int k = 0; k < haplotype.getBases().length; k++) { System.out.format("%c ", haplotype.getBases()[k]); } System.out.println(); System.out.print("Read bases: "); for (int k = 0; k < readBases.length; k++) { System.out.format("%c ", readBases[k]); } System.out.println(); System.out.print("Read quals: "); for (int k = 0; k < readQuals.length; k++) { System.out.format("%d ", (int) readQuals[k]); } System.out.println(); // start from last position of read, go backwards to find optimal alignment int[] bestIndexArray = new int[readLength]; int bestIndex = MathUtils.minElementIndex(pathMetricArray[readLength]); bestIndexArray[readLength - 1] = bestIndex; for (int k = readLength - 2; k >= 0; k--) { bestIndex = bestStateIndexArray[k][bestIndex]; bestIndexArray[k] = bestIndex; } System.out.print("Alignment: "); for (int k = 0; k < readBases.length; k++) { System.out.format("%d ", bestIndexArray[k]); } System.out.println(); } // now just take optimum along all path metrics: that's the log likelihood of best alignment if (DEBUG) System.out.format("Likelihood: %5.4f\n", bestMetric); return bestMetric; }
public static boolean isEmpty(final SAMRecord read) { return read.getReadBases() == null || read.getReadLength() == 0; }