/** * Loop over all of the reads in this likelihood map and realign them to its most likely haplotype * * @param haplotypes the collection of haplotypes * @param paddedReferenceLoc the active region */ public void realignReadsToMostLikelyHaplotype( final Collection<Haplotype> haplotypes, final GenomeLoc paddedReferenceLoc) { // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a // requirement currently final Map<Allele, Haplotype> alleleToHaplotypeMap = new HashMap<>(haplotypes.size()); Haplotype refHaplotype = null; for (final Haplotype haplotype : haplotypes) { alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); if (refHaplotype == null && haplotype.isReference()) refHaplotype = haplotype; } final Map<GATKSAMRecord, Map<Allele, Double>> newLikelihoodReadMap = new LinkedHashMap<>(likelihoodReadMap.size()); for (final Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : likelihoodReadMap.entrySet()) { final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); final GATKSAMRecord alignedToRef = AlignmentUtils.createReadAlignedToRef( entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), refHaplotype, paddedReferenceLoc.getStart(), bestAllele.isInformative()); newLikelihoodReadMap.put(alignedToRef, entry.getValue()); } likelihoodReadMap.clear(); likelihoodReadMap.putAll(newLikelihoodReadMap); }
public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read) { long numStartClippedBases = 0; long numEndClippedBases = 0; byte[] unclippedReadQuals = read.getBaseQualities(); byte[] unclippedReadBases = read.getReadBases(); // Do a stricter base clipping than provided by CIGAR string, since this one may be too // conservative, // and may leave a string of Q2 bases still hanging off the reads. for (int i = 0; i < read.getReadLength(); i++) { if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) numStartClippedBases++; else break; } for (int i = read.getReadLength() - 1; i >= 0; i--) { if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) numEndClippedBases++; else break; } // System.out.format("numstart: %d numend: %d\n", numStartClippedBases, numEndClippedBases); if (numStartClippedBases + numEndClippedBases >= read.getReadLength()) { return 0; /// Double.POSITIVE_INFINITY; } byte[] readBases = Arrays.copyOfRange( unclippedReadBases, (int) numStartClippedBases, (int) (read.getReadBases().length - numEndClippedBases)); byte[] readQuals = Arrays.copyOfRange( unclippedReadQuals, (int) numStartClippedBases, (int) (read.getReadBases().length - numEndClippedBases)); int readLength = readBases.length; // initialize path metric and traceback memories for Viterbi computation pathMetricArray = new double[readLength + 1][PATH_METRIC_TABLE_LENGTH]; bestStateIndexArray = new int[readLength + 1][PATH_METRIC_TABLE_LENGTH]; for (int k = 1; k < PATH_METRIC_TABLE_LENGTH; k++) pathMetricArray[0][k] = 0; /* if (doSimpleCalculationModel) { // No Viterbi algorithm - assume no sequencing indel artifacts, // so we can collapse computations and pr(read | haplotype) is just probability of observing overlap // of read with haplotype. int haplotypeIndex = initialIndexInHaplotype; double c = 0.0;//deletionErrorProbabilities[1] +logOneMinusInsertionStartProbability; // compute likelihood of portion of base to the left of the haplotype for (int indR=readStartIdx-1; indR >= 0; indR--) { byte readBase = readBases[indR]; byte readQual = readQuals[indR]; if (readQual <= 2) continue; double pBaseRead = getProbabilityOfReadBaseGivenXandI((byte)0, readBase, readQual, LEFT_ALIGN_INDEX, 0); // pBaseRead has -10*log10(Prob(base[i]|haplotype[i]) pRead += pBaseRead; } //System.out.format("\nSt: %d Pre-Likelihood:%f\n",readStartIdx, pRead); for (int indR=readStartIdx; indR < readBases.length; indR++) { byte readBase = readBases[indR]; byte readQual = readQuals[indR]; byte haplotypeBase; if (haplotypeIndex < RIGHT_ALIGN_INDEX) haplotypeBase = haplotype.getBases()[haplotypeIndex]; else haplotypeBase = (byte)0; // dummy double pBaseRead = getProbabilityOfReadBaseGivenXandI(haplotypeBase, readBase, readQual, haplotypeIndex, 0); if (haplotypeBase != 0) pBaseRead += c; // pBaseRead has -10*log10(Prob(base[i]|haplotype[i]) if (readQual > 3) pRead += pBaseRead; haplotypeIndex++; if (haplotypeIndex >= haplotype.getBases().length) haplotypeIndex = RIGHT_ALIGN_INDEX; //System.out.format("H:%c R:%c RQ:%d HI:%d %4.5f %4.5f\n", haplotypeBase, readBase, (int)readQual, haplotypeIndex, pBaseRead, pRead); } //System.out.format("\nSt: %d Post-Likelihood:%f\n",readStartIdx, pRead); if (DEBUG) { System.out.println(read.getReadName()); System.out.print("Haplotype:"); for (int k=0; k <haplotype.getBases().length; k++) { System.out.format("%c ", haplotype.getBases()[k]); } System.out.println(); System.out.print("Read bases: "); for (int k=0; k <readBases.length; k++) { System.out.format("%c ", readBases[k]); } System.out.format("\nLikelihood:%f\n",pRead); } if (read.getReadName().contains("106880")) { System.out.println("aca"); System.out.println("Haplotype:"); for (int k=initialIndexInHaplotype; k <haplotype.getBases().length; k++) { System.out.format("%c ", haplotype.getBases()[k]); } System.out.println(); System.out.println("Read bases: "); for (int k=readStartIdx; k <readBases.length; k++) { System.out.format("%c ", readBases[k]); } } return pRead; } */ // Update path metric computations based on branch metric (Add/Compare/Select operations) // do forward direction first, ie from anchor to end of read // outer loop for (int indR = 0; indR < readLength; indR++) { byte readBase = readBases[indR]; byte readQual = readQuals[indR]; for (int indX = LEFT_ALIGN_INDEX; indX <= RIGHT_ALIGN_INDEX; indX++) { byte haplotypeBase; if (indX > LEFT_ALIGN_INDEX && indX < RIGHT_ALIGN_INDEX) haplotypeBase = haplotype.getBases()[indX - 1]; else haplotypeBase = readBase; updatePathMetrics(haplotypeBase, indX, indR, readBase, readQual); } } // for debugging only: compute backtracking to find optimal route through trellis. Since I'm // only interested // in log-likelihood of best state, this isn't really necessary. double bestMetric = MathUtils.arrayMin(pathMetricArray[readLength]); if (DEBUG) { System.out.println(read.getReadName()); System.out.print("Haplotype:"); for (int k = 0; k < haplotype.getBases().length; k++) { System.out.format("%c ", haplotype.getBases()[k]); } System.out.println(); System.out.print("Read bases: "); for (int k = 0; k < readBases.length; k++) { System.out.format("%c ", readBases[k]); } System.out.println(); System.out.print("Read quals: "); for (int k = 0; k < readQuals.length; k++) { System.out.format("%d ", (int) readQuals[k]); } System.out.println(); // start from last position of read, go backwards to find optimal alignment int[] bestIndexArray = new int[readLength]; int bestIndex = MathUtils.minElementIndex(pathMetricArray[readLength]); bestIndexArray[readLength - 1] = bestIndex; for (int k = readLength - 2; k >= 0; k--) { bestIndex = bestStateIndexArray[k][bestIndex]; bestIndexArray[k] = bestIndex; } System.out.print("Alignment: "); for (int k = 0; k < readBases.length; k++) { System.out.format("%d ", bestIndexArray[k]); } System.out.println(); } // now just take optimum along all path metrics: that's the log likelihood of best alignment if (DEBUG) System.out.format("Likelihood: %5.4f\n", bestMetric); return bestMetric; }