/** * Performs allele biased down-sampling on a pileup and computes the list of elements to remove * * @param reads original list of records * @param numElementsToRemove the number of records to remove * @return the list of pileup elements TO REMOVE */ protected static List<GATKSAMRecord> downsampleElements( final List<GATKSAMRecord> reads, final int numElementsToRemove) { // are there no elements to remove? if (numElementsToRemove == 0) return Collections.<GATKSAMRecord>emptyList(); final ArrayList<GATKSAMRecord> elementsToRemove = new ArrayList<GATKSAMRecord>(numElementsToRemove); final int originalElementCount = reads.size(); // should we remove all of the elements? if (numElementsToRemove >= originalElementCount) { elementsToRemove.addAll(reads); return elementsToRemove; } // create a bitset describing which elements to remove final BitSet itemsToRemove = new BitSet(originalElementCount); for (final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove)) { itemsToRemove.set(selectedIndex); } int currentBitSetIndex = 0; for (final GATKSAMRecord read : reads) { if (itemsToRemove.get(currentBitSetIndex++)) elementsToRemove.add(read); } return elementsToRemove; }
/** * P(somatic | D) = P(somatic) * P(D | somatic) = P(somatic) * P(D | normals are ref) * P(D | * tumors are non-ref) * * <p>P(! somatic | D) = P(! somatic) * P(D | ! somatic) = P(! somatic) * * ( P(D | normals are * non-ref) * P(D | tumors are non-ref) [germline] + P(D | normals are ref) * P(D | tumors are * ref)) [no-variant at all] * * @param vc * @return */ private double calcLog10pSomatic(final VariantContext vc) { // walk over tumors double log10pNonRefInTumors = log10pNonRefInSamples(vc, tumorSample); double log10pRefInTumors = log10pRefInSamples(vc, tumorSample); // walk over normals double log10pNonRefInNormals = log10pNonRefInSamples(vc, normalSample); double log10pRefInNormals = log10pRefInSamples(vc, normalSample); // priors double log10pSomaticPrior = QualityUtils.qualToErrorProbLog10(somaticPriorQ); double log10pNotSomaticPrior = Math.log10(1 - QualityUtils.qualToErrorProb(somaticPriorQ)); double log10pNotSomaticGermline = log10pNonRefInNormals + log10pNonRefInTumors; double log10pNotSomaticNoVariant = log10pRefInNormals + log10pRefInTumors; double log10pNotSomatic = log10pNotSomaticPrior + MathUtils.log10sumLog10( new double[] {log10pNotSomaticGermline, log10pNotSomaticNoVariant}); double log10pSomatic = log10pSomaticPrior + log10pNonRefInTumors + log10pRefInNormals; double lod = log10pSomatic - log10pNotSomatic; return Double.isInfinite(lod) ? -10000 : lod; }
public double[] computeReadHaplotypeLikelihoods( ReadBackedPileup pileup, HashMap<Allele, Haplotype> haplotypesInVC) { double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()]; double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()]; int i = 0; for (GATKSAMRecord read : pileup.getReads()) { if (ReadUtils.is454Read(read)) { continue; } // for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi)) // = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent int j = 0; for (Map.Entry<Allele, Haplotype> a : haplotypesInVC.entrySet()) { readLikelihoods[i][j] = computeReadLikelihoodGivenHaplotype(a.getValue(), read); if (DEBUG) { System.out.print(read.getReadName() + " "); System.out.format( "%d %d S:%d US:%d E:%d UE:%d C:%s %3.4f\n", i, j, read.getAlignmentStart(), read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(), read.getCigarString(), readLikelihoods[i][j]); } j++; } i++; } for (i = 0; i < haplotypesInVC.size(); i++) { for (int j = i; j < haplotypesInVC.size(); j++) { // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j] // L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2) // readLikelihoods[k][j] has log10(Pr(R_k) | H[j] ) double[] readLikelihood = new double[2]; // diploid sample for (int readIdx = 0; readIdx < pileup.getReads().size(); readIdx++) { readLikelihood[0] = -readLikelihoods[readIdx][i] / 10; readLikelihood[1] = -readLikelihoods[readIdx][j] / 10; // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+x0^x2)-log10(2) // First term is approximated by Jacobian log with table lookup. // Second term is a constant added to both likelihoods so will be ignored haplotypeLikehoodMatrix[i][j] += MathUtils.approximateLog10SumLog10(readLikelihood[0], readLikelihood[1]); } } } return getHaplotypeLikelihoods(haplotypeLikehoodMatrix); }
private double log10PLFromSamples( final VariantContext vc, final String sample, boolean calcRefP) { Genotype g = vc.getGenotype(sample); double log10pSample = -1000; if (!g.isNoCall()) { final double[] gLikelihoods = MathUtils.normalizeFromLog10(g.getLikelihoods().getAsVector()); log10pSample = Math.log10(calcRefP ? gLikelihoods[0] : 1 - gLikelihoods[0]); log10pSample = Double.isInfinite(log10pSample) ? -10000 : log10pSample; } return log10pSample; }
/** * Computes an allele biased version of the given pileup * * @param pileup the original pileup * @param downsamplingFraction the fraction of total reads to remove per allele * @return allele biased pileup */ public static ReadBackedPileup createAlleleBiasedBasePileup( final ReadBackedPileup pileup, final double downsamplingFraction) { // special case removal of all or no reads if (downsamplingFraction <= 0.0) return pileup; if (downsamplingFraction >= 1.0) return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>()); final PileupElementList[] alleleStratifiedElements = new PileupElementList[4]; for (int i = 0; i < 4; i++) alleleStratifiedElements[i] = new PileupElementList(); // start by stratifying the reads by the alleles they represent at this position for (final PileupElement pe : pileup) { final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); if (baseIndex != -1) alleleStratifiedElements[baseIndex].add(pe); } // make a listing of allele counts and calculate the total count final int[] alleleCounts = calculateAlleleCounts(alleleStratifiedElements); final int totalAlleleCount = (int) MathUtils.sum(alleleCounts); // do smart down-sampling final int numReadsToRemove = (int) (totalAlleleCount * downsamplingFraction); // floor final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); final HashSet<PileupElement> readsToRemove = new HashSet<PileupElement>(numReadsToRemove); for (int i = 0; i < 4; i++) { final PileupElementList alleleList = alleleStratifiedElements[i]; // if we don't need to remove any reads, then don't if (alleleCounts[i] > targetAlleleCounts[i]) readsToRemove.addAll( downsampleElements( alleleList, alleleCounts[i], alleleCounts[i] - targetAlleleCounts[i])); } // we need to keep the reads sorted because the FragmentUtils code will expect them in // coordinate order and will fail otherwise final List<PileupElement> readsToKeep = new ArrayList<PileupElement>(totalAlleleCount - numReadsToRemove); for (final PileupElement pe : pileup) { if (!readsToRemove.contains(pe)) { readsToKeep.add(pe); } } return new ReadBackedPileupImpl( pileup.getLocation(), new ArrayList<PileupElement>(readsToKeep)); }
/** * Get the most likely alleles estimated across all reads in this object * * <p>Takes the most likely two alleles according to their diploid genotype likelihoods. That is, * for each allele i and j we compute p(D | i,j) where D is the read likelihoods. We track the * maximum i,j likelihood and return an object that contains the alleles i and j as well as the * max likelihood. * * <p>Note that the second most likely diploid genotype is not tracked so the resulting * MostLikelyAllele doesn't have a meaningful get best likelihood. * * @return a MostLikelyAllele object, or null if this map is empty */ public MostLikelyAllele getMostLikelyDiploidAlleles() { if (isEmpty()) return null; int hap1 = 0; int hap2 = 0; double maxElement = Double.NEGATIVE_INFINITY; for (int iii = 0; iii < alleles.size(); iii++) { final Allele iii_allele = alleles.get(iii); for (int jjj = 0; jjj <= iii; jjj++) { final Allele jjj_allele = alleles.get(jjj); double haplotypeLikelihood = 0.0; for (final Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : likelihoodReadMap.entrySet()) { // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) final double likelihood_iii = entry.getValue().get(iii_allele); final double likelihood_jjj = entry.getValue().get(jjj_allele); haplotypeLikelihood += MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + MathUtils.LOG_ONE_HALF; // fast exit. If this diploid pair is already worse than the max, just stop and look at // the next pair if (haplotypeLikelihood < maxElement) break; } // keep track of the max element and associated indices if (haplotypeLikelihood > maxElement) { hap1 = iii; hap2 = jjj; maxElement = haplotypeLikelihood; } } } if (maxElement == Double.NEGATIVE_INFINITY) throw new IllegalStateException( "max likelihood is " + maxElement + " indicating something has gone wrong"); return new MostLikelyAllele(alleles.get(hap1), alleles.get(hap2), maxElement, maxElement); }
@Override public CallableBaseState map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { CalledState state; if (BaseUtils.isNBase(ref.getBase())) { state = CalledState.REF_N; } else { // count up the depths of all and QC+ bases int rawDepth = 0, QCDepth = 0, lowMAPQDepth = 0; for (PileupElement e : context.getBasePileup()) { rawDepth++; if (e.getMappingQual() <= maxLowMAPQ) lowMAPQDepth++; if (e.getMappingQual() >= minMappingQuality && (e.getQual() >= minBaseQuality || e.isDeletion())) { QCDepth++; } } // System.out.printf("%s rawdepth = %d QCDepth = %d lowMAPQ = %d%n", context.getLocation(), // rawDepth, QCDepth, lowMAPQDepth); if (rawDepth == 0) { state = CalledState.NO_COVERAGE; } else if (rawDepth >= minDepthLowMAPQ && MathUtils.ratio(lowMAPQDepth, rawDepth) >= maxLowMAPQFraction) { state = CalledState.POOR_MAPPING_QUALITY; } else if (QCDepth < minDepth) { state = CalledState.LOW_COVERAGE; } else if (rawDepth >= maxDepth && maxDepth != -1) { state = CalledState.EXCESSIVE_COVERAGE; } else { state = CalledState.CALLABLE; } } return new CallableBaseState(getToolkit().getGenomeLocParser(), context.getLocation(), state); }
public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read) { long numStartClippedBases = 0; long numEndClippedBases = 0; byte[] unclippedReadQuals = read.getBaseQualities(); byte[] unclippedReadBases = read.getReadBases(); // Do a stricter base clipping than provided by CIGAR string, since this one may be too // conservative, // and may leave a string of Q2 bases still hanging off the reads. for (int i = 0; i < read.getReadLength(); i++) { if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) numStartClippedBases++; else break; } for (int i = read.getReadLength() - 1; i >= 0; i--) { if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) numEndClippedBases++; else break; } // System.out.format("numstart: %d numend: %d\n", numStartClippedBases, numEndClippedBases); if (numStartClippedBases + numEndClippedBases >= read.getReadLength()) { return 0; /// Double.POSITIVE_INFINITY; } byte[] readBases = Arrays.copyOfRange( unclippedReadBases, (int) numStartClippedBases, (int) (read.getReadBases().length - numEndClippedBases)); byte[] readQuals = Arrays.copyOfRange( unclippedReadQuals, (int) numStartClippedBases, (int) (read.getReadBases().length - numEndClippedBases)); int readLength = readBases.length; // initialize path metric and traceback memories for Viterbi computation pathMetricArray = new double[readLength + 1][PATH_METRIC_TABLE_LENGTH]; bestStateIndexArray = new int[readLength + 1][PATH_METRIC_TABLE_LENGTH]; for (int k = 1; k < PATH_METRIC_TABLE_LENGTH; k++) pathMetricArray[0][k] = 0; /* if (doSimpleCalculationModel) { // No Viterbi algorithm - assume no sequencing indel artifacts, // so we can collapse computations and pr(read | haplotype) is just probability of observing overlap // of read with haplotype. int haplotypeIndex = initialIndexInHaplotype; double c = 0.0;//deletionErrorProbabilities[1] +logOneMinusInsertionStartProbability; // compute likelihood of portion of base to the left of the haplotype for (int indR=readStartIdx-1; indR >= 0; indR--) { byte readBase = readBases[indR]; byte readQual = readQuals[indR]; if (readQual <= 2) continue; double pBaseRead = getProbabilityOfReadBaseGivenXandI((byte)0, readBase, readQual, LEFT_ALIGN_INDEX, 0); // pBaseRead has -10*log10(Prob(base[i]|haplotype[i]) pRead += pBaseRead; } //System.out.format("\nSt: %d Pre-Likelihood:%f\n",readStartIdx, pRead); for (int indR=readStartIdx; indR < readBases.length; indR++) { byte readBase = readBases[indR]; byte readQual = readQuals[indR]; byte haplotypeBase; if (haplotypeIndex < RIGHT_ALIGN_INDEX) haplotypeBase = haplotype.getBases()[haplotypeIndex]; else haplotypeBase = (byte)0; // dummy double pBaseRead = getProbabilityOfReadBaseGivenXandI(haplotypeBase, readBase, readQual, haplotypeIndex, 0); if (haplotypeBase != 0) pBaseRead += c; // pBaseRead has -10*log10(Prob(base[i]|haplotype[i]) if (readQual > 3) pRead += pBaseRead; haplotypeIndex++; if (haplotypeIndex >= haplotype.getBases().length) haplotypeIndex = RIGHT_ALIGN_INDEX; //System.out.format("H:%c R:%c RQ:%d HI:%d %4.5f %4.5f\n", haplotypeBase, readBase, (int)readQual, haplotypeIndex, pBaseRead, pRead); } //System.out.format("\nSt: %d Post-Likelihood:%f\n",readStartIdx, pRead); if (DEBUG) { System.out.println(read.getReadName()); System.out.print("Haplotype:"); for (int k=0; k <haplotype.getBases().length; k++) { System.out.format("%c ", haplotype.getBases()[k]); } System.out.println(); System.out.print("Read bases: "); for (int k=0; k <readBases.length; k++) { System.out.format("%c ", readBases[k]); } System.out.format("\nLikelihood:%f\n",pRead); } if (read.getReadName().contains("106880")) { System.out.println("aca"); System.out.println("Haplotype:"); for (int k=initialIndexInHaplotype; k <haplotype.getBases().length; k++) { System.out.format("%c ", haplotype.getBases()[k]); } System.out.println(); System.out.println("Read bases: "); for (int k=readStartIdx; k <readBases.length; k++) { System.out.format("%c ", readBases[k]); } } return pRead; } */ // Update path metric computations based on branch metric (Add/Compare/Select operations) // do forward direction first, ie from anchor to end of read // outer loop for (int indR = 0; indR < readLength; indR++) { byte readBase = readBases[indR]; byte readQual = readQuals[indR]; for (int indX = LEFT_ALIGN_INDEX; indX <= RIGHT_ALIGN_INDEX; indX++) { byte haplotypeBase; if (indX > LEFT_ALIGN_INDEX && indX < RIGHT_ALIGN_INDEX) haplotypeBase = haplotype.getBases()[indX - 1]; else haplotypeBase = readBase; updatePathMetrics(haplotypeBase, indX, indR, readBase, readQual); } } // for debugging only: compute backtracking to find optimal route through trellis. Since I'm // only interested // in log-likelihood of best state, this isn't really necessary. double bestMetric = MathUtils.arrayMin(pathMetricArray[readLength]); if (DEBUG) { System.out.println(read.getReadName()); System.out.print("Haplotype:"); for (int k = 0; k < haplotype.getBases().length; k++) { System.out.format("%c ", haplotype.getBases()[k]); } System.out.println(); System.out.print("Read bases: "); for (int k = 0; k < readBases.length; k++) { System.out.format("%c ", readBases[k]); } System.out.println(); System.out.print("Read quals: "); for (int k = 0; k < readQuals.length; k++) { System.out.format("%d ", (int) readQuals[k]); } System.out.println(); // start from last position of read, go backwards to find optimal alignment int[] bestIndexArray = new int[readLength]; int bestIndex = MathUtils.minElementIndex(pathMetricArray[readLength]); bestIndexArray[readLength - 1] = bestIndex; for (int k = readLength - 2; k >= 0; k--) { bestIndex = bestStateIndexArray[k][bestIndex]; bestIndexArray[k] = bestIndex; } System.out.print("Alignment: "); for (int k = 0; k < readBases.length; k++) { System.out.format("%d ", bestIndexArray[k]); } System.out.println(); } // now just take optimum along all path metrics: that's the log likelihood of best alignment if (DEBUG) System.out.format("Likelihood: %5.4f\n", bestMetric); return bestMetric; }
public final List<Sample> parse( Reader reader, EnumSet<MissingPedField> missingFields, SampleDB sampleDB) { final List<String> lines = new XReadLines(reader).readLines(); // What are the record offsets? final int familyPos = missingFields.contains(MissingPedField.NO_FAMILY_ID) ? -1 : 0; final int samplePos = familyPos + 1; final int paternalPos = missingFields.contains(MissingPedField.NO_PARENTS) ? -1 : samplePos + 1; final int maternalPos = missingFields.contains(MissingPedField.NO_PARENTS) ? -1 : paternalPos + 1; final int sexPos = missingFields.contains(MissingPedField.NO_SEX) ? -1 : Math.max(maternalPos, samplePos) + 1; final int phenotypePos = missingFields.contains(MissingPedField.NO_PHENOTYPE) ? -1 : Math.max(sexPos, Math.max(maternalPos, samplePos)) + 1; final int nExpectedFields = MathUtils.arrayMaxInt( Arrays.asList(samplePos, paternalPos, maternalPos, sexPos, phenotypePos)) + 1; // go through once and determine properties int lineNo = 1; boolean isQT = false; final List<String[]> splits = new ArrayList<String[]>(lines.size()); for (final String line : lines) { if (line.startsWith(commentMarker)) continue; if (line.trim().equals("")) continue; final String[] parts = line.split("\\s+"); if (parts.length != nExpectedFields) throw new UserException.MalformedFile( reader.toString(), "Bad PED line " + lineNo + ": wrong number of fields"); if (phenotypePos != -1) { isQT = isQT || !CATAGORICAL_TRAIT_VALUES.contains(parts[phenotypePos]); } splits.add(parts); lineNo++; } logger.info("Phenotype is other? " + isQT); // now go through and parse each record lineNo = 1; final List<Sample> samples = new ArrayList<Sample>(splits.size()); for (final String[] parts : splits) { String familyID = null, individualID, paternalID = null, maternalID = null; Gender sex = Gender.UNKNOWN; String quantitativePhenotype = Sample.UNSET_QT; Affection affection = Affection.UNKNOWN; if (familyPos != -1) familyID = maybeMissing(parts[familyPos]); individualID = parts[samplePos]; if (paternalPos != -1) paternalID = maybeMissing(parts[paternalPos]); if (maternalPos != -1) maternalID = maybeMissing(parts[maternalPos]); if (sexPos != -1) { if (parts[sexPos].equals(SEX_MALE)) sex = Gender.MALE; else if (parts[sexPos].equals(SEX_FEMALE)) sex = Gender.FEMALE; else sex = Gender.UNKNOWN; } if (phenotypePos != -1) { if (isQT) { if (parts[phenotypePos].equals(MISSING_VALUE1)) affection = Affection.UNKNOWN; else { affection = Affection.OTHER; quantitativePhenotype = parts[phenotypePos]; } } else { if (parts[phenotypePos].equals(MISSING_VALUE1)) affection = Affection.UNKNOWN; else if (parts[phenotypePos].equals(MISSING_VALUE2)) affection = Affection.UNKNOWN; else if (parts[phenotypePos].equals(PHENOTYPE_UNAFFECTED)) affection = Affection.UNAFFECTED; else if (parts[phenotypePos].equals(PHENOTYPE_AFFECTED)) affection = Affection.AFFECTED; else throw new ReviewedGATKException( "Unexpected phenotype type " + parts[phenotypePos] + " at line " + lineNo); } } final Sample s = new Sample( individualID, sampleDB, familyID, paternalID, maternalID, sex, affection, quantitativePhenotype); samples.add(s); sampleDB.addSample(s); lineNo++; } for (final Sample sample : new ArrayList<Sample>(samples)) { Sample dad = maybeAddImplicitSample( sampleDB, sample.getPaternalID(), sample.getFamilyID(), Gender.MALE); if (dad != null) samples.add(dad); Sample mom = maybeAddImplicitSample( sampleDB, sample.getMaternalID(), sample.getFamilyID(), Gender.FEMALE); if (mom != null) samples.add(mom); } return samples; }