/** * Determine if the given loc overlaps any loc in the sorted set * * @param loc the location to test * @return */ public boolean overlaps(final GenomeLoc loc) { for (final GenomeLoc e : mArray) { if (e.overlapsP(loc)) { return true; } } return false; }
/** * return a deep copy of this collection. * * @return a new GenomeLocSortedSet, identical to the current GenomeLocSortedSet. */ public GenomeLocSortedSet clone() { GenomeLocSortedSet ret = new GenomeLocSortedSet(genomeLocParser); for (GenomeLoc loc : this.mArray) { // ensure a deep copy ret.mArray.add( genomeLocParser.createGenomeLoc(loc.getContig(), loc.getStart(), loc.getStop())); } return ret; }
public String toString() { StringBuilder s = new StringBuilder(); s.append("["); for (GenomeLoc e : this) { s.append(" "); s.append(e.toString()); } s.append("]"); return s.toString(); }
@Test public void deleteSomeByRegion() { GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 1, 100); mSortedSet.add(e); for (int x = 1; x < 50; x++) { GenomeLoc del = genomeLocParser.createGenomeLoc(contigOneName, x, x); mSortedSet = mSortedSet.subtractRegions(new GenomeLocSortedSet(genomeLocParser, del)); } assertTrue(!mSortedSet.isEmpty()); assertTrue(mSortedSet.size() == 1); GenomeLoc loc = mSortedSet.iterator().next(); assertTrue(loc.getStop() == 100); assertTrue(loc.getStart() == 50); }
@Test public void fromSequenceDictionary() { mSortedSet = GenomeLocSortedSet.createSetFromSequenceDictionary(this.header.getSequenceDictionary()); // we should have sequence assertTrue(mSortedSet.size() == GenomeLocSortedSetUnitTest.NUMBER_OF_CHROMOSOMES); int seqNumber = 0; for (GenomeLoc loc : mSortedSet) { assertTrue(loc.getStart() == 1); assertTrue(loc.getStop() == GenomeLocSortedSetUnitTest.CHROMOSOME_SIZE); assertTrue(loc.getContigIndex() == seqNumber); ++seqNumber; } assertTrue(seqNumber == GenomeLocSortedSetUnitTest.NUMBER_OF_CHROMOSOMES); }
@Test public void mergingOverlappingAbove() { GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 0, 50); GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 49, 100); assertTrue(mSortedSet.size() == 0); mSortedSet.add(g); assertTrue(mSortedSet.size() == 1); mSortedSet.addRegion(e); assertTrue(mSortedSet.size() == 1); Iterator<GenomeLoc> iter = mSortedSet.iterator(); GenomeLoc loc = iter.next(); assertEquals(loc.getStart(), 0); assertEquals(loc.getStop(), 100); assertEquals(loc.getContigIndex(), 1); }
/** * add a genomeLoc to the collection, simply inserting in order into the set * * @param e the GenomeLoc to add * @return true */ public boolean add(GenomeLoc e) { // assuming that the intervals coming arrive in order saves us a fair amount of time (and it's // most likely true) if (mArray.size() > 0 && e.isPast(mArray.get(mArray.size() - 1))) { mArray.add(e); return true; } else { int loc = Collections.binarySearch(mArray, e); if (loc >= 0) { throw new ReviewedStingException( "Genome Loc Sorted Set already contains the GenomicLoc " + e.toString()); } else { mArray.add((loc + 1) * -1, e); return true; } } }
public GenomeLocSortedSet subtractRegions(GenomeLocSortedSet toRemoveSet) { LinkedList<GenomeLoc> good = new LinkedList<GenomeLoc>(); Stack<GenomeLoc> toProcess = new Stack<GenomeLoc>(); Stack<GenomeLoc> toExclude = new Stack<GenomeLoc>(); // initialize the stacks toProcess.addAll(mArray); Collections.reverse(toProcess); toExclude.addAll(toRemoveSet.mArray); Collections.reverse(toExclude); int i = 0; while (!toProcess.empty()) { // while there's still stuff to process if (toExclude.empty()) { good.addAll(toProcess); // no more excludes, all the processing stuff is good break; } GenomeLoc p = toProcess.peek(); GenomeLoc e = toExclude.peek(); if (p.overlapsP(e)) { toProcess.pop(); for (GenomeLoc newP : p.subtract(e)) toProcess.push(newP); } else if (p.compareContigs(e) < 0) { good.add(toProcess.pop()); // p is now good } else if (p.compareContigs(e) > 0) { toExclude.pop(); // e can't effect anything } else if (p.getStop() < e.getStart()) { good.add(toProcess.pop()); // p stops before e starts, p is good } else if (e.getStop() < p.getStart()) { toExclude.pop(); // p starts after e stops, e is done } else { throw new ReviewedStingException("BUG: unexpected condition: p=" + p + ", e=" + e); } if (i++ % 10000 == 0) logger.debug("removeRegions operation: i = " + i); } return createSetFromList(genomeLocParser, good); }
/** * Return the number of bps before loc in the sorted set * * @param loc the location before which we are counting bases * @return */ public long sizeBeforeLoc(GenomeLoc loc) { long s = 0; for (GenomeLoc e : this) { if (e.isBefore(loc)) s += e.size(); else if (e.isPast(loc)) ; // don't do anything else // loc is inside of s s += loc.getStart() - e.getStart(); } return s; }
@Test public void deleteSuperRegion() { GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 10, 20); GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 70, 100); mSortedSet.add(g); mSortedSet.addRegion(e); assertTrue(mSortedSet.size() == 2); // now delete a region GenomeLoc d = genomeLocParser.createGenomeLoc(contigOneName, 15, 75); mSortedSet = mSortedSet.subtractRegions(new GenomeLocSortedSet(genomeLocParser, d)); Iterator<GenomeLoc> iter = mSortedSet.iterator(); GenomeLoc loc = iter.next(); assertTrue(loc.getStart() == 10); assertTrue(loc.getStop() == 14); assertTrue(loc.getContigIndex() == 1); loc = iter.next(); assertTrue(loc.getStart() == 76); assertTrue(loc.getStop() == 100); assertTrue(loc.getContigIndex() == 1); }
/** * Read in a list of ExactCall objects from reader, keeping only those with starts in startsToKeep * or all sites (if this is empty) * * @param reader a just-opened reader sitting at the start of the file * @param startsToKeep a list of start position of the calls to keep, or empty if all calls should * be kept * @param parser a genome loc parser to create genome locs * @return a list of ExactCall objects in reader * @throws IOException */ public static List<ExactCall> readExactLog( final BufferedReader reader, final List<Integer> startsToKeep, GenomeLocParser parser) throws IOException { if (reader == null) throw new IllegalArgumentException("reader cannot be null"); if (startsToKeep == null) throw new IllegalArgumentException("startsToKeep cannot be null"); if (parser == null) throw new IllegalArgumentException("GenomeLocParser cannot be null"); List<ExactCall> calls = new LinkedList<ExactCall>(); // skip the header line reader.readLine(); // skip the first "type" line reader.readLine(); while (true) { final VariantContextBuilder builder = new VariantContextBuilder(); final List<Allele> alleles = new ArrayList<Allele>(); final List<Genotype> genotypes = new ArrayList<Genotype>(); final double[] posteriors = new double[2]; final double[] priors = MathUtils.normalizeFromLog10(new double[] {0.5, 0.5}, true); final List<Integer> mle = new ArrayList<Integer>(); final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(); long runtimeNano = -1; GenomeLoc currentLoc = null; while (true) { final String line = reader.readLine(); if (line == null) return calls; final String[] parts = line.split("\t"); final GenomeLoc lineLoc = parser.parseGenomeLoc(parts[0]); final String variable = parts[1]; final String key = parts[2]; final String value = parts[3]; if (currentLoc == null) currentLoc = lineLoc; if (variable.equals("type")) { if (startsToKeep.isEmpty() || startsToKeep.contains(currentLoc.getStart())) { builder.alleles(alleles); final int stop = currentLoc.getStart() + alleles.get(0).length() - 1; builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop); builder.genotypes(genotypes); final int[] mleInts = ArrayUtils.toPrimitive(mle.toArray(new Integer[] {})); final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pNonRefByAllele); calls.add(new ExactCall(builder.make(), runtimeNano, result)); } break; } else if (variable.equals("allele")) { final boolean isRef = key.equals("0"); alleles.add(Allele.create(value, isRef)); } else if (variable.equals("PL")) { final GenotypeBuilder gb = new GenotypeBuilder(key); gb.PL(GenotypeLikelihoods.fromPLField(value).getAsPLs()); genotypes.add(gb.make()); } else if (variable.equals("log10PosteriorOfAFEq0")) { posteriors[0] = Double.valueOf(value); } else if (variable.equals("log10PosteriorOfAFGt0")) { posteriors[1] = Double.valueOf(value); } else if (variable.equals("MLE")) { mle.add(Integer.valueOf(value)); } else if (variable.equals("pNonRefByAllele")) { final Allele a = Allele.create(key); log10pNonRefByAllele.put(a, Double.valueOf(value)); } else if (variable.equals("runtime.nano")) { runtimeNano = Long.valueOf(value); } else { // nothing to do } } } }
/** * Return the size, in bp, of the genomic regions by all of the regions in this set * * @return size in bp of the covered regions */ public long coveredSize() { long s = 0; for (GenomeLoc e : this) s += e.size(); return s; }
/** * Adds a GenomeLoc to the collection, merging it if it overlaps another region. If it's not * overlapping then we add it in sorted order. * * @param e the GenomeLoc to add to the collection * @return true, if the GenomeLoc could be added to the collection */ public boolean addRegion(GenomeLoc e) { if (e == null) { return false; } // have we added it to the collection? boolean haveAdded = false; /** * check if the specified element overlaps any current locations, if so we should merge the two. */ for (GenomeLoc g : mArray) { if (g.contiguousP(e)) { GenomeLoc c = g.merge(e); mArray.set(mArray.indexOf(g), c); haveAdded = true; } else if ((g.getContigIndex() == e.getContigIndex()) && (e.getStart() < g.getStart()) && !haveAdded) { mArray.add(mArray.indexOf(g), e); return true; } else if (haveAdded && ((e.getContigIndex() > e.getContigIndex()) || (g.getContigIndex() == e.getContigIndex() && e.getStart() > g.getStart()))) { return true; } } /** * we're at the end and we haven't found locations that should fall after it, so we'll put it at * the end */ if (!haveAdded) { mArray.add(e); } return true; }
@DataProvider(name = "GetOverlapping") public Object[][] makeGetOverlappingTest() throws Exception { final GenomeLocParser genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(b37KGReference))); List<Object[]> tests = new ArrayList<Object[]>(); final GenomeLoc prev1 = genomeLocParser.createGenomeLoc("19", 1, 10); final GenomeLoc prev2 = genomeLocParser.createGenomeLoc("19", 20, 50); final GenomeLoc post1 = genomeLocParser.createGenomeLoc("21", 1, 10); final GenomeLoc post2 = genomeLocParser.createGenomeLoc("21", 20, 50); final int chr20Length = genomeLocParser.getContigs().getSequence("20").getSequenceLength(); for (final int regionStart : Arrays.asList(1, 10, chr20Length - 10, chr20Length)) { for (final int regionSize : Arrays.asList(1, 10, 100)) { final GenomeLoc region = genomeLocParser.createGenomeLocOnContig("20", regionStart, regionStart + regionSize); final GenomeLoc spanning = genomeLocParser.createGenomeLocOnContig("20", regionStart - 10, region.getStop() + 10); final GenomeLoc before_into = genomeLocParser.createGenomeLocOnContig("20", regionStart - 10, regionStart + 1); final GenomeLoc middle = genomeLocParser.createGenomeLocOnContig("20", regionStart + 1, regionStart + 2); final GenomeLoc middle_past = genomeLocParser.createGenomeLocOnContig( "20", region.getStop() - 1, region.getStop() + 10); final List<GenomeLoc> potentials = new LinkedList<GenomeLoc>(); potentials.add(region); if (spanning != null) potentials.add(spanning); if (before_into != null) potentials.add(before_into); if (middle != null) potentials.add(middle); if (middle_past != null) potentials.add(middle_past); for (final int n : Arrays.asList(1, 2, 3)) { for (final List<GenomeLoc> regions : Utils.makePermutations(potentials, n, false)) { tests.add(new Object[] {new GenomeLocSortedSet(genomeLocParser, regions), region}); tests.add( new Object[] { new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, prev1)), region }); tests.add( new Object[] { new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, prev1, prev2)), region }); tests.add( new Object[] { new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, post1)), region }); tests.add( new Object[] { new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, post1, post2)), region }); tests.add( new Object[] { new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, prev1, post1)), region }); tests.add( new Object[] { new GenomeLocSortedSet( genomeLocParser, Utils.append(regions, prev1, prev2, post1, post2)), region }); } } } } return tests.toArray(new Object[][] {}); }
/** * Main entry function to calculate genotypes of a given VC with corresponding GL's * * @param tracker Tracker * @param refContext Reference context * @param rawContext Raw context * @param stratifiedContexts Stratified alignment contexts * @param vc Input VC * @param model GL calculation model * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc * @return VC with assigned genotypes */ public VariantCallContext calculateGenotypes( final RefMetaDataTracker tracker, final ReferenceContext refContext, final AlignmentContext rawContext, Map<String, AlignmentContext> stratifiedContexts, final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final boolean inheritAttributesFromInputVC, final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) { boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; // initialize the data for this thread if that hasn't been done yet if (afcm.get() == null) { afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger)); } // estimate our confidence in a reference call and return if (vc.getNSamples() == 0) { if (limitedContext) return null; return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ? estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), false, 1.0) : generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); } AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model)); // is the most likely frequency conformation AC=0 for all alternate alleles? boolean bestGuessIsRef = true; // determine which alternate alleles have AF>0 final List<Allele> myAlleles = new ArrayList<Allele>(vc.getAlleles().size()); final List<Integer> alleleCountsofMLE = new ArrayList<Integer>(vc.getAlleles().size()); myAlleles.add(vc.getReference()); for (int i = 0; i < AFresult.getAllelesUsedInGenotyping().size(); i++) { final Allele alternateAllele = AFresult.getAllelesUsedInGenotyping().get(i); if (alternateAllele.isReference()) continue; // we are non-ref if the probability of being non-ref > the emit confidence. // the emit confidence is phred-scaled, say 30 => 10^-3. // the posterior AF > 0 is log10: -5 => 10^-5 // we are non-ref if 10^-5 < 10^-3 => -5 < -3 final boolean isNonRef = AFresult.isPolymorphic(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING / -10.0); // if the most likely AC is not 0, then this is a good alternate allele to use if (isNonRef) { myAlleles.add(alternateAllele); alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); bestGuessIsRef = false; } // if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele else if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { myAlleles.add(alternateAllele); alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); } } final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0()); // note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice final double phredScaledConfidence = Math.abs( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE .GENOTYPE_GIVEN_ALLELES ? -10 * AFresult.getLog10PosteriorOfAFEq0() : -10 * AFresult.getLog10PosteriorOfAFGT0()); // return a null call if we don't pass the confidence cutoff or the most likely allele frequency // is zero if (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef)) { // technically, at this point our confidence in a reference call isn't accurately estimated // because it didn't take into account samples with no data, so let's get a better estimate return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0); } // start constructing the resulting VC final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc); final VariantContextBuilder builder = new VariantContextBuilder( "UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles); builder.log10PError(phredScaledConfidence / -10.0); if (!passesCallThreshold(phredScaledConfidence)) builder.filters(filter); // create the genotypes final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, myAlleles, true, ploidy); builder.genotypes(genotypes); // print out stats if we have a writer if (verboseWriter != null && !limitedContext) printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model); // *** note that calculating strand bias involves overwriting data structures, so we do that // last final HashMap<String, Object> attributes = new HashMap<String, Object>(); // inherit attributed from input vc if requested if (inheritAttributesFromInputVC) attributes.putAll(vc.getAttributes()); // if the site was downsampled, record that fact if (!limitedContext && rawContext.hasPileupBeenDownsampled()) attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); if (UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED) attributes.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size()); // add the MLE AC and AF annotations if (alleleCountsofMLE.size() > 0) { attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE); final int AN = builder.make().getCalledChrCount(); final ArrayList<Double> MLEfrequencies = new ArrayList<Double>(alleleCountsofMLE.size()); // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT // is ./. but the exact model may arbitrarily choose an AC>1) for (int AC : alleleCountsofMLE) MLEfrequencies.add(Math.min(1.0, (double) AC / (double) AN)); attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies); } if (UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef) { // final boolean DEBUG_SLOD = false; // the overall lod // double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); // if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); List<Allele> allAllelesToUse = builder.make().getAlleles(); // the forward lod VariantContext vcForward = calculateLikelihoods( tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); AFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model)); // double[] normalizedLog10Posteriors = // MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double forwardLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); double forwardLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); // if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", // forwardLog10PofF=" + forwardLog10PofF); // the reverse lod VariantContext vcReverse = calculateLikelihoods( tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); AFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model)); // normalizedLog10Posteriors = // MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double reverseLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); double reverseLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); // if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", // reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF; // if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + // reverseLod); // strand score is max bias between forward and reverse strands double strandScore = Math.max(forwardLod, reverseLod); // rescale by a factor of 10 strandScore *= 10.0; // logger.debug(String.format("SLOD=%f", strandScore)); if (!Double.isNaN(strandScore)) attributes.put("SB", strandScore); } // finish constructing the resulting VC builder.attributes(attributes); VariantContext vcCall = builder.make(); // if we are subsetting alleles (either because there were too many or because some were not // polymorphic) // then we may need to trim the alleles (because the original VariantContext may have had to pad // at the end). if (myAlleles.size() != vc.getAlleles().size() && !limitedContext) // limitedContext callers need to handle allele trimming on their own to // keep their perReadAlleleLikelihoodMap alleles in sync vcCall = VariantContextUtils.reverseTrimAlleles(vcCall); if (annotationEngine != null && !limitedContext) { // limitedContext callers need to handle annotations on their own by // calling their own annotationEngine // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations final ReadBackedPileup pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); vcCall = annotationEngine.annotateContext( tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap); } return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0)); }