/** * Test the reads according to an independently derived context. * * @param view * @param range * @param reads */ @Override protected void testReadsInContext( LocusView view, List<GenomeLoc> range, List<GATKSAMRecord> reads) { AllLocusView allLocusView = (AllLocusView) view; // TODO: Should skip over loci not in the given range. GenomeLoc firstLoc = range.get(0); GenomeLoc lastLoc = range.get(range.size() - 1); GenomeLoc bounds = genomeLocParser.createGenomeLoc( firstLoc.getContig(), firstLoc.getStart(), lastLoc.getStop()); for (int i = bounds.getStart(); i <= bounds.getStop(); i++) { GenomeLoc site = genomeLocParser.createGenomeLoc("chr1", i); AlignmentContext locusContext = allLocusView.next(); Assert.assertEquals(locusContext.getLocation(), site, "Locus context location is incorrect"); int expectedReadsAtSite = 0; for (GATKSAMRecord read : reads) { if (genomeLocParser.createGenomeLoc(read).containsP(locusContext.getLocation())) { Assert.assertTrue( locusContext.getReads().contains(read), "Target locus context does not contain reads"); expectedReadsAtSite++; } } Assert.assertEquals( locusContext.getReads().size(), expectedReadsAtSite, "Found wrong number of reads at site"); } }
private GenomeLoc createIntervalAfter(GenomeLoc interval) { int contigLimit = getToolkit() .getSAMFileHeader() .getSequenceDictionary() .getSequence(interval.getContigIndex()) .getSequenceLength(); int start = Math.min(interval.getStop() + 1, contigLimit); int stop = Math.min(interval.getStop() + expandInterval, contigLimit); return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop); }
/** * return a deep copy of this collection. * * @return a new GenomeLocSortedSet, identical to the current GenomeLocSortedSet. */ public GenomeLocSortedSet clone() { GenomeLocSortedSet ret = new GenomeLocSortedSet(genomeLocParser); for (GenomeLoc loc : this.mArray) { // ensure a deep copy ret.mArray.add( genomeLocParser.createGenomeLoc(loc.getContig(), loc.getStart(), loc.getStop())); } return ret; }
public GenomeLocSortedSet subtractRegions(GenomeLocSortedSet toRemoveSet) { LinkedList<GenomeLoc> good = new LinkedList<GenomeLoc>(); Stack<GenomeLoc> toProcess = new Stack<GenomeLoc>(); Stack<GenomeLoc> toExclude = new Stack<GenomeLoc>(); // initialize the stacks toProcess.addAll(mArray); Collections.reverse(toProcess); toExclude.addAll(toRemoveSet.mArray); Collections.reverse(toExclude); int i = 0; while (!toProcess.empty()) { // while there's still stuff to process if (toExclude.empty()) { good.addAll(toProcess); // no more excludes, all the processing stuff is good break; } GenomeLoc p = toProcess.peek(); GenomeLoc e = toExclude.peek(); if (p.overlapsP(e)) { toProcess.pop(); for (GenomeLoc newP : p.subtract(e)) toProcess.push(newP); } else if (p.compareContigs(e) < 0) { good.add(toProcess.pop()); // p is now good } else if (p.compareContigs(e) > 0) { toExclude.pop(); // e can't effect anything } else if (p.getStop() < e.getStart()) { good.add(toProcess.pop()); // p stops before e starts, p is good } else if (e.getStop() < p.getStart()) { toExclude.pop(); // p starts after e stops, e is done } else { throw new ReviewedStingException("BUG: unexpected condition: p=" + p + ", e=" + e); } if (i++ % 10000 == 0) logger.debug("removeRegions operation: i = " + i); } return createSetFromList(genomeLocParser, good); }
@Test public void deleteSuperRegion() { GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 10, 20); GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 70, 100); mSortedSet.add(g); mSortedSet.addRegion(e); assertTrue(mSortedSet.size() == 2); // now delete a region GenomeLoc d = genomeLocParser.createGenomeLoc(contigOneName, 15, 75); mSortedSet = mSortedSet.subtractRegions(new GenomeLocSortedSet(genomeLocParser, d)); Iterator<GenomeLoc> iter = mSortedSet.iterator(); GenomeLoc loc = iter.next(); assertTrue(loc.getStart() == 10); assertTrue(loc.getStop() == 14); assertTrue(loc.getContigIndex() == 1); loc = iter.next(); assertTrue(loc.getStart() == 76); assertTrue(loc.getStop() == 100); assertTrue(loc.getContigIndex() == 1); }
/** * Determines what is the position of the read in relation to the interval. Note: This function * uses the UNCLIPPED ENDS of the reads for the comparison. * * @param read the read * @param interval the interval * @return the overlap type as described by ReadAndIntervalOverlap enum (see above) */ public static ReadAndIntervalOverlap getReadAndIntervalOverlapType( GATKSAMRecord read, GenomeLoc interval) { int sStart = read.getSoftStart(); int sStop = read.getSoftEnd(); int uStart = read.getUnclippedStart(); int uStop = read.getUnclippedEnd(); if (!read.getReferenceName().equals(interval.getContig())) return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG; else if (uStop < interval.getStart()) return ReadAndIntervalOverlap.NO_OVERLAP_LEFT; else if (uStart > interval.getStop()) return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT; else if (sStop < interval.getStart()) return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_LEFT; else if (sStart > interval.getStop()) return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_RIGHT; else if ((sStart >= interval.getStart()) && (sStop <= interval.getStop())) return ReadAndIntervalOverlap.OVERLAP_CONTAINED; else if ((sStart < interval.getStart()) && (sStop > interval.getStop())) return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT; else if ((sStart < interval.getStart())) return ReadAndIntervalOverlap.OVERLAP_LEFT; else return ReadAndIntervalOverlap.OVERLAP_RIGHT; }
@Test public void deleteSomeByRegion() { GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 1, 100); mSortedSet.add(e); for (int x = 1; x < 50; x++) { GenomeLoc del = genomeLocParser.createGenomeLoc(contigOneName, x, x); mSortedSet = mSortedSet.subtractRegions(new GenomeLocSortedSet(genomeLocParser, del)); } assertTrue(!mSortedSet.isEmpty()); assertTrue(mSortedSet.size() == 1); GenomeLoc loc = mSortedSet.iterator().next(); assertTrue(loc.getStop() == 100); assertTrue(loc.getStart() == 50); }
@Test public void fromSequenceDictionary() { mSortedSet = GenomeLocSortedSet.createSetFromSequenceDictionary(this.header.getSequenceDictionary()); // we should have sequence assertTrue(mSortedSet.size() == GenomeLocSortedSetUnitTest.NUMBER_OF_CHROMOSOMES); int seqNumber = 0; for (GenomeLoc loc : mSortedSet) { assertTrue(loc.getStart() == 1); assertTrue(loc.getStop() == GenomeLocSortedSetUnitTest.CHROMOSOME_SIZE); assertTrue(loc.getContigIndex() == seqNumber); ++seqNumber; } assertTrue(seqNumber == GenomeLocSortedSetUnitTest.NUMBER_OF_CHROMOSOMES); }
@Test public void mergingOverlappingAbove() { GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 0, 50); GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 49, 100); assertTrue(mSortedSet.size() == 0); mSortedSet.add(g); assertTrue(mSortedSet.size() == 1); mSortedSet.addRegion(e); assertTrue(mSortedSet.size() == 1); Iterator<GenomeLoc> iter = mSortedSet.iterator(); GenomeLoc loc = iter.next(); assertEquals(loc.getStart(), 0); assertEquals(loc.getStop(), 100); assertEquals(loc.getContigIndex(), 1); }
private boolean overlapsKnownCNV(VariantContext cnv) { if (knownCNVs != null) { final GenomeLoc loc = getWalker().getToolkit().getGenomeLocParser().createGenomeLoc(cnv, true); IntervalTree<GenomeLoc> intervalTree = knownCNVs.get(loc.getContig()); final Iterator<IntervalTree.Node<GenomeLoc>> nodeIt = intervalTree.overlappers(loc.getStart(), loc.getStop()); while (nodeIt.hasNext()) { final double overlapP = loc.reciprocialOverlapFraction(nodeIt.next().getValue()); if (overlapP > MIN_CNV_OVERLAP) return true; } } return false; }
public final Map<String, IntervalTree<GenomeLoc>> createIntervalTreeByContig( final IntervalBinding<Feature> intervals) { final Map<String, IntervalTree<GenomeLoc>> byContig = new HashMap<String, IntervalTree<GenomeLoc>>(); final List<GenomeLoc> locs = intervals.getIntervals(getToolkit()); // set up the map from contig -> interval tree for (final String contig : getContigNames()) byContig.put(contig, new IntervalTree<GenomeLoc>()); for (final GenomeLoc loc : locs) { byContig.get(loc.getContig()).put(loc.getStart(), loc.getStop(), loc); } return byContig; }
/** * Returns a new GenomeLoc that represents the region between the endpoints of this and that. * Requires that this and that GenomeLoc are both mapped. */ @Requires({"that != null", "isUnmapped(this) == isUnmapped(that)"}) @Ensures("result != null") public GenomeLoc endpointSpan(GenomeLoc that) throws ReviewedStingException { if (GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) { throw new ReviewedStingException("Cannot get endpoint span for unmerged genome locs"); } if (!this.getContig().equals(that.getContig())) { throw new ReviewedStingException( "Cannot get endpoint span for genome locs on different contigs"); } return new GenomeLoc( getContig(), this.contigIndex, Math.min(getStart(), that.getStart()), Math.max(getStop(), that.getStop())); }
/** * Returns a new GenomeLoc that represents the entire span of this and that. Requires that this * and that GenomeLoc are contiguous and both mapped */ @Requires({"that != null", "isUnmapped(this) == isUnmapped(that)"}) @Ensures("result != null") public GenomeLoc merge(GenomeLoc that) throws ReviewedStingException { if (GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) { if (!GenomeLoc.isUnmapped(this) || !GenomeLoc.isUnmapped(that)) throw new ReviewedStingException("Tried to merge a mapped and an unmapped genome loc"); return UNMAPPED; } if (!(this.contiguousP(that))) { throw new ReviewedStingException("The two genome loc's need to be contigous"); } return new GenomeLoc( getContig(), this.contigIndex, Math.min(getStart(), that.getStart()), Math.max(getStop(), that.getStop())); }
@Requires("that != null") @Ensures("result != null") public GenomeLoc intersect(GenomeLoc that) throws ReviewedStingException { if (GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) { if (!GenomeLoc.isUnmapped(this) || !GenomeLoc.isUnmapped(that)) throw new ReviewedStingException("Tried to intersect a mapped and an unmapped genome loc"); return UNMAPPED; } if (!(this.overlapsP(that))) { throw new ReviewedStingException( "GenomeLoc::intersect(): The two genome loc's need to overlap"); } return new GenomeLoc( getContig(), this.contigIndex, Math.max(getStart(), that.getStart()), Math.min(getStop(), that.getStop())); }
private T callWalkerMapOnActiveRegions( final ActiveRegionWalker<M, T> walker, T sum, final int minStart, final String currentContig) { // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can // unload those regions and process them // TODO can implement parallel traversal here while (workQueue.peek() != null) { final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); if (extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) { final ActiveRegion activeRegion = workQueue.remove(); sum = processActiveRegion(activeRegion, myReads, workQueue, sum, walker); } else { break; } } return sum; }
public Iterable<Shard> createShardsOverIntervals( final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) { List<Shard> shards = new ArrayList<Shard>(); for (GenomeLoc interval : intervals) { while (interval.size() > maxShardSize) { shards.add( new LocusShard( intervals.getGenomeLocParser(), readsDataSource, Collections.singletonList( intervals .getGenomeLocParser() .createGenomeLoc( interval.getContig(), interval.getStart(), interval.getStart() + maxShardSize - 1)), null)); interval = intervals .getGenomeLocParser() .createGenomeLoc( interval.getContig(), interval.getStart() + maxShardSize, interval.getStop()); } shards.add( new LocusShard( intervals.getGenomeLocParser(), readsDataSource, Collections.singletonList(interval), null)); } return shards; }
@Requires("that != null") public final boolean containsP(GenomeLoc that) { return onSameContig(that) && getStart() <= that.getStart() && getStop() >= that.getStop(); }
@DataProvider(name = "GetOverlapping") public Object[][] makeGetOverlappingTest() throws Exception { final GenomeLocParser genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(b37KGReference))); List<Object[]> tests = new ArrayList<Object[]>(); final GenomeLoc prev1 = genomeLocParser.createGenomeLoc("19", 1, 10); final GenomeLoc prev2 = genomeLocParser.createGenomeLoc("19", 20, 50); final GenomeLoc post1 = genomeLocParser.createGenomeLoc("21", 1, 10); final GenomeLoc post2 = genomeLocParser.createGenomeLoc("21", 20, 50); final int chr20Length = genomeLocParser.getContigs().getSequence("20").getSequenceLength(); for (final int regionStart : Arrays.asList(1, 10, chr20Length - 10, chr20Length)) { for (final int regionSize : Arrays.asList(1, 10, 100)) { final GenomeLoc region = genomeLocParser.createGenomeLocOnContig("20", regionStart, regionStart + regionSize); final GenomeLoc spanning = genomeLocParser.createGenomeLocOnContig("20", regionStart - 10, region.getStop() + 10); final GenomeLoc before_into = genomeLocParser.createGenomeLocOnContig("20", regionStart - 10, regionStart + 1); final GenomeLoc middle = genomeLocParser.createGenomeLocOnContig("20", regionStart + 1, regionStart + 2); final GenomeLoc middle_past = genomeLocParser.createGenomeLocOnContig( "20", region.getStop() - 1, region.getStop() + 10); final List<GenomeLoc> potentials = new LinkedList<GenomeLoc>(); potentials.add(region); if (spanning != null) potentials.add(spanning); if (before_into != null) potentials.add(before_into); if (middle != null) potentials.add(middle); if (middle_past != null) potentials.add(middle_past); for (final int n : Arrays.asList(1, 2, 3)) { for (final List<GenomeLoc> regions : Utils.makePermutations(potentials, n, false)) { tests.add(new Object[] {new GenomeLocSortedSet(genomeLocParser, regions), region}); tests.add( new Object[] { new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, prev1)), region }); tests.add( new Object[] { new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, prev1, prev2)), region }); tests.add( new Object[] { new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, post1)), region }); tests.add( new Object[] { new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, post1, post2)), region }); tests.add( new Object[] { new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, prev1, post1)), region }); tests.add( new Object[] { new GenomeLocSortedSet( genomeLocParser, Utils.append(regions, prev1, prev2, post1, post2)), region }); } } } } return tests.toArray(new Object[][] {}); }
@Override public T traverse( final ActiveRegionWalker<M, T> walker, final LocusShardDataProvider dataProvider, T sum) { logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider)); final LocusView locusView = getLocusView(walker, dataProvider); final GenomeLocSortedSet initialIntervals = engine.getIntervals(); final LocusReferenceView referenceView = new LocusReferenceView(walker, dataProvider); final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); if (locusView .hasNext()) { // trivial optimization to avoid unnecessary processing when there's nothing // here at all int minStart = Integer.MAX_VALUE; ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions()); ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); // We keep processing while the next reference location is within the interval GenomeLoc prevLoc = null; while (locusView.hasNext()) { final AlignmentContext locus = locusView.next(); GenomeLoc location = locus.getLocation(); if (prevLoc != null) { // fill in the active / inactive labels from the stop of the previous location to the // start of this location // TODO refactor to separate function for (int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++) { final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii); if (initialIntervals == null || initialIntervals.overlaps(fakeLoc)) { profile.add( fakeLoc, new ActivityProfileResult( walker.hasPresetActiveRegions() && walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0)); } } } dataProvider.getShard().getReadMetrics().incrementNumIterations(); // create reference context. Note that if we have a pileup of "extended events", the context // will // hold the (longest) stretch of deleted reference bases (if deletions are present in the // pileup). final ReferenceContext refContext = referenceView.getReferenceContext(location); // Iterate forward to get all reference ordered data covering this location final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus( locus.getLocation(), refContext); // Call the walkers isActive function for this locus and add them to the list to be // integrated later if (initialIntervals == null || initialIntervals.overlaps(location)) { profile.add(location, walkerActiveProb(walker, tracker, refContext, locus, location)); } // Grab all the previously unseen reads from this pileup and add them to the massive read // list for (final PileupElement p : locus.getBasePileup()) { final GATKSAMRecord read = p.getRead(); if (!myReads.contains(read)) { myReads.add(read); } // If this is the last pileup for this shard calculate the minimum alignment start so that // we know // which active regions in the work queue are now safe to process minStart = Math.min(minStart, read.getAlignmentStart()); } prevLoc = location; printProgress(locus.getLocation()); } updateCumulativeMetrics(dataProvider.getShard()); // Take the individual isActive calls and integrate them into contiguous active regions and // add these blocks of work to the work queue // band-pass filter the list of isActive probabilities and turn into active regions final ActivityProfile bandPassFiltered = profile.bandPassFilter(); final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions(activeRegionExtension, maxRegionSize); // add active regions to queue of regions to process // first check if can merge active regions over shard boundaries if (!activeRegions.isEmpty()) { if (!workQueue.isEmpty()) { final ActiveRegion last = workQueue.getLast(); final ActiveRegion first = activeRegions.get(0); if (last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize) { workQueue.removeLast(); activeRegions.remove(first); workQueue.add( new ActiveRegion( last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension)); } } workQueue.addAll(activeRegions); } logger.debug( "Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions."); // now go and process all of the active regions sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); } return sum; }
public Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { GenomeLoc cur = context.getLocation(); if (verbose && showSkipped) { for (long i = context.getSkippedBases(); i >= 0; i--) { SAMSequenceDictionary dictionary = getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(); SAMSequenceRecord contig = dictionary.getSequence(cur.getContig()); if (cur.getStop() < contig.getSequenceLength()) cur = getToolkit().getGenomeLocParser().incPos(cur, 1); else cur = getToolkit() .getGenomeLocParser() .createGenomeLoc( dictionary.getSequence(contig.getSequenceIndex() + 1).getSequenceName(), 1, 1); out.printf("%s: skipped%n", cur); } } long nRodsHere = 0; long nTotalBases = 0; if (ref == null) { // we're getting the last skipped update if (verbose) out.printf( "Last position was %s: skipping %d bases%n", context.getLocation(), context.getSkippedBases()); nRodsHere = -1; // don't update this nTotalBases = context.getSkippedBases(); } else { Collection<RODRecordList> rods = new LinkedList<RODRecordList>(); for (RODRecordList rod : tracker.getBoundRodTracks()) { // System.out.printf("Considering rod %s%n", rod); if (rod.getLocation().getStart() == context.getLocation().getStart() && !rod.getName().equals("interval")) { // only consider the first element // System.out.printf("adding it%n"); rods.add(rod); } } nRodsHere = rods.size(); if (nRodsHere > 0) { if (verbose) { List<String> names = new ArrayList<String>(); for (RODRecordList rod : rods) { names.add(rod.getName()); } // System.out.printf("context is %s", context.getSkippedBases()); out.printf( "At %s: found %d rod(s) [%s] after skipping %d bases%n", context.getLocation(), nRodsHere, Utils.join(",", names), context.getSkippedBases()); } } nTotalBases = context.getSkippedBases() + 1; } return new Datum(nRodsHere, context.getSkippedBases(), nTotalBases); }
/** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority * order, if provided. If uniqifySamples is true, the priority order is ignored and names are * created by concatenating the VC name with the sample name * * @param genomeLocParser loc parser * @param unsortedVCs collection of unsorted VCs * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs * @param filteredRecordMergeType merge type for filtered records * @param genotypeMergeOptions merge option for genotypes * @param annotateOrigin should we annotate the set it came from? * @param printMessages should we print messages? * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? * @return new VariantContext representing the merge of unsortedVCs */ public static VariantContext simpleMerge( final GenomeLocParser genomeLocParser, final Collection<VariantContext> unsortedVCs, final List<String> priorityListOfVCs, final FilteredRecordMergeType filteredRecordMergeType, final GenotypeMergeType genotypeMergeOptions, final boolean annotateOrigin, final boolean printMessages, final String setKey, final boolean filteredAreUncalled, final boolean mergeInfoWithMaxAC) { if (unsortedVCs == null || unsortedVCs.size() == 0) return null; if (annotateOrigin && priorityListOfVCs == null) throw new IllegalArgumentException( "Cannot merge calls and annotate their origins without a complete priority list of VariantContexts"); if (genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE) verifyUniqueSampleNames(unsortedVCs); List<VariantContext> prepaddedVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); // Make sure all variant contexts are padded with reference base in case of indels if necessary List<VariantContext> VCs = new ArrayList<VariantContext>(); for (VariantContext vc : prepaddedVCs) { // also a reasonable place to remove filtered calls, if needed if (!filteredAreUncalled || vc.isNotFiltered()) VCs.add(createVariantContextWithPaddedAlleles(vc, false)); } if (VCs.size() == 0) // everything is filtered out and we're filteredAreUncalled return null; // establish the baseline info from the first VC final VariantContext first = VCs.get(0); final String name = first.getSource(); final Allele refAllele = determineReferenceAllele(VCs); final Set<Allele> alleles = new LinkedHashSet<Allele>(); final Set<String> filters = new TreeSet<String>(); final Map<String, Object> attributes = new TreeMap<String, Object>(); final Set<String> inconsistentAttributes = new HashSet<String>(); final Set<String> variantSources = new HashSet< String>(); // contains the set of sources we found in our set of VCs that are variant final Set<String> rsIDs = new LinkedHashSet<String>(1); // most of the time there's one id GenomeLoc loc = getLocation(genomeLocParser, first); int depth = 0; int maxAC = -1; final Map<String, Object> attributesWithMaxAC = new TreeMap<String, Object>(); double log10PError = 1; VariantContext vcWithMaxAC = null; GenotypesContext genotypes = GenotypesContext.create(); // counting the number of filtered and variant VCs int nFiltered = 0; boolean remapped = false; // cycle through and add info from the other VCs, making sure the loc/reference matches for (VariantContext vc : VCs) { if (loc.getStart() != vc.getStart()) // || !first.getReference().equals(vc.getReference()) ) throw new ReviewedStingException( "BUG: attempting to merge VariantContexts with different start sites: first=" + first.toString() + " second=" + vc.toString()); if (getLocation(genomeLocParser, vc).size() > loc.size()) loc = getLocation(genomeLocParser, vc); // get the longest location nFiltered += vc.isFiltered() ? 1 : 0; if (vc.isVariant()) variantSources.add(vc.getSource()); AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); remapped = remapped || alleleMapping.needsRemapping(); alleles.addAll(alleleMapping.values()); mergeGenotypes( genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); log10PError = Math.min(log10PError, vc.isVariant() ? vc.getLog10PError() : 1); filters.addAll(vc.getFilters()); // // add attributes // // special case DP (add it up) and ID (just preserve it) // if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); if (vc.hasID()) rsIDs.add(vc.getID()); if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); // lets see if the string contains a , separator if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { List<String> alleleCountArray = Arrays.asList( rawAlleleCounts .substring(1, rawAlleleCounts.length() - 1) .split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); for (String alleleCount : alleleCountArray) { final int ac = Integer.valueOf(alleleCount.trim()); if (ac > maxAC) { maxAC = ac; vcWithMaxAC = vc; } } } else { final int ac = Integer.valueOf(rawAlleleCounts); if (ac > maxAC) { maxAC = ac; vcWithMaxAC = vc; } } } for (Map.Entry<String, Object> p : vc.getAttributes().entrySet()) { String key = p.getKey(); // if we don't like the key already, don't go anywhere if (!inconsistentAttributes.contains(key)) { boolean alreadyFound = attributes.containsKey(key); Object boundValue = attributes.get(key); boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); if (alreadyFound && !boundValue.equals(p.getValue()) && !boundIsMissingValue) { // we found the value but we're inconsistent, put it in the exclude list // System.out.printf("Inconsistent INFO values: %s => %s and %s%n", key, boundValue, // p.getValue()); inconsistentAttributes.add(key); attributes.remove(key); } else if (!alreadyFound || boundIsMissingValue) { // no value // if ( vc != first ) System.out.printf("Adding key %s => %s%n", p.getKey(), // p.getValue()); attributes.put(key, p.getValue()); } } } } // if we have more alternate alleles in the merged VC than in one or more of the // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well // as allele-dependent attributes like AC,AF for (VariantContext vc : VCs) { if (vc.alleles.size() == 1) continue; if (hasPLIncompatibleAlleles(alleles, vc.alleles)) { if (!genotypes.isEmpty()) logger.warn( String.format( "Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); genotypes = stripPLs(genotypes); // this will remove stale AC,AF attributed from vc calculateChromosomeCounts(vc, attributes, true); break; } } // take the VC with the maxAC and pull the attributes into a modifiable map if (mergeInfoWithMaxAC && vcWithMaxAC != null) { attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); } // if at least one record was unfiltered and we want a union, clear all of the filters if ((filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL) filters.clear(); if (annotateOrigin) { // we care about where the call came from String setValue; if (nFiltered == 0 && variantSources.size() == priorityListOfVCs.size()) // nothing was unfiltered setValue = MERGE_INTERSECTION; else if (nFiltered == VCs.size()) // everything was filtered out setValue = MERGE_FILTER_IN_ALL; else if (variantSources.isEmpty()) // everyone was reference setValue = MERGE_REF_IN_ALL; else { LinkedHashSet<String> s = new LinkedHashSet<String>(); for (VariantContext vc : VCs) if (vc.isVariant()) s.add(vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource()); setValue = Utils.join("-", s); } if (setKey != null) { attributes.put(setKey, setValue); if (mergeInfoWithMaxAC && vcWithMaxAC != null) { attributesWithMaxAC.put(setKey, vcWithMaxAC.getSource()); } } } if (depth > 0) attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID); builder.loc(loc.getContig(), loc.getStart(), loc.getStop()); builder.alleles(alleles); builder.genotypes(genotypes); builder.log10PError(log10PError); builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes); // Trim the padded bases of all alleles if necessary VariantContext merged = createVariantContextWithTrimmedAlleles(builder.make()); if (printMessages && remapped) System.out.printf("Remapped => %s%n", merged); return merged; }
@Requires("that != null") public boolean endsAt(GenomeLoc that) { return (this.compareContigs(that) == 0) && (this.getStop() == that.getStop()); }
@Requires({"locFirst != null", "locSecond != null", "locSecond.isPast(locFirst)"}) @Ensures("result >= 0") private static int distanceFirstStopToSecondStart(GenomeLoc locFirst, GenomeLoc locSecond) { return locSecond.getStart() - locFirst.getStop(); }
/** * Tests whether this contig is completely after contig 'that'. * * @param that Contig to test against. * @return true if this contig starts after 'that' ends; false if this is completely before or * overlaps 'that'. */ @Requires("that != null") public final boolean isPast(GenomeLoc that) { int comparison = this.compareContigs(that); return (comparison == 1 || (comparison == 0 && this.getStart() > that.getStop())); }
/** * Main entry function to calculate genotypes of a given VC with corresponding GL's * * @param tracker Tracker * @param refContext Reference context * @param rawContext Raw context * @param stratifiedContexts Stratified alignment contexts * @param vc Input VC * @param model GL calculation model * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc * @return VC with assigned genotypes */ public VariantCallContext calculateGenotypes( final RefMetaDataTracker tracker, final ReferenceContext refContext, final AlignmentContext rawContext, Map<String, AlignmentContext> stratifiedContexts, final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final boolean inheritAttributesFromInputVC, final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) { boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; // initialize the data for this thread if that hasn't been done yet if (afcm.get() == null) { afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger)); } // estimate our confidence in a reference call and return if (vc.getNSamples() == 0) { if (limitedContext) return null; return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ? estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), false, 1.0) : generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); } AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model)); // is the most likely frequency conformation AC=0 for all alternate alleles? boolean bestGuessIsRef = true; // determine which alternate alleles have AF>0 final List<Allele> myAlleles = new ArrayList<Allele>(vc.getAlleles().size()); final List<Integer> alleleCountsofMLE = new ArrayList<Integer>(vc.getAlleles().size()); myAlleles.add(vc.getReference()); for (int i = 0; i < AFresult.getAllelesUsedInGenotyping().size(); i++) { final Allele alternateAllele = AFresult.getAllelesUsedInGenotyping().get(i); if (alternateAllele.isReference()) continue; // we are non-ref if the probability of being non-ref > the emit confidence. // the emit confidence is phred-scaled, say 30 => 10^-3. // the posterior AF > 0 is log10: -5 => 10^-5 // we are non-ref if 10^-5 < 10^-3 => -5 < -3 final boolean isNonRef = AFresult.isPolymorphic(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING / -10.0); // if the most likely AC is not 0, then this is a good alternate allele to use if (isNonRef) { myAlleles.add(alternateAllele); alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); bestGuessIsRef = false; } // if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele else if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { myAlleles.add(alternateAllele); alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); } } final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0()); // note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice final double phredScaledConfidence = Math.abs( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE .GENOTYPE_GIVEN_ALLELES ? -10 * AFresult.getLog10PosteriorOfAFEq0() : -10 * AFresult.getLog10PosteriorOfAFGT0()); // return a null call if we don't pass the confidence cutoff or the most likely allele frequency // is zero if (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef)) { // technically, at this point our confidence in a reference call isn't accurately estimated // because it didn't take into account samples with no data, so let's get a better estimate return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0); } // start constructing the resulting VC final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc); final VariantContextBuilder builder = new VariantContextBuilder( "UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles); builder.log10PError(phredScaledConfidence / -10.0); if (!passesCallThreshold(phredScaledConfidence)) builder.filters(filter); // create the genotypes final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, myAlleles, true, ploidy); builder.genotypes(genotypes); // print out stats if we have a writer if (verboseWriter != null && !limitedContext) printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model); // *** note that calculating strand bias involves overwriting data structures, so we do that // last final HashMap<String, Object> attributes = new HashMap<String, Object>(); // inherit attributed from input vc if requested if (inheritAttributesFromInputVC) attributes.putAll(vc.getAttributes()); // if the site was downsampled, record that fact if (!limitedContext && rawContext.hasPileupBeenDownsampled()) attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); if (UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED) attributes.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size()); // add the MLE AC and AF annotations if (alleleCountsofMLE.size() > 0) { attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE); final int AN = builder.make().getCalledChrCount(); final ArrayList<Double> MLEfrequencies = new ArrayList<Double>(alleleCountsofMLE.size()); // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT // is ./. but the exact model may arbitrarily choose an AC>1) for (int AC : alleleCountsofMLE) MLEfrequencies.add(Math.min(1.0, (double) AC / (double) AN)); attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies); } if (UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef) { // final boolean DEBUG_SLOD = false; // the overall lod // double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); // if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); List<Allele> allAllelesToUse = builder.make().getAlleles(); // the forward lod VariantContext vcForward = calculateLikelihoods( tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); AFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model)); // double[] normalizedLog10Posteriors = // MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double forwardLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); double forwardLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); // if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", // forwardLog10PofF=" + forwardLog10PofF); // the reverse lod VariantContext vcReverse = calculateLikelihoods( tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); AFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model)); // normalizedLog10Posteriors = // MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double reverseLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); double reverseLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); // if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", // reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF; // if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + // reverseLod); // strand score is max bias between forward and reverse strands double strandScore = Math.max(forwardLod, reverseLod); // rescale by a factor of 10 strandScore *= 10.0; // logger.debug(String.format("SLOD=%f", strandScore)); if (!Double.isNaN(strandScore)) attributes.put("SB", strandScore); } // finish constructing the resulting VC builder.attributes(attributes); VariantContext vcCall = builder.make(); // if we are subsetting alleles (either because there were too many or because some were not // polymorphic) // then we may need to trim the alleles (because the original VariantContext may have had to pad // at the end). if (myAlleles.size() != vc.getAlleles().size() && !limitedContext) // limitedContext callers need to handle allele trimming on their own to // keep their perReadAlleleLikelihoodMap alleles in sync vcCall = VariantContextUtils.reverseTrimAlleles(vcCall); if (annotationEngine != null && !limitedContext) { // limitedContext callers need to handle annotations on their own by // calling their own annotationEngine // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations final ReadBackedPileup pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); vcCall = annotationEngine.annotateContext( tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap); } return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0)); }
public Allele getLikelihoods( RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> contexts, AlignmentContextUtils.ReadOrientation contextType, GenotypePriors priors, Map<String, MultiallelicGenotypeLikelihoods> GLs, Allele alternateAlleleToUse, boolean useBAQedPileup) { if (tracker == null) return null; GenomeLoc loc = ref.getLocus(); Allele refAllele, altAllele; VariantContext vc = null; if (!ref.getLocus().equals(lastSiteVisited)) { // starting a new site: clear allele list alleleList.clear(); lastSiteVisited = ref.getLocus(); indelLikelihoodMap.set(new HashMap<PileupElement, LinkedHashMap<Allele, Double>>()); haplotypeMap.clear(); if (getAlleleListFromVCF) { for (final VariantContext vc_input : tracker.getValues(UAC.alleles, loc)) { if (vc_input != null && allowableTypes.contains(vc_input.getType()) && ref.getLocus().getStart() == vc_input.getStart()) { vc = vc_input; break; } } // ignore places where we don't have a variant if (vc == null) return null; alleleList.clear(); if (ignoreSNPAllelesWhenGenotypingIndels) { // if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore // it and don't genotype it for (Allele a : vc.getAlleles()) if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length) continue; else alleleList.add(a); } else { for (Allele a : vc.getAlleles()) alleleList.add(a); } } else { alleleList = computeConsensusAlleles(ref, contexts, contextType); if (alleleList.isEmpty()) return null; } } // protect against having an indel too close to the edge of a contig if (loc.getStart() <= HAPLOTYPE_SIZE) return null; // check if there is enough reference window to create haplotypes (can be an issue at end of // contigs) if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE) return null; if (!(priors instanceof DiploidIndelGenotypePriors)) throw new StingException( "Only diploid-based Indel priors are supported in the DINDEL GL model"); if (alleleList.isEmpty()) return null; refAllele = alleleList.get(0); altAllele = alleleList.get(1); // look for alt allele that has biggest length distance to ref allele int maxLenDiff = 0; for (Allele a : alleleList) { if (a.isNonReference()) { int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length()); if (lenDiff > maxLenDiff) { maxLenDiff = lenDiff; altAllele = a; } } } final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length(); final int hsize = (int) ref.getWindow().size() - Math.abs(eventLength) - 1; final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1; haplotypeMap = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(), ref, hsize, numPrefBases); // For each sample, get genotype likelihoods based on pileup // compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with // them. // initialize the GenotypeLikelihoods GLs.clear(); for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) { AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); ReadBackedPileup pileup = null; if (context.hasExtendedEventPileup()) pileup = context.getExtendedEventPileup(); else if (context.hasBasePileup()) pileup = context.getBasePileup(); if (pileup != null) { final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); GLs.put( sample.getKey(), new MultiallelicGenotypeLikelihoods( sample.getKey(), alleleList, genotypeLikelihoods, getFilteredDepth(pileup))); if (DEBUG) { System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString()); for (int k = 0; k < genotypeLikelihoods.length; k++) System.out.format("%1.4f ", genotypeLikelihoods[k]); System.out.println(); } } } return refAllele; }
@Requires("that != null") public final List<GenomeLoc> subtract(final GenomeLoc that) { if (GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) { if (!GenomeLoc.isUnmapped(this) || !GenomeLoc.isUnmapped(that)) throw new ReviewedStingException("Tried to intersect a mapped and an unmapped genome loc"); return Arrays.asList(UNMAPPED); } if (!(this.overlapsP(that))) { throw new ReviewedStingException("GenomeLoc::minus(): The two genome loc's need to overlap"); } if (equals(that)) { return Collections.emptyList(); } else if (containsP(that)) { List<GenomeLoc> l = new ArrayList<GenomeLoc>(2); /** * we have to create two new region, one for the before part, one for the after The old * region: |----------------- old region (g) -------------| |----- to delete (e) ------| * * <p>product (two new regions): |------| + |--------| */ int afterStop = this.getStop(), afterStart = that.getStop() + 1; int beforeStop = that.getStart() - 1, beforeStart = this.getStart(); if (afterStop - afterStart >= 0) { GenomeLoc after = new GenomeLoc(this.getContig(), getContigIndex(), afterStart, afterStop); l.add(after); } if (beforeStop - beforeStart >= 0) { GenomeLoc before = new GenomeLoc(this.getContig(), getContigIndex(), beforeStart, beforeStop); l.add(before); } return l; } else if (that.containsP(this)) { /** * e completely contains g, delete g, but keep looking, there may be more regions i.e.: * |--------------------- e --------------------| |--- g ---| |---- others ----| */ return Collections.emptyList(); // don't need to do anything } else { /** * otherwise e overlaps some part of g * * <p>figure out which region occurs first on the genome. I.e., is it: |------------- g * ----------| |------------- e ----------| * * <p>or: |------------- g ----------| |------------ e -----------| */ GenomeLoc n; if (that.getStart() < this.getStart()) { n = new GenomeLoc(this.getContig(), getContigIndex(), that.getStop() + 1, this.getStop()); } else { n = new GenomeLoc(this.getContig(), getContigIndex(), this.getStart(), that.getStart() - 1); } // replace g with the new region return Arrays.asList(n); } }