/** * Test the reads according to an independently derived context. * * @param view * @param range * @param reads */ @Override protected void testReadsInContext( LocusView view, List<GenomeLoc> range, List<GATKSAMRecord> reads) { AllLocusView allLocusView = (AllLocusView) view; // TODO: Should skip over loci not in the given range. GenomeLoc firstLoc = range.get(0); GenomeLoc lastLoc = range.get(range.size() - 1); GenomeLoc bounds = genomeLocParser.createGenomeLoc( firstLoc.getContig(), firstLoc.getStart(), lastLoc.getStop()); for (int i = bounds.getStart(); i <= bounds.getStop(); i++) { GenomeLoc site = genomeLocParser.createGenomeLoc("chr1", i); AlignmentContext locusContext = allLocusView.next(); Assert.assertEquals(locusContext.getLocation(), site, "Locus context location is incorrect"); int expectedReadsAtSite = 0; for (GATKSAMRecord read : reads) { if (genomeLocParser.createGenomeLoc(read).containsP(locusContext.getLocation())) { Assert.assertTrue( locusContext.getReads().contains(read), "Target locus context does not contain reads"); expectedReadsAtSite++; } } Assert.assertEquals( locusContext.getReads().size(), expectedReadsAtSite, "Found wrong number of reads at site"); } }
/** * Adds a GenomeLoc to the collection, merging it if it overlaps another region. If it's not * overlapping then we add it in sorted order. * * @param e the GenomeLoc to add to the collection * @return true, if the GenomeLoc could be added to the collection */ public boolean addRegion(GenomeLoc e) { if (e == null) { return false; } // have we added it to the collection? boolean haveAdded = false; /** * check if the specified element overlaps any current locations, if so we should merge the two. */ for (GenomeLoc g : mArray) { if (g.contiguousP(e)) { GenomeLoc c = g.merge(e); mArray.set(mArray.indexOf(g), c); haveAdded = true; } else if ((g.getContigIndex() == e.getContigIndex()) && (e.getStart() < g.getStart()) && !haveAdded) { mArray.add(mArray.indexOf(g), e); return true; } else if (haveAdded && ((e.getContigIndex() > e.getContigIndex()) || (g.getContigIndex() == e.getContigIndex() && e.getStart() > g.getStart()))) { return true; } } /** * we're at the end and we haven't found locations that should fall after it, so we'll put it at * the end */ if (!haveAdded) { mArray.add(e); } return true; }
/** * Return the number of bps before loc in the sorted set * * @param loc the location before which we are counting bases * @return */ public long sizeBeforeLoc(GenomeLoc loc) { long s = 0; for (GenomeLoc e : this) { if (e.isBefore(loc)) s += e.size(); else if (e.isPast(loc)) ; // don't do anything else // loc is inside of s s += loc.getStart() - e.getStart(); } return s; }
/** * return a deep copy of this collection. * * @return a new GenomeLocSortedSet, identical to the current GenomeLocSortedSet. */ public GenomeLocSortedSet clone() { GenomeLocSortedSet ret = new GenomeLocSortedSet(genomeLocParser); for (GenomeLoc loc : this.mArray) { // ensure a deep copy ret.mArray.add( genomeLocParser.createGenomeLoc(loc.getContig(), loc.getStart(), loc.getStop())); } return ret; }
public GenomeLocSortedSet subtractRegions(GenomeLocSortedSet toRemoveSet) { LinkedList<GenomeLoc> good = new LinkedList<GenomeLoc>(); Stack<GenomeLoc> toProcess = new Stack<GenomeLoc>(); Stack<GenomeLoc> toExclude = new Stack<GenomeLoc>(); // initialize the stacks toProcess.addAll(mArray); Collections.reverse(toProcess); toExclude.addAll(toRemoveSet.mArray); Collections.reverse(toExclude); int i = 0; while (!toProcess.empty()) { // while there's still stuff to process if (toExclude.empty()) { good.addAll(toProcess); // no more excludes, all the processing stuff is good break; } GenomeLoc p = toProcess.peek(); GenomeLoc e = toExclude.peek(); if (p.overlapsP(e)) { toProcess.pop(); for (GenomeLoc newP : p.subtract(e)) toProcess.push(newP); } else if (p.compareContigs(e) < 0) { good.add(toProcess.pop()); // p is now good } else if (p.compareContigs(e) > 0) { toExclude.pop(); // e can't effect anything } else if (p.getStop() < e.getStart()) { good.add(toProcess.pop()); // p stops before e starts, p is good } else if (e.getStop() < p.getStart()) { toExclude.pop(); // p starts after e stops, e is done } else { throw new ReviewedStingException("BUG: unexpected condition: p=" + p + ", e=" + e); } if (i++ % 10000 == 0) logger.debug("removeRegions operation: i = " + i); } return createSetFromList(genomeLocParser, good); }
@Test public void deleteSuperRegion() { GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 10, 20); GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 70, 100); mSortedSet.add(g); mSortedSet.addRegion(e); assertTrue(mSortedSet.size() == 2); // now delete a region GenomeLoc d = genomeLocParser.createGenomeLoc(contigOneName, 15, 75); mSortedSet = mSortedSet.subtractRegions(new GenomeLocSortedSet(genomeLocParser, d)); Iterator<GenomeLoc> iter = mSortedSet.iterator(); GenomeLoc loc = iter.next(); assertTrue(loc.getStart() == 10); assertTrue(loc.getStop() == 14); assertTrue(loc.getContigIndex() == 1); loc = iter.next(); assertTrue(loc.getStart() == 76); assertTrue(loc.getStop() == 100); assertTrue(loc.getContigIndex() == 1); }
/** * Determines what is the position of the read in relation to the interval. Note: This function * uses the UNCLIPPED ENDS of the reads for the comparison. * * @param read the read * @param interval the interval * @return the overlap type as described by ReadAndIntervalOverlap enum (see above) */ public static ReadAndIntervalOverlap getReadAndIntervalOverlapType( GATKSAMRecord read, GenomeLoc interval) { int sStart = read.getSoftStart(); int sStop = read.getSoftEnd(); int uStart = read.getUnclippedStart(); int uStop = read.getUnclippedEnd(); if (!read.getReferenceName().equals(interval.getContig())) return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG; else if (uStop < interval.getStart()) return ReadAndIntervalOverlap.NO_OVERLAP_LEFT; else if (uStart > interval.getStop()) return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT; else if (sStop < interval.getStart()) return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_LEFT; else if (sStart > interval.getStop()) return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_RIGHT; else if ((sStart >= interval.getStart()) && (sStop <= interval.getStop())) return ReadAndIntervalOverlap.OVERLAP_CONTAINED; else if ((sStart < interval.getStart()) && (sStop > interval.getStop())) return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT; else if ((sStart < interval.getStart())) return ReadAndIntervalOverlap.OVERLAP_LEFT; else return ReadAndIntervalOverlap.OVERLAP_RIGHT; }
@Test public void deleteSomeByRegion() { GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 1, 100); mSortedSet.add(e); for (int x = 1; x < 50; x++) { GenomeLoc del = genomeLocParser.createGenomeLoc(contigOneName, x, x); mSortedSet = mSortedSet.subtractRegions(new GenomeLocSortedSet(genomeLocParser, del)); } assertTrue(!mSortedSet.isEmpty()); assertTrue(mSortedSet.size() == 1); GenomeLoc loc = mSortedSet.iterator().next(); assertTrue(loc.getStop() == 100); assertTrue(loc.getStart() == 50); }
public Iterable<Shard> createShardsOverIntervals( final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) { List<Shard> shards = new ArrayList<Shard>(); for (GenomeLoc interval : intervals) { while (interval.size() > maxShardSize) { shards.add( new LocusShard( intervals.getGenomeLocParser(), readsDataSource, Collections.singletonList( intervals .getGenomeLocParser() .createGenomeLoc( interval.getContig(), interval.getStart(), interval.getStart() + maxShardSize - 1)), null)); interval = intervals .getGenomeLocParser() .createGenomeLoc( interval.getContig(), interval.getStart() + maxShardSize, interval.getStop()); } shards.add( new LocusShard( intervals.getGenomeLocParser(), readsDataSource, Collections.singletonList(interval), null)); } return shards; }
@Test public void fromSequenceDictionary() { mSortedSet = GenomeLocSortedSet.createSetFromSequenceDictionary(this.header.getSequenceDictionary()); // we should have sequence assertTrue(mSortedSet.size() == GenomeLocSortedSetUnitTest.NUMBER_OF_CHROMOSOMES); int seqNumber = 0; for (GenomeLoc loc : mSortedSet) { assertTrue(loc.getStart() == 1); assertTrue(loc.getStop() == GenomeLocSortedSetUnitTest.CHROMOSOME_SIZE); assertTrue(loc.getContigIndex() == seqNumber); ++seqNumber; } assertTrue(seqNumber == GenomeLocSortedSetUnitTest.NUMBER_OF_CHROMOSOMES); }
@Test public void mergingOverlappingAbove() { GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 0, 50); GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 49, 100); assertTrue(mSortedSet.size() == 0); mSortedSet.add(g); assertTrue(mSortedSet.size() == 1); mSortedSet.addRegion(e); assertTrue(mSortedSet.size() == 1); Iterator<GenomeLoc> iter = mSortedSet.iterator(); GenomeLoc loc = iter.next(); assertEquals(loc.getStart(), 0); assertEquals(loc.getStop(), 100); assertEquals(loc.getContigIndex(), 1); }
private boolean overlapsKnownCNV(VariantContext cnv) { if (knownCNVs != null) { final GenomeLoc loc = getWalker().getToolkit().getGenomeLocParser().createGenomeLoc(cnv, true); IntervalTree<GenomeLoc> intervalTree = knownCNVs.get(loc.getContig()); final Iterator<IntervalTree.Node<GenomeLoc>> nodeIt = intervalTree.overlappers(loc.getStart(), loc.getStop()); while (nodeIt.hasNext()) { final double overlapP = loc.reciprocialOverlapFraction(nodeIt.next().getValue()); if (overlapP > MIN_CNV_OVERLAP) return true; } } return false; }
/** * Utility routine that prints out process information (including timing) every N records or every * M seconds, for N and M set in global variables. * * <p>Synchronized to ensure that even with multiple threads calling notifyOfProgress we still get * one clean stream of meter logs. * * <p>Note this thread doesn't actually print progress, unless must print is true, but just * registers the progress itself. A separate printing daemon periodically polls the meter to print * out progress * * @param loc Current location, can be null if you are at the end of the processing unit * @param nTotalRecordsProcessed the total number of records we've processed */ public synchronized void notifyOfProgress( final GenomeLoc loc, final long nTotalRecordsProcessed) { if (nTotalRecordsProcessed < 0) throw new IllegalArgumentException("nTotalRecordsProcessed must be >= 0"); // weird comparison to ensure that loc == null (in unmapped reads) is keep before maxGenomeLoc // == null (on startup) this.maxGenomeLoc = loc == null ? loc : (maxGenomeLoc == null ? loc : loc.max(maxGenomeLoc)); this.nTotalRecordsProcessed = Math.max(this.nTotalRecordsProcessed, nTotalRecordsProcessed); // a pretty name for our position this.positionMessage = maxGenomeLoc == null ? "unmapped reads" : String.format("%s:%d", maxGenomeLoc.getContig(), maxGenomeLoc.getStart()); }
public final Map<String, IntervalTree<GenomeLoc>> createIntervalTreeByContig( final IntervalBinding<Feature> intervals) { final Map<String, IntervalTree<GenomeLoc>> byContig = new HashMap<String, IntervalTree<GenomeLoc>>(); final List<GenomeLoc> locs = intervals.getIntervals(getToolkit()); // set up the map from contig -> interval tree for (final String contig : getContigNames()) byContig.put(contig, new IntervalTree<GenomeLoc>()); for (final GenomeLoc loc : locs) { byContig.get(loc.getContig()).put(loc.getStart(), loc.getStop(), loc); } return byContig; }
/** * Returns a new GenomeLoc that represents the region between the endpoints of this and that. * Requires that this and that GenomeLoc are both mapped. */ @Requires({"that != null", "isUnmapped(this) == isUnmapped(that)"}) @Ensures("result != null") public GenomeLoc endpointSpan(GenomeLoc that) throws ReviewedStingException { if (GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) { throw new ReviewedStingException("Cannot get endpoint span for unmerged genome locs"); } if (!this.getContig().equals(that.getContig())) { throw new ReviewedStingException( "Cannot get endpoint span for genome locs on different contigs"); } return new GenomeLoc( getContig(), this.contigIndex, Math.min(getStart(), that.getStart()), Math.max(getStop(), that.getStop())); }
/** * Returns a new GenomeLoc that represents the entire span of this and that. Requires that this * and that GenomeLoc are contiguous and both mapped */ @Requires({"that != null", "isUnmapped(this) == isUnmapped(that)"}) @Ensures("result != null") public GenomeLoc merge(GenomeLoc that) throws ReviewedStingException { if (GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) { if (!GenomeLoc.isUnmapped(this) || !GenomeLoc.isUnmapped(that)) throw new ReviewedStingException("Tried to merge a mapped and an unmapped genome loc"); return UNMAPPED; } if (!(this.contiguousP(that))) { throw new ReviewedStingException("The two genome loc's need to be contigous"); } return new GenomeLoc( getContig(), this.contigIndex, Math.min(getStart(), that.getStart()), Math.max(getStop(), that.getStop())); }
@Requires("that != null") @Ensures("result != null") public GenomeLoc intersect(GenomeLoc that) throws ReviewedStingException { if (GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) { if (!GenomeLoc.isUnmapped(this) || !GenomeLoc.isUnmapped(that)) throw new ReviewedStingException("Tried to intersect a mapped and an unmapped genome loc"); return UNMAPPED; } if (!(this.overlapsP(that))) { throw new ReviewedStingException( "GenomeLoc::intersect(): The two genome loc's need to overlap"); } return new GenomeLoc( getContig(), this.contigIndex, Math.max(getStart(), that.getStart()), Math.min(getStop(), that.getStop())); }
@Requires("that != null") @Ensures("result == 0 || result == 1 || result == -1") public int compareTo(GenomeLoc that) { int result = 0; if (this == that) { result = 0; } else if (GenomeLoc.isUnmapped(this)) result = 1; else if (GenomeLoc.isUnmapped(that)) result = -1; else { final int cmpContig = compareContigs(that); if (cmpContig != 0) { result = cmpContig; } else { if (this.getStart() < that.getStart()) result = -1; if (this.getStart() > that.getStart()) result = 1; } } return result; }
private ArrayList<Allele> computeConsensusAlleles( ReferenceContext ref, Map<String, AlignmentContext> contexts, AlignmentContextUtils.ReadOrientation contextType) { Allele refAllele = null, altAllele = null; GenomeLoc loc = ref.getLocus(); ArrayList<Allele> aList = new ArrayList<Allele>(); HashMap<String, Integer> consensusIndelStrings = new HashMap<String, Integer>(); int insCount = 0, delCount = 0; // quick check of total number of indels in pileup for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) { AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); insCount += indelPileup.getNumberOfInsertions(); delCount += indelPileup.getNumberOfDeletions(); } if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) return aList; for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) { // todo -- warning, can be duplicating expensive partition here AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) { // SAMRecord read = p.getRead(); GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead()); if (read == null) continue; if (ReadUtils.is454Read(read)) { continue; } /* if (DEBUG && p.isIndel()) { System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n", read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(), p.getEventLength(),p.getType().toString(), p.getEventBases()); } */ String indelString = p.getEventBases(); if (p.isInsertion()) { boolean foundKey = false; if (read.getAlignmentEnd() == loc.getStart()) { // first corner condition: a read has an insertion at the end, and we're right at the // insertion. // In this case, the read could have any of the inserted bases and we need to build a // consensus for (String s : consensusIndelStrings.keySet()) { int cnt = consensusIndelStrings.get(s); if (s.startsWith(indelString)) { // case 1: current insertion is prefix of indel in hash map consensusIndelStrings.put(s, cnt + 1); foundKey = true; break; } else if (indelString.startsWith(s)) { // case 2: indel stored in hash table is prefix of current insertion // In this case, new bases are new key. consensusIndelStrings.remove(s); consensusIndelStrings.put(indelString, cnt + 1); foundKey = true; break; } } if (!foundKey) // none of the above: event bases not supported by previous table, so add new key consensusIndelStrings.put(indelString, 1); } else if (read.getAlignmentStart() == loc.getStart() + 1) { // opposite corner condition: read will start at current locus with an insertion for (String s : consensusIndelStrings.keySet()) { int cnt = consensusIndelStrings.get(s); if (s.endsWith(indelString)) { // case 1: current insertion is suffix of indel in hash map consensusIndelStrings.put(s, cnt + 1); foundKey = true; break; } else if (indelString.endsWith(s)) { // case 2: indel stored in hash table is suffix of current insertion // In this case, new bases are new key. consensusIndelStrings.remove(s); consensusIndelStrings.put(indelString, cnt + 1); foundKey = true; break; } } if (!foundKey) // none of the above: event bases not supported by previous table, so add new key consensusIndelStrings.put(indelString, 1); } else { // normal case: insertion somewhere in the middle of a read: add count to hash map int cnt = consensusIndelStrings.containsKey(indelString) ? consensusIndelStrings.get(indelString) : 0; consensusIndelStrings.put(indelString, cnt + 1); } } else if (p.isDeletion()) { indelString = String.format("D%d", p.getEventLength()); int cnt = consensusIndelStrings.containsKey(indelString) ? consensusIndelStrings.get(indelString) : 0; consensusIndelStrings.put(indelString, cnt + 1); } } /* if (DEBUG) { int icount = indelPileup.getNumberOfInsertions(); int dcount = indelPileup.getNumberOfDeletions(); if (icount + dcount > 0) { List<Pair<String,Integer>> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases()); System.out.format("#ins: %d, #del:%d\n", insCount, delCount); for (int i=0 ; i < eventStrings.size() ; i++ ) { System.out.format("%s:%d,",eventStrings.get(i).first,eventStrings.get(i).second); // int k=0; } System.out.println(); } } */ } int maxAlleleCnt = 0; String bestAltAllele = ""; for (String s : consensusIndelStrings.keySet()) { int curCnt = consensusIndelStrings.get(s); if (curCnt > maxAlleleCnt) { maxAlleleCnt = curCnt; bestAltAllele = s; } // if (DEBUG) // System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) ); } // gdebug- if (maxAlleleCnt < minIndelCountForGenotyping) return aList; if (bestAltAllele.startsWith("D")) { // get deletion length int dLen = Integer.valueOf(bestAltAllele.substring(1)); // get ref bases of accurate deletion int startIdxInReference = (int) (1 + loc.getStart() - ref.getWindow().getStart()); // System.out.println(new String(ref.getBases())); byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen); if (Allele.acceptableAlleleBases(refBases)) { refAllele = Allele.create(refBases, true); altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); } } else { // insertion case if (Allele.acceptableAlleleBases(bestAltAllele)) { refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); altAllele = Allele.create(bestAltAllele, false); } } if (refAllele != null && altAllele != null) { aList.add(0, refAllele); aList.add(1, altAllele); } return aList; }
/** * Read in a list of ExactCall objects from reader, keeping only those with starts in startsToKeep * or all sites (if this is empty) * * @param reader a just-opened reader sitting at the start of the file * @param startsToKeep a list of start position of the calls to keep, or empty if all calls should * be kept * @param parser a genome loc parser to create genome locs * @return a list of ExactCall objects in reader * @throws IOException */ public static List<ExactCall> readExactLog( final BufferedReader reader, final List<Integer> startsToKeep, GenomeLocParser parser) throws IOException { if (reader == null) throw new IllegalArgumentException("reader cannot be null"); if (startsToKeep == null) throw new IllegalArgumentException("startsToKeep cannot be null"); if (parser == null) throw new IllegalArgumentException("GenomeLocParser cannot be null"); List<ExactCall> calls = new LinkedList<ExactCall>(); // skip the header line reader.readLine(); // skip the first "type" line reader.readLine(); while (true) { final VariantContextBuilder builder = new VariantContextBuilder(); final List<Allele> alleles = new ArrayList<Allele>(); final List<Genotype> genotypes = new ArrayList<Genotype>(); final double[] posteriors = new double[2]; final double[] priors = MathUtils.normalizeFromLog10(new double[] {0.5, 0.5}, true); final List<Integer> mle = new ArrayList<Integer>(); final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(); long runtimeNano = -1; GenomeLoc currentLoc = null; while (true) { final String line = reader.readLine(); if (line == null) return calls; final String[] parts = line.split("\t"); final GenomeLoc lineLoc = parser.parseGenomeLoc(parts[0]); final String variable = parts[1]; final String key = parts[2]; final String value = parts[3]; if (currentLoc == null) currentLoc = lineLoc; if (variable.equals("type")) { if (startsToKeep.isEmpty() || startsToKeep.contains(currentLoc.getStart())) { builder.alleles(alleles); final int stop = currentLoc.getStart() + alleles.get(0).length() - 1; builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop); builder.genotypes(genotypes); final int[] mleInts = ArrayUtils.toPrimitive(mle.toArray(new Integer[] {})); final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pNonRefByAllele); calls.add(new ExactCall(builder.make(), runtimeNano, result)); } break; } else if (variable.equals("allele")) { final boolean isRef = key.equals("0"); alleles.add(Allele.create(value, isRef)); } else if (variable.equals("PL")) { final GenotypeBuilder gb = new GenotypeBuilder(key); gb.PL(GenotypeLikelihoods.fromPLField(value).getAsPLs()); genotypes.add(gb.make()); } else if (variable.equals("log10PosteriorOfAFEq0")) { posteriors[0] = Double.valueOf(value); } else if (variable.equals("log10PosteriorOfAFGt0")) { posteriors[1] = Double.valueOf(value); } else if (variable.equals("MLE")) { mle.add(Integer.valueOf(value)); } else if (variable.equals("pNonRefByAllele")) { final Allele a = Allele.create(key); log10pNonRefByAllele.put(a, Double.valueOf(value)); } else if (variable.equals("runtime.nano")) { runtimeNano = Long.valueOf(value); } else { // nothing to do } } } }
public Allele getLikelihoods( RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> contexts, AlignmentContextUtils.ReadOrientation contextType, GenotypePriors priors, Map<String, MultiallelicGenotypeLikelihoods> GLs, Allele alternateAlleleToUse, boolean useBAQedPileup) { if (tracker == null) return null; GenomeLoc loc = ref.getLocus(); Allele refAllele, altAllele; VariantContext vc = null; if (!ref.getLocus().equals(lastSiteVisited)) { // starting a new site: clear allele list alleleList.clear(); lastSiteVisited = ref.getLocus(); indelLikelihoodMap.set(new HashMap<PileupElement, LinkedHashMap<Allele, Double>>()); haplotypeMap.clear(); if (getAlleleListFromVCF) { for (final VariantContext vc_input : tracker.getValues(UAC.alleles, loc)) { if (vc_input != null && allowableTypes.contains(vc_input.getType()) && ref.getLocus().getStart() == vc_input.getStart()) { vc = vc_input; break; } } // ignore places where we don't have a variant if (vc == null) return null; alleleList.clear(); if (ignoreSNPAllelesWhenGenotypingIndels) { // if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore // it and don't genotype it for (Allele a : vc.getAlleles()) if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length) continue; else alleleList.add(a); } else { for (Allele a : vc.getAlleles()) alleleList.add(a); } } else { alleleList = computeConsensusAlleles(ref, contexts, contextType); if (alleleList.isEmpty()) return null; } } // protect against having an indel too close to the edge of a contig if (loc.getStart() <= HAPLOTYPE_SIZE) return null; // check if there is enough reference window to create haplotypes (can be an issue at end of // contigs) if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE) return null; if (!(priors instanceof DiploidIndelGenotypePriors)) throw new StingException( "Only diploid-based Indel priors are supported in the DINDEL GL model"); if (alleleList.isEmpty()) return null; refAllele = alleleList.get(0); altAllele = alleleList.get(1); // look for alt allele that has biggest length distance to ref allele int maxLenDiff = 0; for (Allele a : alleleList) { if (a.isNonReference()) { int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length()); if (lenDiff > maxLenDiff) { maxLenDiff = lenDiff; altAllele = a; } } } final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length(); final int hsize = (int) ref.getWindow().size() - Math.abs(eventLength) - 1; final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1; haplotypeMap = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(), ref, hsize, numPrefBases); // For each sample, get genotype likelihoods based on pileup // compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with // them. // initialize the GenotypeLikelihoods GLs.clear(); for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) { AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); ReadBackedPileup pileup = null; if (context.hasExtendedEventPileup()) pileup = context.getExtendedEventPileup(); else if (context.hasBasePileup()) pileup = context.getBasePileup(); if (pileup != null) { final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); GLs.put( sample.getKey(), new MultiallelicGenotypeLikelihoods( sample.getKey(), alleleList, genotypeLikelihoods, getFilteredDepth(pileup))); if (DEBUG) { System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString()); for (int k = 0; k < genotypeLikelihoods.length; k++) System.out.format("%1.4f ", genotypeLikelihoods[k]); System.out.println(); } } } return refAllele; }
@Requires("that != null") public final List<GenomeLoc> subtract(final GenomeLoc that) { if (GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) { if (!GenomeLoc.isUnmapped(this) || !GenomeLoc.isUnmapped(that)) throw new ReviewedStingException("Tried to intersect a mapped and an unmapped genome loc"); return Arrays.asList(UNMAPPED); } if (!(this.overlapsP(that))) { throw new ReviewedStingException("GenomeLoc::minus(): The two genome loc's need to overlap"); } if (equals(that)) { return Collections.emptyList(); } else if (containsP(that)) { List<GenomeLoc> l = new ArrayList<GenomeLoc>(2); /** * we have to create two new region, one for the before part, one for the after The old * region: |----------------- old region (g) -------------| |----- to delete (e) ------| * * <p>product (two new regions): |------| + |--------| */ int afterStop = this.getStop(), afterStart = that.getStop() + 1; int beforeStop = that.getStart() - 1, beforeStart = this.getStart(); if (afterStop - afterStart >= 0) { GenomeLoc after = new GenomeLoc(this.getContig(), getContigIndex(), afterStart, afterStop); l.add(after); } if (beforeStop - beforeStart >= 0) { GenomeLoc before = new GenomeLoc(this.getContig(), getContigIndex(), beforeStart, beforeStop); l.add(before); } return l; } else if (that.containsP(this)) { /** * e completely contains g, delete g, but keep looking, there may be more regions i.e.: * |--------------------- e --------------------| |--- g ---| |---- others ----| */ return Collections.emptyList(); // don't need to do anything } else { /** * otherwise e overlaps some part of g * * <p>figure out which region occurs first on the genome. I.e., is it: |------------- g * ----------| |------------- e ----------| * * <p>or: |------------- g ----------| |------------ e -----------| */ GenomeLoc n; if (that.getStart() < this.getStart()) { n = new GenomeLoc(this.getContig(), getContigIndex(), that.getStop() + 1, this.getStop()); } else { n = new GenomeLoc(this.getContig(), getContigIndex(), this.getStart(), that.getStart() - 1); } // replace g with the new region return Arrays.asList(n); } }
@Override public T traverse( final ActiveRegionWalker<M, T> walker, final LocusShardDataProvider dataProvider, T sum) { logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider)); final LocusView locusView = getLocusView(walker, dataProvider); final GenomeLocSortedSet initialIntervals = engine.getIntervals(); final LocusReferenceView referenceView = new LocusReferenceView(walker, dataProvider); final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); if (locusView .hasNext()) { // trivial optimization to avoid unnecessary processing when there's nothing // here at all int minStart = Integer.MAX_VALUE; ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions()); ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); // We keep processing while the next reference location is within the interval GenomeLoc prevLoc = null; while (locusView.hasNext()) { final AlignmentContext locus = locusView.next(); GenomeLoc location = locus.getLocation(); if (prevLoc != null) { // fill in the active / inactive labels from the stop of the previous location to the // start of this location // TODO refactor to separate function for (int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++) { final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii); if (initialIntervals == null || initialIntervals.overlaps(fakeLoc)) { profile.add( fakeLoc, new ActivityProfileResult( walker.hasPresetActiveRegions() && walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0)); } } } dataProvider.getShard().getReadMetrics().incrementNumIterations(); // create reference context. Note that if we have a pileup of "extended events", the context // will // hold the (longest) stretch of deleted reference bases (if deletions are present in the // pileup). final ReferenceContext refContext = referenceView.getReferenceContext(location); // Iterate forward to get all reference ordered data covering this location final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus( locus.getLocation(), refContext); // Call the walkers isActive function for this locus and add them to the list to be // integrated later if (initialIntervals == null || initialIntervals.overlaps(location)) { profile.add(location, walkerActiveProb(walker, tracker, refContext, locus, location)); } // Grab all the previously unseen reads from this pileup and add them to the massive read // list for (final PileupElement p : locus.getBasePileup()) { final GATKSAMRecord read = p.getRead(); if (!myReads.contains(read)) { myReads.add(read); } // If this is the last pileup for this shard calculate the minimum alignment start so that // we know // which active regions in the work queue are now safe to process minStart = Math.min(minStart, read.getAlignmentStart()); } prevLoc = location; printProgress(locus.getLocation()); } updateCumulativeMetrics(dataProvider.getShard()); // Take the individual isActive calls and integrate them into contiguous active regions and // add these blocks of work to the work queue // band-pass filter the list of isActive probabilities and turn into active regions final ActivityProfile bandPassFiltered = profile.bandPassFilter(); final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions(activeRegionExtension, maxRegionSize); // add active regions to queue of regions to process // first check if can merge active regions over shard boundaries if (!activeRegions.isEmpty()) { if (!workQueue.isEmpty()) { final ActiveRegion last = workQueue.getLast(); final ActiveRegion first = activeRegions.get(0); if (last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize) { workQueue.removeLast(); activeRegions.remove(first); workQueue.add( new ActiveRegion( last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension)); } } workQueue.addAll(activeRegions); } logger.debug( "Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions."); // now go and process all of the active regions sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); } return sum; }
@Requires("that != null") @Ensures("result >= 0") public final int distance(final GenomeLoc that) { if (this.onSameContig(that)) return Math.abs(this.getStart() - that.getStart()); else return Integer.MAX_VALUE; }
@Requires("that != null") public final boolean containsP(GenomeLoc that) { return onSameContig(that) && getStart() <= that.getStart() && getStop() >= that.getStop(); }
public void writeBeagleOutput( VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) { GenomeLoc currentLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), preferredVC); StringBuffer beagleOut = new StringBuffer(); String marker = String.format("%s:%d ", currentLoc.getContig(), currentLoc.getStart()); beagleOut.append(marker); if (markers != null) markers.append(marker).append("\t").append(Integer.toString(markerCounter++)).append("\t"); for (Allele allele : preferredVC.getAlleles()) { String bglPrintString; if (allele.isNoCall() || allele.isNull()) bglPrintString = "-"; else bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele beagleOut.append(String.format("%s ", bglPrintString)); if (markers != null) markers.append(bglPrintString).append("\t"); } if (markers != null) markers.append("\n"); GenotypesContext preferredGenotypes = preferredVC.getGenotypes(); GenotypesContext otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; for (String sample : samples) { boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE; Genotype genotype; boolean isValidation; // use sample as key into genotypes structure if (preferredGenotypes.containsSample(sample)) { genotype = preferredGenotypes.get(sample); isValidation = isValidationSite; } else if (otherGenotypes != null && otherGenotypes.containsSample(sample)) { genotype = otherGenotypes.get(sample); isValidation = !isValidationSite; } else { // there is magically no genotype for this sample. throw new StingException( "Sample " + sample + " arose with no genotype in variant or validation VCF. This should never happen."); } /* * Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key */ double[] log10Likelihoods = null; if ((isValidation && prior < 0.0) || genotype.hasLikelihoods()) { log10Likelihoods = genotype.getLikelihoods().getAsVector(); // see if we need to randomly mask out genotype in this position. if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() <= insertedNoCallRate) { // we are masking out this genotype log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS; } if (isMaleOnChrX) { log10Likelihoods[1] = -255; // todo -- warning this is dangerous for multi-allele case } } /** otherwise, use the prior uniformly */ else if (!isValidation && genotype.isCalled() && !genotype.hasLikelihoods()) { // hack to deal with input VCFs with no genotype likelihoods. Just assume the called // genotype // is confident. This is useful for Hapmap and 1KG release VCFs. double AA = (1.0 - prior) / 2.0; double AB = (1.0 - prior) / 2.0; double BB = (1.0 - prior) / 2.0; if (genotype.isHomRef()) { AA = prior; } else if (genotype.isHet()) { AB = prior; } else if (genotype.isHomVar()) { BB = prior; } log10Likelihoods = MathUtils.toLog10(new double[] {AA, isMaleOnChrX ? 0.0 : AB, BB}); } else { log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS; } writeSampleLikelihoods(beagleOut, preferredVC, log10Likelihoods); } beagleWriter.println(beagleOut.toString()); }
/** * Tests whether any portion of this contig is before that contig. * * @param that Other contig to test. * @return True if the start of this contig is before the start of the that contig. */ @Requires("that != null") public final boolean startsBefore(final GenomeLoc that) { int comparison = this.compareContigs(that); return (comparison == -1 || (comparison == 0 && this.getStart() < that.getStart())); }
@Requires({"locFirst != null", "locSecond != null", "locSecond.isPast(locFirst)"}) @Ensures("result >= 0") private static int distanceFirstStopToSecondStart(GenomeLoc locFirst, GenomeLoc locSecond) { return locSecond.getStart() - locFirst.getStop(); }
/** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority * order, if provided. If uniqifySamples is true, the priority order is ignored and names are * created by concatenating the VC name with the sample name * * @param genomeLocParser loc parser * @param unsortedVCs collection of unsorted VCs * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs * @param filteredRecordMergeType merge type for filtered records * @param genotypeMergeOptions merge option for genotypes * @param annotateOrigin should we annotate the set it came from? * @param printMessages should we print messages? * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? * @return new VariantContext representing the merge of unsortedVCs */ public static VariantContext simpleMerge( final GenomeLocParser genomeLocParser, final Collection<VariantContext> unsortedVCs, final List<String> priorityListOfVCs, final FilteredRecordMergeType filteredRecordMergeType, final GenotypeMergeType genotypeMergeOptions, final boolean annotateOrigin, final boolean printMessages, final String setKey, final boolean filteredAreUncalled, final boolean mergeInfoWithMaxAC) { if (unsortedVCs == null || unsortedVCs.size() == 0) return null; if (annotateOrigin && priorityListOfVCs == null) throw new IllegalArgumentException( "Cannot merge calls and annotate their origins without a complete priority list of VariantContexts"); if (genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE) verifyUniqueSampleNames(unsortedVCs); List<VariantContext> prepaddedVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); // Make sure all variant contexts are padded with reference base in case of indels if necessary List<VariantContext> VCs = new ArrayList<VariantContext>(); for (VariantContext vc : prepaddedVCs) { // also a reasonable place to remove filtered calls, if needed if (!filteredAreUncalled || vc.isNotFiltered()) VCs.add(createVariantContextWithPaddedAlleles(vc, false)); } if (VCs.size() == 0) // everything is filtered out and we're filteredAreUncalled return null; // establish the baseline info from the first VC final VariantContext first = VCs.get(0); final String name = first.getSource(); final Allele refAllele = determineReferenceAllele(VCs); final Set<Allele> alleles = new LinkedHashSet<Allele>(); final Set<String> filters = new TreeSet<String>(); final Map<String, Object> attributes = new TreeMap<String, Object>(); final Set<String> inconsistentAttributes = new HashSet<String>(); final Set<String> variantSources = new HashSet< String>(); // contains the set of sources we found in our set of VCs that are variant final Set<String> rsIDs = new LinkedHashSet<String>(1); // most of the time there's one id GenomeLoc loc = getLocation(genomeLocParser, first); int depth = 0; int maxAC = -1; final Map<String, Object> attributesWithMaxAC = new TreeMap<String, Object>(); double log10PError = 1; VariantContext vcWithMaxAC = null; GenotypesContext genotypes = GenotypesContext.create(); // counting the number of filtered and variant VCs int nFiltered = 0; boolean remapped = false; // cycle through and add info from the other VCs, making sure the loc/reference matches for (VariantContext vc : VCs) { if (loc.getStart() != vc.getStart()) // || !first.getReference().equals(vc.getReference()) ) throw new ReviewedStingException( "BUG: attempting to merge VariantContexts with different start sites: first=" + first.toString() + " second=" + vc.toString()); if (getLocation(genomeLocParser, vc).size() > loc.size()) loc = getLocation(genomeLocParser, vc); // get the longest location nFiltered += vc.isFiltered() ? 1 : 0; if (vc.isVariant()) variantSources.add(vc.getSource()); AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); remapped = remapped || alleleMapping.needsRemapping(); alleles.addAll(alleleMapping.values()); mergeGenotypes( genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); log10PError = Math.min(log10PError, vc.isVariant() ? vc.getLog10PError() : 1); filters.addAll(vc.getFilters()); // // add attributes // // special case DP (add it up) and ID (just preserve it) // if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); if (vc.hasID()) rsIDs.add(vc.getID()); if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); // lets see if the string contains a , separator if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { List<String> alleleCountArray = Arrays.asList( rawAlleleCounts .substring(1, rawAlleleCounts.length() - 1) .split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); for (String alleleCount : alleleCountArray) { final int ac = Integer.valueOf(alleleCount.trim()); if (ac > maxAC) { maxAC = ac; vcWithMaxAC = vc; } } } else { final int ac = Integer.valueOf(rawAlleleCounts); if (ac > maxAC) { maxAC = ac; vcWithMaxAC = vc; } } } for (Map.Entry<String, Object> p : vc.getAttributes().entrySet()) { String key = p.getKey(); // if we don't like the key already, don't go anywhere if (!inconsistentAttributes.contains(key)) { boolean alreadyFound = attributes.containsKey(key); Object boundValue = attributes.get(key); boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); if (alreadyFound && !boundValue.equals(p.getValue()) && !boundIsMissingValue) { // we found the value but we're inconsistent, put it in the exclude list // System.out.printf("Inconsistent INFO values: %s => %s and %s%n", key, boundValue, // p.getValue()); inconsistentAttributes.add(key); attributes.remove(key); } else if (!alreadyFound || boundIsMissingValue) { // no value // if ( vc != first ) System.out.printf("Adding key %s => %s%n", p.getKey(), // p.getValue()); attributes.put(key, p.getValue()); } } } } // if we have more alternate alleles in the merged VC than in one or more of the // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well // as allele-dependent attributes like AC,AF for (VariantContext vc : VCs) { if (vc.alleles.size() == 1) continue; if (hasPLIncompatibleAlleles(alleles, vc.alleles)) { if (!genotypes.isEmpty()) logger.warn( String.format( "Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); genotypes = stripPLs(genotypes); // this will remove stale AC,AF attributed from vc calculateChromosomeCounts(vc, attributes, true); break; } } // take the VC with the maxAC and pull the attributes into a modifiable map if (mergeInfoWithMaxAC && vcWithMaxAC != null) { attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); } // if at least one record was unfiltered and we want a union, clear all of the filters if ((filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL) filters.clear(); if (annotateOrigin) { // we care about where the call came from String setValue; if (nFiltered == 0 && variantSources.size() == priorityListOfVCs.size()) // nothing was unfiltered setValue = MERGE_INTERSECTION; else if (nFiltered == VCs.size()) // everything was filtered out setValue = MERGE_FILTER_IN_ALL; else if (variantSources.isEmpty()) // everyone was reference setValue = MERGE_REF_IN_ALL; else { LinkedHashSet<String> s = new LinkedHashSet<String>(); for (VariantContext vc : VCs) if (vc.isVariant()) s.add(vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource()); setValue = Utils.join("-", s); } if (setKey != null) { attributes.put(setKey, setValue); if (mergeInfoWithMaxAC && vcWithMaxAC != null) { attributesWithMaxAC.put(setKey, vcWithMaxAC.getSource()); } } } if (depth > 0) attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID); builder.loc(loc.getContig(), loc.getStart(), loc.getStop()); builder.alleles(alleles); builder.genotypes(genotypes); builder.log10PError(log10PError); builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes); // Trim the padded bases of all alleles if necessary VariantContext merged = createVariantContextWithTrimmedAlleles(builder.make()); if (printMessages && remapped) System.out.printf("Remapped => %s%n", merged); return merged; }
private GenomeLoc createIntervalBefore(GenomeLoc interval) { int start = Math.max(interval.getStart() - expandInterval, 0); int stop = Math.max(interval.getStart() - 1, 0); return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop); }