public Map<String, Object> annotate( RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) { if (stratifiedContexts.size() == 0) return null; double mq0 = 0; double mq10 = 0; double total = 0; for (Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet()) { if (!sample.getValue().hasBasePileup()) continue; for (PileupElement p : sample.getValue().getBasePileup()) { if (p.getMappingQual() == 0) { mq0 += 1; } if (p.getMappingQual() <= 10) { mq10 += 1; } total += 1; } } Map<String, Object> map = new HashMap<String, Object>(); map.put( getKeyNames().get(0), String.format("%.04f,%.04f,%.00f", mq0 / total, mq10 / total, total)); return map; }
// Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we // consider a deletion as part of the pileup, // so that per-sample DP will include deletions covering the event. protected int getFilteredDepth(ReadBackedPileup pileup) { int count = 0; for (PileupElement p : pileup) { if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase())) count++; } return count; }
@Test(dataProvider = "LIBSTest") public void testLIBS(LIBSTest params) { final int locus = 44367788; SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, params.readLength); read.setReadBases(Utils.dupBytes((byte) 'A', params.readLength)); read.setBaseQualities(Utils.dupBytes((byte) '@', params.readLength)); read.setCigarString(params.cigar); // create the iterator by state with the fake reads and fake records li = makeLTBS(Arrays.asList(read), createTestReadProperties()); final LIBS_position tester = new LIBS_position(read); while (li.hasNext()) { AlignmentContext alignmentContext = li.next(); ReadBackedPileup p = alignmentContext.getBasePileup(); Assert.assertTrue(p.getNumberOfElements() == 1); PileupElement pe = p.iterator().next(); tester.stepForwardOnGenome(); Assert.assertEquals(pe.isBeforeDeletedBase(), tester.isBeforeDeletedBase); Assert.assertEquals(pe.isBeforeDeletionStart(), tester.isBeforeDeletionStart); Assert.assertEquals(pe.isAfterDeletedBase(), tester.isAfterDeletedBase); Assert.assertEquals(pe.isAfterDeletionEnd(), tester.isAfterDeletionEnd); Assert.assertEquals(pe.isBeforeInsertion(), tester.isBeforeInsertion); Assert.assertEquals(pe.isAfterInsertion(), tester.isAfterInsertion); Assert.assertEquals(pe.isNextToSoftClip(), tester.isNextToSoftClip); Assert.assertEquals(pe.getOffset(), tester.getCurrentReadOffset()); } }
private Haplotype getHaplotypeFromRead( final PileupElement p, final int contextSize, final int locus) { final GATKSAMRecord read = p.getRead(); int readOffsetFromPileup = p.getOffset(); final byte[] haplotypeBases = new byte[contextSize]; Arrays.fill(haplotypeBases, (byte) REGEXP_WILDCARD); final double[] baseQualities = new double[contextSize]; Arrays.fill(baseQualities, 0.0); byte[] readBases = read.getReadBases(); readBases = AlignmentUtils.readToAlignmentByteArray( read.getCigar(), readBases); // Adjust the read bases based on the Cigar string byte[] readQuals = read.getBaseQualities(); readQuals = AlignmentUtils.readToAlignmentByteArray( read.getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), p, read.getAlignmentStart(), locus); final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2; for (int i = 0; i < contextSize; i++) { final int baseOffset = i + baseOffsetStart; if (baseOffset < 0) { continue; } if (baseOffset >= readBases.length) { break; } if (readQuals[baseOffset] == PileupElement.DELETION_BASE) { readQuals[baseOffset] = PileupElement.DELETION_QUAL; } if (!BaseUtils.isRegularBase(readBases[baseOffset])) { readBases[baseOffset] = (byte) REGEXP_WILDCARD; readQuals[baseOffset] = (byte) 0; } // N's shouldn't be treated as distinct bases readQuals[baseOffset] = (byte) Math.min((int) readQuals[baseOffset], p.getMappingQual()); if (((int) readQuals[baseOffset]) < 5) { readQuals[baseOffset] = (byte) 0; } // quals less than 5 are used as codes and don't have actual probabilistic meaning behind // them haplotypeBases[i] = readBases[baseOffset]; baseQualities[i] = (double) readQuals[baseOffset]; } return new Haplotype(haplotypeBases, baseQualities); }
@Test public void testIndelsInRegularPileup() { final byte[] bases = new byte[] {'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'}; final byte[] indelBases = new byte[] {'A', 'A', 'A', 'A', 'C', 'T', 'A', 'A', 'A', 'A', 'A', 'A'}; // create a test version of the Reads object ReadProperties readAttributes = createTestReadProperties(); SAMRecord before = ArtificialSAMUtils.createArtificialRead(header, "before", 0, 1, 10); before.setReadBases(bases); before.setBaseQualities(new byte[] {20, 20, 20, 20, 20, 20, 20, 20, 20, 20}); before.setCigarString("10M"); SAMRecord during = ArtificialSAMUtils.createArtificialRead(header, "during", 0, 2, 10); during.setReadBases(indelBases); during.setBaseQualities(new byte[] {20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}); during.setCigarString("4M2I6M"); SAMRecord after = ArtificialSAMUtils.createArtificialRead(header, "after", 0, 3, 10); after.setReadBases(bases); after.setBaseQualities(new byte[] {20, 20, 20, 20, 20, 20, 20, 20, 20, 20}); after.setCigarString("10M"); List<SAMRecord> reads = Arrays.asList(before, during, after); // create the iterator by state with the fake reads and fake records li = makeLTBS(reads, readAttributes); boolean foundIndel = false; while (li.hasNext()) { AlignmentContext context = li.next(); ReadBackedPileup pileup = context.getBasePileup().getBaseFilteredPileup(10); for (PileupElement p : pileup) { if (p.isBeforeInsertion()) { foundIndel = true; Assert.assertEquals(p.getEventLength(), 2, "Wrong event length"); Assert.assertEquals(p.getEventBases(), "CT", "Inserted bases are incorrect"); break; } } } Assert.assertTrue(foundIndel, "Indel in pileup not found"); }
private Map<String, AlignmentContext> getFilteredAndStratifiedContexts( UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) { if (!BaseUtils.isRegularBase(refContext.getBase())) return null; Map<String, AlignmentContext> stratifiedContexts = null; if (model.name().contains("INDEL")) { final ReadBackedPileup pileup = rawContext.getBasePileup().getMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE); // don't call when there is no coverage if (pileup.getNumberOfElements() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES) return null; // stratify the AlignmentContext and cut by sample stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); } else if (model.name().contains("SNP")) { // stratify the AlignmentContext and cut by sample stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(rawContext.getBasePileup()); if (!(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES)) { int numDeletions = 0; for (final PileupElement p : rawContext.getBasePileup()) { if (p.isDeletion()) numDeletions++; } if (((double) numDeletions) / ((double) rawContext.getBasePileup().getNumberOfElements()) > UAC.MAX_DELETION_FRACTION) { return null; } } } return stratifiedContexts; }
private static String createVerboseOutput(final ReadBackedPileup pileup) { final StringBuilder sb = new StringBuilder(); boolean isFirst = true; sb.append(pileup.getNumberOfDeletions()); sb.append(" "); for (PileupElement p : pileup) { if (isFirst) isFirst = false; else sb.append(","); sb.append(p.getRead().getReadName()); sb.append(verboseDelimiter); sb.append(p.getOffset()); sb.append(verboseDelimiter); sb.append(p.getRead().getReadLength()); sb.append(verboseDelimiter); sb.append(p.getRead().getMappingQuality()); } return sb.toString(); }
private Map<String, Object> annotateSNP(AlignmentContext stratifiedContext, VariantContext vc) { if (!stratifiedContext.hasBasePileup()) return null; HashMap<Byte, Integer> alleleCounts = new HashMap<Byte, Integer>(); for (Allele allele : vc.getAlternateAlleles()) alleleCounts.put(allele.getBases()[0], 0); ReadBackedPileup pileup = stratifiedContext.getBasePileup(); int totalDepth = pileup.size(); Map<String, Object> map = new HashMap<String, Object>(); map.put(getKeyNames().get(0), totalDepth); // put total depth in right away if (totalDepth == 0) return map; // done, can not compute FA at 0 coverage!! int mq0 = 0; // number of "ref" reads that are acually mq0 for (PileupElement p : pileup) { if (p.getMappingQual() == 0) { mq0++; continue; } if (alleleCounts.containsKey(p.getBase())) // non-mq0 read and it's an alt alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase()) + 1); } if (mq0 == totalDepth) return map; // if all reads are mq0, there is nothing left to do // we need to add counts in the correct order String[] fracs = new String[alleleCounts.size()]; for (int i = 0; i < vc.getAlternateAlleles().size(); i++) { fracs[i] = String.format( "%.3f", ((float) alleleCounts.get(vc.getAlternateAllele(i).getBases()[0])) / (totalDepth - mq0)); } map.put(getKeyNames().get(1), fracs); return map; }
/** * Test to make sure that reads supporting only an indel (example cigar string: 76I) are * represented properly */ @Test public void testWholeIndelReadRepresentedTest() { final int firstLocus = 44367788, secondLocus = firstLocus + 1; SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, secondLocus, 1); read1.setReadBases(Utils.dupBytes((byte) 'A', 1)); read1.setBaseQualities(Utils.dupBytes((byte) '@', 1)); read1.setCigarString("1I"); List<SAMRecord> reads = Arrays.asList(read1); // create the iterator by state with the fake reads and fake records li = makeLTBS(reads, createTestReadProperties()); while (li.hasNext()) { AlignmentContext alignmentContext = li.next(); ReadBackedPileup p = alignmentContext.getBasePileup(); Assert.assertTrue(p.getNumberOfElements() == 1); PileupElement pe = p.iterator().next(); Assert.assertTrue(pe.isBeforeInsertion()); Assert.assertFalse(pe.isAfterInsertion()); Assert.assertEquals(pe.getEventBases(), "A"); } SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "read2", 0, secondLocus, 10); read2.setReadBases(Utils.dupBytes((byte) 'A', 10)); read2.setBaseQualities(Utils.dupBytes((byte) '@', 10)); read2.setCigarString("10I"); reads = Arrays.asList(read2); // create the iterator by state with the fake reads and fake records li = makeLTBS(reads, createTestReadProperties()); while (li.hasNext()) { AlignmentContext alignmentContext = li.next(); ReadBackedPileup p = alignmentContext.getBasePileup(); Assert.assertTrue(p.getNumberOfElements() == 1); PileupElement pe = p.iterator().next(); Assert.assertTrue(pe.isBeforeInsertion()); Assert.assertFalse(pe.isAfterInsertion()); Assert.assertEquals(pe.getEventBases(), "AAAAAAAAAA"); } }
@Override protected Double getElementForPileupElement(final PileupElement p) { return (double) p.getRead().getMappingQuality(); }
@Override public T traverse( final ActiveRegionWalker<M, T> walker, final LocusShardDataProvider dataProvider, T sum) { logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider)); final LocusView locusView = getLocusView(walker, dataProvider); final GenomeLocSortedSet initialIntervals = engine.getIntervals(); final LocusReferenceView referenceView = new LocusReferenceView(walker, dataProvider); final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); if (locusView .hasNext()) { // trivial optimization to avoid unnecessary processing when there's nothing // here at all int minStart = Integer.MAX_VALUE; ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions()); ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); // We keep processing while the next reference location is within the interval GenomeLoc prevLoc = null; while (locusView.hasNext()) { final AlignmentContext locus = locusView.next(); GenomeLoc location = locus.getLocation(); if (prevLoc != null) { // fill in the active / inactive labels from the stop of the previous location to the // start of this location // TODO refactor to separate function for (int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++) { final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii); if (initialIntervals == null || initialIntervals.overlaps(fakeLoc)) { profile.add( fakeLoc, new ActivityProfileResult( walker.hasPresetActiveRegions() && walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0)); } } } dataProvider.getShard().getReadMetrics().incrementNumIterations(); // create reference context. Note that if we have a pileup of "extended events", the context // will // hold the (longest) stretch of deleted reference bases (if deletions are present in the // pileup). final ReferenceContext refContext = referenceView.getReferenceContext(location); // Iterate forward to get all reference ordered data covering this location final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus( locus.getLocation(), refContext); // Call the walkers isActive function for this locus and add them to the list to be // integrated later if (initialIntervals == null || initialIntervals.overlaps(location)) { profile.add(location, walkerActiveProb(walker, tracker, refContext, locus, location)); } // Grab all the previously unseen reads from this pileup and add them to the massive read // list for (final PileupElement p : locus.getBasePileup()) { final GATKSAMRecord read = p.getRead(); if (!myReads.contains(read)) { myReads.add(read); } // If this is the last pileup for this shard calculate the minimum alignment start so that // we know // which active regions in the work queue are now safe to process minStart = Math.min(minStart, read.getAlignmentStart()); } prevLoc = location; printProgress(locus.getLocation()); } updateCumulativeMetrics(dataProvider.getShard()); // Take the individual isActive calls and integrate them into contiguous active regions and // add these blocks of work to the work queue // band-pass filter the list of isActive probabilities and turn into active regions final ActivityProfile bandPassFiltered = profile.bandPassFilter(); final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions(activeRegionExtension, maxRegionSize); // add active regions to queue of regions to process // first check if can merge active regions over shard boundaries if (!activeRegions.isEmpty()) { if (!workQueue.isEmpty()) { final ActiveRegion last = workQueue.getLast(); final ActiveRegion first = activeRegions.get(0); if (last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize) { workQueue.removeLast(); activeRegions.remove(first); workQueue.add( new ActiveRegion( last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension)); } } workQueue.addAll(activeRegions); } logger.debug( "Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions."); // now go and process all of the active regions sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); } return sum; }
private double scoreReadAgainstHaplotype( final PileupElement p, final int contextSize, final Haplotype haplotype, final int locus) { double expected = 0.0; double mismatches = 0.0; // What's the expected mismatch rate under the model that this read is actually sampled from // this haplotype? Let's assume the consensus base c is a random choice one of A, C, G, or T, // and that // the observed base is actually from a c with an error rate e. Since e is the rate at which // we'd // see a miscalled c, the expected mismatch rate is really e. So the expected number of // mismatches // is just sum_i e_i for i from 1..n for n sites // // Now, what's the probabilistic sum of mismatches? Suppose that the base b is equal to c. // Well, it could // actually be a miscall in a matching direction, which would happen at a e / 3 rate. If b != // c, then // the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be // a mismatch. // so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1 // ... n final byte[] haplotypeBases = haplotype.getBases(); final GATKSAMRecord read = p.getRead(); byte[] readBases = read.getReadBases(); readBases = AlignmentUtils.readToAlignmentByteArray( p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string byte[] readQuals = read.getBaseQualities(); readQuals = AlignmentUtils.readToAlignmentByteArray( p.getRead().getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string int readOffsetFromPileup = p.getOffset(); readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset( p.getRead().getCigar(), p, read.getAlignmentStart(), locus); final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2; for (int i = 0; i < contextSize; i++) { final int baseOffset = i + baseOffsetStart; if (baseOffset < 0) { continue; } if (baseOffset >= readBases.length) { break; } final byte haplotypeBase = haplotypeBases[i]; final byte readBase = readBases[baseOffset]; final boolean matched = (readBase == haplotypeBase || haplotypeBase == (byte) REGEXP_WILDCARD); byte qual = readQuals[baseOffset]; if (qual == PileupElement.DELETION_BASE) { qual = PileupElement.DELETION_QUAL; } // calcAlignmentByteArrayOffset fills the readQuals array with DELETION_BASE at deletions qual = (byte) Math.min((int) qual, p.getMappingQual()); if (((int) qual) >= 5) { // quals less than 5 are used as codes and don't have actual probabilistic meaning // behind them final double e = QualityUtils.qualToErrorProb(qual); expected += e; mismatches += matched ? e : 1.0 - e / 3.0; } // a more sophisticated calculation would include the reference quality, but it's nice to // actually penalize // the mismatching of poorly determined regions of the consensus } return mismatches - expected; }