private static String createVerboseOutput(final ReadBackedPileup pileup) { final StringBuilder sb = new StringBuilder(); boolean isFirst = true; sb.append(pileup.getNumberOfDeletions()); sb.append(" "); for (PileupElement p : pileup) { if (isFirst) isFirst = false; else sb.append(","); sb.append(p.getRead().getReadName()); sb.append(verboseDelimiter); sb.append(p.getOffset()); sb.append(verboseDelimiter); sb.append(p.getRead().getReadLength()); sb.append(verboseDelimiter); sb.append(p.getRead().getMappingQuality()); } return sb.toString(); }
private Haplotype getHaplotypeFromRead( final PileupElement p, final int contextSize, final int locus) { final GATKSAMRecord read = p.getRead(); int readOffsetFromPileup = p.getOffset(); final byte[] haplotypeBases = new byte[contextSize]; Arrays.fill(haplotypeBases, (byte) REGEXP_WILDCARD); final double[] baseQualities = new double[contextSize]; Arrays.fill(baseQualities, 0.0); byte[] readBases = read.getReadBases(); readBases = AlignmentUtils.readToAlignmentByteArray( read.getCigar(), readBases); // Adjust the read bases based on the Cigar string byte[] readQuals = read.getBaseQualities(); readQuals = AlignmentUtils.readToAlignmentByteArray( read.getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), p, read.getAlignmentStart(), locus); final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2; for (int i = 0; i < contextSize; i++) { final int baseOffset = i + baseOffsetStart; if (baseOffset < 0) { continue; } if (baseOffset >= readBases.length) { break; } if (readQuals[baseOffset] == PileupElement.DELETION_BASE) { readQuals[baseOffset] = PileupElement.DELETION_QUAL; } if (!BaseUtils.isRegularBase(readBases[baseOffset])) { readBases[baseOffset] = (byte) REGEXP_WILDCARD; readQuals[baseOffset] = (byte) 0; } // N's shouldn't be treated as distinct bases readQuals[baseOffset] = (byte) Math.min((int) readQuals[baseOffset], p.getMappingQual()); if (((int) readQuals[baseOffset]) < 5) { readQuals[baseOffset] = (byte) 0; } // quals less than 5 are used as codes and don't have actual probabilistic meaning behind // them haplotypeBases[i] = readBases[baseOffset]; baseQualities[i] = (double) readQuals[baseOffset]; } return new Haplotype(haplotypeBases, baseQualities); }
@Override protected Double getElementForPileupElement(final PileupElement p) { return (double) p.getRead().getMappingQuality(); }
@Override public T traverse( final ActiveRegionWalker<M, T> walker, final LocusShardDataProvider dataProvider, T sum) { logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider)); final LocusView locusView = getLocusView(walker, dataProvider); final GenomeLocSortedSet initialIntervals = engine.getIntervals(); final LocusReferenceView referenceView = new LocusReferenceView(walker, dataProvider); final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); if (locusView .hasNext()) { // trivial optimization to avoid unnecessary processing when there's nothing // here at all int minStart = Integer.MAX_VALUE; ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions()); ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); // We keep processing while the next reference location is within the interval GenomeLoc prevLoc = null; while (locusView.hasNext()) { final AlignmentContext locus = locusView.next(); GenomeLoc location = locus.getLocation(); if (prevLoc != null) { // fill in the active / inactive labels from the stop of the previous location to the // start of this location // TODO refactor to separate function for (int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++) { final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii); if (initialIntervals == null || initialIntervals.overlaps(fakeLoc)) { profile.add( fakeLoc, new ActivityProfileResult( walker.hasPresetActiveRegions() && walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0)); } } } dataProvider.getShard().getReadMetrics().incrementNumIterations(); // create reference context. Note that if we have a pileup of "extended events", the context // will // hold the (longest) stretch of deleted reference bases (if deletions are present in the // pileup). final ReferenceContext refContext = referenceView.getReferenceContext(location); // Iterate forward to get all reference ordered data covering this location final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus( locus.getLocation(), refContext); // Call the walkers isActive function for this locus and add them to the list to be // integrated later if (initialIntervals == null || initialIntervals.overlaps(location)) { profile.add(location, walkerActiveProb(walker, tracker, refContext, locus, location)); } // Grab all the previously unseen reads from this pileup and add them to the massive read // list for (final PileupElement p : locus.getBasePileup()) { final GATKSAMRecord read = p.getRead(); if (!myReads.contains(read)) { myReads.add(read); } // If this is the last pileup for this shard calculate the minimum alignment start so that // we know // which active regions in the work queue are now safe to process minStart = Math.min(minStart, read.getAlignmentStart()); } prevLoc = location; printProgress(locus.getLocation()); } updateCumulativeMetrics(dataProvider.getShard()); // Take the individual isActive calls and integrate them into contiguous active regions and // add these blocks of work to the work queue // band-pass filter the list of isActive probabilities and turn into active regions final ActivityProfile bandPassFiltered = profile.bandPassFilter(); final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions(activeRegionExtension, maxRegionSize); // add active regions to queue of regions to process // first check if can merge active regions over shard boundaries if (!activeRegions.isEmpty()) { if (!workQueue.isEmpty()) { final ActiveRegion last = workQueue.getLast(); final ActiveRegion first = activeRegions.get(0); if (last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize) { workQueue.removeLast(); activeRegions.remove(first); workQueue.add( new ActiveRegion( last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension)); } } workQueue.addAll(activeRegions); } logger.debug( "Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions."); // now go and process all of the active regions sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); } return sum; }
private double scoreReadAgainstHaplotype( final PileupElement p, final int contextSize, final Haplotype haplotype, final int locus) { double expected = 0.0; double mismatches = 0.0; // What's the expected mismatch rate under the model that this read is actually sampled from // this haplotype? Let's assume the consensus base c is a random choice one of A, C, G, or T, // and that // the observed base is actually from a c with an error rate e. Since e is the rate at which // we'd // see a miscalled c, the expected mismatch rate is really e. So the expected number of // mismatches // is just sum_i e_i for i from 1..n for n sites // // Now, what's the probabilistic sum of mismatches? Suppose that the base b is equal to c. // Well, it could // actually be a miscall in a matching direction, which would happen at a e / 3 rate. If b != // c, then // the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be // a mismatch. // so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1 // ... n final byte[] haplotypeBases = haplotype.getBases(); final GATKSAMRecord read = p.getRead(); byte[] readBases = read.getReadBases(); readBases = AlignmentUtils.readToAlignmentByteArray( p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string byte[] readQuals = read.getBaseQualities(); readQuals = AlignmentUtils.readToAlignmentByteArray( p.getRead().getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string int readOffsetFromPileup = p.getOffset(); readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset( p.getRead().getCigar(), p, read.getAlignmentStart(), locus); final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2; for (int i = 0; i < contextSize; i++) { final int baseOffset = i + baseOffsetStart; if (baseOffset < 0) { continue; } if (baseOffset >= readBases.length) { break; } final byte haplotypeBase = haplotypeBases[i]; final byte readBase = readBases[baseOffset]; final boolean matched = (readBase == haplotypeBase || haplotypeBase == (byte) REGEXP_WILDCARD); byte qual = readQuals[baseOffset]; if (qual == PileupElement.DELETION_BASE) { qual = PileupElement.DELETION_QUAL; } // calcAlignmentByteArrayOffset fills the readQuals array with DELETION_BASE at deletions qual = (byte) Math.min((int) qual, p.getMappingQual()); if (((int) qual) >= 5) { // quals less than 5 are used as codes and don't have actual probabilistic meaning // behind them final double e = QualityUtils.qualToErrorProb(qual); expected += e; mismatches += matched ? e : 1.0 - e / 3.0; } // a more sophisticated calculation would include the reference quality, but it's nice to // actually penalize // the mismatching of poorly determined regions of the consensus } return mismatches - expected; }