/** * Will hard clip every soft clipped bases in the read. * * @return a new read without the soft clipped bases */ private GATKSAMRecord hardClipSoftClippedBases() { if (read.isEmpty()) return read; int readIndex = 0; int cutLeft = -1; // first position to hard clip (inclusive) int cutRight = -1; // first position to hard clip (inclusive) boolean rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail for (CigarElement cigarElement : read.getCigar().getCigarElements()) { if (cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { if (rightTail) { cutRight = readIndex; } else { cutLeft = readIndex + cigarElement.getLength() - 1; } } else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP) rightTail = true; if (cigarElement.getOperator().consumesReadBases()) readIndex += cigarElement.getLength(); } // It is extremely important that we cut the end first otherwise the read coordinates change. if (cutRight >= 0) this.addOp(new ClippingOp(cutRight, read.getReadLength() - 1)); if (cutLeft >= 0) this.addOp(new ClippingOp(0, cutLeft)); return clipRead(ClippingRepresentation.HARDCLIP_BASES); }
@Override public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { final String readGroupId = read.getReadGroup().getReadGroupId(); for (int i = 0; i < read.getReadLength(); i++) { comparable[i] = readGroupId; } }
private static void runTest( final GATKSAMReadGroupRecord rg, final String expected, final ReadGroupCovariate covariate) { GATKSAMRecord read = ReadUtils.createRandomRead(10); read.setReadGroup(rg); ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); covariate.recordValues(read, readCovariates); verifyCovariateArray(readCovariates.getMismatchesKeySet(), expected, covariate); }
public static List<GATKSAMRecord> hardClipToRegion( final List<GATKSAMRecord> reads, final int refStart, final int refStop) { final List<GATKSAMRecord> returnList = new ArrayList<GATKSAMRecord>(reads.size()); for (final GATKSAMRecord read : reads) { final GATKSAMRecord clippedRead = hardClipToRegion(read, refStart, refStop); if (!clippedRead.isEmpty()) { returnList.add(clippedRead); } } return returnList; }
/** * Hard clips a read using read coordinates. * * @param start the first base to clip (inclusive) * @param stop the last base to clip (inclusive) * @return a new read, without the clipped bases */ @Requires({ "start >= 0 && stop <= read.getReadLength() - 1", // start and stop have to be within the read "start == 0 || stop == read.getReadLength() - 1" }) // cannot clip the middle of the read private GATKSAMRecord hardClipByReadCoordinates(int start, int stop) { if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1)) return GATKSAMRecord.emptyRead(read); this.addOp(new ClippingOp(start, stop)); return clipRead(ClippingRepresentation.HARDCLIP_BASES); }
/** * Hard clips any leading insertions in the read. Only looks at the beginning of the read, not the * end. * * @return a new read without leading insertions */ private GATKSAMRecord hardClipLeadingInsertions() { if (read.isEmpty()) return read; for (CigarElement cigarElement : read.getCigar().getCigarElements()) { if (cigarElement.getOperator() != CigarOperator.HARD_CLIP && cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.INSERTION) break; else if (cigarElement.getOperator() == CigarOperator.INSERTION) this.addOp(new ClippingOp(0, cigarElement.getLength() - 1)); } return clipRead(ClippingRepresentation.HARDCLIP_BASES); }
/** * Hard clip the read to the variable region (from refStart to refStop) * * @param read the read to be clipped * @param refStart the beginning of the variant region (inclusive) * @param refStop the end of the variant region (inclusive) * @return the read hard clipped to the variant region */ public static GATKSAMRecord hardClipToRegion( final GATKSAMRecord read, final int refStart, final int refStop) { final int start = read.getAlignmentStart(); final int stop = read.getAlignmentEnd(); // check if the read is contained in region if (start <= refStop && stop >= refStart) { if (start < refStart && stop > refStop) return hardClipBothEndsByReferenceCoordinates(read, refStart - 1, refStop + 1); else if (start < refStart) return hardClipByReferenceCoordinatesLeftTail(read, refStart - 1); else if (stop > refStop) return hardClipByReferenceCoordinatesRightTail(read, refStop + 1); return read; } else return GATKSAMRecord.emptyRead(read); }
/** * Hard clips both tails of a read. Left tail goes from the beginning to the 'left' coordinate * (inclusive) Right tail goes from the 'right' coordinate (inclusive) until the end of the read * * @param left the coordinate of the last base to be clipped in the left tail (inclusive) * @param right the coordinate of the first base to be clipped in the right tail (inclusive) * @return a new read, without the clipped bases */ @Requires({ "left <= right", // tails cannot overlap "left >= read.getAlignmentStart()", // coordinate has to be within the mapped read "right <= read.getAlignmentEnd()" }) // coordinate has to be within the mapped read private GATKSAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) { if (read.isEmpty() || left == right) return GATKSAMRecord.emptyRead(read); GATKSAMRecord leftTailRead = hardClipByReferenceCoordinates(right, -1); // after clipping one tail, it is possible that the consequent hard clipping of adjacent // deletions // make the left cut index no longer part of the read. In that case, clip the read entirely. if (left > leftTailRead.getAlignmentEnd()) return GATKSAMRecord.emptyRead(read); ReadClipper clipper = new ReadClipper(leftTailRead); return clipper.hardClipByReferenceCoordinatesLeftTail(left); }
/** * Checks if a read contains adaptor sequences. If it does, hard clips them out. * * <p>Note: To see how a read is checked for adaptor sequence see ReadUtils.getAdaptorBoundary() * * @return a new read without adaptor sequence */ private GATKSAMRecord hardClipAdaptorSequence() { final int adaptorBoundary = ReadUtils.getAdaptorBoundary(read); if (adaptorBoundary == ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY || !ReadUtils.isInsideRead(read, adaptorBoundary)) return read; return read.getReadNegativeStrandFlag() ? hardClipByReferenceCoordinatesLeftTail(adaptorBoundary) : hardClipByReferenceCoordinatesRightTail(adaptorBoundary); }
/** * Clips a read according to ops and the chosen algorithm. * * @param algorithm What mode of clipping do you want to apply for the stacked operations. * @return the read with the clipping applied. */ public GATKSAMRecord clipRead(ClippingRepresentation algorithm) { if (ops == null) return getRead(); GATKSAMRecord clippedRead = read; for (ClippingOp op : getOps()) { final int readLength = clippedRead.getReadLength(); // check if the clipped read can still be clipped in the range requested if (op.start < readLength) { ClippingOp fixedOperation = op; if (op.stop >= readLength) fixedOperation = new ClippingOp(op.start, readLength - 1); clippedRead = fixedOperation.apply(algorithm, clippedRead); } } wasClipped = true; ops.clear(); if (clippedRead.isEmpty()) return GATKSAMRecord.emptyRead(clippedRead); return clippedRead; }
// copied from LocusViewTemplate protected GATKSAMRecord buildSAMRecord( final String readName, final String contig, final int alignmentStart) { GATKSAMRecord record = new GATKSAMRecord(header); record.setReadName(readName); record.setReferenceIndex(dictionary.getSequenceIndex(contig)); record.setAlignmentStart(alignmentStart); record.setCigarString("1M"); record.setReadString("A"); record.setBaseQualityString("A"); record.setReadGroup(readGroup); return record; }
/** * Clips any contiguous tail (left, right or both) with base quality lower than lowQual using the * desired algorithm. * * <p>This function will look for low quality tails and hard clip them away. A low quality tail * ends when a base has base quality greater than lowQual. * * @param algorithm the algorithm to use (HardClip, SoftClip, Write N's,...) * @param lowQual every base quality lower than or equal to this in the tail of the read will be * hard clipped * @return a new read without low quality tails */ private GATKSAMRecord clipLowQualEnds(ClippingRepresentation algorithm, byte lowQual) { if (read.isEmpty()) return read; final byte[] quals = read.getBaseQualities(); final int readLength = read.getReadLength(); int leftClipIndex = 0; int rightClipIndex = readLength - 1; // check how far we can clip both sides while (rightClipIndex >= 0 && quals[rightClipIndex] <= lowQual) rightClipIndex--; while (leftClipIndex < readLength && quals[leftClipIndex] <= lowQual) leftClipIndex++; // if the entire read should be clipped, then return an empty read. if (leftClipIndex > rightClipIndex) return GATKSAMRecord.emptyRead(read); if (rightClipIndex < readLength - 1) { this.addOp(new ClippingOp(rightClipIndex + 1, readLength - 1)); } if (leftClipIndex > 0) { this.addOp(new ClippingOp(0, leftClipIndex - 1)); } return this.clipRead(algorithm); }
private Haplotype getHaplotypeFromRead( final PileupElement p, final int contextSize, final int locus) { final GATKSAMRecord read = p.getRead(); int readOffsetFromPileup = p.getOffset(); final byte[] haplotypeBases = new byte[contextSize]; Arrays.fill(haplotypeBases, (byte) REGEXP_WILDCARD); final double[] baseQualities = new double[contextSize]; Arrays.fill(baseQualities, 0.0); byte[] readBases = read.getReadBases(); readBases = AlignmentUtils.readToAlignmentByteArray( read.getCigar(), readBases); // Adjust the read bases based on the Cigar string byte[] readQuals = read.getBaseQualities(); readQuals = AlignmentUtils.readToAlignmentByteArray( read.getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), p, read.getAlignmentStart(), locus); final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2; for (int i = 0; i < contextSize; i++) { final int baseOffset = i + baseOffsetStart; if (baseOffset < 0) { continue; } if (baseOffset >= readBases.length) { break; } if (readQuals[baseOffset] == PileupElement.DELETION_BASE) { readQuals[baseOffset] = PileupElement.DELETION_QUAL; } if (!BaseUtils.isRegularBase(readBases[baseOffset])) { readBases[baseOffset] = (byte) REGEXP_WILDCARD; readQuals[baseOffset] = (byte) 0; } // N's shouldn't be treated as distinct bases readQuals[baseOffset] = (byte) Math.min((int) readQuals[baseOffset], p.getMappingQual()); if (((int) readQuals[baseOffset]) < 5) { readQuals[baseOffset] = (byte) 0; } // quals less than 5 are used as codes and don't have actual probabilistic meaning behind // them haplotypeBases[i] = readBases[baseOffset]; baseQualities[i] = (double) readQuals[baseOffset]; } return new Haplotype(haplotypeBases, baseQualities); }
/** * Generic functionality to hard clip a read, used internally by * hardClipByReferenceCoordinatesLeftTail and hardClipByReferenceCoordinatesRightTail. Should not * be used directly. * * <p>Note, it REQUIRES you to give the directionality of your hard clip (i.e. whether you're * clipping the left of right tail) by specifying either refStart < 0 or refStop < 0. * * @param refStart first base to clip (inclusive) * @param refStop last base to clip (inclusive) * @return a new read, without the clipped bases */ @Requires({ "!read.getReadUnmappedFlag()", "refStart < 0 || refStop < 0" }) // can't handle unmapped reads, as we're using reference coordinates to clip protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { if (read.isEmpty()) return read; int start; int stop; // Determine the read coordinate to start and stop hard clipping if (refStart < 0) { if (refStop < 0) throw new ReviewedStingException( "Only one of refStart or refStop must be < 0, not both (" + refStart + ", " + refStop + ")"); start = 0; stop = ReadUtils.getReadCoordinateForReferenceCoordinate( read, refStop, ReadUtils.ClippingTail.LEFT_TAIL); } else { if (refStop >= 0) throw new ReviewedStingException( "Either refStart or refStop must be < 0 (" + refStart + ", " + refStop + ")"); start = ReadUtils.getReadCoordinateForReferenceCoordinate( read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL); stop = read.getReadLength() - 1; } if (start < 0 || stop > read.getReadLength() - 1) throw new ReviewedStingException( "Trying to clip before the start or after the end of a read"); if (start > stop) throw new ReviewedStingException( String.format( "START (%d) > (%d) STOP -- this should never happen, please check read: %s (CIGAR: %s)", start, stop, read, read.getCigarString())); if (start > 0 && stop < read.getReadLength() - 1) throw new ReviewedStingException( String.format( "Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString())); this.addOp(new ClippingOp(start, stop)); GATKSAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES); this.ops = null; return clippedRead; }
/** * Turns soft clipped bases into matches * * @return a new read with every soft clip turned into a match */ private GATKSAMRecord revertSoftClippedBases() { if (read.isEmpty()) return read; this.addOp(new ClippingOp(0, 0)); return this.clipRead(ClippingRepresentation.REVERT_SOFTCLIPPED_BASES); }
private ArrayList<Allele> computeConsensusAlleles( ReferenceContext ref, Map<String, AlignmentContext> contexts, AlignmentContextUtils.ReadOrientation contextType) { Allele refAllele = null, altAllele = null; GenomeLoc loc = ref.getLocus(); ArrayList<Allele> aList = new ArrayList<Allele>(); HashMap<String, Integer> consensusIndelStrings = new HashMap<String, Integer>(); int insCount = 0, delCount = 0; // quick check of total number of indels in pileup for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) { AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); insCount += indelPileup.getNumberOfInsertions(); delCount += indelPileup.getNumberOfDeletions(); } if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) return aList; for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) { // todo -- warning, can be duplicating expensive partition here AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) { // SAMRecord read = p.getRead(); GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead()); if (read == null) continue; if (ReadUtils.is454Read(read)) { continue; } /* if (DEBUG && p.isIndel()) { System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n", read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(), p.getEventLength(),p.getType().toString(), p.getEventBases()); } */ String indelString = p.getEventBases(); if (p.isInsertion()) { boolean foundKey = false; if (read.getAlignmentEnd() == loc.getStart()) { // first corner condition: a read has an insertion at the end, and we're right at the // insertion. // In this case, the read could have any of the inserted bases and we need to build a // consensus for (String s : consensusIndelStrings.keySet()) { int cnt = consensusIndelStrings.get(s); if (s.startsWith(indelString)) { // case 1: current insertion is prefix of indel in hash map consensusIndelStrings.put(s, cnt + 1); foundKey = true; break; } else if (indelString.startsWith(s)) { // case 2: indel stored in hash table is prefix of current insertion // In this case, new bases are new key. consensusIndelStrings.remove(s); consensusIndelStrings.put(indelString, cnt + 1); foundKey = true; break; } } if (!foundKey) // none of the above: event bases not supported by previous table, so add new key consensusIndelStrings.put(indelString, 1); } else if (read.getAlignmentStart() == loc.getStart() + 1) { // opposite corner condition: read will start at current locus with an insertion for (String s : consensusIndelStrings.keySet()) { int cnt = consensusIndelStrings.get(s); if (s.endsWith(indelString)) { // case 1: current insertion is suffix of indel in hash map consensusIndelStrings.put(s, cnt + 1); foundKey = true; break; } else if (indelString.endsWith(s)) { // case 2: indel stored in hash table is suffix of current insertion // In this case, new bases are new key. consensusIndelStrings.remove(s); consensusIndelStrings.put(indelString, cnt + 1); foundKey = true; break; } } if (!foundKey) // none of the above: event bases not supported by previous table, so add new key consensusIndelStrings.put(indelString, 1); } else { // normal case: insertion somewhere in the middle of a read: add count to hash map int cnt = consensusIndelStrings.containsKey(indelString) ? consensusIndelStrings.get(indelString) : 0; consensusIndelStrings.put(indelString, cnt + 1); } } else if (p.isDeletion()) { indelString = String.format("D%d", p.getEventLength()); int cnt = consensusIndelStrings.containsKey(indelString) ? consensusIndelStrings.get(indelString) : 0; consensusIndelStrings.put(indelString, cnt + 1); } } /* if (DEBUG) { int icount = indelPileup.getNumberOfInsertions(); int dcount = indelPileup.getNumberOfDeletions(); if (icount + dcount > 0) { List<Pair<String,Integer>> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases()); System.out.format("#ins: %d, #del:%d\n", insCount, delCount); for (int i=0 ; i < eventStrings.size() ; i++ ) { System.out.format("%s:%d,",eventStrings.get(i).first,eventStrings.get(i).second); // int k=0; } System.out.println(); } } */ } int maxAlleleCnt = 0; String bestAltAllele = ""; for (String s : consensusIndelStrings.keySet()) { int curCnt = consensusIndelStrings.get(s); if (curCnt > maxAlleleCnt) { maxAlleleCnt = curCnt; bestAltAllele = s; } // if (DEBUG) // System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) ); } // gdebug- if (maxAlleleCnt < minIndelCountForGenotyping) return aList; if (bestAltAllele.startsWith("D")) { // get deletion length int dLen = Integer.valueOf(bestAltAllele.substring(1)); // get ref bases of accurate deletion int startIdxInReference = (int) (1 + loc.getStart() - ref.getWindow().getStart()); // System.out.println(new String(ref.getBases())); byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen); if (Allele.acceptableAlleleBases(refBases)) { refAllele = Allele.create(refBases, true); altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); } } else { // insertion case if (Allele.acceptableAlleleBases(bestAltAllele)) { refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); altAllele = Allele.create(bestAltAllele, false); } } if (refAllele != null && altAllele != null) { aList.add(0, refAllele); aList.add(1, altAllele); } return aList; }
@Test(enabled = false) public void testCovariateGeneration() { final String RGID = "id"; final int length = 10; final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); GATKSAMRecord read = ReadUtils.createRandomRead(length, false); GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(RGID); rg.setPlatform("illumina"); read.setReadGroup(rg); final byte[] mQuals = read.getBaseQualities(EventType.BASE_SUBSTITUTION); final byte[] iQuals = read.getBaseQualities(EventType.BASE_INSERTION); final byte[] dQuals = read.getBaseQualities(EventType.BASE_DELETION); ReadGroupCovariate rgCov = new ReadGroupCovariate(); QualityScoreCovariate qsCov = new QualityScoreCovariate(); ContextCovariate coCov = new ContextCovariate(); CycleCovariate cyCov = new CycleCovariate(); rgCov.initialize(RAC); qsCov.initialize(RAC); coCov.initialize(RAC); cyCov.initialize(RAC); Covariate[] requestedCovariates = new Covariate[4]; requestedCovariates[0] = rgCov; requestedCovariates[1] = qsCov; requestedCovariates[2] = coCov; requestedCovariates[3] = cyCov; ReadCovariates rc = RecalDataManager.computeCovariates(read, requestedCovariates); // check that the length is correct Assert.assertEquals(rc.getMismatchesKeySet().length, length); Assert.assertEquals(rc.getInsertionsKeySet().length, length); Assert.assertEquals(rc.getDeletionsKeySet().length, length); for (int i = 0; i < length; i++) { // check that read group is always the same Assert.assertEquals(rgCov.formatKey(rc.getMismatchesKeySet(i)[0]), RGID); Assert.assertEquals(rgCov.formatKey(rc.getInsertionsKeySet(i)[0]), RGID); Assert.assertEquals(rgCov.formatKey(rc.getDeletionsKeySet(i)[0]), RGID); // check quality score Assert.assertEquals(qsCov.formatKey(rc.getMismatchesKeySet(i)[1]), "" + mQuals[i]); Assert.assertEquals(qsCov.formatKey(rc.getInsertionsKeySet(i)[1]), "" + iQuals[i]); Assert.assertEquals(qsCov.formatKey(rc.getDeletionsKeySet(i)[1]), "" + dQuals[i]); // check context Assert.assertEquals( coCov.formatKey(rc.getMismatchesKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.MISMATCHES_CONTEXT_SIZE)); Assert.assertEquals( coCov.formatKey(rc.getInsertionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); Assert.assertEquals( coCov.formatKey(rc.getDeletionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); // check cycle Assert.assertEquals(cyCov.formatKey(rc.getMismatchesKeySet(i)[3]), "" + (i + 1)); Assert.assertEquals(cyCov.formatKey(rc.getInsertionsKeySet(i)[3]), "" + (i + 1)); Assert.assertEquals(cyCov.formatKey(rc.getDeletionsKeySet(i)[3]), "" + (i + 1)); } }
@Override protected Double getElementForRead(final GATKSAMRecord read, final int refLoc) { return (double) read.getMappingQuality(); }
@Override public T traverse( final ActiveRegionWalker<M, T> walker, final LocusShardDataProvider dataProvider, T sum) { logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider)); final LocusView locusView = getLocusView(walker, dataProvider); final GenomeLocSortedSet initialIntervals = engine.getIntervals(); final LocusReferenceView referenceView = new LocusReferenceView(walker, dataProvider); final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); if (locusView .hasNext()) { // trivial optimization to avoid unnecessary processing when there's nothing // here at all int minStart = Integer.MAX_VALUE; ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions()); ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); // We keep processing while the next reference location is within the interval GenomeLoc prevLoc = null; while (locusView.hasNext()) { final AlignmentContext locus = locusView.next(); GenomeLoc location = locus.getLocation(); if (prevLoc != null) { // fill in the active / inactive labels from the stop of the previous location to the // start of this location // TODO refactor to separate function for (int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++) { final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii); if (initialIntervals == null || initialIntervals.overlaps(fakeLoc)) { profile.add( fakeLoc, new ActivityProfileResult( walker.hasPresetActiveRegions() && walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0)); } } } dataProvider.getShard().getReadMetrics().incrementNumIterations(); // create reference context. Note that if we have a pileup of "extended events", the context // will // hold the (longest) stretch of deleted reference bases (if deletions are present in the // pileup). final ReferenceContext refContext = referenceView.getReferenceContext(location); // Iterate forward to get all reference ordered data covering this location final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus( locus.getLocation(), refContext); // Call the walkers isActive function for this locus and add them to the list to be // integrated later if (initialIntervals == null || initialIntervals.overlaps(location)) { profile.add(location, walkerActiveProb(walker, tracker, refContext, locus, location)); } // Grab all the previously unseen reads from this pileup and add them to the massive read // list for (final PileupElement p : locus.getBasePileup()) { final GATKSAMRecord read = p.getRead(); if (!myReads.contains(read)) { myReads.add(read); } // If this is the last pileup for this shard calculate the minimum alignment start so that // we know // which active regions in the work queue are now safe to process minStart = Math.min(minStart, read.getAlignmentStart()); } prevLoc = location; printProgress(locus.getLocation()); } updateCumulativeMetrics(dataProvider.getShard()); // Take the individual isActive calls and integrate them into contiguous active regions and // add these blocks of work to the work queue // band-pass filter the list of isActive probabilities and turn into active regions final ActivityProfile bandPassFiltered = profile.bandPassFilter(); final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions(activeRegionExtension, maxRegionSize); // add active regions to queue of regions to process // first check if can merge active regions over shard boundaries if (!activeRegions.isEmpty()) { if (!workQueue.isEmpty()) { final ActiveRegion last = workQueue.getLast(); final ActiveRegion first = activeRegions.get(0); if (last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize) { workQueue.removeLast(); activeRegions.remove(first); workQueue.add( new ActiveRegion( last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension)); } } workQueue.addAll(activeRegions); } logger.debug( "Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions."); // now go and process all of the active regions sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); } return sum; }
private double scoreReadAgainstHaplotype( final PileupElement p, final int contextSize, final Haplotype haplotype, final int locus) { double expected = 0.0; double mismatches = 0.0; // What's the expected mismatch rate under the model that this read is actually sampled from // this haplotype? Let's assume the consensus base c is a random choice one of A, C, G, or T, // and that // the observed base is actually from a c with an error rate e. Since e is the rate at which // we'd // see a miscalled c, the expected mismatch rate is really e. So the expected number of // mismatches // is just sum_i e_i for i from 1..n for n sites // // Now, what's the probabilistic sum of mismatches? Suppose that the base b is equal to c. // Well, it could // actually be a miscall in a matching direction, which would happen at a e / 3 rate. If b != // c, then // the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be // a mismatch. // so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1 // ... n final byte[] haplotypeBases = haplotype.getBases(); final GATKSAMRecord read = p.getRead(); byte[] readBases = read.getReadBases(); readBases = AlignmentUtils.readToAlignmentByteArray( p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string byte[] readQuals = read.getBaseQualities(); readQuals = AlignmentUtils.readToAlignmentByteArray( p.getRead().getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string int readOffsetFromPileup = p.getOffset(); readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset( p.getRead().getCigar(), p, read.getAlignmentStart(), locus); final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2; for (int i = 0; i < contextSize; i++) { final int baseOffset = i + baseOffsetStart; if (baseOffset < 0) { continue; } if (baseOffset >= readBases.length) { break; } final byte haplotypeBase = haplotypeBases[i]; final byte readBase = readBases[baseOffset]; final boolean matched = (readBase == haplotypeBase || haplotypeBase == (byte) REGEXP_WILDCARD); byte qual = readQuals[baseOffset]; if (qual == PileupElement.DELETION_BASE) { qual = PileupElement.DELETION_QUAL; } // calcAlignmentByteArrayOffset fills the readQuals array with DELETION_BASE at deletions qual = (byte) Math.min((int) qual, p.getMappingQual()); if (((int) qual) >= 5) { // quals less than 5 are used as codes and don't have actual probabilistic meaning // behind them final double e = QualityUtils.qualToErrorProb(qual); expected += e; mismatches += matched ? e : 1.0 - e / 3.0; } // a more sophisticated calculation would include the reference quality, but it's nice to // actually penalize // the mismatching of poorly determined regions of the consensus } return mismatches - expected; }
/** * Hard clips away soft clipped bases that are below the given quality threshold * * @param read the read * @param minQual the mininum base quality score to revert the base (inclusive) * @return a new read without low quality soft clipped bases */ public static GATKSAMRecord hardClipLowQualitySoftClips(GATKSAMRecord read, byte minQual) { int nLeadingSoftClips = read.getAlignmentStart() - read.getSoftStart(); if (read.isEmpty() || nLeadingSoftClips > read.getReadLength()) return GATKSAMRecord.emptyRead(read); byte[] quals = read.getBaseQualities(EventType.BASE_SUBSTITUTION); int left = -1; if (nLeadingSoftClips > 0) { for (int i = nLeadingSoftClips - 1; i >= 0; i--) { if (quals[i] >= minQual) left = i; else break; } } int right = -1; int nTailingSoftClips = read.getSoftEnd() - read.getAlignmentEnd(); if (nTailingSoftClips > 0) { for (int i = read.getReadLength() - nTailingSoftClips; i < read.getReadLength(); i++) { if (quals[i] >= minQual) right = i; else break; } } GATKSAMRecord clippedRead = read; if (right >= 0 && right + 1 < clippedRead .getReadLength()) // only clip if there are softclipped bases (right >= 0) and the // first high quality soft clip is not the last base (right+1 < // readlength) clippedRead = hardClipByReadCoordinates( clippedRead, right + 1, clippedRead.getReadLength() - 1); // first we hard clip the low quality soft clips on the right tail if (left >= 0 && left - 1 > 0) // only clip if there are softclipped bases (left >= 0) and the first high quality // soft clip is not the last base (left-1 > 0) clippedRead = hardClipByReadCoordinates( clippedRead, 0, left - 1); // then we hard clip the low quality soft clips on the left tail return clippedRead; }