private Haplotype getHaplotypeFromRead( final PileupElement p, final int contextSize, final int locus) { final GATKSAMRecord read = p.getRead(); int readOffsetFromPileup = p.getOffset(); final byte[] haplotypeBases = new byte[contextSize]; Arrays.fill(haplotypeBases, (byte) REGEXP_WILDCARD); final double[] baseQualities = new double[contextSize]; Arrays.fill(baseQualities, 0.0); byte[] readBases = read.getReadBases(); readBases = AlignmentUtils.readToAlignmentByteArray( read.getCigar(), readBases); // Adjust the read bases based on the Cigar string byte[] readQuals = read.getBaseQualities(); readQuals = AlignmentUtils.readToAlignmentByteArray( read.getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), p, read.getAlignmentStart(), locus); final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2; for (int i = 0; i < contextSize; i++) { final int baseOffset = i + baseOffsetStart; if (baseOffset < 0) { continue; } if (baseOffset >= readBases.length) { break; } if (readQuals[baseOffset] == PileupElement.DELETION_BASE) { readQuals[baseOffset] = PileupElement.DELETION_QUAL; } if (!BaseUtils.isRegularBase(readBases[baseOffset])) { readBases[baseOffset] = (byte) REGEXP_WILDCARD; readQuals[baseOffset] = (byte) 0; } // N's shouldn't be treated as distinct bases readQuals[baseOffset] = (byte) Math.min((int) readQuals[baseOffset], p.getMappingQual()); if (((int) readQuals[baseOffset]) < 5) { readQuals[baseOffset] = (byte) 0; } // quals less than 5 are used as codes and don't have actual probabilistic meaning behind // them haplotypeBases[i] = readBases[baseOffset]; baseQualities[i] = (double) readQuals[baseOffset]; } return new Haplotype(haplotypeBases, baseQualities); }
/** * Hard clips away soft clipped bases that are below the given quality threshold * * @param read the read * @param minQual the mininum base quality score to revert the base (inclusive) * @return a new read without low quality soft clipped bases */ public static GATKSAMRecord hardClipLowQualitySoftClips(GATKSAMRecord read, byte minQual) { int nLeadingSoftClips = read.getAlignmentStart() - read.getSoftStart(); if (read.isEmpty() || nLeadingSoftClips > read.getReadLength()) return GATKSAMRecord.emptyRead(read); byte[] quals = read.getBaseQualities(EventType.BASE_SUBSTITUTION); int left = -1; if (nLeadingSoftClips > 0) { for (int i = nLeadingSoftClips - 1; i >= 0; i--) { if (quals[i] >= minQual) left = i; else break; } } int right = -1; int nTailingSoftClips = read.getSoftEnd() - read.getAlignmentEnd(); if (nTailingSoftClips > 0) { for (int i = read.getReadLength() - nTailingSoftClips; i < read.getReadLength(); i++) { if (quals[i] >= minQual) right = i; else break; } } GATKSAMRecord clippedRead = read; if (right >= 0 && right + 1 < clippedRead .getReadLength()) // only clip if there are softclipped bases (right >= 0) and the // first high quality soft clip is not the last base (right+1 < // readlength) clippedRead = hardClipByReadCoordinates( clippedRead, right + 1, clippedRead.getReadLength() - 1); // first we hard clip the low quality soft clips on the right tail if (left >= 0 && left - 1 > 0) // only clip if there are softclipped bases (left >= 0) and the first high quality // soft clip is not the last base (left-1 > 0) clippedRead = hardClipByReadCoordinates( clippedRead, 0, left - 1); // then we hard clip the low quality soft clips on the left tail return clippedRead; }
/** * Clips any contiguous tail (left, right or both) with base quality lower than lowQual using the * desired algorithm. * * <p>This function will look for low quality tails and hard clip them away. A low quality tail * ends when a base has base quality greater than lowQual. * * @param algorithm the algorithm to use (HardClip, SoftClip, Write N's,...) * @param lowQual every base quality lower than or equal to this in the tail of the read will be * hard clipped * @return a new read without low quality tails */ private GATKSAMRecord clipLowQualEnds(ClippingRepresentation algorithm, byte lowQual) { if (read.isEmpty()) return read; final byte[] quals = read.getBaseQualities(); final int readLength = read.getReadLength(); int leftClipIndex = 0; int rightClipIndex = readLength - 1; // check how far we can clip both sides while (rightClipIndex >= 0 && quals[rightClipIndex] <= lowQual) rightClipIndex--; while (leftClipIndex < readLength && quals[leftClipIndex] <= lowQual) leftClipIndex++; // if the entire read should be clipped, then return an empty read. if (leftClipIndex > rightClipIndex) return GATKSAMRecord.emptyRead(read); if (rightClipIndex < readLength - 1) { this.addOp(new ClippingOp(rightClipIndex + 1, readLength - 1)); } if (leftClipIndex > 0) { this.addOp(new ClippingOp(0, leftClipIndex - 1)); } return this.clipRead(algorithm); }
@Test(enabled = false) public void testCovariateGeneration() { final String RGID = "id"; final int length = 10; final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); GATKSAMRecord read = ReadUtils.createRandomRead(length, false); GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(RGID); rg.setPlatform("illumina"); read.setReadGroup(rg); final byte[] mQuals = read.getBaseQualities(EventType.BASE_SUBSTITUTION); final byte[] iQuals = read.getBaseQualities(EventType.BASE_INSERTION); final byte[] dQuals = read.getBaseQualities(EventType.BASE_DELETION); ReadGroupCovariate rgCov = new ReadGroupCovariate(); QualityScoreCovariate qsCov = new QualityScoreCovariate(); ContextCovariate coCov = new ContextCovariate(); CycleCovariate cyCov = new CycleCovariate(); rgCov.initialize(RAC); qsCov.initialize(RAC); coCov.initialize(RAC); cyCov.initialize(RAC); Covariate[] requestedCovariates = new Covariate[4]; requestedCovariates[0] = rgCov; requestedCovariates[1] = qsCov; requestedCovariates[2] = coCov; requestedCovariates[3] = cyCov; ReadCovariates rc = RecalDataManager.computeCovariates(read, requestedCovariates); // check that the length is correct Assert.assertEquals(rc.getMismatchesKeySet().length, length); Assert.assertEquals(rc.getInsertionsKeySet().length, length); Assert.assertEquals(rc.getDeletionsKeySet().length, length); for (int i = 0; i < length; i++) { // check that read group is always the same Assert.assertEquals(rgCov.formatKey(rc.getMismatchesKeySet(i)[0]), RGID); Assert.assertEquals(rgCov.formatKey(rc.getInsertionsKeySet(i)[0]), RGID); Assert.assertEquals(rgCov.formatKey(rc.getDeletionsKeySet(i)[0]), RGID); // check quality score Assert.assertEquals(qsCov.formatKey(rc.getMismatchesKeySet(i)[1]), "" + mQuals[i]); Assert.assertEquals(qsCov.formatKey(rc.getInsertionsKeySet(i)[1]), "" + iQuals[i]); Assert.assertEquals(qsCov.formatKey(rc.getDeletionsKeySet(i)[1]), "" + dQuals[i]); // check context Assert.assertEquals( coCov.formatKey(rc.getMismatchesKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.MISMATCHES_CONTEXT_SIZE)); Assert.assertEquals( coCov.formatKey(rc.getInsertionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); Assert.assertEquals( coCov.formatKey(rc.getDeletionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); // check cycle Assert.assertEquals(cyCov.formatKey(rc.getMismatchesKeySet(i)[3]), "" + (i + 1)); Assert.assertEquals(cyCov.formatKey(rc.getInsertionsKeySet(i)[3]), "" + (i + 1)); Assert.assertEquals(cyCov.formatKey(rc.getDeletionsKeySet(i)[3]), "" + (i + 1)); } }
private double scoreReadAgainstHaplotype( final PileupElement p, final int contextSize, final Haplotype haplotype, final int locus) { double expected = 0.0; double mismatches = 0.0; // What's the expected mismatch rate under the model that this read is actually sampled from // this haplotype? Let's assume the consensus base c is a random choice one of A, C, G, or T, // and that // the observed base is actually from a c with an error rate e. Since e is the rate at which // we'd // see a miscalled c, the expected mismatch rate is really e. So the expected number of // mismatches // is just sum_i e_i for i from 1..n for n sites // // Now, what's the probabilistic sum of mismatches? Suppose that the base b is equal to c. // Well, it could // actually be a miscall in a matching direction, which would happen at a e / 3 rate. If b != // c, then // the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be // a mismatch. // so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1 // ... n final byte[] haplotypeBases = haplotype.getBases(); final GATKSAMRecord read = p.getRead(); byte[] readBases = read.getReadBases(); readBases = AlignmentUtils.readToAlignmentByteArray( p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string byte[] readQuals = read.getBaseQualities(); readQuals = AlignmentUtils.readToAlignmentByteArray( p.getRead().getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string int readOffsetFromPileup = p.getOffset(); readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset( p.getRead().getCigar(), p, read.getAlignmentStart(), locus); final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2; for (int i = 0; i < contextSize; i++) { final int baseOffset = i + baseOffsetStart; if (baseOffset < 0) { continue; } if (baseOffset >= readBases.length) { break; } final byte haplotypeBase = haplotypeBases[i]; final byte readBase = readBases[baseOffset]; final boolean matched = (readBase == haplotypeBase || haplotypeBase == (byte) REGEXP_WILDCARD); byte qual = readQuals[baseOffset]; if (qual == PileupElement.DELETION_BASE) { qual = PileupElement.DELETION_QUAL; } // calcAlignmentByteArrayOffset fills the readQuals array with DELETION_BASE at deletions qual = (byte) Math.min((int) qual, p.getMappingQual()); if (((int) qual) >= 5) { // quals less than 5 are used as codes and don't have actual probabilistic meaning // behind them final double e = QualityUtils.qualToErrorProb(qual); expected += e; mismatches += matched ? e : 1.0 - e / 3.0; } // a more sophisticated calculation would include the reference quality, but it's nice to // actually penalize // the mismatching of poorly determined regions of the consensus } return mismatches - expected; }