/** * Checks if a read contains adaptor sequences. If it does, hard clips them out. * * <p>Note: To see how a read is checked for adaptor sequence see ReadUtils.getAdaptorBoundary() * * @return a new read without adaptor sequence */ private GATKSAMRecord hardClipAdaptorSequence() { final int adaptorBoundary = ReadUtils.getAdaptorBoundary(read); if (adaptorBoundary == ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY || !ReadUtils.isInsideRead(read, adaptorBoundary)) return read; return read.getReadNegativeStrandFlag() ? hardClipByReferenceCoordinatesLeftTail(adaptorBoundary) : hardClipByReferenceCoordinatesRightTail(adaptorBoundary); }
/** * Generic functionality to hard clip a read, used internally by * hardClipByReferenceCoordinatesLeftTail and hardClipByReferenceCoordinatesRightTail. Should not * be used directly. * * <p>Note, it REQUIRES you to give the directionality of your hard clip (i.e. whether you're * clipping the left of right tail) by specifying either refStart < 0 or refStop < 0. * * @param refStart first base to clip (inclusive) * @param refStop last base to clip (inclusive) * @return a new read, without the clipped bases */ @Requires({ "!read.getReadUnmappedFlag()", "refStart < 0 || refStop < 0" }) // can't handle unmapped reads, as we're using reference coordinates to clip protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { if (read.isEmpty()) return read; int start; int stop; // Determine the read coordinate to start and stop hard clipping if (refStart < 0) { if (refStop < 0) throw new ReviewedStingException( "Only one of refStart or refStop must be < 0, not both (" + refStart + ", " + refStop + ")"); start = 0; stop = ReadUtils.getReadCoordinateForReferenceCoordinate( read, refStop, ReadUtils.ClippingTail.LEFT_TAIL); } else { if (refStop >= 0) throw new ReviewedStingException( "Either refStart or refStop must be < 0 (" + refStart + ", " + refStop + ")"); start = ReadUtils.getReadCoordinateForReferenceCoordinate( read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL); stop = read.getReadLength() - 1; } if (start < 0 || stop > read.getReadLength() - 1) throw new ReviewedStingException( "Trying to clip before the start or after the end of a read"); if (start > stop) throw new ReviewedStingException( String.format( "START (%d) > (%d) STOP -- this should never happen, please check read: %s (CIGAR: %s)", start, stop, read, read.getCigarString())); if (start > 0 && stop < read.getReadLength() - 1) throw new ReviewedStingException( String.format( "Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString())); this.addOp(new ClippingOp(start, stop)); GATKSAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES); this.ops = null; return clippedRead; }
@Test(enabled = true) public void testGetMaxReadLength() { for (final int minLength : Arrays.asList(5, 30, 50)) { for (final int maxLength : Arrays.asList(50, 75, 100)) { final List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>(); for (int readLength = minLength; readLength <= maxLength; readLength++) { reads.add(ReadUtils.createRandomRead(readLength)); } Assert.assertEquals( ReadUtils.getMaxReadLength(reads), maxLength, "max length does not match"); } } final List<GATKSAMRecord> reads = new LinkedList<GATKSAMRecord>(); Assert.assertEquals( ReadUtils.getMaxReadLength(reads), 0, "Empty list should have max length of zero"); }
private static void runTest( final GATKSAMReadGroupRecord rg, final String expected, final ReadGroupCovariate covariate) { GATKSAMRecord read = ReadUtils.createRandomRead(10); read.setReadGroup(rg); ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); covariate.recordValues(read, readCovariates); verifyCovariateArray(readCovariates.getMismatchesKeySet(), expected, covariate); }
@Test(enabled = true) public void testGetBasesReverseComplement() { int iterations = 1000; Random random = GenomeAnalysisEngine.getRandomGenerator(); while (iterations-- > 0) { final int l = random.nextInt(1000); GATKSAMRecord read = GATKSAMRecord.createRandomRead(l); byte[] original = read.getReadBases(); byte[] reconverted = new byte[l]; String revComp = ReadUtils.getBasesReverseComplement(read); for (int i = 0; i < l; i++) { reconverted[l - 1 - i] = BaseUtils.getComplement((byte) revComp.charAt(i)); } Assert.assertEquals(reconverted, original); } }
@Test(enabled = true) public void testReadWithNsRefAfterDeletion() throws FileNotFoundException { final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); final int readLength = 76; final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 8975, readLength); read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); read.setBaseQualities(Utils.dupBytes((byte) 30, readLength)); read.setCigarString("3M414N1D73M"); final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead( read, 9393, ReadUtils.ClippingTail.LEFT_TAIL); Assert.assertEquals(result, 3); }
private ArrayList<Allele> computeConsensusAlleles( ReferenceContext ref, Map<String, AlignmentContext> contexts, AlignmentContextUtils.ReadOrientation contextType) { Allele refAllele = null, altAllele = null; GenomeLoc loc = ref.getLocus(); ArrayList<Allele> aList = new ArrayList<Allele>(); HashMap<String, Integer> consensusIndelStrings = new HashMap<String, Integer>(); int insCount = 0, delCount = 0; // quick check of total number of indels in pileup for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) { AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); insCount += indelPileup.getNumberOfInsertions(); delCount += indelPileup.getNumberOfDeletions(); } if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) return aList; for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) { // todo -- warning, can be duplicating expensive partition here AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) { // SAMRecord read = p.getRead(); GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead()); if (read == null) continue; if (ReadUtils.is454Read(read)) { continue; } /* if (DEBUG && p.isIndel()) { System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n", read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(), p.getEventLength(),p.getType().toString(), p.getEventBases()); } */ String indelString = p.getEventBases(); if (p.isInsertion()) { boolean foundKey = false; if (read.getAlignmentEnd() == loc.getStart()) { // first corner condition: a read has an insertion at the end, and we're right at the // insertion. // In this case, the read could have any of the inserted bases and we need to build a // consensus for (String s : consensusIndelStrings.keySet()) { int cnt = consensusIndelStrings.get(s); if (s.startsWith(indelString)) { // case 1: current insertion is prefix of indel in hash map consensusIndelStrings.put(s, cnt + 1); foundKey = true; break; } else if (indelString.startsWith(s)) { // case 2: indel stored in hash table is prefix of current insertion // In this case, new bases are new key. consensusIndelStrings.remove(s); consensusIndelStrings.put(indelString, cnt + 1); foundKey = true; break; } } if (!foundKey) // none of the above: event bases not supported by previous table, so add new key consensusIndelStrings.put(indelString, 1); } else if (read.getAlignmentStart() == loc.getStart() + 1) { // opposite corner condition: read will start at current locus with an insertion for (String s : consensusIndelStrings.keySet()) { int cnt = consensusIndelStrings.get(s); if (s.endsWith(indelString)) { // case 1: current insertion is suffix of indel in hash map consensusIndelStrings.put(s, cnt + 1); foundKey = true; break; } else if (indelString.endsWith(s)) { // case 2: indel stored in hash table is suffix of current insertion // In this case, new bases are new key. consensusIndelStrings.remove(s); consensusIndelStrings.put(indelString, cnt + 1); foundKey = true; break; } } if (!foundKey) // none of the above: event bases not supported by previous table, so add new key consensusIndelStrings.put(indelString, 1); } else { // normal case: insertion somewhere in the middle of a read: add count to hash map int cnt = consensusIndelStrings.containsKey(indelString) ? consensusIndelStrings.get(indelString) : 0; consensusIndelStrings.put(indelString, cnt + 1); } } else if (p.isDeletion()) { indelString = String.format("D%d", p.getEventLength()); int cnt = consensusIndelStrings.containsKey(indelString) ? consensusIndelStrings.get(indelString) : 0; consensusIndelStrings.put(indelString, cnt + 1); } } /* if (DEBUG) { int icount = indelPileup.getNumberOfInsertions(); int dcount = indelPileup.getNumberOfDeletions(); if (icount + dcount > 0) { List<Pair<String,Integer>> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases()); System.out.format("#ins: %d, #del:%d\n", insCount, delCount); for (int i=0 ; i < eventStrings.size() ; i++ ) { System.out.format("%s:%d,",eventStrings.get(i).first,eventStrings.get(i).second); // int k=0; } System.out.println(); } } */ } int maxAlleleCnt = 0; String bestAltAllele = ""; for (String s : consensusIndelStrings.keySet()) { int curCnt = consensusIndelStrings.get(s); if (curCnt > maxAlleleCnt) { maxAlleleCnt = curCnt; bestAltAllele = s; } // if (DEBUG) // System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) ); } // gdebug- if (maxAlleleCnt < minIndelCountForGenotyping) return aList; if (bestAltAllele.startsWith("D")) { // get deletion length int dLen = Integer.valueOf(bestAltAllele.substring(1)); // get ref bases of accurate deletion int startIdxInReference = (int) (1 + loc.getStart() - ref.getWindow().getStart()); // System.out.println(new String(ref.getBases())); byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen); if (Allele.acceptableAlleleBases(refBases)) { refAllele = Allele.create(refBases, true); altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); } } else { // insertion case if (Allele.acceptableAlleleBases(bestAltAllele)) { refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); altAllele = Allele.create(bestAltAllele, false); } } if (refAllele != null && altAllele != null) { aList.add(0, refAllele); aList.add(1, altAllele); } return aList; }
@Test(dataProvider = "HasWellDefinedFragmentSizeData") private void testHasWellDefinedFragmentSize( final String name, final GATKSAMRecord read, final boolean expected) { Assert.assertEquals(ReadUtils.hasWellDefinedFragmentSize(read), expected); }
@Test(enabled = false) public void testCovariateGeneration() { final String RGID = "id"; final int length = 10; final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); GATKSAMRecord read = ReadUtils.createRandomRead(length, false); GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(RGID); rg.setPlatform("illumina"); read.setReadGroup(rg); final byte[] mQuals = read.getBaseQualities(EventType.BASE_SUBSTITUTION); final byte[] iQuals = read.getBaseQualities(EventType.BASE_INSERTION); final byte[] dQuals = read.getBaseQualities(EventType.BASE_DELETION); ReadGroupCovariate rgCov = new ReadGroupCovariate(); QualityScoreCovariate qsCov = new QualityScoreCovariate(); ContextCovariate coCov = new ContextCovariate(); CycleCovariate cyCov = new CycleCovariate(); rgCov.initialize(RAC); qsCov.initialize(RAC); coCov.initialize(RAC); cyCov.initialize(RAC); Covariate[] requestedCovariates = new Covariate[4]; requestedCovariates[0] = rgCov; requestedCovariates[1] = qsCov; requestedCovariates[2] = coCov; requestedCovariates[3] = cyCov; ReadCovariates rc = RecalDataManager.computeCovariates(read, requestedCovariates); // check that the length is correct Assert.assertEquals(rc.getMismatchesKeySet().length, length); Assert.assertEquals(rc.getInsertionsKeySet().length, length); Assert.assertEquals(rc.getDeletionsKeySet().length, length); for (int i = 0; i < length; i++) { // check that read group is always the same Assert.assertEquals(rgCov.formatKey(rc.getMismatchesKeySet(i)[0]), RGID); Assert.assertEquals(rgCov.formatKey(rc.getInsertionsKeySet(i)[0]), RGID); Assert.assertEquals(rgCov.formatKey(rc.getDeletionsKeySet(i)[0]), RGID); // check quality score Assert.assertEquals(qsCov.formatKey(rc.getMismatchesKeySet(i)[1]), "" + mQuals[i]); Assert.assertEquals(qsCov.formatKey(rc.getInsertionsKeySet(i)[1]), "" + iQuals[i]); Assert.assertEquals(qsCov.formatKey(rc.getDeletionsKeySet(i)[1]), "" + dQuals[i]); // check context Assert.assertEquals( coCov.formatKey(rc.getMismatchesKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.MISMATCHES_CONTEXT_SIZE)); Assert.assertEquals( coCov.formatKey(rc.getInsertionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); Assert.assertEquals( coCov.formatKey(rc.getDeletionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); // check cycle Assert.assertEquals(cyCov.formatKey(rc.getMismatchesKeySet(i)[3]), "" + (i + 1)); Assert.assertEquals(cyCov.formatKey(rc.getInsertionsKeySet(i)[3]), "" + (i + 1)); Assert.assertEquals(cyCov.formatKey(rc.getDeletionsKeySet(i)[3]), "" + (i + 1)); } }
public static GATKSAMRecord createRandomRead(int length, boolean allowNs) { byte[] quals = ReadUtils.createRandomReadQuals(length); byte[] bbases = ReadUtils.createRandomReadBases(length, allowNs); return ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M"); }