/** * Returns the coverage distribution of a single read within the desired region. * * <p>Note: This function counts DELETIONS as coverage (since the main purpose is to downsample * reads for variant regions, and deletions count as variants) * * @param read the read to get the coverage distribution of * @param startLocation the first reference coordinate of the region (inclusive) * @param stopLocation the last reference coordinate of the region (inclusive) * @return an array with the coverage of each position from startLocation to stopLocation */ public static int[] getCoverageDistributionOfRead( GATKSAMRecord read, int startLocation, int stopLocation) { int[] coverage = new int[stopLocation - startLocation + 1]; int refLocation = read.getSoftStart(); for (CigarElement cigarElement : read.getCigar().getCigarElements()) { switch (cigarElement.getOperator()) { case S: case M: case EQ: case N: case X: case D: for (int i = 0; i < cigarElement.getLength(); i++) { if (refLocation >= startLocation && refLocation <= stopLocation) { int baseCount = read.isReducedRead() ? read.getReducedCount(refLocation - read.getSoftStart()) : 1; coverage[refLocation - startLocation] += baseCount; // this may be a reduced read, so add the proper number of bases } refLocation++; } break; case P: case I: case H: break; } if (refLocation > stopLocation) break; } return coverage; }
/** * Returns the read coordinate corresponding to the requested reference coordinate. * * <p>WARNING: if the requested reference coordinate happens to fall inside a deletion in the * read, this function will return the last read base before the deletion. This function returns a * Pair(int readCoord, boolean fallsInsideDeletion) so you can choose which readCoordinate to use * when faced with a deletion. * * <p>SUGGESTION: Use getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int, ClippingTail) * instead to get a pre-processed result according to normal clipping needs. Or you can use this * function and tailor the behavior to your needs. * * @param read * @param refCoord * @return the read coordinate corresponding to the requested reference coordinate. (see warning!) */ @Requires({"refCoord >= read.getSoftStart()", "refCoord <= read.getSoftEnd()"}) @Ensures({"result.getFirst() >= 0", "result.getFirst() < read.getReadLength()"}) public static Pair<Integer, Boolean> getReadCoordinateForReferenceCoordinate( GATKSAMRecord read, int refCoord) { return getReadCoordinateForReferenceCoordinate( read.getSoftStart(), read.getCigar(), refCoord, false); }
public static int getMeanRepresentativeReadCount(GATKSAMRecord read) { if (!read.isReducedRead()) return 1; // compute mean representative read counts final byte[] counts = read.getReducedReadCounts(); return (int) Math.round((double) MathUtils.sum(counts) / counts.length); }
/** * is this base inside the adaptor of the read? * * <p>There are two cases to treat here: * * <p>1) Read is in the negative strand => Adaptor boundary is on the left tail 2) Read is in the * positive strand => Adaptor boundary is on the right tail * * <p>Note: We return false to all reads that are UNMAPPED or have an weird big insert size * (probably due to mismapping or bigger event) * * @param read the read to test * @param basePos base position in REFERENCE coordinates (not read coordinates) * @return whether or not the base is in the adaptor */ public static boolean isBaseInsideAdaptor(final GATKSAMRecord read, long basePos) { Integer adaptorBoundary = getAdaptorBoundary(read); if (adaptorBoundary == null || read.getInferredInsertSize() > DEFAULT_ADAPTOR_SIZE) return false; return read.getReadNegativeStrandFlag() ? basePos <= adaptorBoundary : basePos >= adaptorBoundary; }
/** * Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) to * take care of two corner cases: * * <p>1. If clipping the right tail (end of the read) getReadCoordinateForReferenceCoordinate and * fall inside a deletion return the base after the deletion. If clipping the left tail (beginning * of the read) it doesn't matter because it already returns the previous base by default. * * <p>2. If clipping the left tail (beginning of the read) getReadCoordinateForReferenceCoordinate * and the read starts with an insertion, and you're requesting the first read based coordinate, * it will skip the leading insertion (because it has the same reference coordinate as the * following base). * * @param read * @param refCoord * @param tail * @return the read coordinate corresponding to the requested reference coordinate for clipping. */ @Requires({ "refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd() || (read.getUnclippedEnd() < read.getUnclippedStart())" }) @Ensures({"result >= 0", "result < read.getReadLength()"}) public static int getReadCoordinateForReferenceCoordinate( GATKSAMRecord read, int refCoord, ClippingTail tail) { return getReadCoordinateForReferenceCoordinate( read.getSoftStart(), read.getCigar(), refCoord, tail, false); }
private GATKSAMRecord makeRead(final int fragmentSize, final int mateStart) { final byte[] bases = {'A', 'C', 'G', 'T', 'A', 'C', 'G', 'T'}; final byte[] quals = {30, 30, 30, 30, 30, 30, 30, 30}; final String cigar = "8M"; GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, cigar); read.setProperPairFlag(true); read.setReadPairedFlag(true); read.setMateAlignmentStart(mateStart); read.setInferredInsertSize(fragmentSize); return read; }
@Test(enabled = true) public void testGetBasesReverseComplement() { int iterations = 1000; Random random = GenomeAnalysisEngine.getRandomGenerator(); while (iterations-- > 0) { final int l = random.nextInt(1000); GATKSAMRecord read = GATKSAMRecord.createRandomRead(l); byte[] original = read.getReadBases(); byte[] reconverted = new byte[l]; String revComp = ReadUtils.getBasesReverseComplement(read); for (int i = 0; i < l; i++) { reconverted[l - 1 - i] = BaseUtils.getComplement((byte) revComp.charAt(i)); } Assert.assertEquals(reconverted, original); } }
/** * Creates a map with each event in the read (cigar operator) and the read coordinate where it * happened. * * <p>Example: D -> 2, 34, 75 I -> 55 S -> 0, 101 H -> 101 * * @param read the read * @return a map with the properties described above. See example */ public static Map<CigarOperator, ArrayList<Integer>> getCigarOperatorForAllBases( GATKSAMRecord read) { Map<CigarOperator, ArrayList<Integer>> events = new HashMap<CigarOperator, ArrayList<Integer>>(); int position = 0; for (CigarElement cigarElement : read.getCigar().getCigarElements()) { CigarOperator op = cigarElement.getOperator(); if (op.consumesReadBases()) { ArrayList<Integer> list = events.get(op); if (list == null) { list = new ArrayList<Integer>(); events.put(op, list); } for (int i = position; i < cigarElement.getLength(); i++) list.add(position++); } else { ArrayList<Integer> list = events.get(op); if (list == null) { list = new ArrayList<Integer>(); events.put(op, list); } list.add(position); } } return events; }
@Test(enabled = true) public void testReadWithNsRefAfterDeletion() throws FileNotFoundException { final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); final int readLength = 76; final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 8975, readLength); read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); read.setBaseQualities(Utils.dupBytes((byte) 30, readLength)); read.setCigarString("3M414N1D73M"); final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead( read, 9393, ReadUtils.ClippingTail.LEFT_TAIL); Assert.assertEquals(result, 3); }
/** * Calculates the reference coordinate for a read coordinate * * @param read the read * @param offset the base in the read (coordinate in the read) * @return the reference coordinate correspondent to this base */ public static long getReferenceCoordinateForReadCoordinate(GATKSAMRecord read, int offset) { if (offset > read.getReadLength()) throw new ReviewedStingException( String.format(OFFSET_OUT_OF_BOUNDS_EXCEPTION, offset, read.getReadLength())); long location = read.getAlignmentStart(); Iterator<CigarElement> cigarElementIterator = read.getCigar().getCigarElements().iterator(); while (offset > 0 && cigarElementIterator.hasNext()) { CigarElement cigarElement = cigarElementIterator.next(); long move = 0; if (cigarElement.getOperator().consumesReferenceBases()) move = (long) Math.min(cigarElement.getLength(), offset); location += move; offset -= move; } if (offset > 0 && !cigarElementIterator.hasNext()) throw new ReviewedStingException(OFFSET_NOT_ZERO_EXCEPTION); return location; }
/** * Determines what is the position of the read in relation to the interval. Note: This function * uses the UNCLIPPED ENDS of the reads for the comparison. * * @param read the read * @param interval the interval * @return the overlap type as described by ReadAndIntervalOverlap enum (see above) */ public static ReadAndIntervalOverlap getReadAndIntervalOverlapType( GATKSAMRecord read, GenomeLoc interval) { int sStart = read.getSoftStart(); int sStop = read.getSoftEnd(); int uStart = read.getUnclippedStart(); int uStop = read.getUnclippedEnd(); if (!read.getReferenceName().equals(interval.getContig())) return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG; else if (uStop < interval.getStart()) return ReadAndIntervalOverlap.NO_OVERLAP_LEFT; else if (uStart > interval.getStop()) return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT; else if (sStop < interval.getStart()) return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_LEFT; else if (sStart > interval.getStop()) return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_RIGHT; else if ((sStart >= interval.getStart()) && (sStop <= interval.getStop())) return ReadAndIntervalOverlap.OVERLAP_CONTAINED; else if ((sStart < interval.getStart()) && (sStop > interval.getStop())) return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT; else if ((sStart < interval.getStart())) return ReadAndIntervalOverlap.OVERLAP_LEFT; else return ReadAndIntervalOverlap.OVERLAP_RIGHT; }
// copied from LocusViewTemplate protected GATKSAMRecord buildSAMRecord( final String readName, final String contig, final int alignmentStart) { GATKSAMRecord record = new GATKSAMRecord(header); record.setReadName(readName); record.setReferenceIndex(dictionary.getSequenceIndex(contig)); record.setAlignmentStart(alignmentStart); record.setCigarString("1M"); record.setReadString("A"); record.setBaseQualityString("A"); record.setReadGroup(readGroup); return record; }
@DataProvider(name = "HasWellDefinedFragmentSizeData") public Object[][] makeHasWellDefinedFragmentSizeData() throws Exception { final List<Object[]> tests = new LinkedList<Object[]>(); // setup a basic read that will work final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(); final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 10, 10); read.setReadPairedFlag(true); read.setProperPairFlag(true); read.setReadUnmappedFlag(false); read.setMateUnmappedFlag(false); read.setAlignmentStart(100); read.setCigarString("50M"); read.setMateAlignmentStart(130); read.setInferredInsertSize(80); read.setFirstOfPairFlag(true); read.setReadNegativeStrandFlag(false); read.setMateNegativeStrandFlag(true); tests.add(new Object[] {"basic case", read.clone(), true}); { final GATKSAMRecord bad1 = (GATKSAMRecord) read.clone(); bad1.setReadPairedFlag(false); tests.add(new Object[] {"not paired", bad1, false}); } { final GATKSAMRecord bad = (GATKSAMRecord) read.clone(); bad.setProperPairFlag(false); // we currently don't require the proper pair flag to be set tests.add(new Object[] {"not proper pair", bad, true}); // tests.add( new Object[]{ "not proper pair", bad, false }); } { final GATKSAMRecord bad = (GATKSAMRecord) read.clone(); bad.setReadUnmappedFlag(true); tests.add(new Object[] {"read is unmapped", bad, false}); } { final GATKSAMRecord bad = (GATKSAMRecord) read.clone(); bad.setMateUnmappedFlag(true); tests.add(new Object[] {"mate is unmapped", bad, false}); } { final GATKSAMRecord bad = (GATKSAMRecord) read.clone(); bad.setMateNegativeStrandFlag(false); tests.add(new Object[] {"read and mate both on positive strand", bad, false}); } { final GATKSAMRecord bad = (GATKSAMRecord) read.clone(); bad.setReadNegativeStrandFlag(true); tests.add(new Object[] {"read and mate both on negative strand", bad, false}); } { final GATKSAMRecord bad = (GATKSAMRecord) read.clone(); bad.setInferredInsertSize(0); tests.add(new Object[] {"insert size is 0", bad, false}); } { final GATKSAMRecord bad = (GATKSAMRecord) read.clone(); bad.setAlignmentStart(1000); tests.add(new Object[] {"positve read starts after mate end", bad, false}); } { final GATKSAMRecord bad = (GATKSAMRecord) read.clone(); bad.setReadNegativeStrandFlag(true); bad.setMateNegativeStrandFlag(false); bad.setMateAlignmentStart(1000); tests.add(new Object[] {"negative strand read ends before mate starts", bad, false}); } return tests.toArray(new Object[][] {}); }
/** * Is this read all insertion? * * @param read * @return whether or not the only element in the cigar string is an Insertion */ public static boolean readIsEntirelyInsertion(GATKSAMRecord read) { for (CigarElement cigarElement : read.getCigar().getCigarElements()) { if (cigarElement.getOperator() != CigarOperator.INSERTION) return false; } return true; }
/** * Is a base inside a read? * * @param read the read to evaluate * @param referenceCoordinate the reference coordinate of the base to test * @return true if it is inside the read, false otherwise. */ public static boolean isInsideRead(final GATKSAMRecord read, final int referenceCoordinate) { return referenceCoordinate >= read.getAlignmentStart() && referenceCoordinate <= read.getAlignmentEnd(); }
@Test(dataProvider = "AdaptorGetter") public void testGetAdaptorBoundary(final GetAdaptorFunc get) { final int fragmentSize = 10; final int mateStart = 1000; final int BEFORE = mateStart - 2; final int AFTER = mateStart + 2; int myStart, boundary; GATKSAMRecord read; // Test case 1: positive strand, first read read = makeRead(fragmentSize, mateStart); myStart = BEFORE; read.setAlignmentStart(myStart); read.setReadNegativeStrandFlag(false); read.setMateNegativeStrandFlag(true); boundary = get.getAdaptor(read); Assert.assertEquals(boundary, myStart + fragmentSize + 1); // Test case 2: positive strand, second read read = makeRead(fragmentSize, mateStart); myStart = AFTER; read.setAlignmentStart(myStart); read.setReadNegativeStrandFlag(false); read.setMateNegativeStrandFlag(true); boundary = get.getAdaptor(read); Assert.assertEquals(boundary, myStart + fragmentSize + 1); // Test case 3: negative strand, second read read = makeRead(fragmentSize, mateStart); myStart = AFTER; read.setAlignmentStart(myStart); read.setReadNegativeStrandFlag(true); read.setMateNegativeStrandFlag(false); boundary = get.getAdaptor(read); Assert.assertEquals(boundary, mateStart - 1); // Test case 4: negative strand, first read read = makeRead(fragmentSize, mateStart); myStart = BEFORE; read.setAlignmentStart(myStart); read.setReadNegativeStrandFlag(true); read.setMateNegativeStrandFlag(false); boundary = get.getAdaptor(read); Assert.assertEquals(boundary, mateStart - 1); // Test case 5: mate is mapped to another chromosome (test both strands) read = makeRead(fragmentSize, mateStart); read.setInferredInsertSize(0); read.setReadNegativeStrandFlag(true); read.setMateNegativeStrandFlag(false); boundary = get.getAdaptor(read); Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); read.setReadNegativeStrandFlag(false); read.setMateNegativeStrandFlag(true); boundary = get.getAdaptor(read); Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); read.setInferredInsertSize(10); // Test case 6: read is unmapped read = makeRead(fragmentSize, mateStart); read.setReadUnmappedFlag(true); boundary = get.getAdaptor(read); Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); read.setReadUnmappedFlag(false); // Test case 7: reads don't overlap and look like this: // <--------| // |------> // first read: read = makeRead(fragmentSize, mateStart); myStart = 980; read.setAlignmentStart(myStart); read.setInferredInsertSize(20); read.setReadNegativeStrandFlag(true); boundary = get.getAdaptor(read); Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); // second read: read = makeRead(fragmentSize, mateStart); myStart = 1000; read.setAlignmentStart(myStart); read.setInferredInsertSize(20); read.setMateAlignmentStart(980); read.setReadNegativeStrandFlag(false); boundary = get.getAdaptor(read); Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); // Test case 8: read doesn't have proper pair flag set read = makeRead(fragmentSize, mateStart); read.setReadPairedFlag(true); read.setProperPairFlag(false); Assert.assertEquals(get.getAdaptor(read), ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); // Test case 9: read and mate have same negative flag setting for (final boolean negFlag : Arrays.asList(true, false)) { read = makeRead(fragmentSize, mateStart); read.setAlignmentStart(BEFORE); read.setReadPairedFlag(true); read.setProperPairFlag(true); read.setReadNegativeStrandFlag(negFlag); read.setMateNegativeStrandFlag(!negFlag); Assert.assertTrue( get.getAdaptor(read) != ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY, "Get adaptor should have succeeded"); read = makeRead(fragmentSize, mateStart); read.setAlignmentStart(BEFORE); read.setReadPairedFlag(true); read.setProperPairFlag(true); read.setReadNegativeStrandFlag(negFlag); read.setMateNegativeStrandFlag(negFlag); Assert.assertEquals( get.getAdaptor(read), ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY, "Get adaptor should have failed for reads with bad alignment orientation"); } }
/** * Checks if a read starts with an insertion. It looks beyond Hard and Soft clips if there are * any. * * @param read * @return A pair with the answer (true/false) and the element or null if it doesn't exist */ public static Pair<Boolean, CigarElement> readStartsWithInsertion(GATKSAMRecord read) { return readStartsWithInsertion(read.getCigar()); }