@Test public void testCountsFromLocusTraversal() { final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); engine.setGenomeLocParser(genomeLocParser); final Collection<SAMReaderID> samFiles = new ArrayList<>(); final SAMReaderID readerID = new SAMReaderID(testBAM, new Tags()); samFiles.add(readerID); final SAMDataSource dataSource = new SAMDataSource( samFiles, new ThreadAllocation(), null, genomeLocParser, false, SAMFileReader.ValidationStringency.STRICT, null, null, new ValidationExclusion(), new ArrayList<ReadFilter>(), new ArrayList<ReadTransformer>(), false, (byte) 30, false, true); engine.setReadsDataSource(dataSource); final Set<String> samples = SampleUtils.getSAMFileSamples(dataSource.getHeader()); final TraverseLociNano traverseLociNano = new TraverseLociNano(1); final DummyLocusWalker walker = new DummyLocusWalker(); traverseLociNano.initialize(engine, walker, null); for (final Shard shard : dataSource.createShardIteratorOverAllReads(new LocusShardBalancer())) { final WindowMaker windowMaker = new WindowMaker( shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs(), samples); for (WindowMaker.WindowMakerIterator window : windowMaker) { final LocusShardDataProvider dataProvider = new LocusShardDataProvider( shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList<ReferenceOrderedDataSource>()); traverseLociNano.traverse(walker, dataProvider, 0); dataProvider.close(); } windowMaker.close(); } // dataSource.close(); Assert.assertEquals( engine.getCumulativeMetrics().getNumReadsSeen(), contigs.size() * numReadsPerContig); Assert.assertEquals( engine.getCumulativeMetrics().getNumIterations(), contigs.size() * numReadsPerContig); }
@Test public void testFilteredCounts() { final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); engine.setGenomeLocParser(genomeLocParser); final Collection<SAMReaderID> samFiles = new ArrayList<>(); final SAMReaderID readerID = new SAMReaderID(testBAM, new Tags()); samFiles.add(readerID); final List<ReadFilter> filters = new ArrayList<>(); filters.add(new EveryTenthReadFilter()); final SAMDataSource dataSource = new SAMDataSource( samFiles, new ThreadAllocation(), null, genomeLocParser, false, SAMFileReader.ValidationStringency.STRICT, null, null, new ValidationExclusion(), filters, new ArrayList<ReadTransformer>(), false, (byte) 30, false, true); engine.setReadsDataSource(dataSource); final TraverseReadsNano traverseReadsNano = new TraverseReadsNano(1); final DummyReadWalker walker = new DummyReadWalker(); traverseReadsNano.initialize(engine, walker, null); for (final Shard shard : dataSource.createShardIteratorOverAllReads(new ReadShardBalancer())) { final ReadShardDataProvider dataProvider = new ReadShardDataProvider( shard, engine.getGenomeLocParser(), dataSource.seek(shard), reference, new ArrayList<ReferenceOrderedDataSource>()); traverseReadsNano.traverse(walker, dataProvider, 0); dataProvider.close(); } Assert.assertEquals( (long) engine .getCumulativeMetrics() .getCountsByFilter() .get(EveryTenthReadFilter.class.getSimpleName()), contigs.size() * numReadsPerContig / 10); }
/** * Makes association maps for the reads and loci coverage as described below : * * <p>- First: locusToReadMap -- a HashMap that describes for each locus, which reads contribute * to its coverage. Note: Locus is in reference coordinates. Example: Locus => {read1, read2, ..., * readN} * * <p>- Second: readToLocusMap -- a HashMap that describes for each read what loci it contributes * to the coverage. Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= * stopLocation), with value==true meaning it contributes to the coverage. Example: Read => {true, * true, false, ... false} * * @param readList the list of reads to generate the association mappings * @param startLocation the first reference coordinate of the region (inclusive) * @param stopLocation the last reference coordinate of the region (inclusive) * @return the two hashmaps described above */ public static Pair<HashMap<Integer, HashSet<GATKSAMRecord>>, HashMap<GATKSAMRecord, Boolean[]>> getBothReadToLociMappings(List<GATKSAMRecord> readList, int startLocation, int stopLocation) { int arraySize = stopLocation - startLocation + 1; HashMap<Integer, HashSet<GATKSAMRecord>> locusToReadMap = new HashMap<Integer, HashSet<GATKSAMRecord>>(2 * (stopLocation - startLocation + 1), 0.5f); HashMap<GATKSAMRecord, Boolean[]> readToLocusMap = new HashMap<GATKSAMRecord, Boolean[]>(2 * readList.size(), 0.5f); for (int i = startLocation; i <= stopLocation; i++) locusToReadMap.put( i, new HashSet<GATKSAMRecord>()); // Initialize the locusToRead map with empty lists for (GATKSAMRecord read : readList) { readToLocusMap.put( read, new Boolean[arraySize]); // Initialize the readToLocus map with empty arrays int[] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); for (int i = 0; i < readCoverage.length; i++) { int refLocation = i + startLocation; if (readCoverage[i] > 0) { // Update the hash for this locus HashSet<GATKSAMRecord> readSet = locusToReadMap.get(refLocation); readSet.add(read); // Add this locus to the read hash readToLocusMap.get(read)[refLocation - startLocation] = true; } else // Update the boolean array with a 'no coverage' from this read to this locus readToLocusMap.get(read)[refLocation - startLocation] = false; } } return new Pair<HashMap<Integer, HashSet<GATKSAMRecord>>, HashMap<GATKSAMRecord, Boolean[]>>( locusToReadMap, readToLocusMap); }
public static void align( Graph graph, SAMRecord rec, Node recNode, ReferenceSequence sequence, SAMProgramRecord programRecord, int offset, AlleleCoverageCutoffs alleleCoverageCutoffs, boolean correctBases, boolean useSequenceQualities, int MAXIMUM_TOTAL_COVERAGE, int MAX_HEAP_SIZE) throws Exception { int i; AlignHeapNode curAlignHeapNode = null; AlignHeapNode nextAlignHeapNode = null; AlignHeapNode bestAlignHeapNode = null; AlignHeap heap = null; String read = null; // could be cs String readBases = null; // always nt String qualities = null; // could be cq SRMAUtil.Space space = SRMAUtil.Space.NTSPACE; ListIterator<NodeRecord> iter = null; AlignHeapNodeComparator comp = null; int alignmentStart = -1; int numStartNodesAdded = 0; boolean strand = rec.getReadNegativeStrandFlag(); // false -> forward, true -> reverse String softClipStartBases = null; String softClipStartQualities = null; String softClipEndBases = null; String softClipEndQualities = null; // Debugging stuff String readName = rec.getReadName(); assert SRMAUtil.Space.COLORSPACE != space; // Get space read = (String) rec.getAttribute("CS"); if (null == read) { // Use base space space = SRMAUtil.Space.NTSPACE; } else { // assumes CS and CQ are always in sequencing order space = SRMAUtil.Space.COLORSPACE; } // Get read and qualities if (space == SRMAUtil.Space.NTSPACE) { byte tmpRead[] = rec.getReadString().getBytes(); byte tmpQualities[] = rec.getBaseQualityString().getBytes(); // Reverse once if (strand) { // reverse SAMRecordUtil.reverseArray(tmpRead); SAMRecordUtil.reverseArray(tmpQualities); } read = new String(tmpRead); readBases = new String(tmpRead); qualities = new String(tmpQualities); // Reverse again if (strand) { // reverse SAMRecordUtil.reverseArray(tmpRead); SAMRecordUtil.reverseArray(tmpQualities); } } else { byte tmpRead[] = rec.getReadString().getBytes(); // Reverse once if (strand) { // reverse SAMRecordUtil.reverseArray(tmpRead); } readBases = new String(tmpRead); // Reverse again if (strand) { // reverse SAMRecordUtil.reverseArray(tmpRead); } read = SRMAUtil.normalizeColorSpaceRead(read); qualities = (String) rec.getAttribute("CQ"); // Some aligners include a quality value for the adapter. A quality value // IMHO should not be given for an unobserved (assumed) peice of data. Trim // the first quality in this case if (qualities.length() == 1 + read.length()) { // trim the first quality qualities = qualities.substring(1); } } // Reverse back if (readBases.length() <= 0) { throw new Exception("Error. The current alignment has no bases."); } if (read.length() <= 0) { throw new Exception("Error. The current alignment has no bases."); } if (qualities.length() <= 0) { throw new Exception("Error. The current alignment has no qualities."); } if (readBases.length() != read.length()) { if (space == SRMAUtil.Space.COLORSPACE) { throw new Exception( "Error. The current alignment's read bases length does not match the length of the colors in the CS tag [" + rec.getReadName() + "]."); } else { throw new Exception("Error. Internal error: readBases.length() != read.length()"); } } // Deal with soft-clipping // - save the soft clipped sequence for latter { List<CigarElement> cigarElements = null; cigarElements = rec.getCigar().getCigarElements(); CigarElement e1 = cigarElements.get(0); // first CigarElement e2 = cigarElements.get(cigarElements.size() - 1); // last // Soft-clipped if (CigarOperator.S == e1.getOperator()) { if (space == SRMAUtil.Space.COLORSPACE) { throw new Exception( "Error. Soft clipping with color-space data not currently supported."); } int l = e1.getLength(); if (strand) { // reverse softClipStartBases = readBases.substring(readBases.length() - l); softClipStartQualities = qualities.substring(qualities.length() - l); readBases = readBases.substring(0, readBases.length() - l); read = read.substring(0, read.length() - l); qualities = qualities.substring(0, qualities.length() - l); } else { softClipStartBases = readBases.substring(0, l - 1); softClipStartQualities = qualities.substring(0, l - 1); readBases = readBases.substring(l); read = read.substring(l); qualities = qualities.substring(l); } } if (CigarOperator.S == e2.getOperator()) { if (space == SRMAUtil.Space.COLORSPACE) { throw new Exception( "Error. Soft clipping with color-space data not currently supported."); } int l = e2.getLength(); if (strand) { // reverse softClipEndBases = readBases.substring(0, l - 1); softClipEndQualities = qualities.substring(0, l - 1); readBases = readBases.substring(l); read = read.substring(l); qualities = qualities.substring(l); } else { softClipEndBases = readBases.substring(readBases.length() - l); softClipEndQualities = qualities.substring(qualities.length() - l); readBases = readBases.substring(0, readBases.length() - l); read = read.substring(0, read.length() - l); qualities = qualities.substring(0, qualities.length() - l); } } } // Remove mate pair information Align.removeMateInfo(rec); comp = new AlignHeapNodeComparator( (strand) ? AlignHeap.HeapType.MAXHEAP : AlignHeap.HeapType.MINHEAP); // Bound by original alignment if possible bestAlignHeapNode = Align.boundWithOriginalAlignment( rec, graph, recNode, comp, strand, read, qualities, readBases, space, sequence, alleleCoverageCutoffs, useSequenceQualities, MAXIMUM_TOTAL_COVERAGE, MAX_HEAP_SIZE); /* System.err.println("readName="+rec.getReadName()); if(null != bestAlignHeapNode) { System.err.println("\nFOUND BEST:" + rec.toString()); } else { System.err.println("\nNOT FOUND (BEST): " + rec.toString()); } Align.updateSAM(rec, programRecord, bestAlignHeapNode, space, read, qualities, softClipStartBases, softClipStartQualities, softClipEndBases, softClipEndQualities, strand, correctBases); return; */ heap = new AlignHeap((strand) ? AlignHeap.HeapType.MAXHEAP : AlignHeap.HeapType.MINHEAP); // Add start nodes if (strand) { // reverse alignmentStart = rec.getAlignmentEnd(); for (i = alignmentStart + offset; alignmentStart - offset <= i; i--) { int position = graph.getPriorityQueueIndexAtPositionOrBefore(i); PriorityQueue<Node> startNodeQueue = graph.getPriorityQueue(position); if (0 != position && null != startNodeQueue) { Iterator<Node> startNodeQueueIter = startNodeQueue.iterator(); while (startNodeQueueIter.hasNext()) { Node startNode = startNodeQueueIter.next(); int f = passFilters(graph, startNode, alleleCoverageCutoffs, MAXIMUM_TOTAL_COVERAGE); if (0 == f) { heap.add( new AlignHeapNode( null, startNode, startNode.coverage, read.charAt(0), qualities.charAt(0), useSequenceQualities, space)); } else if (f < 0) { return; } if (startNode.position < i) { i = startNode.position; } numStartNodesAdded++; } } } } else { alignmentStart = rec.getAlignmentStart(); for (i = alignmentStart - offset; i <= alignmentStart + offset; i++) { int position = graph.getPriorityQueueIndexAtPositionOrGreater(i); PriorityQueue<Node> startNodeQueue = graph.getPriorityQueue(position); if (0 != position && null != startNodeQueue) { Iterator<Node> startNodeQueueIter = startNodeQueue.iterator(); while (startNodeQueueIter.hasNext()) { Node startNode = startNodeQueueIter.next(); int f = passFilters(graph, startNode, alleleCoverageCutoffs, MAXIMUM_TOTAL_COVERAGE); if (0 == f) { heap.add( new AlignHeapNode( null, startNode, startNode.coverage, read.charAt(0), qualities.charAt(0), useSequenceQualities, space)); } else if (f < 0) { return; } if (i < startNode.position) { i = startNode.position; } numStartNodesAdded++; } } } } if (numStartNodesAdded == 0) { throw new Exception("Did not add any start nodes!"); } // Get first node off the heap curAlignHeapNode = heap.poll(); while (null != curAlignHeapNode) { if (MAX_HEAP_SIZE <= heap.size()) { // too many to consider return; } // System.err.println("strand:" + strand + "\tsize:" + heap.size() + "\talignmentStart:" + // alignmentStart + "\toffset:" + offset + "\treadOffset:" + curAlignHeapNode.readOffset); // System.err.print("size:" + heap.size() + ":" + curAlignHeapNode.readOffset + ":" + // curAlignHeapNode.score + ":" + curAlignHeapNode.alleleCoverageSum + ":" + // curAlignHeapNode.startPosition + "\t"); // curAlignHeapNode.node.print(System.err); // System.err.print("\rposition:" + curAlignHeapNode.node.position + "\treadOffset:" + // curAlignHeapNode.readOffset); // Remove all non-insertions with the same contig/pos/read-offset/type/base and lower score nextAlignHeapNode = heap.peek(); while (Node.INSERTION != curAlignHeapNode.node.type && null != nextAlignHeapNode && 0 == comp.compare(curAlignHeapNode, nextAlignHeapNode)) { if (curAlignHeapNode.score < nextAlignHeapNode.score || (curAlignHeapNode.score == nextAlignHeapNode.score && curAlignHeapNode.alleleCoverageSum < nextAlignHeapNode.alleleCoverageSum)) { // Update current node curAlignHeapNode = heap.poll(); } else { // Ignore next node heap.poll(); } nextAlignHeapNode = heap.peek(); } nextAlignHeapNode = null; // Check if the alignment is complete if (curAlignHeapNode.readOffset == read.length() - 1) { // All read bases examined, store if has the best alignment. // System.err.print(curAlignHeapNode.alleleCoverageSum + ":" + curAlignHeapNode.score + // ":"); // System.err.print(curAlignHeapNode.startPosition + ":"); // curAlignHeapNode.node.print(System.err); if (null == bestAlignHeapNode || bestAlignHeapNode.score < curAlignHeapNode.score || (bestAlignHeapNode.score == curAlignHeapNode.score && bestAlignHeapNode.alleleCoverageSum < curAlignHeapNode.alleleCoverageSum)) { bestAlignHeapNode = curAlignHeapNode; } } else if (null != bestAlignHeapNode && curAlignHeapNode.score < bestAlignHeapNode.score) { // ignore, under the assumption that scores can only become more negative. } else { if (strand) { // reverse // Go to all the "prev" nodes iter = curAlignHeapNode.node.prev.listIterator(); } else { // forward // Go to all "next" nodes iter = curAlignHeapNode.node.next.listIterator(); } while (iter.hasNext()) { NodeRecord next = iter.next(); int f = passFilters( graph, next.node, next.coverage, alleleCoverageCutoffs, MAXIMUM_TOTAL_COVERAGE); if (0 == f) { heap.add( new AlignHeapNode( curAlignHeapNode, next.node, next.coverage, read.charAt(curAlignHeapNode.readOffset + 1), qualities.charAt(curAlignHeapNode.readOffset + 1), useSequenceQualities, space)); } else if (f < 0) { return; } } iter = null; } // Get next node curAlignHeapNode = heap.poll(); } // Recover alignment Align.updateSAM( rec, sequence, programRecord, bestAlignHeapNode, space, read, qualities, softClipStartBases, softClipStartQualities, softClipEndBases, softClipEndQualities, strand, correctBases); }