/** * HACK TO CREATE GATKSAMRECORD BASED ONLY A SAMRECORD FOR TESTING PURPOSES ONLY * * @param read */ public GATKSAMRecord(final SAMRecord read) { super(read.getHeader()); super.setReferenceIndex(read.getReferenceIndex()); super.setAlignmentStart(read.getAlignmentStart()); super.setReadName(read.getReadName()); super.setMappingQuality(read.getMappingQuality()); // indexing bin done below super.setCigar(read.getCigar()); super.setFlags(read.getFlags()); super.setMateReferenceIndex(read.getMateReferenceIndex()); super.setMateAlignmentStart(read.getMateAlignmentStart()); super.setInferredInsertSize(read.getInferredInsertSize()); SAMReadGroupRecord samRG = read.getReadGroup(); SAMBinaryTagAndValue samAttr = GATKBin.getReadBinaryAttributes(read); if (samAttr == null) { clearAttributes(); } else { setAttributes(samAttr); } if (samRG != null) { GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(samRG); setReadGroup(rg); } super.setFileSource(read.getFileSource()); super.setReadName(read.getReadName()); super.setCigarString(read.getCigarString()); super.setReadBases(read.getReadBases()); super.setBaseQualities(read.getBaseQualities()); // From SAMRecord constructor: Do this after the above because setCigarString will clear it. GATKBin.setReadIndexingBin(this, GATKBin.getReadIndexingBin(read)); }
/** Return the sort key used for the given sort order. Useful in error messages. */ public String getSortKey(final SAMRecord rec) { switch (sortOrder) { case coordinate: return rec.getReferenceName() + ":" + rec.getAlignmentStart(); case queryname: return rec.getReadName(); case unsorted: default: return null; } }
/** * Returns the duplicate score computed from the given fragment. value should be capped by * Short.MAX_VALUE/2 since the score from two reads will be added and an overflow will be * * <p>If true is given to assumeMateCigar, then any score that can use the mate cigar to compute * the mate's score will return the score computed on both ends. */ public static short computeDuplicateScore( final SAMRecord record, final ScoringStrategy scoringStrategy, final boolean assumeMateCigar) { Short storedScore = (Short) record.getTransientAttribute(Attr.DuplicateScore); if (storedScore == null) { short score = 0; switch (scoringStrategy) { case SUM_OF_BASE_QUALITIES: // two (very) long reads worth of high-quality bases can go over Short.MAX_VALUE/2 // and risk overflow. score += (short) Math.min(getSumOfBaseQualities(record), Short.MAX_VALUE / 2); break; case TOTAL_MAPPED_REFERENCE_LENGTH: if (!record.getReadUnmappedFlag()) { // no need to remember the score since this scoring mechanism is symmetric score = (short) Math.min(record.getCigar().getReferenceLength(), Short.MAX_VALUE / 2); } if (assumeMateCigar && record.getReadPairedFlag() && !record.getMateUnmappedFlag()) { score += (short) Math.min( SAMUtils.getMateCigar(record).getReferenceLength(), Short.MAX_VALUE / 2); } break; // The RANDOM score gives the same score to both reads so that they get filtered together. // it's not critical do use the readName since the scores from both ends get added, but it // seem // to be clearer this way. case RANDOM: // start with a random number between Short.MIN_VALUE/4 and Short.MAX_VALUE/4 score += (short) (hasher.hashUnencodedChars(record.getReadName()) & 0b11_1111_1111_1111); // subtract Short.MIN_VALUE/4 from it to end up with a number between // 0 and Short.MAX_VALUE/2. This number can be then discounted in case the read is // not passing filters. We need to stay far from overflow so that when we add the two // scores from the two read mates we do not overflow since that could cause us to chose a // failing read-pair instead of a passing one. score -= Short.MIN_VALUE / 4; } // make sure that filter-failing records are heavily discounted. (the discount can happen // twice, once // for each mate, so need to make sure we do not subtract more than Short.MIN_VALUE overall.) score += record.getReadFailsVendorQualityCheckFlag() ? (short) (Short.MIN_VALUE / 2) : 0; storedScore = score; record.setTransientAttribute(Attr.DuplicateScore, storedScore); } return storedScore; }
private int countAlignmentsInWindow( int reference, int window, SAMFileReader reader, int expectedCount) { final int SIXTEEN_K = 1 << 14; // 1 << LinearIndex.BAM_LIDX_SHIFT final int start = window >> 14; // window * SIXTEEN_K; final int stop = ((window + 1) >> 14) - 1; // (window + 1 * SIXTEEN_K) - 1; final String chr = reader.getFileHeader().getSequence(reference).getSequenceName(); // get records for the entire linear index window SAMRecordIterator iter = reader.queryOverlapping(chr, start, stop); SAMRecord rec; int count = 0; while (iter.hasNext()) { rec = iter.next(); count++; if (expectedCount == -1) System.err.println(rec.getReadName()); } iter.close(); return count; }
private int runQueryTest( final URL bamURL, final String sequence, final int startPos, final int endPos, final boolean contained) { verbose("Testing query " + sequence + ":" + startPos + "-" + endPos + " ..."); final SamReader reader1 = SamReaderFactory.makeDefault() .disable(SamReaderFactory.Option.EAGERLY_DECODE) .open(SamInputResource.of(bamURL).index(BAM_INDEX_FILE)); final SamReader reader2 = SamReaderFactory.makeDefault() .disable(SamReaderFactory.Option.EAGERLY_DECODE) .open(SamInputResource.of(bamURL).index(BAM_INDEX_FILE)); final Iterator<SAMRecord> iter1 = reader1.query(sequence, startPos, endPos, contained); final Iterator<SAMRecord> iter2 = reader2.iterator(); // Compare ordered iterators. // Confirm that iter1 is a subset of iter2 that properly filters. SAMRecord record1 = null; SAMRecord record2 = null; int count1 = 0; int count2 = 0; int beforeCount = 0; int afterCount = 0; while (true) { if (record1 == null && iter1.hasNext()) { record1 = iter1.next(); count1++; } if (record2 == null && iter2.hasNext()) { record2 = iter2.next(); count2++; } // System.out.println("Iteration:"); // System.out.println(" Record1 = " + ((record1 == null) ? "null" : record1.format())); // System.out.println(" Record2 = " + ((record2 == null) ? "null" : record2.format())); if (record1 == null && record2 == null) { break; } if (record1 == null) { checkPassesFilter(false, record2, sequence, startPos, endPos, contained); record2 = null; afterCount++; continue; } assertNotNull(record2); final int ordering = compareCoordinates(record1, record2); if (ordering > 0) { checkPassesFilter(false, record2, sequence, startPos, endPos, contained); record2 = null; beforeCount++; continue; } assertTrue(ordering == 0); checkPassesFilter(true, record1, sequence, startPos, endPos, contained); checkPassesFilter(true, record2, sequence, startPos, endPos, contained); assertEquals(record1.getReadName(), record2.getReadName()); assertEquals(record1.getReadString(), record2.getReadString()); record1 = null; record2 = null; } CloserUtil.close(reader1); CloserUtil.close(reader2); verbose("Checked " + count1 + " records against " + count2 + " records."); verbose("Found " + (count2 - beforeCount - afterCount) + " records matching."); verbose("Found " + beforeCount + " records before."); verbose("Found " + afterCount + " records after."); return count1; }
public void find_coverage(SAMResource sres) { int start_base = sres.region.range.start; int end_base = sres.region.range.end; int coverage_len = (end_base - start_base) + 1; int i, end, ref_i, read_i, len; int[] coverage = new int[coverage_len]; Arrays.fill(coverage, 0); WorkingFile wf = null; if (outfile != null) { try { wf = new WorkingFile(outfile); ps = wf.getPrintStream(); } catch (Exception e) { System.err.println("I/O error: " + e); // debug e.printStackTrace(); System.exit(1); } } try { // // gather coverage info: // CloseableIterator<SAMRecord> iterator = sres.get_iterator(); int read_count = 0; int ref_min = -1; int ref_max = -1; while (iterator.hasNext()) { SAMRecord sr = iterator.next(); read_count++; // System.err.println(sr.getReadName() + ": " + sr.getAlignmentStart() + "-" + // sr.getAlignmentEnd()); // debug if (sr.getReadUnmappedFlag()) continue; if (sr.getDuplicateReadFlag()) { if (verbose_mode) System.err.println( sr.getReadName() + "." + (sr.getReadNegativeStrandFlag() ? "R" : "F") + " ignoring, duplicate"); continue; } byte[] read = sr.getReadBases(); byte[] quals = sr.getBaseQualities(); for (AlignmentBlock ab : sr.getAlignmentBlocks()) { len = ab.getLength(); read_i = ab.getReadStart() - 1; ref_i = ab.getReferenceStart() - start_base; if (ref_min == -1 || ref_i < ref_min) ref_min = ref_i; for (i = read_i, end = read_i + len; i < end; i++, ref_i++) { if (ref_i >= 0 && ref_i < coverage_len) { if (quals[i] >= MIN_QUALITY) { if (verbose_mode) System.err.println( sr.getReadName() + "." + (sr.getReadNegativeStrandFlag() ? "R" : "F") + " hit at " + (ref_i + start_base) + " as=" + sr.getAlignmentStart() + " ae=" + sr.getAlignmentEnd()); coverage[ref_i]++; } else if (verbose_mode) { System.err.println( sr.getReadName() + "." + (sr.getReadNegativeStrandFlag() ? "R" : "F") + " qual_reject at " + (ref_i + start_base) + " as=" + sr.getAlignmentStart() + " ae=" + sr.getAlignmentEnd()); } } } if (ref_max == -1 || ref_i > ref_max) ref_max = ref_i; } } sres.close(); System.err.println( "records:" + read_count + " ref_min:" + (ref_min + start_base) + " ref_max:" + (ref_max + start_base)); // debug // // report coverage info: // for (i = 0; i < coverage.length; i++) { if (name != null) ps.print(name + ","); ps.println((i + start_base) + "," + coverage[i]); // debug } if (wf != null) wf.finish(); } catch (Exception e) { System.err.println("ERROR: " + e); // debug e.printStackTrace(); } }
private static void updateSAM( SAMRecord rec, ReferenceSequence sequence, SAMProgramRecord programRecord, AlignHeapNode bestAlignHeapNode, SRMAUtil.Space space, String read, String qualities, String softClipStartBases, String softClipStartQualities, String softClipEndBases, String softClipEndQualities, boolean strand, boolean correctBases) throws Exception { AlignHeapNode curAlignHeapNode = null; AlignHeapNode prevAlignHeapNode = null; int alignmentStart = 0; int readIndex = -1; byte readBases[] = null; byte baseQualities[] = null; byte colorErrors[] = null; int i; int numEdits = 0; List<String> optFieldTags = new LinkedList<String>(); List<Object> optFieldValues = new LinkedList<Object>(); Object attr; // Debugging stuff String readName = rec.getReadName(); if (null == bestAlignHeapNode) { // Do not modify the alignment return; } // To generate a new CIGAR List<CigarElement> cigarElements = null; CigarOperator prevCigarOperator = null, curCigarOperator = null; int prevCigarOperatorLength = 0; // TODO // setInferredInsertSize (invalidates paired end reads) // setMappingQuality (?) // setFlag // update base qualities for color space reads // clear attributes, but save some Align.clearAttributes(rec, optFieldTags, optFieldValues); readBases = new byte[read.length()]; baseQualities = new byte[qualities.length()]; for (i = 0; i < qualities.length(); i++) { // Must subtract 33 for PHRED scaling baseQualities[i] = (byte) (qualities.charAt(i) - 33); } if (strand) { readIndex = 0; } else { readIndex = read.length() - 1; } cigarElements = new LinkedList<CigarElement>(); if (strand) { // reverse strand is the current position alignmentStart = bestAlignHeapNode.node.position; } else { alignmentStart = bestAlignHeapNode.startPosition; } assert null != bestAlignHeapNode; curAlignHeapNode = bestAlignHeapNode; while (null != curAlignHeapNode) { // Get the current cigar operator if (null != prevAlignHeapNode && CigarOperator.DELETION != prevCigarOperator && 1 < Math.abs(curAlignHeapNode.node.position - prevAlignHeapNode.node.position)) { curCigarOperator = CigarOperator.DELETION; } else { switch (curAlignHeapNode.node.type) { case Node.MISMATCH: // Fall through case Node.MATCH: curCigarOperator = CigarOperator.MATCH_OR_MISMATCH; break; case Node.INSERTION: // System.out.println("INS"); curCigarOperator = CigarOperator.INSERTION; break; default: throw new Exception("Unknown node type"); } if (space == SRMAUtil.Space.COLORSPACE || correctBases) { readBases[readIndex] = (byte) curAlignHeapNode.node.base; if (strand) { readIndex++; } else { readIndex--; } // count the number of mismatches switch (curAlignHeapNode.node.type) { case Node.MISMATCH: case Node.INSERTION: numEdits++; break; default: break; } } else { // count the number of mismatches switch (curAlignHeapNode.node.type) { case Node.MATCH: if (read.charAt(curAlignHeapNode.readOffset) != curAlignHeapNode.node.base) { numEdits++; } break; case Node.MISMATCH: // Fall through if (read.charAt(curAlignHeapNode.readOffset) != sequence.getBases()[curAlignHeapNode.node.position - 1]) { numEdits++; } break; case Node.INSERTION: numEdits++; break; default: break; } } } if (prevCigarOperator != curCigarOperator) { // different cigar operator // add the previous cigar operator if (null != prevCigarOperator) { if (strand) { // reverse // append cigarElements.add(new CigarElement(prevCigarOperatorLength, prevCigarOperator)); } else { // prepend cigarElements.add(0, new CigarElement(prevCigarOperatorLength, prevCigarOperator)); } } // update prevCigarOperator prevCigarOperator = curCigarOperator; if (curCigarOperator == CigarOperator.DELETION) { // length of deletion prevCigarOperatorLength = Math.abs(curAlignHeapNode.node.position - prevAlignHeapNode.node.position) - 1; numEdits += prevCigarOperatorLength; // deletions } else { prevCigarOperatorLength = 1; } } else { // same cigar operator prevCigarOperatorLength++; } // Update if (CigarOperator.DELETION != curCigarOperator) { prevAlignHeapNode = curAlignHeapNode; curAlignHeapNode = curAlignHeapNode.prev; } } if (0 < prevCigarOperatorLength) { if (null == prevCigarOperator || CigarOperator.DELETION == prevCigarOperator) { throw new Exception("Ended with a null cigar operator or a deletion cigar operator"); } if (strand) { // reverse // append cigarElements.add(new CigarElement(prevCigarOperatorLength, prevCigarOperator)); } else { // prepend cigarElements.add(0, new CigarElement(prevCigarOperatorLength, prevCigarOperator)); } } if (space == SRMAUtil.Space.COLORSPACE) { // color space, read bases already inferred // Get color error string colorErrors = new byte[read.length()]; char prevBase = SRMAUtil.COLORSPACE_ADAPTOR; if (strand) { // reverse for (i = 0; i < read.length(); i++) { char nextBase = SRMAUtil.colorSpaceNextBase(prevBase, read.charAt(i)); if (nextBase == SRMAUtil.getCompliment((char) readBases[read.length() - i - 1])) { colorErrors[i] = (byte) Alignment.GAP; } else { colorErrors[i] = (byte) read.charAt(i); } if (0 < i) { // qualities are assumed to be always in the same direction as the color errors baseQualities[read.length() - i] = getColorQuality( colorErrors[i - 1], colorErrors[i], (byte) (qualities.charAt(i - 1) - 33), (byte) (qualities.charAt(i) - 33)); } prevBase = SRMAUtil.getCompliment((char) readBases[read.length() - i - 1]); } // last color baseQualities[0] = (byte) (qualities.charAt(read.length() - 1) - 33); } else { for (i = 0; i < read.length(); i++) { char nextBase = SRMAUtil.colorSpaceNextBase(prevBase, read.charAt(i)); if (nextBase == readBases[i]) { colorErrors[i] = (byte) Alignment.GAP; } else { colorErrors[i] = (byte) read.charAt(i); } if (0 < i) { baseQualities[i - 1] = getColorQuality( colorErrors[i - 1], colorErrors[i], (byte) (qualities.charAt(i - 1) - 33), (byte) (qualities.charAt(i) - 33)); } prevBase = (char) readBases[i]; } // last color baseQualities[read.length() - 1] = (byte) (qualities.charAt(read.length() - 1) - 33); } } else if (correctBases) { // bases were corrected if (strand) { for (i = 0; i < read.length(); i++) { if (readBases[i] == (byte) read.charAt(read.length() - i - 1)) { baseQualities[i] = (byte) (qualities.charAt(read.length() - i - 1) - 33); } else { // TODO: how much to down-weight ? baseQualities[i] = (byte) (SRMAUtil.QUAL2CHAR( SRMAUtil.CHAR2QUAL(qualities.charAt(read.length() - i - 1)) - CORRECT_BASE_QUALITY_PENALTY) - 33); if (baseQualities[i] <= 0) { baseQualities[i] = 1; } } } } else { for (i = 0; i < read.length(); i++) { if (readBases[i] == (byte) read.charAt(i)) { baseQualities[i] = (byte) (qualities.charAt(i) - 33); } else { // TODO: how much to down-weight ? baseQualities[i] = (byte) (SRMAUtil.QUAL2CHAR( SRMAUtil.CHAR2QUAL(qualities.charAt(i)) - CORRECT_BASE_QUALITY_PENALTY) - 33); if (baseQualities[i] <= 0) { baseQualities[i] = 1; } } } } rec.setAttribute("XO", read); rec.setAttribute("XQ", qualities); } else { // bases not corrected readBases = new byte[read.length()]; baseQualities = new byte[qualities.length()]; // qualities.length() == read.length() if (strand) { // reverse for (i = 0; i < read.length(); i++) { readBases[i] = (byte) read.charAt(read.length() - i - 1); baseQualities[i] = (byte) (qualities.charAt(read.length() - i - 1) - 33); } } else { for (i = 0; i < read.length(); i++) { readBases[i] = (byte) read.charAt(i); baseQualities[i] = (byte) (qualities.charAt(i) - 33); } } } // Add in soft-clipping if (null != softClipStartBases) { // prepend cigarElements.add(0, new CigarElement(softClipStartBases.length(), CigarOperator.S)); byte tmpBases[] = new byte[readBases.length + softClipStartBases.length()]; System.arraycopy(readBases, 0, tmpBases, softClipStartBases.length(), readBases.length); readBases = tmpBases; for (i = 0; i < softClipStartBases.length(); i++) { readBases[i] = (byte) softClipStartBases.charAt(i); } byte tmpQualities[] = new byte[baseQualities.length + softClipStartQualities.length()]; System.arraycopy( baseQualities, 0, tmpQualities, softClipStartQualities.length(), baseQualities.length); baseQualities = tmpQualities; for (i = 0; i < softClipStartQualities.length(); i++) { baseQualities[i] = (byte) softClipStartQualities.charAt(i); } } if (null != softClipEndBases) { // append cigarElements.add(new CigarElement(softClipEndBases.length(), CigarOperator.S)); byte tmpBases[] = new byte[readBases.length + softClipEndBases.length()]; System.arraycopy(readBases, 0, tmpBases, 0, readBases.length); for (i = 0; i < softClipEndBases.length(); i++) { tmpBases[i + readBases.length] = (byte) softClipEndBases.charAt(i); } readBases = tmpBases; byte tmpQualities[] = new byte[baseQualities.length + softClipEndQualities.length()]; System.arraycopy(baseQualities, 0, tmpQualities, 0, baseQualities.length); for (i = 0; i < softClipEndQualities.length(); i++) { tmpQualities[i + baseQualities.length] = (byte) softClipEndQualities.charAt(i); } baseQualities = tmpQualities; } // Update SAM record rec.setCigar(new Cigar(cigarElements)); rec.setAlignmentStart(alignmentStart); rec.setReadBases(readBases); rec.setBaseQualities(baseQualities); // Reset saved attributes Align.resetAttributes(rec, optFieldTags, optFieldValues); // Set new attributes if (space == SRMAUtil.Space.COLORSPACE) { // set the XE attribute for colorError string rec.setAttribute("XE", new String(colorErrors)); } rec.setAttribute("AS", bestAlignHeapNode.score); rec.setAttribute("XC", bestAlignHeapNode.alleleCoverageSum); rec.setAttribute("PG", programRecord.getId()); rec.setAttribute("NM", numEdits); }
public static void align( Graph graph, SAMRecord rec, Node recNode, ReferenceSequence sequence, SAMProgramRecord programRecord, int offset, AlleleCoverageCutoffs alleleCoverageCutoffs, boolean correctBases, boolean useSequenceQualities, int MAXIMUM_TOTAL_COVERAGE, int MAX_HEAP_SIZE) throws Exception { int i; AlignHeapNode curAlignHeapNode = null; AlignHeapNode nextAlignHeapNode = null; AlignHeapNode bestAlignHeapNode = null; AlignHeap heap = null; String read = null; // could be cs String readBases = null; // always nt String qualities = null; // could be cq SRMAUtil.Space space = SRMAUtil.Space.NTSPACE; ListIterator<NodeRecord> iter = null; AlignHeapNodeComparator comp = null; int alignmentStart = -1; int numStartNodesAdded = 0; boolean strand = rec.getReadNegativeStrandFlag(); // false -> forward, true -> reverse String softClipStartBases = null; String softClipStartQualities = null; String softClipEndBases = null; String softClipEndQualities = null; // Debugging stuff String readName = rec.getReadName(); assert SRMAUtil.Space.COLORSPACE != space; // Get space read = (String) rec.getAttribute("CS"); if (null == read) { // Use base space space = SRMAUtil.Space.NTSPACE; } else { // assumes CS and CQ are always in sequencing order space = SRMAUtil.Space.COLORSPACE; } // Get read and qualities if (space == SRMAUtil.Space.NTSPACE) { byte tmpRead[] = rec.getReadString().getBytes(); byte tmpQualities[] = rec.getBaseQualityString().getBytes(); // Reverse once if (strand) { // reverse SAMRecordUtil.reverseArray(tmpRead); SAMRecordUtil.reverseArray(tmpQualities); } read = new String(tmpRead); readBases = new String(tmpRead); qualities = new String(tmpQualities); // Reverse again if (strand) { // reverse SAMRecordUtil.reverseArray(tmpRead); SAMRecordUtil.reverseArray(tmpQualities); } } else { byte tmpRead[] = rec.getReadString().getBytes(); // Reverse once if (strand) { // reverse SAMRecordUtil.reverseArray(tmpRead); } readBases = new String(tmpRead); // Reverse again if (strand) { // reverse SAMRecordUtil.reverseArray(tmpRead); } read = SRMAUtil.normalizeColorSpaceRead(read); qualities = (String) rec.getAttribute("CQ"); // Some aligners include a quality value for the adapter. A quality value // IMHO should not be given for an unobserved (assumed) peice of data. Trim // the first quality in this case if (qualities.length() == 1 + read.length()) { // trim the first quality qualities = qualities.substring(1); } } // Reverse back if (readBases.length() <= 0) { throw new Exception("Error. The current alignment has no bases."); } if (read.length() <= 0) { throw new Exception("Error. The current alignment has no bases."); } if (qualities.length() <= 0) { throw new Exception("Error. The current alignment has no qualities."); } if (readBases.length() != read.length()) { if (space == SRMAUtil.Space.COLORSPACE) { throw new Exception( "Error. The current alignment's read bases length does not match the length of the colors in the CS tag [" + rec.getReadName() + "]."); } else { throw new Exception("Error. Internal error: readBases.length() != read.length()"); } } // Deal with soft-clipping // - save the soft clipped sequence for latter { List<CigarElement> cigarElements = null; cigarElements = rec.getCigar().getCigarElements(); CigarElement e1 = cigarElements.get(0); // first CigarElement e2 = cigarElements.get(cigarElements.size() - 1); // last // Soft-clipped if (CigarOperator.S == e1.getOperator()) { if (space == SRMAUtil.Space.COLORSPACE) { throw new Exception( "Error. Soft clipping with color-space data not currently supported."); } int l = e1.getLength(); if (strand) { // reverse softClipStartBases = readBases.substring(readBases.length() - l); softClipStartQualities = qualities.substring(qualities.length() - l); readBases = readBases.substring(0, readBases.length() - l); read = read.substring(0, read.length() - l); qualities = qualities.substring(0, qualities.length() - l); } else { softClipStartBases = readBases.substring(0, l - 1); softClipStartQualities = qualities.substring(0, l - 1); readBases = readBases.substring(l); read = read.substring(l); qualities = qualities.substring(l); } } if (CigarOperator.S == e2.getOperator()) { if (space == SRMAUtil.Space.COLORSPACE) { throw new Exception( "Error. Soft clipping with color-space data not currently supported."); } int l = e2.getLength(); if (strand) { // reverse softClipEndBases = readBases.substring(0, l - 1); softClipEndQualities = qualities.substring(0, l - 1); readBases = readBases.substring(l); read = read.substring(l); qualities = qualities.substring(l); } else { softClipEndBases = readBases.substring(readBases.length() - l); softClipEndQualities = qualities.substring(qualities.length() - l); readBases = readBases.substring(0, readBases.length() - l); read = read.substring(0, read.length() - l); qualities = qualities.substring(0, qualities.length() - l); } } } // Remove mate pair information Align.removeMateInfo(rec); comp = new AlignHeapNodeComparator( (strand) ? AlignHeap.HeapType.MAXHEAP : AlignHeap.HeapType.MINHEAP); // Bound by original alignment if possible bestAlignHeapNode = Align.boundWithOriginalAlignment( rec, graph, recNode, comp, strand, read, qualities, readBases, space, sequence, alleleCoverageCutoffs, useSequenceQualities, MAXIMUM_TOTAL_COVERAGE, MAX_HEAP_SIZE); /* System.err.println("readName="+rec.getReadName()); if(null != bestAlignHeapNode) { System.err.println("\nFOUND BEST:" + rec.toString()); } else { System.err.println("\nNOT FOUND (BEST): " + rec.toString()); } Align.updateSAM(rec, programRecord, bestAlignHeapNode, space, read, qualities, softClipStartBases, softClipStartQualities, softClipEndBases, softClipEndQualities, strand, correctBases); return; */ heap = new AlignHeap((strand) ? AlignHeap.HeapType.MAXHEAP : AlignHeap.HeapType.MINHEAP); // Add start nodes if (strand) { // reverse alignmentStart = rec.getAlignmentEnd(); for (i = alignmentStart + offset; alignmentStart - offset <= i; i--) { int position = graph.getPriorityQueueIndexAtPositionOrBefore(i); PriorityQueue<Node> startNodeQueue = graph.getPriorityQueue(position); if (0 != position && null != startNodeQueue) { Iterator<Node> startNodeQueueIter = startNodeQueue.iterator(); while (startNodeQueueIter.hasNext()) { Node startNode = startNodeQueueIter.next(); int f = passFilters(graph, startNode, alleleCoverageCutoffs, MAXIMUM_TOTAL_COVERAGE); if (0 == f) { heap.add( new AlignHeapNode( null, startNode, startNode.coverage, read.charAt(0), qualities.charAt(0), useSequenceQualities, space)); } else if (f < 0) { return; } if (startNode.position < i) { i = startNode.position; } numStartNodesAdded++; } } } } else { alignmentStart = rec.getAlignmentStart(); for (i = alignmentStart - offset; i <= alignmentStart + offset; i++) { int position = graph.getPriorityQueueIndexAtPositionOrGreater(i); PriorityQueue<Node> startNodeQueue = graph.getPriorityQueue(position); if (0 != position && null != startNodeQueue) { Iterator<Node> startNodeQueueIter = startNodeQueue.iterator(); while (startNodeQueueIter.hasNext()) { Node startNode = startNodeQueueIter.next(); int f = passFilters(graph, startNode, alleleCoverageCutoffs, MAXIMUM_TOTAL_COVERAGE); if (0 == f) { heap.add( new AlignHeapNode( null, startNode, startNode.coverage, read.charAt(0), qualities.charAt(0), useSequenceQualities, space)); } else if (f < 0) { return; } if (i < startNode.position) { i = startNode.position; } numStartNodesAdded++; } } } } if (numStartNodesAdded == 0) { throw new Exception("Did not add any start nodes!"); } // Get first node off the heap curAlignHeapNode = heap.poll(); while (null != curAlignHeapNode) { if (MAX_HEAP_SIZE <= heap.size()) { // too many to consider return; } // System.err.println("strand:" + strand + "\tsize:" + heap.size() + "\talignmentStart:" + // alignmentStart + "\toffset:" + offset + "\treadOffset:" + curAlignHeapNode.readOffset); // System.err.print("size:" + heap.size() + ":" + curAlignHeapNode.readOffset + ":" + // curAlignHeapNode.score + ":" + curAlignHeapNode.alleleCoverageSum + ":" + // curAlignHeapNode.startPosition + "\t"); // curAlignHeapNode.node.print(System.err); // System.err.print("\rposition:" + curAlignHeapNode.node.position + "\treadOffset:" + // curAlignHeapNode.readOffset); // Remove all non-insertions with the same contig/pos/read-offset/type/base and lower score nextAlignHeapNode = heap.peek(); while (Node.INSERTION != curAlignHeapNode.node.type && null != nextAlignHeapNode && 0 == comp.compare(curAlignHeapNode, nextAlignHeapNode)) { if (curAlignHeapNode.score < nextAlignHeapNode.score || (curAlignHeapNode.score == nextAlignHeapNode.score && curAlignHeapNode.alleleCoverageSum < nextAlignHeapNode.alleleCoverageSum)) { // Update current node curAlignHeapNode = heap.poll(); } else { // Ignore next node heap.poll(); } nextAlignHeapNode = heap.peek(); } nextAlignHeapNode = null; // Check if the alignment is complete if (curAlignHeapNode.readOffset == read.length() - 1) { // All read bases examined, store if has the best alignment. // System.err.print(curAlignHeapNode.alleleCoverageSum + ":" + curAlignHeapNode.score + // ":"); // System.err.print(curAlignHeapNode.startPosition + ":"); // curAlignHeapNode.node.print(System.err); if (null == bestAlignHeapNode || bestAlignHeapNode.score < curAlignHeapNode.score || (bestAlignHeapNode.score == curAlignHeapNode.score && bestAlignHeapNode.alleleCoverageSum < curAlignHeapNode.alleleCoverageSum)) { bestAlignHeapNode = curAlignHeapNode; } } else if (null != bestAlignHeapNode && curAlignHeapNode.score < bestAlignHeapNode.score) { // ignore, under the assumption that scores can only become more negative. } else { if (strand) { // reverse // Go to all the "prev" nodes iter = curAlignHeapNode.node.prev.listIterator(); } else { // forward // Go to all "next" nodes iter = curAlignHeapNode.node.next.listIterator(); } while (iter.hasNext()) { NodeRecord next = iter.next(); int f = passFilters( graph, next.node, next.coverage, alleleCoverageCutoffs, MAXIMUM_TOTAL_COVERAGE); if (0 == f) { heap.add( new AlignHeapNode( curAlignHeapNode, next.node, next.coverage, read.charAt(curAlignHeapNode.readOffset + 1), qualities.charAt(curAlignHeapNode.readOffset + 1), useSequenceQualities, space)); } else if (f < 0) { return; } } iter = null; } // Get next node curAlignHeapNode = heap.poll(); } // Recover alignment Align.updateSAM( rec, sequence, programRecord, bestAlignHeapNode, space, read, qualities, softClipStartBases, softClipStartQualities, softClipEndBases, softClipEndQualities, strand, correctBases); }
@Override public void execute() { log.info("Initializing kmer code map..."); Map<Character, Integer> kmerCodeIndices = new HashMap<Character, Integer>(); kmerCodeIndices.put('0', 1); kmerCodeIndices.put('A', 3); kmerCodeIndices.put('B', 4); kmerCodeIndices.put('C', 5); kmerCodeIndices.put('_', 6); kmerCodeIndices.put('.', 7); kmerCodeIndices.put('1', 9); Map<Character, String> kmerCodeNames = new LinkedHashMap<Character, String>(); kmerCodeNames.put('0', "ref0"); kmerCodeNames.put('A', "repetitive"); kmerCodeNames.put('B', "both"); kmerCodeNames.put('C', "lowcoverage"); kmerCodeNames.put('_', "lowconfidence"); kmerCodeNames.put('.', "novel"); kmerCodeNames.put('1', "ref1"); if (KMER_CODE_NAMES != null) { for (Character c : kmerCodeNames.keySet()) { String cStr = String.valueOf(c); if (KMER_CODE_NAMES.containsKey(cStr)) { kmerCodeNames.put(c, KMER_CODE_NAMES.get(cStr)); } } } for (Character c : kmerCodeNames.keySet()) { log.info(" {} {}: {}", c, kmerCodeIndices.get(c), kmerCodeNames.get(c)); } log.info("Loading annotated contigs..."); Map<String, Map<String, String>> annotatedContigs = new HashMap<String, Map<String, String>>(); int kmerSize = 0; if (ANN.length() > 0) { TableReader tr = new TableReader(ANN); for (Map<String, String> te : tr) { String contigName = te.get("contigName"); if (kmerSize == 0) { kmerSize = te.get("seq").length() - te.get("kmerOrigin").length() + 1; } annotatedContigs.put(contigName, te); String[] ref0ToCanonicalExact = (te.get("ref0ToCanonicalExact").equals("NA") || te.get("ref0ToCanonicalExact").equals("*:0-0") ? "NA:0-0" : te.get("ref0ToCanonicalExact")) .split("[:-]"); String[] ref1ToCanonicalExact = (te.get("ref1ToCanonicalExact").equals("NA") || te.get("ref1ToCanonicalExact").equals("*:0-0") ? "NA:0-0" : te.get("ref1ToCanonicalExact")) .split("[:-]"); cout.println( te.get("sampleName") + "_" + te.get("accession") + "_" + contigName + " " + ref0ToCanonicalExact[0] + " " + ref0ToCanonicalExact[1] + " " + ref0ToCanonicalExact[2] + " radius1=0.8r"); cout.println( te.get("sampleName") + "_" + te.get("accession") + "_" + contigName + " " + ref1ToCanonicalExact[0] + " " + ref1ToCanonicalExact[1] + " " + ref1ToCanonicalExact[2] + " radius2=0.6r"); } } log.info(" contigs: {}", annotatedContigs.size()); log.info(" kmer size: {}", kmerSize); log.info("Computing kmer inheritance information..."); SAMFileHeader sfh = CONTIGS.getFileHeader(); for (Character c : kmerCodeNames.keySet()) { SAMReadGroupRecord rgr = new SAMReadGroupRecord(kmerCodeNames.get(c)); rgr.setSample(kmerCodeNames.get(c)); sfh.addReadGroup(rgr); } SAMFileWriterFactory sfwf = new SAMFileWriterFactory(); sfwf.setCreateIndex(true); SAMFileWriter sfw = sfwf.makeBAMWriter(sfh, false, bout); TableWriter tw = new TableWriter(sout); Set<IGVEntry> igvEntries = new TreeSet<IGVEntry>(); int numContigs = 0; for (SAMRecord contig : CONTIGS) { if (CONTIG_NAMES == null || CONTIG_NAMES.isEmpty() || CONTIG_NAMES.contains(contig.getReadName())) { Map<String, String> te = annotatedContigs.get(contig.getReadName()); if (annotatedContigs.containsKey(contig.getReadName())) { String seq = contig.getReadString(); // log.debug(" te: {}", te); String annSeq = te.get("seq"); String kmerOrigin = te.get("kmerOrigin"); Map<CortexKmer, Character> kmerCodes = new HashMap<CortexKmer, Character>(); for (int i = 0; i < kmerOrigin.length(); i++) { CortexKmer kmer = new CortexKmer(annSeq.substring(i, i + kmerSize)); Character code = kmerOrigin.charAt(i); kmerCodes.put(kmer, code); } Map<Character, Integer> kmerStats = new HashMap<Character, Integer>(); for (Character c : kmerCodeNames.keySet()) { kmerStats.put(c, 0); } boolean changed = false; // We want to be able to examine soft-clipped regions as well. List<CigarElement> ces = new ArrayList<CigarElement>(); for (CigarElement ce : contig.getCigar().getCigarElements()) { if (ce.getOperator().equals(CigarOperator.S)) { ces.add(new CigarElement(ce.getLength(), CigarOperator.M)); changed = true; } else { ces.add(ce); } } if (changed) { CigarElement firstCe = contig.getCigar().getCigarElements().get(0); if (firstCe.getOperator().equals(CigarOperator.S)) { contig.setAlignmentStart(contig.getAlignmentStart() - firstCe.getLength()); } contig.setCigar(new Cigar(ces)); } for (AlignmentBlock ab : contig.getAlignmentBlocks()) { for (int i = ab.getReadStart() - 1; i < ab.getReadStart() + ab.getLength(); i++) { if (i + kmerSize < seq.length()) { CortexKmer kmer = new CortexKmer(seq.substring(i, i + kmerSize)); SAMRecord skmer = new SAMRecord(CONTIGS.getFileHeader()); skmer.setReadBases(seq.substring(i, i + kmerSize).getBytes()); List<CigarElement> cigarElements = new ArrayList<CigarElement>(); cigarElements.add(new CigarElement(kmerSize, CigarOperator.M)); Cigar cigar = new Cigar(cigarElements); skmer.setReadName(contig.getReadName() + "." + kmer.getKmerAsString()); skmer.setReferenceName(contig.getReferenceName()); skmer.setCigar(cigar); skmer.setReadPairedFlag(false); skmer.setDuplicateReadFlag(false); skmer.setMateNegativeStrandFlag(false); skmer.setAlignmentStart(ab.getReferenceStart() - ab.getReadStart() + 1 + i); skmer.setAttribute("RG", "none"); skmer.setMappingQuality(0); Character c = kmerCodes.get(kmer); String codeName = kmerCodeNames.get(c); String parentReadGroupId = null; String sampleReadGroupId = null; for (SAMReadGroupRecord rgr : sfh.getReadGroups()) { if (rgr.getSample().equals(codeName)) { parentReadGroupId = rgr.getReadGroupId(); } if (rgr.getSample().equals(contig.getReadGroup().getSample())) { sampleReadGroupId = rgr.getReadGroupId(); } } skmer.setAttribute( "RG", parentReadGroupId != null ? parentReadGroupId : sampleReadGroupId); skmer.setMappingQuality(99); sfw.addAlignment(skmer); kmerStats.put(c, kmerStats.get(c) + 1); IGVEntry igvEntry = new IGVEntry(); igvEntry.chromosome = contig.getReferenceName(); igvEntry.start = ab.getReferenceStart() - ab.getReadStart() + i; igvEntry.parentageName = kmerCodeNames.get(c); igvEntry.parentage = kmerCodeIndices.get(c); igvEntries.add(igvEntry); } } } if (!contig.isSecondaryOrSupplementary()) { beout.println( contig.getReferenceName() + "\t" + contig.getAlignmentStart() + "\t" + contig.getAlignmentEnd() + "\t" + contig.getReadName() + "." + contig.getReadGroup().getSample()); if (annotatedContigs.size() > 10 && numContigs % (annotatedContigs.size() / 10) == 0) { log.info(" processed {}/{} contigs", numContigs, annotatedContigs.size()); } numContigs++; } Map<String, String> stats = new LinkedHashMap<String, String>(); stats.put("contigName", contig.getReadName()); stats.put("sampleName", contig.getReadGroup().getSample()); for (Character c : kmerCodeNames.keySet()) { stats.put(kmerCodeNames.get(c), String.valueOf(kmerStats.get(c))); } tw.addEntry(stats); } } } log.info("Writing kmer inheritance information..."); out.printf("%s\t%s\t%s\t%s\t%s\n", "Chromosome", "Start", "End", "Feature", "Parentage"); for (IGVEntry igvEntry : igvEntries) { out.printf( "%s\t%d\t%d\t%s\t%d\n", igvEntry.chromosome, igvEntry.start, igvEntry.start + 1, igvEntry.parentageName, igvEntry.parentage); } sfw.close(); }