/** * Creates a map with each event in the read (cigar operator) and the read coordinate where it * happened. * * <p>Example: D -> 2, 34, 75 I -> 55 S -> 0, 101 H -> 101 * * @param read the read * @return a map with the properties described above. See example */ public static Map<CigarOperator, ArrayList<Integer>> getCigarOperatorForAllBases( GATKSAMRecord read) { Map<CigarOperator, ArrayList<Integer>> events = new HashMap<CigarOperator, ArrayList<Integer>>(); int position = 0; for (CigarElement cigarElement : read.getCigar().getCigarElements()) { CigarOperator op = cigarElement.getOperator(); if (op.consumesReadBases()) { ArrayList<Integer> list = events.get(op); if (list == null) { list = new ArrayList<Integer>(); events.put(op, list); } for (int i = position; i < cigarElement.getLength(); i++) list.add(position++); } else { ArrayList<Integer> list = events.get(op); if (list == null) { list = new ArrayList<Integer>(); events.put(op, list); } list.add(position); } } return events; }
/** * Returns the coverage distribution of a single read within the desired region. * * <p>Note: This function counts DELETIONS as coverage (since the main purpose is to downsample * reads for variant regions, and deletions count as variants) * * @param read the read to get the coverage distribution of * @param startLocation the first reference coordinate of the region (inclusive) * @param stopLocation the last reference coordinate of the region (inclusive) * @return an array with the coverage of each position from startLocation to stopLocation */ public static int[] getCoverageDistributionOfRead( GATKSAMRecord read, int startLocation, int stopLocation) { int[] coverage = new int[stopLocation - startLocation + 1]; int refLocation = read.getSoftStart(); for (CigarElement cigarElement : read.getCigar().getCigarElements()) { switch (cigarElement.getOperator()) { case S: case M: case EQ: case N: case X: case D: for (int i = 0; i < cigarElement.getLength(); i++) { if (refLocation >= startLocation && refLocation <= stopLocation) { int baseCount = read.isReducedRead() ? read.getReducedCount(refLocation - read.getSoftStart()) : 1; coverage[refLocation - startLocation] += baseCount; // this may be a reduced read, so add the proper number of bases } refLocation++; } break; case P: case I: case H: break; } if (refLocation > stopLocation) break; } return coverage; }
public static Pair<Boolean, CigarElement> readStartsWithInsertion(final Cigar cigar) { for (CigarElement cigarElement : cigar.getCigarElements()) { if (cigarElement.getOperator() == CigarOperator.INSERTION) return new Pair<Boolean, CigarElement>(true, cigarElement); else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP && cigarElement.getOperator() != CigarOperator.SOFT_CLIP) break; } return new Pair<Boolean, CigarElement>(false, null); }
/** * Hard clips any leading insertions in the read. Only looks at the beginning of the read, not the * end. * * @return a new read without leading insertions */ private GATKSAMRecord hardClipLeadingInsertions() { if (read.isEmpty()) return read; for (CigarElement cigarElement : read.getCigar().getCigarElements()) { if (cigarElement.getOperator() != CigarOperator.HARD_CLIP && cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.INSERTION) break; else if (cigarElement.getOperator() == CigarOperator.INSERTION) this.addOp(new ClippingOp(0, cigarElement.getLength() - 1)); } return clipRead(ClippingRepresentation.HARDCLIP_BASES); }
/** * Calculates the reference coordinate for a read coordinate * * @param read the read * @param offset the base in the read (coordinate in the read) * @return the reference coordinate correspondent to this base */ public static long getReferenceCoordinateForReadCoordinate(GATKSAMRecord read, int offset) { if (offset > read.getReadLength()) throw new ReviewedStingException( String.format(OFFSET_OUT_OF_BOUNDS_EXCEPTION, offset, read.getReadLength())); long location = read.getAlignmentStart(); Iterator<CigarElement> cigarElementIterator = read.getCigar().getCigarElements().iterator(); while (offset > 0 && cigarElementIterator.hasNext()) { CigarElement cigarElement = cigarElementIterator.next(); long move = 0; if (cigarElement.getOperator().consumesReferenceBases()) move = (long) Math.min(cigarElement.getLength(), offset); location += move; offset -= move; } if (offset > 0 && !cigarElementIterator.hasNext()) throw new ReviewedStingException(OFFSET_NOT_ZERO_EXCEPTION); return location; }
/** * Will hard clip every soft clipped bases in the read. * * @return a new read without the soft clipped bases */ private GATKSAMRecord hardClipSoftClippedBases() { if (read.isEmpty()) return read; int readIndex = 0; int cutLeft = -1; // first position to hard clip (inclusive) int cutRight = -1; // first position to hard clip (inclusive) boolean rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail for (CigarElement cigarElement : read.getCigar().getCigarElements()) { if (cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { if (rightTail) { cutRight = readIndex; } else { cutLeft = readIndex + cigarElement.getLength() - 1; } } else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP) rightTail = true; if (cigarElement.getOperator().consumesReadBases()) readIndex += cigarElement.getLength(); } // It is extremely important that we cut the end first otherwise the read coordinates change. if (cutRight >= 0) this.addOp(new ClippingOp(cutRight, read.getReadLength() - 1)); if (cutLeft >= 0) this.addOp(new ClippingOp(0, cutLeft)); return clipRead(ClippingRepresentation.HARDCLIP_BASES); }
/** Steps forward on the genome. Returns false when done reading the read, true otherwise. */ public boolean stepForwardOnGenome() { if (currentOperatorIndex == numOperators) return false; CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); if (currentPositionOnOperator >= curElement.getLength()) { if (++currentOperatorIndex == numOperators) return false; curElement = read.getCigar().getCigarElement(currentOperatorIndex); currentPositionOnOperator = 0; } switch (curElement.getOperator()) { case I: // insertion w.r.t. the reference if (!sawMop) break; case S: // soft clip currentReadOffset += curElement.getLength(); case H: // hard clip case P: // padding currentOperatorIndex++; return stepForwardOnGenome(); case D: // deletion w.r.t. the reference case N: // reference skip (looks and gets processed just like a "deletion", just different // logical meaning) currentPositionOnOperator++; break; case M: case EQ: case X: sawMop = true; currentReadOffset++; currentPositionOnOperator++; break; default: throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); } final boolean isFirstOp = currentOperatorIndex == 0; final boolean isLastOp = currentOperatorIndex == numOperators - 1; final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); isBeforeDeletionStart = isBeforeOp( read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); isAfterDeletionEnd = isAfterOp( read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) || (!sawMop && curElement.getOperator() == CigarOperator.I); isAfterInsertion = isAfterOp( read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) || isAfterOp( read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); return true; }
@Override public int doWork(String[] args) { File refFile = null; com.github.lindenb.jvarkit.util.cli.GetOpt getopt = new com.github.lindenb.jvarkit.util.cli.GetOpt(); int c; while ((c = getopt.getopt(args, "hvL:r:")) != -1) { switch (c) { case 'h': printUsage(); return 0; case 'v': System.out.println(getVersion()); return 0; case 'L': getLogger().setLevel(java.util.logging.Level.parse(getopt.getOptArg())); break; case 'r': refFile = new File(getopt.getOptArg()); break; case ':': System.err.println("Missing argument for option -" + getopt.getOptOpt()); return -1; default: System.err.println("Unknown option -" + getopt.getOptOpt()); return -1; } } if (refFile == null) { error("Undefined REF file"); return -1; } File bamFile = null; if (getopt.getOptInd() + 1 != args.length) { info("reading from stdin."); } else { bamFile = new File(args[getopt.getOptInd()]); } IndexedFastaSequenceFile indexedFastaSequenceFile = null; SAMFileReader samFileReader = null; try { GenomicSequence genomicSequence = null; indexedFastaSequenceFile = new IndexedFastaSequenceFile(refFile); SAMFileReader.setDefaultValidationStringency(ValidationStringency.SILENT); samFileReader = null; if (bamFile == null) { samFileReader = new SAMFileReader(System.in); } else { samFileReader = new SAMFileReader(bamFile); } XMLOutputFactory xmlfactory = XMLOutputFactory.newInstance(); XMLStreamWriter w = xmlfactory.createXMLStreamWriter(System.out, "UTF-8"); w.writeStartDocument("UTF-8", "1.0"); w.writeStartElement("sam"); w.writeComment(getProgramCommandLine()); w.writeAttribute("ref", (bamFile == null ? "stdin" : bamFile.getPath())); w.writeAttribute("bam", args[1]); SAMRecordIterator iter = samFileReader.iterator(); while (iter.hasNext()) { SAMRecord rec = iter.next(); final byte readbases[] = rec.getReadBases(); w.writeStartElement("read"); w.writeStartElement("name"); w.writeCharacters(rec.getReadName()); w.writeEndElement(); w.writeStartElement("sequence"); w.writeCharacters(new String(readbases)); w.writeEndElement(); w.writeStartElement("flags"); w.writeAttribute("paired", String.valueOf(rec.getReadPairedFlag())); w.writeAttribute( "failsVendorQual", String.valueOf(rec.getReadFailsVendorQualityCheckFlag())); w.writeAttribute("mapped", String.valueOf(!rec.getReadUnmappedFlag())); w.writeAttribute("strand", (rec.getReadNegativeStrandFlag() ? "-" : "+")); if (rec.getReadPairedFlag()) { w.writeAttribute("mate-mapped", String.valueOf(!rec.getMateUnmappedFlag())); w.writeAttribute("mate-strand", (rec.getMateNegativeStrandFlag() ? "-" : "+")); w.writeAttribute("proper-pair", String.valueOf(rec.getProperPairFlag())); } w.writeCharacters(String.valueOf(rec.getFlags())); w.writeEndElement(); if (!rec.getReadUnmappedFlag()) { w.writeStartElement("qual"); w.writeCharacters(String.valueOf(rec.getMappingQuality())); w.writeEndElement(); w.writeStartElement("chrom"); w.writeAttribute("index", String.valueOf(rec.getReferenceIndex())); w.writeCharacters(rec.getReferenceName()); w.writeEndElement(); w.writeStartElement("pos"); w.writeCharacters(String.valueOf(rec.getAlignmentStart())); w.writeEndElement(); w.writeStartElement("cigar"); w.writeCharacters(rec.getCigarString()); w.writeEndElement(); } if (!rec.getMateUnmappedFlag()) { w.writeStartElement("mate-chrom"); w.writeAttribute("index", String.valueOf(rec.getMateReferenceIndex())); w.writeCharacters(rec.getMateReferenceName()); w.writeEndElement(); w.writeStartElement("mate-pos"); w.writeCharacters(String.valueOf(rec.getMateAlignmentStart())); w.writeEndElement(); } if (!rec.getReadUnmappedFlag()) { if (genomicSequence == null || genomicSequence.getChrom().equals(rec.getReferenceName())) { genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName()); } w.writeStartElement("align"); int readIndex = 0; int refIndex = rec.getAlignmentStart(); for (final CigarElement e : rec.getCigar().getCigarElements()) { switch (e.getOperator()) { case H: break; // ignore hard clips case P: break; // ignore pads case I: // cont. case S: { final int length = e.getLength(); for (int i = 0; i < length; ++i) { w.writeEmptyElement(e.getOperator().name()); w.writeAttribute("read-index", String.valueOf(readIndex + 1)); if (readIndex >= 0 && readIndex < readbases.length) { w.writeAttribute("read-base", String.valueOf((char) (readbases[readIndex]))); } readIndex++; } break; } case N: // cont. -- reference skip case D: { final int length = e.getLength(); for (int i = 0; i < length; ++i) { w.writeEmptyElement(e.getOperator().name()); w.writeAttribute("ref-index", String.valueOf(refIndex)); if (refIndex >= 1 && refIndex <= genomicSequence.length()) { w.writeAttribute( "ref-base", String.valueOf(genomicSequence.charAt(refIndex - 1))); } refIndex++; } break; } case M: case EQ: case X: { final int length = e.getLength(); for (int i = 0; i < length; ++i) { w.writeEmptyElement(e.getOperator().name()); char baseRead = '\0'; if (readIndex >= 0 && readIndex < readbases.length) { baseRead = (char) (rec.getReadBases()[readIndex]); w.writeAttribute("read-index", String.valueOf(readIndex + 1)); w.writeAttribute("read-base", String.valueOf(baseRead)); } w.writeAttribute("ref-index", String.valueOf(refIndex)); if (refIndex >= 1 && refIndex <= genomicSequence.length()) { char baseRef = genomicSequence.charAt(refIndex - 1); w.writeAttribute("ref-base", String.valueOf(baseRef)); if (Character.toUpperCase(baseRef) != Character.toUpperCase(baseRead)) { w.writeAttribute("mismatch", "true"); } } refIndex++; readIndex++; } break; } default: throw new IllegalStateException( "Case statement didn't deal with cigar op: " + e.getOperator()); } } } w.writeEndElement(); w.writeEndElement(); iter.close(); w.writeEndElement(); } w.writeEndElement(); w.writeEndDocument(); w.flush(); w.close(); } catch (Exception err) { error(err); return -1; } finally { CloserUtil.close(samFileReader); CloserUtil.close(indexedFastaSequenceFile); } return 0; }
private void collectQualityData(final SAMRecord record, final ReferenceSequence reference) { // If the read isnt an aligned PF read then look at the read for no-calls if (record.getReadUnmappedFlag() || record.getReadFailsVendorQualityCheckFlag() || !doRefMetrics) { final byte[] readBases = record.getReadBases(); for (int i = 0; i < readBases.length; i++) { if (SequenceUtil.isNoCall(readBases[i])) { badCycleHistogram.increment( CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i)); } } } else if (!record.getReadFailsVendorQualityCheckFlag()) { final boolean highQualityMapping = isHighQualityMapping(record); if (highQualityMapping) metrics.PF_HQ_ALIGNED_READS++; final byte[] readBases = record.getReadBases(); final byte[] refBases = reference.getBases(); final byte[] qualities = record.getBaseQualities(); final int refLength = refBases.length; long mismatchCount = 0; long hqMismatchCount = 0; for (final AlignmentBlock alignmentBlock : record.getAlignmentBlocks()) { final int readIndex = alignmentBlock.getReadStart() - 1; final int refIndex = alignmentBlock.getReferenceStart() - 1; final int length = alignmentBlock.getLength(); for (int i = 0; i < length && refIndex + i < refLength; ++i) { final int readBaseIndex = readIndex + i; boolean mismatch = !SequenceUtil.basesEqual(readBases[readBaseIndex], refBases[refIndex + i]); boolean bisulfiteBase = false; if (mismatch && isBisulfiteSequenced) { if ((record.getReadNegativeStrandFlag() && (refBases[refIndex + i] == 'G' || refBases[refIndex + i] == 'g') && (readBases[readBaseIndex] == 'A' || readBases[readBaseIndex] == 'a')) || ((!record.getReadNegativeStrandFlag()) && (refBases[refIndex + i] == 'C' || refBases[refIndex + i] == 'c') && (readBases[readBaseIndex] == 'T') || readBases[readBaseIndex] == 't')) { bisulfiteBase = true; mismatch = false; } } if (mismatch) mismatchCount++; metrics.PF_ALIGNED_BASES++; if (!bisulfiteBase) nonBisulfiteAlignedBases++; if (highQualityMapping) { metrics.PF_HQ_ALIGNED_BASES++; if (!bisulfiteBase) hqNonBisulfiteAlignedBases++; if (qualities[readBaseIndex] >= BASE_QUALITY_THRESHOLD) metrics.PF_HQ_ALIGNED_Q20_BASES++; if (mismatch) hqMismatchCount++; } if (mismatch || SequenceUtil.isNoCall(readBases[readBaseIndex])) { badCycleHistogram.increment( CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i)); } } } mismatchHistogram.increment(mismatchCount); hqMismatchHistogram.increment(hqMismatchCount); // Add any insertions and/or deletions to the global count for (final CigarElement elem : record.getCigar().getCigarElements()) { final CigarOperator op = elem.getOperator(); if (op == CigarOperator.INSERTION || op == CigarOperator.DELETION) ++this.indels; } } }
/** * Is this read all insertion? * * @param read * @return whether or not the only element in the cigar string is an Insertion */ public static boolean readIsEntirelyInsertion(GATKSAMRecord read) { for (CigarElement cigarElement : read.getCigar().getCigarElements()) { if (cigarElement.getOperator() != CigarOperator.INSERTION) return false; } return true; }
public static Pair<Integer, Boolean> getReadCoordinateForReferenceCoordinate( final int alignmentStart, final Cigar cigar, final int refCoord, final boolean allowGoalNotReached) { int readBases = 0; int refBases = 0; boolean fallsInsideDeletion = false; int goal = refCoord - alignmentStart; // The goal is to move this many reference bases if (goal < 0) { if (allowGoalNotReached) { return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false); } else { throw new ReviewedStingException( "Somehow the requested coordinate is not covered by the read. Too many deletions?"); } } boolean goalReached = refBases == goal; Iterator<CigarElement> cigarElementIterator = cigar.getCigarElements().iterator(); while (!goalReached && cigarElementIterator.hasNext()) { CigarElement cigarElement = cigarElementIterator.next(); int shift = 0; if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { if (refBases + cigarElement.getLength() < goal) shift = cigarElement.getLength(); else shift = goal - refBases; refBases += shift; } goalReached = refBases == goal; if (!goalReached && cigarElement.getOperator().consumesReadBases()) readBases += cigarElement.getLength(); if (goalReached) { // Is this base's reference position within this cigar element? Or did we use it all? boolean endsWithinCigar = shift < cigarElement.getLength(); // If it isn't, we need to check the next one. There should *ALWAYS* be a next one // since we checked if the goal coordinate is within the read length, so this is just a // sanity check. if (!endsWithinCigar && !cigarElementIterator.hasNext()) { if (allowGoalNotReached) { return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false); } else { throw new ReviewedStingException( "Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); } } CigarElement nextCigarElement; // if we end inside the current cigar element, we just have to check if it is a deletion if (endsWithinCigar) fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION; // if we end outside the current cigar element, we need to check if the next element is an // insertion or deletion. else { nextCigarElement = cigarElementIterator.next(); // if it's an insertion, we need to clip the whole insertion before looking at the next // element if (nextCigarElement.getOperator() == CigarOperator.INSERTION) { readBases += nextCigarElement.getLength(); if (!cigarElementIterator.hasNext()) { if (allowGoalNotReached) { return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false); } else { throw new ReviewedStingException( "Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); } } nextCigarElement = cigarElementIterator.next(); } // if it's a deletion, we will pass the information on to be handled downstream. fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION; } // If we reached our goal outside a deletion, add the shift if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases()) readBases += shift; // If we reached our goal inside a deletion, but the deletion is the next cigar element then // we need // to add the shift of the current cigar element but go back to it's last element to return // the last // base before the deletion (see warning in function contracts) else if (fallsInsideDeletion && !endsWithinCigar) readBases += shift - 1; // If we reached our goal inside a deletion then we must backtrack to the last base before // the deletion else if (fallsInsideDeletion && endsWithinCigar) readBases--; } } if (!goalReached) { if (allowGoalNotReached) { return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false); } else { throw new ReviewedStingException( "Somehow the requested coordinate is not covered by the read. Alignment " + alignmentStart + " | " + cigar); } } return new Pair<Integer, Boolean>(readBases, fallsInsideDeletion); }
/** * If a read ends in INSERTION, returns the last element length. * * <p>Warning: If the read has Hard or Soft clips after the insertion this function will return 0. * * @param read * @return the length of the last insertion, or 0 if there is none (see warning). */ public static final int getLastInsertionOffset(SAMRecord read) { CigarElement e = read.getCigar().getCigarElement(read.getCigarLength() - 1); if (e.getOperator() == CigarOperator.I) return e.getLength(); else return 0; }
public static void align( Graph graph, SAMRecord rec, Node recNode, ReferenceSequence sequence, SAMProgramRecord programRecord, int offset, AlleleCoverageCutoffs alleleCoverageCutoffs, boolean correctBases, boolean useSequenceQualities, int MAXIMUM_TOTAL_COVERAGE, int MAX_HEAP_SIZE) throws Exception { int i; AlignHeapNode curAlignHeapNode = null; AlignHeapNode nextAlignHeapNode = null; AlignHeapNode bestAlignHeapNode = null; AlignHeap heap = null; String read = null; // could be cs String readBases = null; // always nt String qualities = null; // could be cq SRMAUtil.Space space = SRMAUtil.Space.NTSPACE; ListIterator<NodeRecord> iter = null; AlignHeapNodeComparator comp = null; int alignmentStart = -1; int numStartNodesAdded = 0; boolean strand = rec.getReadNegativeStrandFlag(); // false -> forward, true -> reverse String softClipStartBases = null; String softClipStartQualities = null; String softClipEndBases = null; String softClipEndQualities = null; // Debugging stuff String readName = rec.getReadName(); assert SRMAUtil.Space.COLORSPACE != space; // Get space read = (String) rec.getAttribute("CS"); if (null == read) { // Use base space space = SRMAUtil.Space.NTSPACE; } else { // assumes CS and CQ are always in sequencing order space = SRMAUtil.Space.COLORSPACE; } // Get read and qualities if (space == SRMAUtil.Space.NTSPACE) { byte tmpRead[] = rec.getReadString().getBytes(); byte tmpQualities[] = rec.getBaseQualityString().getBytes(); // Reverse once if (strand) { // reverse SAMRecordUtil.reverseArray(tmpRead); SAMRecordUtil.reverseArray(tmpQualities); } read = new String(tmpRead); readBases = new String(tmpRead); qualities = new String(tmpQualities); // Reverse again if (strand) { // reverse SAMRecordUtil.reverseArray(tmpRead); SAMRecordUtil.reverseArray(tmpQualities); } } else { byte tmpRead[] = rec.getReadString().getBytes(); // Reverse once if (strand) { // reverse SAMRecordUtil.reverseArray(tmpRead); } readBases = new String(tmpRead); // Reverse again if (strand) { // reverse SAMRecordUtil.reverseArray(tmpRead); } read = SRMAUtil.normalizeColorSpaceRead(read); qualities = (String) rec.getAttribute("CQ"); // Some aligners include a quality value for the adapter. A quality value // IMHO should not be given for an unobserved (assumed) peice of data. Trim // the first quality in this case if (qualities.length() == 1 + read.length()) { // trim the first quality qualities = qualities.substring(1); } } // Reverse back if (readBases.length() <= 0) { throw new Exception("Error. The current alignment has no bases."); } if (read.length() <= 0) { throw new Exception("Error. The current alignment has no bases."); } if (qualities.length() <= 0) { throw new Exception("Error. The current alignment has no qualities."); } if (readBases.length() != read.length()) { if (space == SRMAUtil.Space.COLORSPACE) { throw new Exception( "Error. The current alignment's read bases length does not match the length of the colors in the CS tag [" + rec.getReadName() + "]."); } else { throw new Exception("Error. Internal error: readBases.length() != read.length()"); } } // Deal with soft-clipping // - save the soft clipped sequence for latter { List<CigarElement> cigarElements = null; cigarElements = rec.getCigar().getCigarElements(); CigarElement e1 = cigarElements.get(0); // first CigarElement e2 = cigarElements.get(cigarElements.size() - 1); // last // Soft-clipped if (CigarOperator.S == e1.getOperator()) { if (space == SRMAUtil.Space.COLORSPACE) { throw new Exception( "Error. Soft clipping with color-space data not currently supported."); } int l = e1.getLength(); if (strand) { // reverse softClipStartBases = readBases.substring(readBases.length() - l); softClipStartQualities = qualities.substring(qualities.length() - l); readBases = readBases.substring(0, readBases.length() - l); read = read.substring(0, read.length() - l); qualities = qualities.substring(0, qualities.length() - l); } else { softClipStartBases = readBases.substring(0, l - 1); softClipStartQualities = qualities.substring(0, l - 1); readBases = readBases.substring(l); read = read.substring(l); qualities = qualities.substring(l); } } if (CigarOperator.S == e2.getOperator()) { if (space == SRMAUtil.Space.COLORSPACE) { throw new Exception( "Error. Soft clipping with color-space data not currently supported."); } int l = e2.getLength(); if (strand) { // reverse softClipEndBases = readBases.substring(0, l - 1); softClipEndQualities = qualities.substring(0, l - 1); readBases = readBases.substring(l); read = read.substring(l); qualities = qualities.substring(l); } else { softClipEndBases = readBases.substring(readBases.length() - l); softClipEndQualities = qualities.substring(qualities.length() - l); readBases = readBases.substring(0, readBases.length() - l); read = read.substring(0, read.length() - l); qualities = qualities.substring(0, qualities.length() - l); } } } // Remove mate pair information Align.removeMateInfo(rec); comp = new AlignHeapNodeComparator( (strand) ? AlignHeap.HeapType.MAXHEAP : AlignHeap.HeapType.MINHEAP); // Bound by original alignment if possible bestAlignHeapNode = Align.boundWithOriginalAlignment( rec, graph, recNode, comp, strand, read, qualities, readBases, space, sequence, alleleCoverageCutoffs, useSequenceQualities, MAXIMUM_TOTAL_COVERAGE, MAX_HEAP_SIZE); /* System.err.println("readName="+rec.getReadName()); if(null != bestAlignHeapNode) { System.err.println("\nFOUND BEST:" + rec.toString()); } else { System.err.println("\nNOT FOUND (BEST): " + rec.toString()); } Align.updateSAM(rec, programRecord, bestAlignHeapNode, space, read, qualities, softClipStartBases, softClipStartQualities, softClipEndBases, softClipEndQualities, strand, correctBases); return; */ heap = new AlignHeap((strand) ? AlignHeap.HeapType.MAXHEAP : AlignHeap.HeapType.MINHEAP); // Add start nodes if (strand) { // reverse alignmentStart = rec.getAlignmentEnd(); for (i = alignmentStart + offset; alignmentStart - offset <= i; i--) { int position = graph.getPriorityQueueIndexAtPositionOrBefore(i); PriorityQueue<Node> startNodeQueue = graph.getPriorityQueue(position); if (0 != position && null != startNodeQueue) { Iterator<Node> startNodeQueueIter = startNodeQueue.iterator(); while (startNodeQueueIter.hasNext()) { Node startNode = startNodeQueueIter.next(); int f = passFilters(graph, startNode, alleleCoverageCutoffs, MAXIMUM_TOTAL_COVERAGE); if (0 == f) { heap.add( new AlignHeapNode( null, startNode, startNode.coverage, read.charAt(0), qualities.charAt(0), useSequenceQualities, space)); } else if (f < 0) { return; } if (startNode.position < i) { i = startNode.position; } numStartNodesAdded++; } } } } else { alignmentStart = rec.getAlignmentStart(); for (i = alignmentStart - offset; i <= alignmentStart + offset; i++) { int position = graph.getPriorityQueueIndexAtPositionOrGreater(i); PriorityQueue<Node> startNodeQueue = graph.getPriorityQueue(position); if (0 != position && null != startNodeQueue) { Iterator<Node> startNodeQueueIter = startNodeQueue.iterator(); while (startNodeQueueIter.hasNext()) { Node startNode = startNodeQueueIter.next(); int f = passFilters(graph, startNode, alleleCoverageCutoffs, MAXIMUM_TOTAL_COVERAGE); if (0 == f) { heap.add( new AlignHeapNode( null, startNode, startNode.coverage, read.charAt(0), qualities.charAt(0), useSequenceQualities, space)); } else if (f < 0) { return; } if (i < startNode.position) { i = startNode.position; } numStartNodesAdded++; } } } } if (numStartNodesAdded == 0) { throw new Exception("Did not add any start nodes!"); } // Get first node off the heap curAlignHeapNode = heap.poll(); while (null != curAlignHeapNode) { if (MAX_HEAP_SIZE <= heap.size()) { // too many to consider return; } // System.err.println("strand:" + strand + "\tsize:" + heap.size() + "\talignmentStart:" + // alignmentStart + "\toffset:" + offset + "\treadOffset:" + curAlignHeapNode.readOffset); // System.err.print("size:" + heap.size() + ":" + curAlignHeapNode.readOffset + ":" + // curAlignHeapNode.score + ":" + curAlignHeapNode.alleleCoverageSum + ":" + // curAlignHeapNode.startPosition + "\t"); // curAlignHeapNode.node.print(System.err); // System.err.print("\rposition:" + curAlignHeapNode.node.position + "\treadOffset:" + // curAlignHeapNode.readOffset); // Remove all non-insertions with the same contig/pos/read-offset/type/base and lower score nextAlignHeapNode = heap.peek(); while (Node.INSERTION != curAlignHeapNode.node.type && null != nextAlignHeapNode && 0 == comp.compare(curAlignHeapNode, nextAlignHeapNode)) { if (curAlignHeapNode.score < nextAlignHeapNode.score || (curAlignHeapNode.score == nextAlignHeapNode.score && curAlignHeapNode.alleleCoverageSum < nextAlignHeapNode.alleleCoverageSum)) { // Update current node curAlignHeapNode = heap.poll(); } else { // Ignore next node heap.poll(); } nextAlignHeapNode = heap.peek(); } nextAlignHeapNode = null; // Check if the alignment is complete if (curAlignHeapNode.readOffset == read.length() - 1) { // All read bases examined, store if has the best alignment. // System.err.print(curAlignHeapNode.alleleCoverageSum + ":" + curAlignHeapNode.score + // ":"); // System.err.print(curAlignHeapNode.startPosition + ":"); // curAlignHeapNode.node.print(System.err); if (null == bestAlignHeapNode || bestAlignHeapNode.score < curAlignHeapNode.score || (bestAlignHeapNode.score == curAlignHeapNode.score && bestAlignHeapNode.alleleCoverageSum < curAlignHeapNode.alleleCoverageSum)) { bestAlignHeapNode = curAlignHeapNode; } } else if (null != bestAlignHeapNode && curAlignHeapNode.score < bestAlignHeapNode.score) { // ignore, under the assumption that scores can only become more negative. } else { if (strand) { // reverse // Go to all the "prev" nodes iter = curAlignHeapNode.node.prev.listIterator(); } else { // forward // Go to all "next" nodes iter = curAlignHeapNode.node.next.listIterator(); } while (iter.hasNext()) { NodeRecord next = iter.next(); int f = passFilters( graph, next.node, next.coverage, alleleCoverageCutoffs, MAXIMUM_TOTAL_COVERAGE); if (0 == f) { heap.add( new AlignHeapNode( curAlignHeapNode, next.node, next.coverage, read.charAt(curAlignHeapNode.readOffset + 1), qualities.charAt(curAlignHeapNode.readOffset + 1), useSequenceQualities, space)); } else if (f < 0) { return; } } iter = null; } // Get next node curAlignHeapNode = heap.poll(); } // Recover alignment Align.updateSAM( rec, sequence, programRecord, bestAlignHeapNode, space, read, qualities, softClipStartBases, softClipStartQualities, softClipEndBases, softClipEndQualities, strand, correctBases); }