private GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read) { GATKSAMRecord unclipped = (GATKSAMRecord) read.clone(); Cigar unclippedCigar = new Cigar(); int matchesCount = 0; for (CigarElement element : read.getCigar().getCigarElements()) { if (element.getOperator() == CigarOperator.SOFT_CLIP || element.getOperator() == CigarOperator.MATCH_OR_MISMATCH) matchesCount += element.getLength(); else if (matchesCount > 0) { unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH)); matchesCount = 0; unclippedCigar.add(element); } else unclippedCigar.add(element); } if (matchesCount > 0) unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH)); unclipped.setCigar(unclippedCigar); final int newStart = read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), unclippedCigar); unclipped.setAlignmentStart(newStart); if (newStart <= 0) { // if the start of the unclipped read occurs before the contig, // we must hard clip away the bases since we cannot represent reads with // negative or 0 alignment start values in the SAMRecord (e.g., 0 means unaligned) return hardClip(unclipped, 0, -newStart); } else { return unclipped; } }
/** * @param read a read containing the variant * @return the number of hard clipped and low qual bases at the read start (where start is the * leftmost end w.r.t. the reference) */ public static int getNumClippedBasesAtStart(final SAMRecord read) { // check for hard clips (never consider these bases): final Cigar c = read.getCigar(); final CigarElement first = c.getCigarElement(0); int numStartClippedBases = 0; if (first.getOperator() == CigarOperator.H) { numStartClippedBases = first.getLength(); } final byte[] unclippedReadBases = read.getReadBases(); final byte[] unclippedReadQuals = read.getBaseQualities(); // Do a stricter base clipping than provided by CIGAR string, since this one may be too // conservative, // and may leave a string of Q2 bases still hanging off the reads. // TODO: this code may not even get used because HaplotypeCaller already hard clips low quality // tails for (int i = numStartClippedBases; i < unclippedReadBases.length; i++) { if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD) numStartClippedBases++; else break; } return numStartClippedBases; }
/** * Checks if a hard clipped cigar left a read starting or ending with deletions or gap (N) and * cleans it up accordingly. * * @param cigar the original cigar * @return an object with the shifts (see CigarShift class) */ private CigarShift cleanHardClippedCigar(final Cigar cigar) { final Cigar cleanCigar = new Cigar(); int shiftFromStart = 0; int shiftFromEnd = 0; Stack<CigarElement> cigarStack = new Stack<CigarElement>(); final Stack<CigarElement> inverseCigarStack = new Stack<CigarElement>(); for (final CigarElement cigarElement : cigar.getCigarElements()) cigarStack.push(cigarElement); for (int i = 1; i <= 2; i++) { int shift = 0; int totalHardClip = 0; boolean readHasStarted = false; boolean addedHardClips = false; while (!cigarStack.empty()) { CigarElement cigarElement = cigarStack.pop(); if (!readHasStarted && cigarElement.getOperator() != CigarOperator.DELETION && cigarElement.getOperator() != CigarOperator.SKIPPED_REGION && cigarElement.getOperator() != CigarOperator.HARD_CLIP) readHasStarted = true; else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.HARD_CLIP) totalHardClip += cigarElement.getLength(); else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.DELETION) totalHardClip += cigarElement.getLength(); else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.SKIPPED_REGION) totalHardClip += cigarElement.getLength(); if (readHasStarted) { if (i == 1) { if (!addedHardClips) { if (totalHardClip > 0) inverseCigarStack.push(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP)); addedHardClips = true; } inverseCigarStack.push(cigarElement); } else { if (!addedHardClips) { if (totalHardClip > 0) cleanCigar.add(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP)); addedHardClips = true; } cleanCigar.add(cigarElement); } } } // first pass (i=1) is from end to start of the cigar elements if (i == 1) { shiftFromEnd = shift; cigarStack = inverseCigarStack; } // second pass (i=2) is from start to end with the end already cleaned else { shiftFromStart = shift; } } return new CigarShift(cleanCigar, shiftFromStart, shiftFromEnd); }
/** * Decode a single line in a SAM text file. * * @param line line to decode. * @return A SAMReadFeature modeling that line. */ @Override public SAMReadFeature decode(String line) { // we may be asked to process a header line; ignore it if (line.startsWith("@")) return null; String[] tokens = new String[expectedTokenCount]; // split the line int count = ParsingUtils.splitWhitespace(line, tokens); // check to see if we've parsed the string into the right number of tokens (expectedTokenCount) if (count != expectedTokenCount) throw new CodecLineParsingException( "the SAM read line didn't have the expected number of tokens " + "(expected = " + expectedTokenCount + ", saw = " + count + " on " + "line = " + line + ")"); final String readName = tokens[0]; final int flags = Integer.parseInt(tokens[1]); final String contigName = tokens[2]; final int alignmentStart = Integer.parseInt(tokens[3]); final int mapQ = Integer.parseInt(tokens[4]); final String cigarString = tokens[5]; final String mateContigName = tokens[6]; final int mateAlignmentStart = Integer.parseInt(tokens[7]); final int inferredInsertSize = Integer.parseInt(tokens[8]); final byte[] bases = StringUtil.stringToBytes(tokens[9]); final byte[] qualities = StringUtil.stringToBytes(tokens[10]); // Infer the alignment end. Cigar cigar = TextCigarCodec.decode(cigarString); int alignmentEnd = alignmentStart + cigar.getReferenceLength() - 1; // Remove printable character conversion from the qualities. for (byte quality : qualities) quality -= 33; return new SAMReadFeature( readName, flags, contigName, alignmentStart, alignmentEnd, mapQ, cigarString, mateContigName, mateAlignmentStart, inferredInsertSize, bases, qualities); }
public static int getReadCoordinateForReferenceCoordinate( final int alignmentStart, final Cigar cigar, final int refCoord, final ClippingTail tail, final boolean allowGoalNotReached) { final Pair<Integer, Boolean> result = getReadCoordinateForReferenceCoordinate( alignmentStart, cigar, refCoord, allowGoalNotReached); int readCoord = result.getLeft(); // Corner case one: clipping the right tail and falls on deletion, move to the next // read coordinate. It is not a problem for the left tail because the default answer // from getReadCoordinateForReferenceCoordinate is to give the previous read coordinate. if (result.getRight() && tail == ClippingTail.RIGHT_TAIL) { readCoord++; } // clipping the left tail and first base is insertion, go to the next read coordinate // with the same reference coordinate. Advance to the next cigar element, or to the // end of the read if there is no next element. final CigarElement firstElementIsInsertion = readStartsWithInsertion(cigar); if (readCoord == 0 && tail == ClippingTail.LEFT_TAIL && firstElementIsInsertion != null) { readCoord = Math.min(firstElementIsInsertion.getLength(), cigar.getReadLength() - 1); } return readCoord; }
/** * Calculates the reference coordinate for the end of the read taking into account soft clips but * not hard clips. * * <p>Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips. * * @param read the read * @param cigar the read's cigar * <p>Note: this overload of the function takes the cigar as input for speed because getCigar * is an expensive operation. Most callers should use the overload that does not take the * cigar. * @return the unclipped end of the read taking soft clips (but not hard clips) into account */ public static int getSoftEnd(final GATKRead read, final Cigar cigar) { Utils.nonNull(read, "read"); Utils.nonNull(cigar, "cigar"); boolean foundAlignedBase = false; int softEnd = read.getEnd(); final List<CigarElement> cigs = cigar.getCigarElements(); for (int i = cigs.size() - 1; i >= 0; --i) { final CigarElement cig = cigs.get(i); final CigarOperator op = cig.getOperator(); if (op == CigarOperator .SOFT_CLIP) { // assumes the soft clip that we found is at the end of the aligned read softEnd += cig.getLength(); } else if (op != CigarOperator.HARD_CLIP) { foundAlignedBase = true; break; } } if (!foundAlignedBase) { // for example 64H14S, the soft end is actually the same as the // alignment end softEnd = read.getEnd(); } return softEnd; }
/** Given a cigar string, get the number of bases hard or soft clipped at the start */ private int getNewAlignmentStartOffset(final Cigar __cigar, final Cigar __oldCigar) { int num = 0; for (CigarElement e : __cigar.getCigarElements()) { if (!e.getOperator().consumesReferenceBases()) { if (e.getOperator().consumesReadBases()) { num += e.getLength(); } } else { break; } } int oldNum = 0; int curReadCounter = 0; for (CigarElement e : __oldCigar.getCigarElements()) { int curRefLength = e.getLength(); int curReadLength = e.getLength(); if (!e.getOperator().consumesReadBases()) { curReadLength = 0; } boolean truncated = false; if (curReadCounter + curReadLength > num) { curReadLength = num - curReadCounter; curRefLength = num - curReadCounter; truncated = true; } if (!e.getOperator().consumesReferenceBases()) { curRefLength = 0; } curReadCounter += curReadLength; oldNum += curRefLength; if (curReadCounter > num || truncated) { break; } } return oldNum; }
/** * Checks if a read starts with an insertion. * * @param cigarForRead the CIGAR to evaluate * @param ignoreSoftClipOps should we ignore S operators when evaluating whether an I operator is * at the beginning? Note that H operators are always ignored. * @return the element if it's a leading insertion or null otherwise */ public static CigarElement readStartsWithInsertion( final Cigar cigarForRead, final boolean ignoreSoftClipOps) { for (final CigarElement cigarElement : cigarForRead.getCigarElements()) { if (cigarElement.getOperator() == CigarOperator.INSERTION) { return cigarElement; } else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP && (!ignoreSoftClipOps || cigarElement.getOperator() != CigarOperator.SOFT_CLIP)) { break; } } return null; }
/** * Compute the offset of the first "real" position in the cigar on the genome * * <p>This is defined as a first position after a run of Hs followed by a run of Ss * * @param cigar A non-null cigar * @return the offset (from 0) of the first on-genome base */ private int calcHardSoftOffset(final Cigar cigar) { final List<CigarElement> elements = cigar.getCigarElements(); int size = 0; int i = 0; while (i < elements.size() && elements.get(i).getOperator() == CigarOperator.HARD_CLIP) { size += elements.get(i).getLength(); i++; } while (i < elements.size() && elements.get(i).getOperator() == CigarOperator.SOFT_CLIP) { size += elements.get(i).getLength(); i++; } return size; }
/** * Calculates the reference coordinate for the beginning of the read taking into account soft * clips but not hard clips. * * <p>Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips. * * @param read the read * @param cigar the read's cigar * <p>Note: this overload of the function takes the cigar as input for speed because getCigar * is an expensive operation. Most callers should use the overload that does not take the * cigar. * @return the unclipped start of the read taking soft clips (but not hard clips) into account */ public static int getSoftStart(final GATKRead read, final Cigar cigar) { Utils.nonNull(read, "read"); Utils.nonNull(cigar, "cigar"); int softStart = read.getStart(); for (final CigarElement cig : cigar.getCigarElements()) { final CigarOperator op = cig.getOperator(); if (op == CigarOperator.SOFT_CLIP) { softStart -= cig.getLength(); } else if (op != CigarOperator.HARD_CLIP) { break; } } return softStart; }
@Requires({"!cigar.isEmpty()"}) private CigarShift hardClipCigar(Cigar cigar, int start, int stop) { Cigar newCigar = new Cigar(); int index = 0; int totalHardClipCount = stop - start + 1; int alignmentShift = 0; // caused by hard clipping deletions // hard clip the beginning of the cigar string if (start == 0) { Iterator<CigarElement> cigarElementIterator = cigar.getCigarElements().iterator(); CigarElement cigarElement = cigarElementIterator.next(); // Skip all leading hard clips while (cigarElement.getOperator() == CigarOperator.HARD_CLIP) { totalHardClipCount += cigarElement.getLength(); if (cigarElementIterator.hasNext()) cigarElement = cigarElementIterator.next(); else throw new ReviewedGATKException( "Read is entirely hardclipped, shouldn't be trying to clip it's cigar string"); } // keep clipping until we hit stop while (index <= stop) { int shift = 0; if (cigarElement.getOperator().consumesReadBases()) shift = cigarElement.getLength(); // we're still clipping or just finished perfectly if (index + shift == stop + 1) { alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength()); newCigar.add( new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP)); } // element goes beyond what we need to clip else if (index + shift > stop + 1) { int elementLengthAfterChopping = cigarElement.getLength() - (stop - index + 1); alignmentShift += calculateHardClippingAlignmentShift(cigarElement, stop - index + 1); newCigar.add( new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP)); newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator())); } index += shift; alignmentShift += calculateHardClippingAlignmentShift(cigarElement, shift); if (index <= stop && cigarElementIterator.hasNext()) cigarElement = cigarElementIterator.next(); else break; } // add the remaining cigar elements while (cigarElementIterator.hasNext()) { cigarElement = cigarElementIterator.next(); newCigar.add(new CigarElement(cigarElement.getLength(), cigarElement.getOperator())); } } // hard clip the end of the cigar string else { Iterator<CigarElement> cigarElementIterator = cigar.getCigarElements().iterator(); CigarElement cigarElement = cigarElementIterator.next(); // Keep marching on until we find the start while (index < start) { int shift = 0; if (cigarElement.getOperator().consumesReadBases()) shift = cigarElement.getLength(); // we haven't gotten to the start yet, keep everything as is. if (index + shift < start) newCigar.add(new CigarElement(cigarElement.getLength(), cigarElement.getOperator())); // element goes beyond our clip starting position else { int elementLengthAfterChopping = start - index; alignmentShift += calculateHardClippingAlignmentShift( cigarElement, cigarElement.getLength() - (start - index)); // if this last element is a HARD CLIP operator, just merge it with our hard clip operator // to be added later if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) totalHardClipCount += elementLengthAfterChopping; // otherwise, maintain what's left of this last operator else newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator())); } index += shift; if (index < start && cigarElementIterator.hasNext()) cigarElement = cigarElementIterator.next(); else break; } // check if we are hard clipping indels while (cigarElementIterator.hasNext()) { cigarElement = cigarElementIterator.next(); alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength()); // if the read had a HardClip operator in the end, combine it with the Hard Clip we are // adding if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) totalHardClipCount += cigarElement.getLength(); } newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP)); } return cleanHardClippedCigar(newCigar); }
/** Given a cigar string, soft clip up to startClipEnd and soft clip starting at endClipBegin */ private Cigar softClip(final Cigar __cigar, final int __startClipEnd, final int __endClipBegin) { if (__endClipBegin <= __startClipEnd) { // whole thing should be soft clipped int cigarLength = 0; for (CigarElement e : __cigar.getCigarElements()) { cigarLength += e.getLength(); } Cigar newCigar = new Cigar(); newCigar.add(new CigarElement(cigarLength, CigarOperator.SOFT_CLIP)); assert newCigar.isValid(null, -1) == null; return newCigar; } int curLength = 0; Vector<CigarElement> newElements = new Vector<CigarElement>(); for (CigarElement curElem : __cigar.getCigarElements()) { if (!curElem.getOperator().consumesReadBases()) { if (curElem.getOperator() == CigarOperator.HARD_CLIP || curLength > __startClipEnd && curLength < __endClipBegin) { newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator())); } continue; } int s = curLength; int e = curLength + curElem.getLength(); if (e <= __startClipEnd || s >= __endClipBegin) { // must turn this entire thing into a clip newElements.add(new CigarElement(curElem.getLength(), CigarOperator.SOFT_CLIP)); } else if (s >= __startClipEnd && e <= __endClipBegin) { // same thing newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator())); } else { // we are clipping in the middle of this guy CigarElement newStart = null; CigarElement newMid = null; CigarElement newEnd = null; int midLength = curElem.getLength(); if (s < __startClipEnd) { newStart = new CigarElement(__startClipEnd - s, CigarOperator.SOFT_CLIP); midLength -= newStart.getLength(); } if (e > __endClipBegin) { newEnd = new CigarElement(e - __endClipBegin, CigarOperator.SOFT_CLIP); midLength -= newEnd.getLength(); } assert midLength >= 0; if (midLength > 0) { newMid = new CigarElement(midLength, curElem.getOperator()); } if (newStart != null) { newElements.add(newStart); } if (newMid != null) { newElements.add(newMid); } if (newEnd != null) { newElements.add(newEnd); } } curLength += curElem.getLength(); } Vector<CigarElement> finalNewElements = new Vector<CigarElement>(); CigarElement lastElement = null; for (CigarElement elem : newElements) { if (lastElement == null || lastElement.getOperator() != elem.getOperator()) { if (lastElement != null) { finalNewElements.add(lastElement); } lastElement = elem; } else { lastElement = new CigarElement(lastElement.getLength() + elem.getLength(), lastElement.getOperator()); } } if (lastElement != null) { finalNewElements.add(lastElement); } Cigar newCigar = new Cigar(finalNewElements); assert newCigar.isValid(null, -1) == null; return newCigar; }
@Override public String process(File page, Map<String, String> query) { loadContigs(); if (query.get("contigName").matches("^[ACGT]+$")) { contigs.put("manual", query.get("contigName")); query.put("contigName", "manual"); } else if (query.get("contigName").matches("^Pf3D7.+$")) { String[] pieces = query.get("contigName").split("[:-]"); int start = Integer.valueOf(pieces[1].replaceAll(",", "")); int end = Integer.valueOf(pieces[2].replaceAll(",", "")); ReferenceSequence rseq = REF.getSubsequenceAt(pieces[0], start, end); contigs.put("manual", new String(rseq.getBases())); query.put("contigName", "manual"); } if (query.containsKey("contigName") && contigs.containsKey(query.get("contigName")) && graphs.containsKey(query.get("graphName"))) { boolean showLinks = query.get("showLinks").equals("links_on"); String contig = contigs.get(query.get("contigName")); String originalContig = contigs.get(query.get("contigName")); String refFormattedString = ""; String kmerOrigin = ""; if (metrics.containsKey(query.get("contigName"))) { String[] loc = metrics.get(query.get("contigName")).get("canonicalLocus").split("[:-]"); if (!loc[0].equals("*")) { boolean isRc = metrics.get(query.get("contigName")).get("isRcCanonical").equals("1"); if (isRc) { contig = SequenceUtils.reverseComplement(contig); originalContig = SequenceUtils.reverseComplement(originalContig); } int locStart = Integer.valueOf(loc[1]); int locEnd = Integer.valueOf(loc[2]); Cigar cigar = cigarStringToCigar(metrics.get(query.get("contigName")).get("cigarCanonical")); if (cigar.getCigarElement(0).getOperator().equals(CigarOperator.S)) { locStart -= cigar.getCigarElement(0).getLength(); } if (cigar .getCigarElement(cigar.getCigarElements().size() - 1) .getOperator() .equals(CigarOperator.S)) { locEnd += cigar.getCigarElement(cigar.getCigarElements().size() - 1).getLength(); } String ref = new String(REF.getSubsequenceAt(loc[0], locStart, locEnd).getBases()); StringBuilder refFormatted = new StringBuilder(); int pos = 0; for (CigarElement ce : cigar.getCigarElements()) { CigarOperator co = ce.getOperator(); switch (co) { case S: refFormatted.append(ref.substring(pos, pos + ce.getLength()).toLowerCase()); break; case M: refFormatted.append(ref.substring(pos, pos + ce.getLength())); break; case I: refFormatted.append(StringUtils.repeat("-", ce.getLength())); break; } if (ce.getOperator().consumesReferenceBases()) { pos += ce.getLength(); } } refFormattedString = refFormatted.toString(); kmerOrigin = metrics.get(query.get("contigName")).get("kmerOrigin"); } } CortexGraph cg = graphs.get(query.get("graphName")); String sampleName = cg.getColor(0).getSampleName(); Set<CortexLinksMap> links = new HashSet<CortexLinksMap>(); if (LINKS != null && !LINKS.isEmpty()) { for (CortexLinksMap link : LINKS) { if (sampleName.equals(link.getCortexLinks().getColor(0).getSampleName())) { links.add(link); } } } Set<String> contigKmers = new HashSet<String>(); for (int i = 0; i <= contig.length() - cg.getKmerSize(); i++) { String curKmer = contig.substring(i, i + cg.getKmerSize()); contigKmers.add(curKmer); } StringBuilder firstFlank = new StringBuilder(); String firstKmer = contig.substring(0, cg.getKmerSize()); Set<String> pks = CortexUtils.getPrevKmers(cg, firstKmer, 0); Set<String> usedPrevKmers = new HashSet<String>(); usedPrevKmers.add(firstKmer); while (pks.size() == 1 && usedPrevKmers.size() <= 100) { String kmer = pks.iterator().next(); firstFlank.insert(0, kmer.charAt(0)); if (usedPrevKmers.contains(kmer)) { break; } usedPrevKmers.add(kmer); pks = CortexUtils.getPrevKmers(cg, kmer, 0); } StringBuilder lastFlank = new StringBuilder(); String lastKmer = contig.substring(contig.length() - cg.getKmerSize(), contig.length()); Set<String> nks = CortexUtils.getNextKmers(cg, lastKmer, 0); Set<String> usedNextKmers = new HashSet<String>(); usedNextKmers.add(lastKmer); while (nks.size() == 1 && usedNextKmers.size() <= 100) { String kmer = nks.iterator().next(); lastFlank.append(kmer.charAt(kmer.length() - 1)); if (usedNextKmers.contains(kmer)) { break; } usedNextKmers.add(kmer); nks = CortexUtils.getNextKmers(cg, kmer, 0); } contig = firstFlank.toString() + contig + lastFlank.toString(); DirectedGraph<CtxVertex, MultiEdge> g = new DefaultDirectedGraph<CtxVertex, MultiEdge>(MultiEdge.class); for (int i = 0; i <= contig.length() - cg.getKmerSize(); i++) { String curKmer = contig.substring(i, i + cg.getKmerSize()); CortexKmer ck = new CortexKmer(curKmer); CtxVertex curVer = new CtxVertex( curKmer, i, contigKmers.contains(curKmer) ? VertexType.CONTIG : VertexType.CLIPPED, cg.findRecord(ck)); g.addVertex(curVer); String expectedPrevKmer = (i > 0) ? contig.substring(i - 1, i - 1 + cg.getKmerSize()) : ""; String expectedNextKmer = (i < contig.length() - cg.getKmerSize()) ? contig.substring(i + 1, i + 1 + cg.getKmerSize()) : ""; Set<String> prevKmers = CortexUtils.getPrevKmers(cg, curKmer, 0); for (String prevKmer : prevKmers) { if (!expectedPrevKmer.equals(prevKmer)) { CortexKmer pk = new CortexKmer(prevKmer); CtxVertex prevVer = new CtxVertex(prevKmer, i - 1, VertexType.IN, cg.findRecord(pk)); MultiEdge me = g.containsEdge(prevVer, curVer) ? g.getEdge(prevVer, curVer) : new MultiEdge(); me.addGraphName(cg.getCortexFile().getName()); g.addVertex(prevVer); g.addEdge(prevVer, curVer, me); } } Set<String> nextKmers = CortexUtils.getNextKmers(cg, curKmer, 0); for (String nextKmer : nextKmers) { if (!expectedNextKmer.equals(nextKmer)) { CortexKmer nk = new CortexKmer(nextKmer); CtxVertex nextVer = new CtxVertex(nextKmer, i + 1, VertexType.OUT, cg.findRecord(nk)); MultiEdge me = g.containsEdge(curVer, nextVer) ? g.getEdge(curVer, nextVer) : new MultiEdge(); me.addGraphName(cg.getCortexFile().getName()); g.addVertex(nextVer); g.addEdge(curVer, nextVer, me); } } } Set<Map<String, Object>> verticesWithLinks = new HashSet<Map<String, Object>>(); DataFrame<String, String, Integer> hv = new DataFrame<String, String, Integer>(0); for (int q = 0; q <= contig.length() - cg.getKmerSize(); q++) { // String sk = cv.getBinaryKmer(); String sk = contig.substring(q, q + cg.getKmerSize()); CortexKmer ck = new CortexKmer(sk); for (CortexLinksMap link : links) { if (link.containsKey(ck)) { CortexLinksRecord clr = link.get(ck); Map<String, Integer> lc = (!showLinks) ? new HashMap<String, Integer>() : CortexUtils.getKmersAndCoverageInLink(cg, sk, clr); Map<String, Object> entry = new HashMap<String, Object>(); entry.put("kmer", sk); entry.put("lc", lc); verticesWithLinks.add(entry); if (showLinks) { for (CortexJunctionsRecord cjr : clr.getJunctions()) { List<String> lk = CortexUtils.getKmersInLink(cg, sk, cjr); for (int i = 0; i < lk.size(); i++) { String kili = lk.get(i); for (int j = 0; j < lk.size(); j++) { String kilj = lk.get(j); if (i != j) { hv.set(kili, kilj, hv.get(kili, kilj) + cjr.getCoverage(0)); } } } } } } } } /* int hvMax = 0; Map<String, Integer> hvlin = new HashMap<String, Integer>(); if (showLinks) { for (String kili : hv.getRowNames()) { for (String kilj : hv.getColNames()) { int cov = hv.get(kili, kilj); String id = kili + "_" + kilj; hvlin.put(id, cov); if (cov > hvMax) { hvMax = cov; } } } } */ JSONObject jo = new JSONObject(); jo.put("contig", contig); jo.put("originalContig", originalContig); jo.put("ref", refFormattedString); jo.put("kmerOrigin", kmerOrigin); jo.put("kmerSize", cg.getKmerSize()); jo.put("clipStart", firstFlank.length()); jo.put("clipEnd", contig.length() - lastFlank.length()); List<Map<String, Object>> va = new ArrayList<Map<String, Object>>(); for (CtxVertex v : g.vertexSet()) { Map<String, Object> vm = new HashMap<String, Object>(); vm.put("base", v.getBase()); vm.put("kmer", v.getKmer()); vm.put("pos", v.getPos()); vm.put("type", v.getVertexType().name()); vm.put("missing", v.isMissingFromGraph()); vm.put("cov", v.getCoverage()); va.add(vm); } jo.put("vertices", va); jo.put("verticesWithLinks", verticesWithLinks); // jo.put("hvlin", hvlin); // jo.put("hvmax", hvMax); return jo.toString(); } return null; }
private static Pair<Integer, Boolean> getReadCoordinateForReferenceCoordinate( final int alignmentStart, final Cigar cigar, final int refCoord, final boolean allowGoalNotReached) { int readBases = 0; int refBases = 0; boolean fallsInsideDeletionOrSkippedRegion = false; boolean endJustBeforeDeletionOrSkippedRegion = false; boolean fallsInsideOrJustBeforeDeletionOrSkippedRegion = false; final int goal = refCoord - alignmentStart; // The goal is to move this many reference bases if (goal < 0) { if (allowGoalNotReached) { return new MutablePair<>(CLIPPING_GOAL_NOT_REACHED, false); } else { throw new GATKException( "Somehow the requested coordinate is not covered by the read. Too many deletions?"); } } boolean goalReached = refBases == goal; final Iterator<CigarElement> cigarElementIterator = cigar.getCigarElements().iterator(); while (!goalReached && cigarElementIterator.hasNext()) { final CigarElement cigarElement = cigarElementIterator.next(); int shift = 0; if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { if (refBases + cigarElement.getLength() < goal) { shift = cigarElement.getLength(); } else { shift = goal - refBases; } refBases += shift; } goalReached = refBases == goal; if (!goalReached && cigarElement.getOperator().consumesReadBases()) { readBases += cigarElement.getLength(); } if (goalReached) { // Is this base's reference position within this cigar element? Or did we use it all? final boolean endsWithinCigar = shift < cigarElement.getLength(); // If it isn't, we need to check the next one. There should *ALWAYS* be a next one // since we checked if the goal coordinate is within the read length, so this is just a // sanity check. if (!endsWithinCigar && !cigarElementIterator.hasNext()) { if (allowGoalNotReached) { return new MutablePair<>(CLIPPING_GOAL_NOT_REACHED, false); } else { throw new GATKException( String.format( "Reference coordinate corresponds to a non-existent base in the read. This should never happen -- check read with alignment start: %s and cigar: %s", alignmentStart, cigar)); } } CigarElement nextCigarElement = null; // if we end inside the current cigar element, we just have to check if it is a deletion (or // skipped region) if (endsWithinCigar) { fallsInsideDeletionOrSkippedRegion = (cigarElement.getOperator() == CigarOperator.DELETION || cigarElement.getOperator() == CigarOperator.SKIPPED_REGION); } // if we end outside the current cigar element, we need to check if the next element is an // insertion, deletion or skipped region. else { nextCigarElement = cigarElementIterator.next(); // if it's an insertion, we need to clip the whole insertion before looking at the next // element if (nextCigarElement.getOperator() == CigarOperator.INSERTION) { readBases += nextCigarElement.getLength(); if (!cigarElementIterator.hasNext()) { if (allowGoalNotReached) { return new MutablePair<>(CLIPPING_GOAL_NOT_REACHED, false); } else { throw new GATKException( String.format( "Reference coordinate corresponds to a non-existent base in the read. This should never happen -- check read with alignment start: %s and cigar: %s", alignmentStart, cigar)); } } nextCigarElement = cigarElementIterator.next(); } // if it's a deletion (or skipped region), we will pass the information on to be handled // downstream. endJustBeforeDeletionOrSkippedRegion = (nextCigarElement.getOperator() == CigarOperator.DELETION || nextCigarElement.getOperator() == CigarOperator.SKIPPED_REGION); } fallsInsideOrJustBeforeDeletionOrSkippedRegion = endJustBeforeDeletionOrSkippedRegion || fallsInsideDeletionOrSkippedRegion; // If we reached our goal outside a deletion (or skipped region), add the shift if (!fallsInsideOrJustBeforeDeletionOrSkippedRegion && cigarElement.getOperator().consumesReadBases()) { readBases += shift; } // If we reached our goal just before a deletion (or skipped region) we need // to add the shift of the current cigar element but go back to it's last element to return // the last // base before the deletion (or skipped region) (see warning in function contracts) else if (endJustBeforeDeletionOrSkippedRegion && cigarElement.getOperator().consumesReadBases()) { readBases += shift - 1; } // If we reached our goal inside a deletion (or skipped region), or just between a // deletion and a skipped region, // then we must backtrack to the last base before the deletion (or skipped region) else if (fallsInsideDeletionOrSkippedRegion || (endJustBeforeDeletionOrSkippedRegion && nextCigarElement.getOperator().equals(CigarOperator.N)) || (endJustBeforeDeletionOrSkippedRegion && nextCigarElement.getOperator().equals(CigarOperator.D))) { readBases--; } } } if (!goalReached) { if (allowGoalNotReached) { return new MutablePair<>(CLIPPING_GOAL_NOT_REACHED, false); } else { throw new GATKException( "Somehow the requested coordinate is not covered by the read. Alignment " + alignmentStart + " | " + cigar); } } return Pair.of(readBases, fallsInsideOrJustBeforeDeletionOrSkippedRegion); }
public void printAlignment(final byte[] ref, final byte[] read, final int width) { final StringBuilder bread = new StringBuilder(); final StringBuilder bref = new StringBuilder(); final StringBuilder match = new StringBuilder(); int i = 0; int j = 0; final int offset = getAlignmentStart2wrt1(); Cigar cigar = getCigar(); if (overhangStrategy != OverhangStrategy.SOFTCLIP) { // we need to go through all the hassle below only if we do not do softclipping; // otherwise offset is never negative if (offset < 0) { for (; j < (-offset); j++) { bread.append((char) read[j]); bref.append(' '); match.append(' '); } // at negative offsets, our cigar's first element carries overhanging bases // that we have just printed above. Tweak the first element to // exclude those bases. Here we create a new list of cigar elements, so the original // list/original cigar are unchanged (they are unmodifiable anyway!) final List<CigarElement> tweaked = new ArrayList<>(); tweaked.addAll(cigar.getCigarElements()); tweaked.set( 0, new CigarElement( cigar.getCigarElement(0).getLength() + offset, cigar.getCigarElement(0).getOperator())); cigar = new Cigar(tweaked); } } if (offset > 0) { // note: the way this implementation works, cigar will ever start from S *only* if // read starts before the ref, i.e. offset = 0 for (; i < getAlignmentStart2wrt1(); i++) { bref.append((char) ref[i]); bread.append(' '); match.append(' '); } } for (final CigarElement e : cigar.getCigarElements()) { switch (e.getOperator()) { case M: for (int z = 0; z < e.getLength(); z++, i++, j++) { bref.append((i < ref.length) ? (char) ref[i] : ' '); bread.append((j < read.length) ? (char) read[j] : ' '); match.append( (i < ref.length && j < read.length) ? (ref[i] == read[j] ? '.' : '*') : ' '); } break; case I: for (int z = 0; z < e.getLength(); z++, j++) { bref.append('-'); bread.append((char) read[j]); match.append('I'); } break; case S: for (int z = 0; z < e.getLength(); z++, j++) { bref.append(' '); bread.append((char) read[j]); match.append('S'); } break; case D: for (int z = 0; z < e.getLength(); z++, i++) { bref.append((char) ref[i]); bread.append('-'); match.append('D'); } break; default: throw new GATKException("Unexpected Cigar element:" + e.getOperator()); } } for (; i < ref.length; i++) bref.append((char) ref[i]); for (; j < read.length; j++) bread.append((char) read[j]); int pos = 0; final int maxlength = Math.max(match.length(), Math.max(bread.length(), bref.length())); while (pos < maxlength) { print_cautiously(match, pos, width); print_cautiously(bread, pos, width); print_cautiously(bref, pos, width); System.out.println(); pos += width; } }