Ejemplo n.º 1
0
  private GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read) {
    GATKSAMRecord unclipped = (GATKSAMRecord) read.clone();

    Cigar unclippedCigar = new Cigar();
    int matchesCount = 0;
    for (CigarElement element : read.getCigar().getCigarElements()) {
      if (element.getOperator() == CigarOperator.SOFT_CLIP
          || element.getOperator() == CigarOperator.MATCH_OR_MISMATCH)
        matchesCount += element.getLength();
      else if (matchesCount > 0) {
        unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH));
        matchesCount = 0;
        unclippedCigar.add(element);
      } else unclippedCigar.add(element);
    }
    if (matchesCount > 0)
      unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH));

    unclipped.setCigar(unclippedCigar);
    final int newStart =
        read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), unclippedCigar);
    unclipped.setAlignmentStart(newStart);

    if (newStart <= 0) {
      // if the start of the unclipped read occurs before the contig,
      // we must hard clip away the bases since we cannot represent reads with
      // negative or 0 alignment start values in the SAMRecord (e.g., 0 means unaligned)
      return hardClip(unclipped, 0, -newStart);
    } else {
      return unclipped;
    }
  }
Ejemplo n.º 2
0
  /**
   * @param read a read containing the variant
   * @return the number of hard clipped and low qual bases at the read start (where start is the
   *     leftmost end w.r.t. the reference)
   */
  public static int getNumClippedBasesAtStart(final SAMRecord read) {
    // check for hard clips (never consider these bases):
    final Cigar c = read.getCigar();
    final CigarElement first = c.getCigarElement(0);

    int numStartClippedBases = 0;
    if (first.getOperator() == CigarOperator.H) {
      numStartClippedBases = first.getLength();
    }
    final byte[] unclippedReadBases = read.getReadBases();
    final byte[] unclippedReadQuals = read.getBaseQualities();

    // Do a stricter base clipping than provided by CIGAR string, since this one may be too
    // conservative,
    // and may leave a string of Q2 bases still hanging off the reads.
    // TODO: this code may not even get used because HaplotypeCaller already hard clips low quality
    // tails
    for (int i = numStartClippedBases; i < unclippedReadBases.length; i++) {
      if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD)
        numStartClippedBases++;
      else break;
    }

    return numStartClippedBases;
  }
Ejemplo n.º 3
0
  /**
   * Checks if a hard clipped cigar left a read starting or ending with deletions or gap (N) and
   * cleans it up accordingly.
   *
   * @param cigar the original cigar
   * @return an object with the shifts (see CigarShift class)
   */
  private CigarShift cleanHardClippedCigar(final Cigar cigar) {
    final Cigar cleanCigar = new Cigar();
    int shiftFromStart = 0;
    int shiftFromEnd = 0;
    Stack<CigarElement> cigarStack = new Stack<CigarElement>();
    final Stack<CigarElement> inverseCigarStack = new Stack<CigarElement>();

    for (final CigarElement cigarElement : cigar.getCigarElements()) cigarStack.push(cigarElement);

    for (int i = 1; i <= 2; i++) {
      int shift = 0;
      int totalHardClip = 0;
      boolean readHasStarted = false;
      boolean addedHardClips = false;

      while (!cigarStack.empty()) {
        CigarElement cigarElement = cigarStack.pop();

        if (!readHasStarted
            && cigarElement.getOperator() != CigarOperator.DELETION
            && cigarElement.getOperator() != CigarOperator.SKIPPED_REGION
            && cigarElement.getOperator() != CigarOperator.HARD_CLIP) readHasStarted = true;
        else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.HARD_CLIP)
          totalHardClip += cigarElement.getLength();
        else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.DELETION)
          totalHardClip += cigarElement.getLength();
        else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.SKIPPED_REGION)
          totalHardClip += cigarElement.getLength();

        if (readHasStarted) {
          if (i == 1) {
            if (!addedHardClips) {
              if (totalHardClip > 0)
                inverseCigarStack.push(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP));
              addedHardClips = true;
            }
            inverseCigarStack.push(cigarElement);
          } else {
            if (!addedHardClips) {
              if (totalHardClip > 0)
                cleanCigar.add(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP));
              addedHardClips = true;
            }
            cleanCigar.add(cigarElement);
          }
        }
      }
      // first pass  (i=1) is from end to start of the cigar elements
      if (i == 1) {
        shiftFromEnd = shift;
        cigarStack = inverseCigarStack;
      }
      // second pass (i=2) is from start to end with the end already cleaned
      else {
        shiftFromStart = shift;
      }
    }
    return new CigarShift(cleanCigar, shiftFromStart, shiftFromEnd);
  }
Ejemplo n.º 4
0
  /**
   * Decode a single line in a SAM text file.
   *
   * @param line line to decode.
   * @return A SAMReadFeature modeling that line.
   */
  @Override
  public SAMReadFeature decode(String line) {
    // we may be asked to process a header line; ignore it
    if (line.startsWith("@")) return null;

    String[] tokens = new String[expectedTokenCount];

    // split the line
    int count = ParsingUtils.splitWhitespace(line, tokens);

    // check to see if we've parsed the string into the right number of tokens (expectedTokenCount)
    if (count != expectedTokenCount)
      throw new CodecLineParsingException(
          "the SAM read line didn't have the expected number of tokens "
              + "(expected = "
              + expectedTokenCount
              + ", saw = "
              + count
              + " on "
              + "line = "
              + line
              + ")");

    final String readName = tokens[0];
    final int flags = Integer.parseInt(tokens[1]);
    final String contigName = tokens[2];
    final int alignmentStart = Integer.parseInt(tokens[3]);
    final int mapQ = Integer.parseInt(tokens[4]);
    final String cigarString = tokens[5];
    final String mateContigName = tokens[6];
    final int mateAlignmentStart = Integer.parseInt(tokens[7]);
    final int inferredInsertSize = Integer.parseInt(tokens[8]);
    final byte[] bases = StringUtil.stringToBytes(tokens[9]);
    final byte[] qualities = StringUtil.stringToBytes(tokens[10]);

    // Infer the alignment end.
    Cigar cigar = TextCigarCodec.decode(cigarString);
    int alignmentEnd = alignmentStart + cigar.getReferenceLength() - 1;

    // Remove printable character conversion from the qualities.
    for (byte quality : qualities) quality -= 33;

    return new SAMReadFeature(
        readName,
        flags,
        contigName,
        alignmentStart,
        alignmentEnd,
        mapQ,
        cigarString,
        mateContigName,
        mateAlignmentStart,
        inferredInsertSize,
        bases,
        qualities);
  }
Ejemplo n.º 5
0
  public static int getReadCoordinateForReferenceCoordinate(
      final int alignmentStart,
      final Cigar cigar,
      final int refCoord,
      final ClippingTail tail,
      final boolean allowGoalNotReached) {
    final Pair<Integer, Boolean> result =
        getReadCoordinateForReferenceCoordinate(
            alignmentStart, cigar, refCoord, allowGoalNotReached);
    int readCoord = result.getLeft();

    // Corner case one: clipping the right tail and falls on deletion, move to the next
    // read coordinate. It is not a problem for the left tail because the default answer
    // from getReadCoordinateForReferenceCoordinate is to give the previous read coordinate.
    if (result.getRight() && tail == ClippingTail.RIGHT_TAIL) {
      readCoord++;
    }

    // clipping the left tail and first base is insertion, go to the next read coordinate
    // with the same reference coordinate. Advance to the next cigar element, or to the
    // end of the read if there is no next element.
    final CigarElement firstElementIsInsertion = readStartsWithInsertion(cigar);
    if (readCoord == 0 && tail == ClippingTail.LEFT_TAIL && firstElementIsInsertion != null) {
      readCoord = Math.min(firstElementIsInsertion.getLength(), cigar.getReadLength() - 1);
    }

    return readCoord;
  }
Ejemplo n.º 6
0
  /**
   * Calculates the reference coordinate for the end of the read taking into account soft clips but
   * not hard clips.
   *
   * <p>Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips.
   *
   * @param read the read
   * @param cigar the read's cigar
   *     <p>Note: this overload of the function takes the cigar as input for speed because getCigar
   *     is an expensive operation. Most callers should use the overload that does not take the
   *     cigar.
   * @return the unclipped end of the read taking soft clips (but not hard clips) into account
   */
  public static int getSoftEnd(final GATKRead read, final Cigar cigar) {
    Utils.nonNull(read, "read");
    Utils.nonNull(cigar, "cigar");

    boolean foundAlignedBase = false;
    int softEnd = read.getEnd();
    final List<CigarElement> cigs = cigar.getCigarElements();
    for (int i = cigs.size() - 1; i >= 0; --i) {
      final CigarElement cig = cigs.get(i);
      final CigarOperator op = cig.getOperator();

      if (op
          == CigarOperator
              .SOFT_CLIP) { // assumes the soft clip that we found is at the end of the aligned read
        softEnd += cig.getLength();
      } else if (op != CigarOperator.HARD_CLIP) {
        foundAlignedBase = true;
        break;
      }
    }
    if (!foundAlignedBase) { // for example 64H14S, the soft end is actually the same as the
                             // alignment end
      softEnd = read.getEnd();
    }
    return softEnd;
  }
Ejemplo n.º 7
0
  /** Given a cigar string, get the number of bases hard or soft clipped at the start */
  private int getNewAlignmentStartOffset(final Cigar __cigar, final Cigar __oldCigar) {
    int num = 0;
    for (CigarElement e : __cigar.getCigarElements()) {
      if (!e.getOperator().consumesReferenceBases()) {
        if (e.getOperator().consumesReadBases()) {
          num += e.getLength();
        }
      } else {
        break;
      }
    }

    int oldNum = 0;
    int curReadCounter = 0;

    for (CigarElement e : __oldCigar.getCigarElements()) {
      int curRefLength = e.getLength();
      int curReadLength = e.getLength();
      if (!e.getOperator().consumesReadBases()) {
        curReadLength = 0;
      }

      boolean truncated = false;
      if (curReadCounter + curReadLength > num) {
        curReadLength = num - curReadCounter;
        curRefLength = num - curReadCounter;
        truncated = true;
      }

      if (!e.getOperator().consumesReferenceBases()) {
        curRefLength = 0;
      }

      curReadCounter += curReadLength;
      oldNum += curRefLength;

      if (curReadCounter > num || truncated) {
        break;
      }
    }

    return oldNum;
  }
Ejemplo n.º 8
0
 /**
  * Checks if a read starts with an insertion.
  *
  * @param cigarForRead the CIGAR to evaluate
  * @param ignoreSoftClipOps should we ignore S operators when evaluating whether an I operator is
  *     at the beginning? Note that H operators are always ignored.
  * @return the element if it's a leading insertion or null otherwise
  */
 public static CigarElement readStartsWithInsertion(
     final Cigar cigarForRead, final boolean ignoreSoftClipOps) {
   for (final CigarElement cigarElement : cigarForRead.getCigarElements()) {
     if (cigarElement.getOperator() == CigarOperator.INSERTION) {
       return cigarElement;
     } else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP
         && (!ignoreSoftClipOps || cigarElement.getOperator() != CigarOperator.SOFT_CLIP)) {
       break;
     }
   }
   return null;
 }
Ejemplo n.º 9
0
  /**
   * Compute the offset of the first "real" position in the cigar on the genome
   *
   * <p>This is defined as a first position after a run of Hs followed by a run of Ss
   *
   * @param cigar A non-null cigar
   * @return the offset (from 0) of the first on-genome base
   */
  private int calcHardSoftOffset(final Cigar cigar) {
    final List<CigarElement> elements = cigar.getCigarElements();

    int size = 0;
    int i = 0;
    while (i < elements.size() && elements.get(i).getOperator() == CigarOperator.HARD_CLIP) {
      size += elements.get(i).getLength();
      i++;
    }
    while (i < elements.size() && elements.get(i).getOperator() == CigarOperator.SOFT_CLIP) {
      size += elements.get(i).getLength();
      i++;
    }

    return size;
  }
Ejemplo n.º 10
0
  /**
   * Calculates the reference coordinate for the beginning of the read taking into account soft
   * clips but not hard clips.
   *
   * <p>Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips.
   *
   * @param read the read
   * @param cigar the read's cigar
   *     <p>Note: this overload of the function takes the cigar as input for speed because getCigar
   *     is an expensive operation. Most callers should use the overload that does not take the
   *     cigar.
   * @return the unclipped start of the read taking soft clips (but not hard clips) into account
   */
  public static int getSoftStart(final GATKRead read, final Cigar cigar) {
    Utils.nonNull(read, "read");
    Utils.nonNull(cigar, "cigar");

    int softStart = read.getStart();
    for (final CigarElement cig : cigar.getCigarElements()) {
      final CigarOperator op = cig.getOperator();

      if (op == CigarOperator.SOFT_CLIP) {
        softStart -= cig.getLength();
      } else if (op != CigarOperator.HARD_CLIP) {
        break;
      }
    }
    return softStart;
  }
Ejemplo n.º 11
0
  @Requires({"!cigar.isEmpty()"})
  private CigarShift hardClipCigar(Cigar cigar, int start, int stop) {
    Cigar newCigar = new Cigar();
    int index = 0;
    int totalHardClipCount = stop - start + 1;
    int alignmentShift = 0; // caused by hard clipping deletions

    // hard clip the beginning of the cigar string
    if (start == 0) {
      Iterator<CigarElement> cigarElementIterator = cigar.getCigarElements().iterator();
      CigarElement cigarElement = cigarElementIterator.next();
      // Skip all leading hard clips
      while (cigarElement.getOperator() == CigarOperator.HARD_CLIP) {
        totalHardClipCount += cigarElement.getLength();
        if (cigarElementIterator.hasNext()) cigarElement = cigarElementIterator.next();
        else
          throw new ReviewedGATKException(
              "Read is entirely hardclipped, shouldn't be trying to clip it's cigar string");
      }
      // keep clipping until we hit stop
      while (index <= stop) {
        int shift = 0;
        if (cigarElement.getOperator().consumesReadBases()) shift = cigarElement.getLength();

        // we're still clipping or just finished perfectly
        if (index + shift == stop + 1) {
          alignmentShift +=
              calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength());
          newCigar.add(
              new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP));
        }
        // element goes beyond what we need to clip
        else if (index + shift > stop + 1) {
          int elementLengthAfterChopping = cigarElement.getLength() - (stop - index + 1);
          alignmentShift += calculateHardClippingAlignmentShift(cigarElement, stop - index + 1);
          newCigar.add(
              new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP));
          newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator()));
        }
        index += shift;
        alignmentShift += calculateHardClippingAlignmentShift(cigarElement, shift);

        if (index <= stop && cigarElementIterator.hasNext())
          cigarElement = cigarElementIterator.next();
        else break;
      }

      // add the remaining cigar elements
      while (cigarElementIterator.hasNext()) {
        cigarElement = cigarElementIterator.next();
        newCigar.add(new CigarElement(cigarElement.getLength(), cigarElement.getOperator()));
      }
    }

    // hard clip the end of the cigar string
    else {
      Iterator<CigarElement> cigarElementIterator = cigar.getCigarElements().iterator();
      CigarElement cigarElement = cigarElementIterator.next();

      // Keep marching on until we find the start
      while (index < start) {
        int shift = 0;
        if (cigarElement.getOperator().consumesReadBases()) shift = cigarElement.getLength();

        // we haven't gotten to the start yet, keep everything as is.
        if (index + shift < start)
          newCigar.add(new CigarElement(cigarElement.getLength(), cigarElement.getOperator()));

        // element goes beyond our clip starting position
        else {
          int elementLengthAfterChopping = start - index;
          alignmentShift +=
              calculateHardClippingAlignmentShift(
                  cigarElement, cigarElement.getLength() - (start - index));

          // if this last element is a HARD CLIP operator, just merge it with our hard clip operator
          // to be added later
          if (cigarElement.getOperator() == CigarOperator.HARD_CLIP)
            totalHardClipCount += elementLengthAfterChopping;
          // otherwise, maintain what's left of this last operator
          else
            newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator()));
        }
        index += shift;
        if (index < start && cigarElementIterator.hasNext())
          cigarElement = cigarElementIterator.next();
        else break;
      }

      // check if we are hard clipping indels
      while (cigarElementIterator.hasNext()) {
        cigarElement = cigarElementIterator.next();
        alignmentShift +=
            calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength());

        // if the read had a HardClip operator in the end, combine it with the Hard Clip we are
        // adding
        if (cigarElement.getOperator() == CigarOperator.HARD_CLIP)
          totalHardClipCount += cigarElement.getLength();
      }
      newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP));
    }
    return cleanHardClippedCigar(newCigar);
  }
Ejemplo n.º 12
0
  /** Given a cigar string, soft clip up to startClipEnd and soft clip starting at endClipBegin */
  private Cigar softClip(final Cigar __cigar, final int __startClipEnd, final int __endClipBegin) {
    if (__endClipBegin <= __startClipEnd) {
      // whole thing should be soft clipped
      int cigarLength = 0;
      for (CigarElement e : __cigar.getCigarElements()) {
        cigarLength += e.getLength();
      }

      Cigar newCigar = new Cigar();
      newCigar.add(new CigarElement(cigarLength, CigarOperator.SOFT_CLIP));
      assert newCigar.isValid(null, -1) == null;
      return newCigar;
    }

    int curLength = 0;
    Vector<CigarElement> newElements = new Vector<CigarElement>();
    for (CigarElement curElem : __cigar.getCigarElements()) {
      if (!curElem.getOperator().consumesReadBases()) {
        if (curElem.getOperator() == CigarOperator.HARD_CLIP
            || curLength > __startClipEnd && curLength < __endClipBegin) {
          newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator()));
        }
        continue;
      }

      int s = curLength;
      int e = curLength + curElem.getLength();
      if (e <= __startClipEnd || s >= __endClipBegin) {
        // must turn this entire thing into a clip
        newElements.add(new CigarElement(curElem.getLength(), CigarOperator.SOFT_CLIP));
      } else if (s >= __startClipEnd && e <= __endClipBegin) {
        // same thing
        newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator()));
      } else {
        // we are clipping in the middle of this guy
        CigarElement newStart = null;
        CigarElement newMid = null;
        CigarElement newEnd = null;

        int midLength = curElem.getLength();
        if (s < __startClipEnd) {
          newStart = new CigarElement(__startClipEnd - s, CigarOperator.SOFT_CLIP);
          midLength -= newStart.getLength();
        }

        if (e > __endClipBegin) {
          newEnd = new CigarElement(e - __endClipBegin, CigarOperator.SOFT_CLIP);
          midLength -= newEnd.getLength();
        }
        assert midLength >= 0;
        if (midLength > 0) {
          newMid = new CigarElement(midLength, curElem.getOperator());
        }
        if (newStart != null) {
          newElements.add(newStart);
        }
        if (newMid != null) {
          newElements.add(newMid);
        }
        if (newEnd != null) {
          newElements.add(newEnd);
        }
      }
      curLength += curElem.getLength();
    }

    Vector<CigarElement> finalNewElements = new Vector<CigarElement>();
    CigarElement lastElement = null;
    for (CigarElement elem : newElements) {
      if (lastElement == null || lastElement.getOperator() != elem.getOperator()) {
        if (lastElement != null) {
          finalNewElements.add(lastElement);
        }
        lastElement = elem;
      } else {
        lastElement =
            new CigarElement(lastElement.getLength() + elem.getLength(), lastElement.getOperator());
      }
    }
    if (lastElement != null) {
      finalNewElements.add(lastElement);
    }

    Cigar newCigar = new Cigar(finalNewElements);
    assert newCigar.isValid(null, -1) == null;
    return newCigar;
  }
Ejemplo n.º 13
0
    @Override
    public String process(File page, Map<String, String> query) {
      loadContigs();

      if (query.get("contigName").matches("^[ACGT]+$")) {
        contigs.put("manual", query.get("contigName"));
        query.put("contigName", "manual");
      } else if (query.get("contigName").matches("^Pf3D7.+$")) {
        String[] pieces = query.get("contigName").split("[:-]");

        int start = Integer.valueOf(pieces[1].replaceAll(",", ""));
        int end = Integer.valueOf(pieces[2].replaceAll(",", ""));

        ReferenceSequence rseq = REF.getSubsequenceAt(pieces[0], start, end);
        contigs.put("manual", new String(rseq.getBases()));
        query.put("contigName", "manual");
      }

      if (query.containsKey("contigName")
          && contigs.containsKey(query.get("contigName"))
          && graphs.containsKey(query.get("graphName"))) {
        boolean showLinks = query.get("showLinks").equals("links_on");

        String contig = contigs.get(query.get("contigName"));
        String originalContig = contigs.get(query.get("contigName"));
        String refFormattedString = "";
        String kmerOrigin = "";

        if (metrics.containsKey(query.get("contigName"))) {
          String[] loc = metrics.get(query.get("contigName")).get("canonicalLocus").split("[:-]");
          if (!loc[0].equals("*")) {
            boolean isRc = metrics.get(query.get("contigName")).get("isRcCanonical").equals("1");

            if (isRc) {
              contig = SequenceUtils.reverseComplement(contig);
              originalContig = SequenceUtils.reverseComplement(originalContig);
            }

            int locStart = Integer.valueOf(loc[1]);
            int locEnd = Integer.valueOf(loc[2]);

            Cigar cigar =
                cigarStringToCigar(metrics.get(query.get("contigName")).get("cigarCanonical"));
            if (cigar.getCigarElement(0).getOperator().equals(CigarOperator.S)) {
              locStart -= cigar.getCigarElement(0).getLength();
            }

            if (cigar
                .getCigarElement(cigar.getCigarElements().size() - 1)
                .getOperator()
                .equals(CigarOperator.S)) {
              locEnd += cigar.getCigarElement(cigar.getCigarElements().size() - 1).getLength();
            }

            String ref = new String(REF.getSubsequenceAt(loc[0], locStart, locEnd).getBases());

            StringBuilder refFormatted = new StringBuilder();
            int pos = 0;
            for (CigarElement ce : cigar.getCigarElements()) {
              CigarOperator co = ce.getOperator();
              switch (co) {
                case S:
                  refFormatted.append(ref.substring(pos, pos + ce.getLength()).toLowerCase());
                  break;
                case M:
                  refFormatted.append(ref.substring(pos, pos + ce.getLength()));
                  break;
                case I:
                  refFormatted.append(StringUtils.repeat("-", ce.getLength()));
                  break;
              }

              if (ce.getOperator().consumesReferenceBases()) {
                pos += ce.getLength();
              }
            }

            refFormattedString = refFormatted.toString();

            kmerOrigin = metrics.get(query.get("contigName")).get("kmerOrigin");
          }
        }

        CortexGraph cg = graphs.get(query.get("graphName"));

        String sampleName = cg.getColor(0).getSampleName();
        Set<CortexLinksMap> links = new HashSet<CortexLinksMap>();
        if (LINKS != null && !LINKS.isEmpty()) {
          for (CortexLinksMap link : LINKS) {
            if (sampleName.equals(link.getCortexLinks().getColor(0).getSampleName())) {
              links.add(link);
            }
          }
        }

        Set<String> contigKmers = new HashSet<String>();
        for (int i = 0; i <= contig.length() - cg.getKmerSize(); i++) {
          String curKmer = contig.substring(i, i + cg.getKmerSize());

          contigKmers.add(curKmer);
        }

        StringBuilder firstFlank = new StringBuilder();
        String firstKmer = contig.substring(0, cg.getKmerSize());
        Set<String> pks = CortexUtils.getPrevKmers(cg, firstKmer, 0);
        Set<String> usedPrevKmers = new HashSet<String>();
        usedPrevKmers.add(firstKmer);
        while (pks.size() == 1 && usedPrevKmers.size() <= 100) {
          String kmer = pks.iterator().next();
          firstFlank.insert(0, kmer.charAt(0));

          if (usedPrevKmers.contains(kmer)) {
            break;
          }
          usedPrevKmers.add(kmer);

          pks = CortexUtils.getPrevKmers(cg, kmer, 0);
        }

        StringBuilder lastFlank = new StringBuilder();
        String lastKmer = contig.substring(contig.length() - cg.getKmerSize(), contig.length());
        Set<String> nks = CortexUtils.getNextKmers(cg, lastKmer, 0);
        Set<String> usedNextKmers = new HashSet<String>();
        usedNextKmers.add(lastKmer);
        while (nks.size() == 1 && usedNextKmers.size() <= 100) {
          String kmer = nks.iterator().next();
          lastFlank.append(kmer.charAt(kmer.length() - 1));

          if (usedNextKmers.contains(kmer)) {
            break;
          }
          usedNextKmers.add(kmer);

          nks = CortexUtils.getNextKmers(cg, kmer, 0);
        }

        contig = firstFlank.toString() + contig + lastFlank.toString();

        DirectedGraph<CtxVertex, MultiEdge> g =
            new DefaultDirectedGraph<CtxVertex, MultiEdge>(MultiEdge.class);
        for (int i = 0; i <= contig.length() - cg.getKmerSize(); i++) {
          String curKmer = contig.substring(i, i + cg.getKmerSize());
          CortexKmer ck = new CortexKmer(curKmer);
          CtxVertex curVer =
              new CtxVertex(
                  curKmer,
                  i,
                  contigKmers.contains(curKmer) ? VertexType.CONTIG : VertexType.CLIPPED,
                  cg.findRecord(ck));

          g.addVertex(curVer);

          String expectedPrevKmer =
              (i > 0) ? contig.substring(i - 1, i - 1 + cg.getKmerSize()) : "";
          String expectedNextKmer =
              (i < contig.length() - cg.getKmerSize())
                  ? contig.substring(i + 1, i + 1 + cg.getKmerSize())
                  : "";

          Set<String> prevKmers = CortexUtils.getPrevKmers(cg, curKmer, 0);
          for (String prevKmer : prevKmers) {
            if (!expectedPrevKmer.equals(prevKmer)) {
              CortexKmer pk = new CortexKmer(prevKmer);
              CtxVertex prevVer = new CtxVertex(prevKmer, i - 1, VertexType.IN, cg.findRecord(pk));

              MultiEdge me =
                  g.containsEdge(prevVer, curVer) ? g.getEdge(prevVer, curVer) : new MultiEdge();
              me.addGraphName(cg.getCortexFile().getName());

              g.addVertex(prevVer);
              g.addEdge(prevVer, curVer, me);
            }
          }

          Set<String> nextKmers = CortexUtils.getNextKmers(cg, curKmer, 0);
          for (String nextKmer : nextKmers) {
            if (!expectedNextKmer.equals(nextKmer)) {
              CortexKmer nk = new CortexKmer(nextKmer);
              CtxVertex nextVer = new CtxVertex(nextKmer, i + 1, VertexType.OUT, cg.findRecord(nk));

              MultiEdge me =
                  g.containsEdge(curVer, nextVer) ? g.getEdge(curVer, nextVer) : new MultiEdge();
              me.addGraphName(cg.getCortexFile().getName());

              g.addVertex(nextVer);
              g.addEdge(curVer, nextVer, me);
            }
          }
        }

        Set<Map<String, Object>> verticesWithLinks = new HashSet<Map<String, Object>>();
        DataFrame<String, String, Integer> hv = new DataFrame<String, String, Integer>(0);

        for (int q = 0; q <= contig.length() - cg.getKmerSize(); q++) {
          // String sk = cv.getBinaryKmer();
          String sk = contig.substring(q, q + cg.getKmerSize());
          CortexKmer ck = new CortexKmer(sk);

          for (CortexLinksMap link : links) {
            if (link.containsKey(ck)) {
              CortexLinksRecord clr = link.get(ck);
              Map<String, Integer> lc =
                  (!showLinks)
                      ? new HashMap<String, Integer>()
                      : CortexUtils.getKmersAndCoverageInLink(cg, sk, clr);

              Map<String, Object> entry = new HashMap<String, Object>();
              entry.put("kmer", sk);
              entry.put("lc", lc);

              verticesWithLinks.add(entry);

              if (showLinks) {
                for (CortexJunctionsRecord cjr : clr.getJunctions()) {
                  List<String> lk = CortexUtils.getKmersInLink(cg, sk, cjr);

                  for (int i = 0; i < lk.size(); i++) {
                    String kili = lk.get(i);

                    for (int j = 0; j < lk.size(); j++) {
                      String kilj = lk.get(j);

                      if (i != j) {
                        hv.set(kili, kilj, hv.get(kili, kilj) + cjr.getCoverage(0));
                      }
                    }
                  }
                }
              }
            }
          }
        }

        /*
        int hvMax = 0;
        Map<String, Integer> hvlin = new HashMap<String, Integer>();
        if (showLinks) {
            for (String kili : hv.getRowNames()) {
                for (String kilj : hv.getColNames()) {
                    int cov = hv.get(kili, kilj);

                    String id = kili + "_" + kilj;
                    hvlin.put(id, cov);

                    if (cov > hvMax) {
                        hvMax = cov;
                    }
                }
            }
        }
        */

        JSONObject jo = new JSONObject();
        jo.put("contig", contig);
        jo.put("originalContig", originalContig);
        jo.put("ref", refFormattedString);
        jo.put("kmerOrigin", kmerOrigin);
        jo.put("kmerSize", cg.getKmerSize());
        jo.put("clipStart", firstFlank.length());
        jo.put("clipEnd", contig.length() - lastFlank.length());

        List<Map<String, Object>> va = new ArrayList<Map<String, Object>>();
        for (CtxVertex v : g.vertexSet()) {
          Map<String, Object> vm = new HashMap<String, Object>();
          vm.put("base", v.getBase());
          vm.put("kmer", v.getKmer());
          vm.put("pos", v.getPos());
          vm.put("type", v.getVertexType().name());
          vm.put("missing", v.isMissingFromGraph());
          vm.put("cov", v.getCoverage());

          va.add(vm);
        }

        jo.put("vertices", va);
        jo.put("verticesWithLinks", verticesWithLinks);
        // jo.put("hvlin", hvlin);
        // jo.put("hvmax", hvMax);

        return jo.toString();
      }

      return null;
    }
Ejemplo n.º 14
0
  private static Pair<Integer, Boolean> getReadCoordinateForReferenceCoordinate(
      final int alignmentStart,
      final Cigar cigar,
      final int refCoord,
      final boolean allowGoalNotReached) {
    int readBases = 0;
    int refBases = 0;
    boolean fallsInsideDeletionOrSkippedRegion = false;
    boolean endJustBeforeDeletionOrSkippedRegion = false;
    boolean fallsInsideOrJustBeforeDeletionOrSkippedRegion = false;

    final int goal = refCoord - alignmentStart; // The goal is to move this many reference bases
    if (goal < 0) {
      if (allowGoalNotReached) {
        return new MutablePair<>(CLIPPING_GOAL_NOT_REACHED, false);
      } else {
        throw new GATKException(
            "Somehow the requested coordinate is not covered by the read. Too many deletions?");
      }
    }
    boolean goalReached = refBases == goal;

    final Iterator<CigarElement> cigarElementIterator = cigar.getCigarElements().iterator();
    while (!goalReached && cigarElementIterator.hasNext()) {
      final CigarElement cigarElement = cigarElementIterator.next();
      int shift = 0;

      if (cigarElement.getOperator().consumesReferenceBases()
          || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) {
        if (refBases + cigarElement.getLength() < goal) {
          shift = cigarElement.getLength();
        } else {
          shift = goal - refBases;
        }

        refBases += shift;
      }
      goalReached = refBases == goal;

      if (!goalReached && cigarElement.getOperator().consumesReadBases()) {
        readBases += cigarElement.getLength();
      }

      if (goalReached) {
        // Is this base's reference position within this cigar element? Or did we use it all?
        final boolean endsWithinCigar = shift < cigarElement.getLength();

        // If it isn't, we need to check the next one. There should *ALWAYS* be a next one
        // since we checked if the goal coordinate is within the read length, so this is just a
        // sanity check.
        if (!endsWithinCigar && !cigarElementIterator.hasNext()) {
          if (allowGoalNotReached) {
            return new MutablePair<>(CLIPPING_GOAL_NOT_REACHED, false);
          } else {
            throw new GATKException(
                String.format(
                    "Reference coordinate corresponds to a non-existent base in the read. This should never happen -- check read with alignment start: %s  and cigar: %s",
                    alignmentStart, cigar));
          }
        }

        CigarElement nextCigarElement = null;

        // if we end inside the current cigar element, we just have to check if it is a deletion (or
        // skipped region)
        if (endsWithinCigar) {
          fallsInsideDeletionOrSkippedRegion =
              (cigarElement.getOperator() == CigarOperator.DELETION
                  || cigarElement.getOperator() == CigarOperator.SKIPPED_REGION);
        } // if we end outside the current cigar element, we need to check if the next element is an
          // insertion, deletion or skipped region.
        else {
          nextCigarElement = cigarElementIterator.next();

          // if it's an insertion, we need to clip the whole insertion before looking at the next
          // element
          if (nextCigarElement.getOperator() == CigarOperator.INSERTION) {
            readBases += nextCigarElement.getLength();
            if (!cigarElementIterator.hasNext()) {
              if (allowGoalNotReached) {
                return new MutablePair<>(CLIPPING_GOAL_NOT_REACHED, false);
              } else {
                throw new GATKException(
                    String.format(
                        "Reference coordinate corresponds to a non-existent base in the read. This should never happen -- check read with alignment start: %s  and cigar: %s",
                        alignmentStart, cigar));
              }
            }

            nextCigarElement = cigarElementIterator.next();
          }

          // if it's a deletion (or skipped region), we will pass the information on to be handled
          // downstream.
          endJustBeforeDeletionOrSkippedRegion =
              (nextCigarElement.getOperator() == CigarOperator.DELETION
                  || nextCigarElement.getOperator() == CigarOperator.SKIPPED_REGION);
        }

        fallsInsideOrJustBeforeDeletionOrSkippedRegion =
            endJustBeforeDeletionOrSkippedRegion || fallsInsideDeletionOrSkippedRegion;

        // If we reached our goal outside a deletion (or skipped region), add the shift
        if (!fallsInsideOrJustBeforeDeletionOrSkippedRegion
            && cigarElement.getOperator().consumesReadBases()) {
          readBases += shift;
        } // If we reached our goal just before a deletion (or skipped region) we need
        // to add the shift of the current cigar element but go back to it's last element to return
        // the last
        // base before the deletion (or skipped region) (see warning in function contracts)
        else if (endJustBeforeDeletionOrSkippedRegion
            && cigarElement.getOperator().consumesReadBases()) {
          readBases += shift - 1;
        } // If we reached our goal inside a deletion (or skipped region), or just between a
          // deletion and a skipped region,
        // then we must backtrack to the last base before the deletion (or skipped region)
        else if (fallsInsideDeletionOrSkippedRegion
            || (endJustBeforeDeletionOrSkippedRegion
                && nextCigarElement.getOperator().equals(CigarOperator.N))
            || (endJustBeforeDeletionOrSkippedRegion
                && nextCigarElement.getOperator().equals(CigarOperator.D))) {
          readBases--;
        }
      }
    }

    if (!goalReached) {
      if (allowGoalNotReached) {
        return new MutablePair<>(CLIPPING_GOAL_NOT_REACHED, false);
      } else {
        throw new GATKException(
            "Somehow the requested coordinate is not covered by the read. Alignment "
                + alignmentStart
                + " | "
                + cigar);
      }
    }

    return Pair.of(readBases, fallsInsideOrJustBeforeDeletionOrSkippedRegion);
  }
Ejemplo n.º 15
0
  public void printAlignment(final byte[] ref, final byte[] read, final int width) {
    final StringBuilder bread = new StringBuilder();
    final StringBuilder bref = new StringBuilder();
    final StringBuilder match = new StringBuilder();

    int i = 0;
    int j = 0;

    final int offset = getAlignmentStart2wrt1();

    Cigar cigar = getCigar();

    if (overhangStrategy != OverhangStrategy.SOFTCLIP) {

      // we need to go through all the hassle below only if we do not do softclipping;
      // otherwise offset is never negative
      if (offset < 0) {
        for (; j < (-offset); j++) {
          bread.append((char) read[j]);
          bref.append(' ');
          match.append(' ');
        }
        // at negative offsets, our cigar's first element carries overhanging bases
        // that we have just printed above. Tweak the first element to
        // exclude those bases. Here we create a new list of cigar elements, so the original
        // list/original cigar are unchanged (they are unmodifiable anyway!)

        final List<CigarElement> tweaked = new ArrayList<>();
        tweaked.addAll(cigar.getCigarElements());
        tweaked.set(
            0,
            new CigarElement(
                cigar.getCigarElement(0).getLength() + offset,
                cigar.getCigarElement(0).getOperator()));
        cigar = new Cigar(tweaked);
      }
    }

    if (offset
        > 0) { // note: the way this implementation works, cigar will ever start from S *only* if
      // read starts before the ref, i.e. offset = 0
      for (; i < getAlignmentStart2wrt1(); i++) {
        bref.append((char) ref[i]);
        bread.append(' ');
        match.append(' ');
      }
    }

    for (final CigarElement e : cigar.getCigarElements()) {
      switch (e.getOperator()) {
        case M:
          for (int z = 0; z < e.getLength(); z++, i++, j++) {
            bref.append((i < ref.length) ? (char) ref[i] : ' ');
            bread.append((j < read.length) ? (char) read[j] : ' ');
            match.append(
                (i < ref.length && j < read.length) ? (ref[i] == read[j] ? '.' : '*') : ' ');
          }
          break;
        case I:
          for (int z = 0; z < e.getLength(); z++, j++) {
            bref.append('-');
            bread.append((char) read[j]);
            match.append('I');
          }
          break;
        case S:
          for (int z = 0; z < e.getLength(); z++, j++) {
            bref.append(' ');
            bread.append((char) read[j]);
            match.append('S');
          }
          break;
        case D:
          for (int z = 0; z < e.getLength(); z++, i++) {
            bref.append((char) ref[i]);
            bread.append('-');
            match.append('D');
          }
          break;
        default:
          throw new GATKException("Unexpected Cigar element:" + e.getOperator());
      }
    }
    for (; i < ref.length; i++) bref.append((char) ref[i]);
    for (; j < read.length; j++) bread.append((char) read[j]);

    int pos = 0;
    final int maxlength = Math.max(match.length(), Math.max(bread.length(), bref.length()));
    while (pos < maxlength) {
      print_cautiously(match, pos, width);
      print_cautiously(bread, pos, width);
      print_cautiously(bref, pos, width);
      System.out.println();
      pos += width;
    }
  }