Exemple #1
0
  /**
   * Calculates the reference coordinate for a read coordinate
   *
   * @param read the read
   * @param offset the base in the read (coordinate in the read)
   * @return the reference coordinate correspondent to this base
   */
  public static long getReferenceCoordinateForReadCoordinate(GATKSAMRecord read, int offset) {
    if (offset > read.getReadLength())
      throw new ReviewedStingException(
          String.format(OFFSET_OUT_OF_BOUNDS_EXCEPTION, offset, read.getReadLength()));

    long location = read.getAlignmentStart();
    Iterator<CigarElement> cigarElementIterator = read.getCigar().getCigarElements().iterator();
    while (offset > 0 && cigarElementIterator.hasNext()) {
      CigarElement cigarElement = cigarElementIterator.next();
      long move = 0;
      if (cigarElement.getOperator().consumesReferenceBases())
        move = (long) Math.min(cigarElement.getLength(), offset);
      location += move;
      offset -= move;
    }
    if (offset > 0 && !cigarElementIterator.hasNext())
      throw new ReviewedStingException(OFFSET_NOT_ZERO_EXCEPTION);

    return location;
  }
Exemple #2
0
  public static Pair<Integer, Boolean> getReadCoordinateForReferenceCoordinate(
      final int alignmentStart,
      final Cigar cigar,
      final int refCoord,
      final boolean allowGoalNotReached) {
    int readBases = 0;
    int refBases = 0;
    boolean fallsInsideDeletion = false;

    int goal = refCoord - alignmentStart; // The goal is to move this many reference bases
    if (goal < 0) {
      if (allowGoalNotReached) {
        return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false);
      } else {
        throw new ReviewedStingException(
            "Somehow the requested coordinate is not covered by the read. Too many deletions?");
      }
    }
    boolean goalReached = refBases == goal;

    Iterator<CigarElement> cigarElementIterator = cigar.getCigarElements().iterator();
    while (!goalReached && cigarElementIterator.hasNext()) {
      CigarElement cigarElement = cigarElementIterator.next();
      int shift = 0;

      if (cigarElement.getOperator().consumesReferenceBases()
          || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) {
        if (refBases + cigarElement.getLength() < goal) shift = cigarElement.getLength();
        else shift = goal - refBases;

        refBases += shift;
      }
      goalReached = refBases == goal;

      if (!goalReached && cigarElement.getOperator().consumesReadBases())
        readBases += cigarElement.getLength();

      if (goalReached) {
        // Is this base's reference position within this cigar element? Or did we use it all?
        boolean endsWithinCigar = shift < cigarElement.getLength();

        // If it isn't, we need to check the next one. There should *ALWAYS* be a next one
        // since we checked if the goal coordinate is within the read length, so this is just a
        // sanity check.
        if (!endsWithinCigar && !cigarElementIterator.hasNext()) {
          if (allowGoalNotReached) {
            return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false);
          } else {
            throw new ReviewedStingException(
                "Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio");
          }
        }

        CigarElement nextCigarElement;

        // if we end inside the current cigar element, we just have to check if it is a deletion
        if (endsWithinCigar)
          fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION;

        // if we end outside the current cigar element, we need to check if the next element is an
        // insertion or deletion.
        else {
          nextCigarElement = cigarElementIterator.next();

          // if it's an insertion, we need to clip the whole insertion before looking at the next
          // element
          if (nextCigarElement.getOperator() == CigarOperator.INSERTION) {
            readBases += nextCigarElement.getLength();
            if (!cigarElementIterator.hasNext()) {
              if (allowGoalNotReached) {
                return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false);
              } else {
                throw new ReviewedStingException(
                    "Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio");
              }
            }

            nextCigarElement = cigarElementIterator.next();
          }

          // if it's a deletion, we will pass the information on to be handled downstream.
          fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION;
        }

        // If we reached our goal outside a deletion, add the shift
        if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases())
          readBases += shift;

        // If we reached our goal inside a deletion, but the deletion is the next cigar element then
        // we need
        // to add the shift of the current cigar element but go back to it's last element to return
        // the last
        // base before the deletion (see warning in function contracts)
        else if (fallsInsideDeletion && !endsWithinCigar) readBases += shift - 1;

        // If we reached our goal inside a deletion then we must backtrack to the last base before
        // the deletion
        else if (fallsInsideDeletion && endsWithinCigar) readBases--;
      }
    }

    if (!goalReached) {
      if (allowGoalNotReached) {
        return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false);
      } else {
        throw new ReviewedStingException(
            "Somehow the requested coordinate is not covered by the read. Alignment "
                + alignmentStart
                + " | "
                + cigar);
      }
    }

    return new Pair<Integer, Boolean>(readBases, fallsInsideDeletion);
  }
Exemple #3
0
  public static void align(
      Graph graph,
      SAMRecord rec,
      Node recNode,
      ReferenceSequence sequence,
      SAMProgramRecord programRecord,
      int offset,
      AlleleCoverageCutoffs alleleCoverageCutoffs,
      boolean correctBases,
      boolean useSequenceQualities,
      int MAXIMUM_TOTAL_COVERAGE,
      int MAX_HEAP_SIZE)
      throws Exception {

    int i;
    AlignHeapNode curAlignHeapNode = null;
    AlignHeapNode nextAlignHeapNode = null;
    AlignHeapNode bestAlignHeapNode = null;
    AlignHeap heap = null;
    String read = null; // could be cs
    String readBases = null; // always nt
    String qualities = null; // could be cq
    SRMAUtil.Space space = SRMAUtil.Space.NTSPACE;
    ListIterator<NodeRecord> iter = null;
    AlignHeapNodeComparator comp = null;
    int alignmentStart = -1;
    int numStartNodesAdded = 0;
    boolean strand = rec.getReadNegativeStrandFlag(); // false -> forward, true -> reverse
    String softClipStartBases = null;
    String softClipStartQualities = null;
    String softClipEndBases = null;
    String softClipEndQualities = null;

    // Debugging stuff
    String readName = rec.getReadName();

    assert SRMAUtil.Space.COLORSPACE != space;

    // Get space
    read = (String) rec.getAttribute("CS");
    if (null == read) {
      // Use base space
      space = SRMAUtil.Space.NTSPACE;
    } else {
      // assumes CS and CQ are always in sequencing order
      space = SRMAUtil.Space.COLORSPACE;
    }

    // Get read and qualities
    if (space == SRMAUtil.Space.NTSPACE) {
      byte tmpRead[] = rec.getReadString().getBytes();
      byte tmpQualities[] = rec.getBaseQualityString().getBytes();
      // Reverse once
      if (strand) { // reverse
        SAMRecordUtil.reverseArray(tmpRead);
        SAMRecordUtil.reverseArray(tmpQualities);
      }
      read = new String(tmpRead);
      readBases = new String(tmpRead);
      qualities = new String(tmpQualities);
      // Reverse again
      if (strand) { // reverse
        SAMRecordUtil.reverseArray(tmpRead);
        SAMRecordUtil.reverseArray(tmpQualities);
      }
    } else {
      byte tmpRead[] = rec.getReadString().getBytes();
      // Reverse once
      if (strand) { // reverse
        SAMRecordUtil.reverseArray(tmpRead);
      }
      readBases = new String(tmpRead);
      // Reverse again
      if (strand) { // reverse
        SAMRecordUtil.reverseArray(tmpRead);
      }
      read = SRMAUtil.normalizeColorSpaceRead(read);
      qualities = (String) rec.getAttribute("CQ");
      // Some aligners include a quality value for the adapter.  A quality value
      // IMHO should not be given for an unobserved (assumed) peice of data.  Trim
      // the first quality in this case
      if (qualities.length() == 1 + read.length()) { // trim the first quality
        qualities = qualities.substring(1);
      }
    }
    // Reverse back
    if (readBases.length() <= 0) {
      throw new Exception("Error.  The current alignment has no bases.");
    }
    if (read.length() <= 0) {
      throw new Exception("Error.  The current alignment has no bases.");
    }
    if (qualities.length() <= 0) {
      throw new Exception("Error.  The current alignment has no qualities.");
    }
    if (readBases.length() != read.length()) {
      if (space == SRMAUtil.Space.COLORSPACE) {
        throw new Exception(
            "Error.  The current alignment's read bases length does not match the length of the colors in the CS tag ["
                + rec.getReadName()
                + "].");
      } else {
        throw new Exception("Error.  Internal error: readBases.length() != read.length()");
      }
    }

    // Deal with soft-clipping
    // - save the soft clipped sequence for latter
    {
      List<CigarElement> cigarElements = null;

      cigarElements = rec.getCigar().getCigarElements();
      CigarElement e1 = cigarElements.get(0); // first
      CigarElement e2 = cigarElements.get(cigarElements.size() - 1); // last

      // Soft-clipped
      if (CigarOperator.S == e1.getOperator()) {
        if (space == SRMAUtil.Space.COLORSPACE) {
          throw new Exception(
              "Error.  Soft clipping with color-space data not currently supported.");
        }
        int l = e1.getLength();
        if (strand) { // reverse
          softClipStartBases = readBases.substring(readBases.length() - l);
          softClipStartQualities = qualities.substring(qualities.length() - l);
          readBases = readBases.substring(0, readBases.length() - l);
          read = read.substring(0, read.length() - l);
          qualities = qualities.substring(0, qualities.length() - l);
        } else {
          softClipStartBases = readBases.substring(0, l - 1);
          softClipStartQualities = qualities.substring(0, l - 1);
          readBases = readBases.substring(l);
          read = read.substring(l);
          qualities = qualities.substring(l);
        }
      }
      if (CigarOperator.S == e2.getOperator()) {
        if (space == SRMAUtil.Space.COLORSPACE) {
          throw new Exception(
              "Error.  Soft clipping with color-space data not currently supported.");
        }
        int l = e2.getLength();
        if (strand) { // reverse
          softClipEndBases = readBases.substring(0, l - 1);
          softClipEndQualities = qualities.substring(0, l - 1);
          readBases = readBases.substring(l);
          read = read.substring(l);
          qualities = qualities.substring(l);
        } else {
          softClipEndBases = readBases.substring(readBases.length() - l);
          softClipEndQualities = qualities.substring(qualities.length() - l);
          readBases = readBases.substring(0, readBases.length() - l);
          read = read.substring(0, read.length() - l);
          qualities = qualities.substring(0, qualities.length() - l);
        }
      }
    }

    // Remove mate pair information
    Align.removeMateInfo(rec);

    comp =
        new AlignHeapNodeComparator(
            (strand) ? AlignHeap.HeapType.MAXHEAP : AlignHeap.HeapType.MINHEAP);

    // Bound by original alignment if possible
    bestAlignHeapNode =
        Align.boundWithOriginalAlignment(
            rec,
            graph,
            recNode,
            comp,
            strand,
            read,
            qualities,
            readBases,
            space,
            sequence,
            alleleCoverageCutoffs,
            useSequenceQualities,
            MAXIMUM_TOTAL_COVERAGE,
            MAX_HEAP_SIZE);

    /*
    System.err.println("readName="+rec.getReadName());
    if(null != bestAlignHeapNode) {
    System.err.println("\nFOUND BEST:" + rec.toString());
    }
    else {
    System.err.println("\nNOT FOUND (BEST): " + rec.toString());
    }
    Align.updateSAM(rec, programRecord, bestAlignHeapNode, space, read, qualities, softClipStartBases, softClipStartQualities, softClipEndBases, softClipEndQualities, strand, correctBases);
    return;
    */

    heap = new AlignHeap((strand) ? AlignHeap.HeapType.MAXHEAP : AlignHeap.HeapType.MINHEAP);

    // Add start nodes
    if (strand) { // reverse
      alignmentStart = rec.getAlignmentEnd();
      for (i = alignmentStart + offset; alignmentStart - offset <= i; i--) {
        int position = graph.getPriorityQueueIndexAtPositionOrBefore(i);
        PriorityQueue<Node> startNodeQueue = graph.getPriorityQueue(position);
        if (0 != position && null != startNodeQueue) {
          Iterator<Node> startNodeQueueIter = startNodeQueue.iterator();
          while (startNodeQueueIter.hasNext()) {
            Node startNode = startNodeQueueIter.next();
            int f = passFilters(graph, startNode, alleleCoverageCutoffs, MAXIMUM_TOTAL_COVERAGE);
            if (0 == f) {
              heap.add(
                  new AlignHeapNode(
                      null,
                      startNode,
                      startNode.coverage,
                      read.charAt(0),
                      qualities.charAt(0),
                      useSequenceQualities,
                      space));
            } else if (f < 0) {
              return;
            }
            if (startNode.position < i) {
              i = startNode.position;
            }
            numStartNodesAdded++;
          }
        }
      }
    } else {
      alignmentStart = rec.getAlignmentStart();
      for (i = alignmentStart - offset; i <= alignmentStart + offset; i++) {
        int position = graph.getPriorityQueueIndexAtPositionOrGreater(i);
        PriorityQueue<Node> startNodeQueue = graph.getPriorityQueue(position);
        if (0 != position && null != startNodeQueue) {
          Iterator<Node> startNodeQueueIter = startNodeQueue.iterator();
          while (startNodeQueueIter.hasNext()) {
            Node startNode = startNodeQueueIter.next();
            int f = passFilters(graph, startNode, alleleCoverageCutoffs, MAXIMUM_TOTAL_COVERAGE);
            if (0 == f) {
              heap.add(
                  new AlignHeapNode(
                      null,
                      startNode,
                      startNode.coverage,
                      read.charAt(0),
                      qualities.charAt(0),
                      useSequenceQualities,
                      space));
            } else if (f < 0) {
              return;
            }
            if (i < startNode.position) {
              i = startNode.position;
            }
            numStartNodesAdded++;
          }
        }
      }
    }
    if (numStartNodesAdded == 0) {
      throw new Exception("Did not add any start nodes!");
    }

    // Get first node off the heap
    curAlignHeapNode = heap.poll();

    while (null != curAlignHeapNode) {

      if (MAX_HEAP_SIZE <= heap.size()) {
        // too many to consider
        return;
      }

      // System.err.println("strand:" + strand + "\tsize:" + heap.size() + "\talignmentStart:" +
      // alignmentStart + "\toffset:" + offset + "\treadOffset:" + curAlignHeapNode.readOffset);
      // System.err.print("size:" + heap.size() + ":" + curAlignHeapNode.readOffset + ":" +
      // curAlignHeapNode.score + ":" + curAlignHeapNode.alleleCoverageSum + ":" +
      // curAlignHeapNode.startPosition + "\t");
      // curAlignHeapNode.node.print(System.err);
      // System.err.print("\rposition:" + curAlignHeapNode.node.position + "\treadOffset:" +
      // curAlignHeapNode.readOffset);

      // Remove all non-insertions with the same contig/pos/read-offset/type/base and lower score
      nextAlignHeapNode = heap.peek();
      while (Node.INSERTION != curAlignHeapNode.node.type
          && null != nextAlignHeapNode
          && 0 == comp.compare(curAlignHeapNode, nextAlignHeapNode)) {
        if (curAlignHeapNode.score < nextAlignHeapNode.score
            || (curAlignHeapNode.score == nextAlignHeapNode.score
                && curAlignHeapNode.alleleCoverageSum < nextAlignHeapNode.alleleCoverageSum)) {
          // Update current node
          curAlignHeapNode = heap.poll();
        } else {
          // Ignore next node
          heap.poll();
        }
        nextAlignHeapNode = heap.peek();
      }
      nextAlignHeapNode = null;

      // Check if the alignment is complete
      if (curAlignHeapNode.readOffset == read.length() - 1) {
        // All read bases examined, store if has the best alignment.

        // System.err.print(curAlignHeapNode.alleleCoverageSum + ":" + curAlignHeapNode.score +
        // ":");
        // System.err.print(curAlignHeapNode.startPosition + ":");
        // curAlignHeapNode.node.print(System.err);

        if (null == bestAlignHeapNode
            || bestAlignHeapNode.score < curAlignHeapNode.score
            || (bestAlignHeapNode.score == curAlignHeapNode.score
                && bestAlignHeapNode.alleleCoverageSum < curAlignHeapNode.alleleCoverageSum)) {
          bestAlignHeapNode = curAlignHeapNode;
        }
      } else if (null != bestAlignHeapNode && curAlignHeapNode.score < bestAlignHeapNode.score) {
        // ignore, under the assumption that scores can only become more negative.
      } else {
        if (strand) { // reverse
          // Go to all the "prev" nodes
          iter = curAlignHeapNode.node.prev.listIterator();
        } else { // forward
          // Go to all "next" nodes
          iter = curAlignHeapNode.node.next.listIterator();
        }
        while (iter.hasNext()) {
          NodeRecord next = iter.next();
          int f =
              passFilters(
                  graph, next.node, next.coverage, alleleCoverageCutoffs, MAXIMUM_TOTAL_COVERAGE);
          if (0 == f) {
            heap.add(
                new AlignHeapNode(
                    curAlignHeapNode,
                    next.node,
                    next.coverage,
                    read.charAt(curAlignHeapNode.readOffset + 1),
                    qualities.charAt(curAlignHeapNode.readOffset + 1),
                    useSequenceQualities,
                    space));
          } else if (f < 0) {
            return;
          }
        }
        iter = null;
      }
      // Get next node
      curAlignHeapNode = heap.poll();
    }

    // Recover alignment
    Align.updateSAM(
        rec,
        sequence,
        programRecord,
        bestAlignHeapNode,
        space,
        read,
        qualities,
        softClipStartBases,
        softClipStartQualities,
        softClipEndBases,
        softClipEndQualities,
        strand,
        correctBases);
  }