Beispiel #1
0
  /**
   * Creates a map with each event in the read (cigar operator) and the read coordinate where it
   * happened.
   *
   * <p>Example: D -> 2, 34, 75 I -> 55 S -> 0, 101 H -> 101
   *
   * @param read the read
   * @return a map with the properties described above. See example
   */
  public static Map<CigarOperator, ArrayList<Integer>> getCigarOperatorForAllBases(
      GATKSAMRecord read) {
    Map<CigarOperator, ArrayList<Integer>> events =
        new HashMap<CigarOperator, ArrayList<Integer>>();

    int position = 0;
    for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
      CigarOperator op = cigarElement.getOperator();
      if (op.consumesReadBases()) {
        ArrayList<Integer> list = events.get(op);
        if (list == null) {
          list = new ArrayList<Integer>();
          events.put(op, list);
        }
        for (int i = position; i < cigarElement.getLength(); i++) list.add(position++);
      } else {
        ArrayList<Integer> list = events.get(op);
        if (list == null) {
          list = new ArrayList<Integer>();
          events.put(op, list);
        }
        list.add(position);
      }
    }
    return events;
  }
Beispiel #2
0
  /**
   * Returns the coverage distribution of a single read within the desired region.
   *
   * <p>Note: This function counts DELETIONS as coverage (since the main purpose is to downsample
   * reads for variant regions, and deletions count as variants)
   *
   * @param read the read to get the coverage distribution of
   * @param startLocation the first reference coordinate of the region (inclusive)
   * @param stopLocation the last reference coordinate of the region (inclusive)
   * @return an array with the coverage of each position from startLocation to stopLocation
   */
  public static int[] getCoverageDistributionOfRead(
      GATKSAMRecord read, int startLocation, int stopLocation) {
    int[] coverage = new int[stopLocation - startLocation + 1];
    int refLocation = read.getSoftStart();
    for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
      switch (cigarElement.getOperator()) {
        case S:
        case M:
        case EQ:
        case N:
        case X:
        case D:
          for (int i = 0; i < cigarElement.getLength(); i++) {
            if (refLocation >= startLocation && refLocation <= stopLocation) {
              int baseCount =
                  read.isReducedRead()
                      ? read.getReducedCount(refLocation - read.getSoftStart())
                      : 1;
              coverage[refLocation - startLocation] +=
                  baseCount; // this may be a reduced read, so add the proper number of bases
            }
            refLocation++;
          }
          break;

        case P:
        case I:
        case H:
          break;
      }

      if (refLocation > stopLocation) break;
    }
    return coverage;
  }
Beispiel #3
0
 public static Pair<Boolean, CigarElement> readStartsWithInsertion(final Cigar cigar) {
   for (CigarElement cigarElement : cigar.getCigarElements()) {
     if (cigarElement.getOperator() == CigarOperator.INSERTION)
       return new Pair<Boolean, CigarElement>(true, cigarElement);
     else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP
         && cigarElement.getOperator() != CigarOperator.SOFT_CLIP) break;
   }
   return new Pair<Boolean, CigarElement>(false, null);
 }
Beispiel #4
0
  /**
   * Hard clips any leading insertions in the read. Only looks at the beginning of the read, not the
   * end.
   *
   * @return a new read without leading insertions
   */
  private GATKSAMRecord hardClipLeadingInsertions() {
    if (read.isEmpty()) return read;

    for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
      if (cigarElement.getOperator() != CigarOperator.HARD_CLIP
          && cigarElement.getOperator() != CigarOperator.SOFT_CLIP
          && cigarElement.getOperator() != CigarOperator.INSERTION) break;
      else if (cigarElement.getOperator() == CigarOperator.INSERTION)
        this.addOp(new ClippingOp(0, cigarElement.getLength() - 1));
    }
    return clipRead(ClippingRepresentation.HARDCLIP_BASES);
  }
Beispiel #5
0
  /**
   * Calculates the reference coordinate for a read coordinate
   *
   * @param read the read
   * @param offset the base in the read (coordinate in the read)
   * @return the reference coordinate correspondent to this base
   */
  public static long getReferenceCoordinateForReadCoordinate(GATKSAMRecord read, int offset) {
    if (offset > read.getReadLength())
      throw new ReviewedStingException(
          String.format(OFFSET_OUT_OF_BOUNDS_EXCEPTION, offset, read.getReadLength()));

    long location = read.getAlignmentStart();
    Iterator<CigarElement> cigarElementIterator = read.getCigar().getCigarElements().iterator();
    while (offset > 0 && cigarElementIterator.hasNext()) {
      CigarElement cigarElement = cigarElementIterator.next();
      long move = 0;
      if (cigarElement.getOperator().consumesReferenceBases())
        move = (long) Math.min(cigarElement.getLength(), offset);
      location += move;
      offset -= move;
    }
    if (offset > 0 && !cigarElementIterator.hasNext())
      throw new ReviewedStingException(OFFSET_NOT_ZERO_EXCEPTION);

    return location;
  }
Beispiel #6
0
  /**
   * Will hard clip every soft clipped bases in the read.
   *
   * @return a new read without the soft clipped bases
   */
  private GATKSAMRecord hardClipSoftClippedBases() {
    if (read.isEmpty()) return read;

    int readIndex = 0;
    int cutLeft = -1; // first position to hard clip (inclusive)
    int cutRight = -1; // first position to hard clip (inclusive)
    boolean rightTail =
        false; // trigger to stop clipping the left tail and start cutting the right tail

    for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
      if (cigarElement.getOperator() == CigarOperator.SOFT_CLIP) {
        if (rightTail) {
          cutRight = readIndex;
        } else {
          cutLeft = readIndex + cigarElement.getLength() - 1;
        }
      } else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP) rightTail = true;

      if (cigarElement.getOperator().consumesReadBases()) readIndex += cigarElement.getLength();
    }

    // It is extremely important that we cut the end first otherwise the read coordinates change.
    if (cutRight >= 0) this.addOp(new ClippingOp(cutRight, read.getReadLength() - 1));
    if (cutLeft >= 0) this.addOp(new ClippingOp(0, cutLeft));

    return clipRead(ClippingRepresentation.HARDCLIP_BASES);
  }
  /** Steps forward on the genome. Returns false when done reading the read, true otherwise. */
  public boolean stepForwardOnGenome() {
    if (currentOperatorIndex == numOperators) return false;

    CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex);
    if (currentPositionOnOperator >= curElement.getLength()) {
      if (++currentOperatorIndex == numOperators) return false;

      curElement = read.getCigar().getCigarElement(currentOperatorIndex);
      currentPositionOnOperator = 0;
    }

    switch (curElement.getOperator()) {
      case I: // insertion w.r.t. the reference
        if (!sawMop) break;
      case S: // soft clip
        currentReadOffset += curElement.getLength();
      case H: // hard clip
      case P: // padding
        currentOperatorIndex++;
        return stepForwardOnGenome();

      case D: // deletion w.r.t. the reference
      case N: // reference skip (looks and gets processed just like a "deletion", just different
        // logical meaning)
        currentPositionOnOperator++;
        break;

      case M:
      case EQ:
      case X:
        sawMop = true;
        currentReadOffset++;
        currentPositionOnOperator++;
        break;
      default:
        throw new IllegalStateException("No support for cigar op: " + curElement.getOperator());
    }

    final boolean isFirstOp = currentOperatorIndex == 0;
    final boolean isLastOp = currentOperatorIndex == numOperators - 1;
    final boolean isFirstBaseOfOp = currentPositionOnOperator == 1;
    final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength();

    isBeforeDeletionStart =
        isBeforeOp(
            read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp);
    isBeforeDeletedBase =
        isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D);
    isAfterDeletionEnd =
        isAfterOp(
            read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp);
    isAfterDeletedBase =
        isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D);
    isBeforeInsertion =
        isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp)
            || (!sawMop && curElement.getOperator() == CigarOperator.I);
    isAfterInsertion =
        isAfterOp(
            read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp);
    isNextToSoftClip =
        isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp)
            || isAfterOp(
                read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp);

    return true;
  }
Beispiel #8
0
  @Override
  public int doWork(String[] args) {
    File refFile = null;
    com.github.lindenb.jvarkit.util.cli.GetOpt getopt =
        new com.github.lindenb.jvarkit.util.cli.GetOpt();
    int c;
    while ((c = getopt.getopt(args, "hvL:r:")) != -1) {
      switch (c) {
        case 'h':
          printUsage();
          return 0;
        case 'v':
          System.out.println(getVersion());
          return 0;
        case 'L':
          getLogger().setLevel(java.util.logging.Level.parse(getopt.getOptArg()));
          break;
        case 'r':
          refFile = new File(getopt.getOptArg());
          break;
        case ':':
          System.err.println("Missing argument for option -" + getopt.getOptOpt());
          return -1;
        default:
          System.err.println("Unknown option -" + getopt.getOptOpt());
          return -1;
      }
    }

    if (refFile == null) {
      error("Undefined REF file");
      return -1;
    }
    File bamFile = null;
    if (getopt.getOptInd() + 1 != args.length) {
      info("reading from stdin.");
    } else {
      bamFile = new File(args[getopt.getOptInd()]);
    }

    IndexedFastaSequenceFile indexedFastaSequenceFile = null;
    SAMFileReader samFileReader = null;

    try {
      GenomicSequence genomicSequence = null;
      indexedFastaSequenceFile = new IndexedFastaSequenceFile(refFile);
      SAMFileReader.setDefaultValidationStringency(ValidationStringency.SILENT);
      samFileReader = null;
      if (bamFile == null) {
        samFileReader = new SAMFileReader(System.in);
      } else {
        samFileReader = new SAMFileReader(bamFile);
      }
      XMLOutputFactory xmlfactory = XMLOutputFactory.newInstance();
      XMLStreamWriter w = xmlfactory.createXMLStreamWriter(System.out, "UTF-8");
      w.writeStartDocument("UTF-8", "1.0");
      w.writeStartElement("sam");
      w.writeComment(getProgramCommandLine());
      w.writeAttribute("ref", (bamFile == null ? "stdin" : bamFile.getPath()));
      w.writeAttribute("bam", args[1]);

      SAMRecordIterator iter = samFileReader.iterator();
      while (iter.hasNext()) {
        SAMRecord rec = iter.next();

        final byte readbases[] = rec.getReadBases();
        w.writeStartElement("read");

        w.writeStartElement("name");
        w.writeCharacters(rec.getReadName());
        w.writeEndElement();
        w.writeStartElement("sequence");
        w.writeCharacters(new String(readbases));
        w.writeEndElement();
        w.writeStartElement("flags");
        w.writeAttribute("paired", String.valueOf(rec.getReadPairedFlag()));
        w.writeAttribute(
            "failsVendorQual", String.valueOf(rec.getReadFailsVendorQualityCheckFlag()));
        w.writeAttribute("mapped", String.valueOf(!rec.getReadUnmappedFlag()));
        w.writeAttribute("strand", (rec.getReadNegativeStrandFlag() ? "-" : "+"));

        if (rec.getReadPairedFlag()) {
          w.writeAttribute("mate-mapped", String.valueOf(!rec.getMateUnmappedFlag()));
          w.writeAttribute("mate-strand", (rec.getMateNegativeStrandFlag() ? "-" : "+"));
          w.writeAttribute("proper-pair", String.valueOf(rec.getProperPairFlag()));
        }

        w.writeCharacters(String.valueOf(rec.getFlags()));
        w.writeEndElement();
        if (!rec.getReadUnmappedFlag()) {
          w.writeStartElement("qual");
          w.writeCharacters(String.valueOf(rec.getMappingQuality()));
          w.writeEndElement();

          w.writeStartElement("chrom");
          w.writeAttribute("index", String.valueOf(rec.getReferenceIndex()));
          w.writeCharacters(rec.getReferenceName());
          w.writeEndElement();
          w.writeStartElement("pos");
          w.writeCharacters(String.valueOf(rec.getAlignmentStart()));
          w.writeEndElement();
          w.writeStartElement("cigar");
          w.writeCharacters(rec.getCigarString());
          w.writeEndElement();
        }

        if (!rec.getMateUnmappedFlag()) {
          w.writeStartElement("mate-chrom");
          w.writeAttribute("index", String.valueOf(rec.getMateReferenceIndex()));
          w.writeCharacters(rec.getMateReferenceName());
          w.writeEndElement();
          w.writeStartElement("mate-pos");
          w.writeCharacters(String.valueOf(rec.getMateAlignmentStart()));
          w.writeEndElement();
        }

        if (!rec.getReadUnmappedFlag()) {
          if (genomicSequence == null
              || genomicSequence.getChrom().equals(rec.getReferenceName())) {
            genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
          }

          w.writeStartElement("align");

          int readIndex = 0;
          int refIndex = rec.getAlignmentStart();

          for (final CigarElement e : rec.getCigar().getCigarElements()) {
            switch (e.getOperator()) {
              case H:
                break; // ignore hard clips
              case P:
                break; // ignore pads
              case I: // cont.
              case S:
                {
                  final int length = e.getLength();
                  for (int i = 0; i < length; ++i) {
                    w.writeEmptyElement(e.getOperator().name());
                    w.writeAttribute("read-index", String.valueOf(readIndex + 1));
                    if (readIndex >= 0 && readIndex < readbases.length) {
                      w.writeAttribute("read-base", String.valueOf((char) (readbases[readIndex])));
                    }
                    readIndex++;
                  }
                  break;
                }
              case N: // cont. -- reference skip
              case D:
                {
                  final int length = e.getLength();
                  for (int i = 0; i < length; ++i) {
                    w.writeEmptyElement(e.getOperator().name());
                    w.writeAttribute("ref-index", String.valueOf(refIndex));
                    if (refIndex >= 1 && refIndex <= genomicSequence.length()) {
                      w.writeAttribute(
                          "ref-base", String.valueOf(genomicSequence.charAt(refIndex - 1)));
                    }
                    refIndex++;
                  }
                  break;
                }
              case M:
              case EQ:
              case X:
                {
                  final int length = e.getLength();
                  for (int i = 0; i < length; ++i) {
                    w.writeEmptyElement(e.getOperator().name());
                    char baseRead = '\0';
                    if (readIndex >= 0 && readIndex < readbases.length) {
                      baseRead = (char) (rec.getReadBases()[readIndex]);
                      w.writeAttribute("read-index", String.valueOf(readIndex + 1));
                      w.writeAttribute("read-base", String.valueOf(baseRead));
                    }
                    w.writeAttribute("ref-index", String.valueOf(refIndex));
                    if (refIndex >= 1 && refIndex <= genomicSequence.length()) {
                      char baseRef = genomicSequence.charAt(refIndex - 1);
                      w.writeAttribute("ref-base", String.valueOf(baseRef));
                      if (Character.toUpperCase(baseRef) != Character.toUpperCase(baseRead)) {
                        w.writeAttribute("mismatch", "true");
                      }
                    }

                    refIndex++;
                    readIndex++;
                  }
                  break;
                }

              default:
                throw new IllegalStateException(
                    "Case statement didn't deal with cigar op: " + e.getOperator());
            }
          }
        }

        w.writeEndElement();

        w.writeEndElement();

        iter.close();
        w.writeEndElement();
      }
      w.writeEndElement();
      w.writeEndDocument();
      w.flush();
      w.close();
    } catch (Exception err) {
      error(err);
      return -1;
    } finally {
      CloserUtil.close(samFileReader);
      CloserUtil.close(indexedFastaSequenceFile);
    }
    return 0;
  }
      private void collectQualityData(final SAMRecord record, final ReferenceSequence reference) {
        // If the read isnt an aligned PF read then look at the read for no-calls
        if (record.getReadUnmappedFlag()
            || record.getReadFailsVendorQualityCheckFlag()
            || !doRefMetrics) {
          final byte[] readBases = record.getReadBases();
          for (int i = 0; i < readBases.length; i++) {
            if (SequenceUtil.isNoCall(readBases[i])) {
              badCycleHistogram.increment(
                  CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i));
            }
          }
        } else if (!record.getReadFailsVendorQualityCheckFlag()) {
          final boolean highQualityMapping = isHighQualityMapping(record);
          if (highQualityMapping) metrics.PF_HQ_ALIGNED_READS++;

          final byte[] readBases = record.getReadBases();
          final byte[] refBases = reference.getBases();
          final byte[] qualities = record.getBaseQualities();
          final int refLength = refBases.length;
          long mismatchCount = 0;
          long hqMismatchCount = 0;

          for (final AlignmentBlock alignmentBlock : record.getAlignmentBlocks()) {
            final int readIndex = alignmentBlock.getReadStart() - 1;
            final int refIndex = alignmentBlock.getReferenceStart() - 1;
            final int length = alignmentBlock.getLength();

            for (int i = 0; i < length && refIndex + i < refLength; ++i) {
              final int readBaseIndex = readIndex + i;
              boolean mismatch =
                  !SequenceUtil.basesEqual(readBases[readBaseIndex], refBases[refIndex + i]);
              boolean bisulfiteBase = false;
              if (mismatch && isBisulfiteSequenced) {
                if ((record.getReadNegativeStrandFlag()
                        && (refBases[refIndex + i] == 'G' || refBases[refIndex + i] == 'g')
                        && (readBases[readBaseIndex] == 'A' || readBases[readBaseIndex] == 'a'))
                    || ((!record.getReadNegativeStrandFlag())
                            && (refBases[refIndex + i] == 'C' || refBases[refIndex + i] == 'c')
                            && (readBases[readBaseIndex] == 'T')
                        || readBases[readBaseIndex] == 't')) {

                  bisulfiteBase = true;
                  mismatch = false;
                }
              }

              if (mismatch) mismatchCount++;

              metrics.PF_ALIGNED_BASES++;
              if (!bisulfiteBase) nonBisulfiteAlignedBases++;

              if (highQualityMapping) {
                metrics.PF_HQ_ALIGNED_BASES++;
                if (!bisulfiteBase) hqNonBisulfiteAlignedBases++;
                if (qualities[readBaseIndex] >= BASE_QUALITY_THRESHOLD)
                  metrics.PF_HQ_ALIGNED_Q20_BASES++;
                if (mismatch) hqMismatchCount++;
              }

              if (mismatch || SequenceUtil.isNoCall(readBases[readBaseIndex])) {
                badCycleHistogram.increment(
                    CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i));
              }
            }
          }

          mismatchHistogram.increment(mismatchCount);
          hqMismatchHistogram.increment(hqMismatchCount);

          // Add any insertions and/or deletions to the global count
          for (final CigarElement elem : record.getCigar().getCigarElements()) {
            final CigarOperator op = elem.getOperator();
            if (op == CigarOperator.INSERTION || op == CigarOperator.DELETION) ++this.indels;
          }
        }
      }
Beispiel #10
0
 /**
  * Is this read all insertion?
  *
  * @param read
  * @return whether or not the only element in the cigar string is an Insertion
  */
 public static boolean readIsEntirelyInsertion(GATKSAMRecord read) {
   for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
     if (cigarElement.getOperator() != CigarOperator.INSERTION) return false;
   }
   return true;
 }
Beispiel #11
0
  public static Pair<Integer, Boolean> getReadCoordinateForReferenceCoordinate(
      final int alignmentStart,
      final Cigar cigar,
      final int refCoord,
      final boolean allowGoalNotReached) {
    int readBases = 0;
    int refBases = 0;
    boolean fallsInsideDeletion = false;

    int goal = refCoord - alignmentStart; // The goal is to move this many reference bases
    if (goal < 0) {
      if (allowGoalNotReached) {
        return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false);
      } else {
        throw new ReviewedStingException(
            "Somehow the requested coordinate is not covered by the read. Too many deletions?");
      }
    }
    boolean goalReached = refBases == goal;

    Iterator<CigarElement> cigarElementIterator = cigar.getCigarElements().iterator();
    while (!goalReached && cigarElementIterator.hasNext()) {
      CigarElement cigarElement = cigarElementIterator.next();
      int shift = 0;

      if (cigarElement.getOperator().consumesReferenceBases()
          || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) {
        if (refBases + cigarElement.getLength() < goal) shift = cigarElement.getLength();
        else shift = goal - refBases;

        refBases += shift;
      }
      goalReached = refBases == goal;

      if (!goalReached && cigarElement.getOperator().consumesReadBases())
        readBases += cigarElement.getLength();

      if (goalReached) {
        // Is this base's reference position within this cigar element? Or did we use it all?
        boolean endsWithinCigar = shift < cigarElement.getLength();

        // If it isn't, we need to check the next one. There should *ALWAYS* be a next one
        // since we checked if the goal coordinate is within the read length, so this is just a
        // sanity check.
        if (!endsWithinCigar && !cigarElementIterator.hasNext()) {
          if (allowGoalNotReached) {
            return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false);
          } else {
            throw new ReviewedStingException(
                "Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio");
          }
        }

        CigarElement nextCigarElement;

        // if we end inside the current cigar element, we just have to check if it is a deletion
        if (endsWithinCigar)
          fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION;

        // if we end outside the current cigar element, we need to check if the next element is an
        // insertion or deletion.
        else {
          nextCigarElement = cigarElementIterator.next();

          // if it's an insertion, we need to clip the whole insertion before looking at the next
          // element
          if (nextCigarElement.getOperator() == CigarOperator.INSERTION) {
            readBases += nextCigarElement.getLength();
            if (!cigarElementIterator.hasNext()) {
              if (allowGoalNotReached) {
                return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false);
              } else {
                throw new ReviewedStingException(
                    "Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio");
              }
            }

            nextCigarElement = cigarElementIterator.next();
          }

          // if it's a deletion, we will pass the information on to be handled downstream.
          fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION;
        }

        // If we reached our goal outside a deletion, add the shift
        if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases())
          readBases += shift;

        // If we reached our goal inside a deletion, but the deletion is the next cigar element then
        // we need
        // to add the shift of the current cigar element but go back to it's last element to return
        // the last
        // base before the deletion (see warning in function contracts)
        else if (fallsInsideDeletion && !endsWithinCigar) readBases += shift - 1;

        // If we reached our goal inside a deletion then we must backtrack to the last base before
        // the deletion
        else if (fallsInsideDeletion && endsWithinCigar) readBases--;
      }
    }

    if (!goalReached) {
      if (allowGoalNotReached) {
        return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false);
      } else {
        throw new ReviewedStingException(
            "Somehow the requested coordinate is not covered by the read. Alignment "
                + alignmentStart
                + " | "
                + cigar);
      }
    }

    return new Pair<Integer, Boolean>(readBases, fallsInsideDeletion);
  }
Beispiel #12
0
 /**
  * If a read ends in INSERTION, returns the last element length.
  *
  * <p>Warning: If the read has Hard or Soft clips after the insertion this function will return 0.
  *
  * @param read
  * @return the length of the last insertion, or 0 if there is none (see warning).
  */
 public static final int getLastInsertionOffset(SAMRecord read) {
   CigarElement e = read.getCigar().getCigarElement(read.getCigarLength() - 1);
   if (e.getOperator() == CigarOperator.I) return e.getLength();
   else return 0;
 }
Beispiel #13
0
  public static void align(
      Graph graph,
      SAMRecord rec,
      Node recNode,
      ReferenceSequence sequence,
      SAMProgramRecord programRecord,
      int offset,
      AlleleCoverageCutoffs alleleCoverageCutoffs,
      boolean correctBases,
      boolean useSequenceQualities,
      int MAXIMUM_TOTAL_COVERAGE,
      int MAX_HEAP_SIZE)
      throws Exception {

    int i;
    AlignHeapNode curAlignHeapNode = null;
    AlignHeapNode nextAlignHeapNode = null;
    AlignHeapNode bestAlignHeapNode = null;
    AlignHeap heap = null;
    String read = null; // could be cs
    String readBases = null; // always nt
    String qualities = null; // could be cq
    SRMAUtil.Space space = SRMAUtil.Space.NTSPACE;
    ListIterator<NodeRecord> iter = null;
    AlignHeapNodeComparator comp = null;
    int alignmentStart = -1;
    int numStartNodesAdded = 0;
    boolean strand = rec.getReadNegativeStrandFlag(); // false -> forward, true -> reverse
    String softClipStartBases = null;
    String softClipStartQualities = null;
    String softClipEndBases = null;
    String softClipEndQualities = null;

    // Debugging stuff
    String readName = rec.getReadName();

    assert SRMAUtil.Space.COLORSPACE != space;

    // Get space
    read = (String) rec.getAttribute("CS");
    if (null == read) {
      // Use base space
      space = SRMAUtil.Space.NTSPACE;
    } else {
      // assumes CS and CQ are always in sequencing order
      space = SRMAUtil.Space.COLORSPACE;
    }

    // Get read and qualities
    if (space == SRMAUtil.Space.NTSPACE) {
      byte tmpRead[] = rec.getReadString().getBytes();
      byte tmpQualities[] = rec.getBaseQualityString().getBytes();
      // Reverse once
      if (strand) { // reverse
        SAMRecordUtil.reverseArray(tmpRead);
        SAMRecordUtil.reverseArray(tmpQualities);
      }
      read = new String(tmpRead);
      readBases = new String(tmpRead);
      qualities = new String(tmpQualities);
      // Reverse again
      if (strand) { // reverse
        SAMRecordUtil.reverseArray(tmpRead);
        SAMRecordUtil.reverseArray(tmpQualities);
      }
    } else {
      byte tmpRead[] = rec.getReadString().getBytes();
      // Reverse once
      if (strand) { // reverse
        SAMRecordUtil.reverseArray(tmpRead);
      }
      readBases = new String(tmpRead);
      // Reverse again
      if (strand) { // reverse
        SAMRecordUtil.reverseArray(tmpRead);
      }
      read = SRMAUtil.normalizeColorSpaceRead(read);
      qualities = (String) rec.getAttribute("CQ");
      // Some aligners include a quality value for the adapter.  A quality value
      // IMHO should not be given for an unobserved (assumed) peice of data.  Trim
      // the first quality in this case
      if (qualities.length() == 1 + read.length()) { // trim the first quality
        qualities = qualities.substring(1);
      }
    }
    // Reverse back
    if (readBases.length() <= 0) {
      throw new Exception("Error.  The current alignment has no bases.");
    }
    if (read.length() <= 0) {
      throw new Exception("Error.  The current alignment has no bases.");
    }
    if (qualities.length() <= 0) {
      throw new Exception("Error.  The current alignment has no qualities.");
    }
    if (readBases.length() != read.length()) {
      if (space == SRMAUtil.Space.COLORSPACE) {
        throw new Exception(
            "Error.  The current alignment's read bases length does not match the length of the colors in the CS tag ["
                + rec.getReadName()
                + "].");
      } else {
        throw new Exception("Error.  Internal error: readBases.length() != read.length()");
      }
    }

    // Deal with soft-clipping
    // - save the soft clipped sequence for latter
    {
      List<CigarElement> cigarElements = null;

      cigarElements = rec.getCigar().getCigarElements();
      CigarElement e1 = cigarElements.get(0); // first
      CigarElement e2 = cigarElements.get(cigarElements.size() - 1); // last

      // Soft-clipped
      if (CigarOperator.S == e1.getOperator()) {
        if (space == SRMAUtil.Space.COLORSPACE) {
          throw new Exception(
              "Error.  Soft clipping with color-space data not currently supported.");
        }
        int l = e1.getLength();
        if (strand) { // reverse
          softClipStartBases = readBases.substring(readBases.length() - l);
          softClipStartQualities = qualities.substring(qualities.length() - l);
          readBases = readBases.substring(0, readBases.length() - l);
          read = read.substring(0, read.length() - l);
          qualities = qualities.substring(0, qualities.length() - l);
        } else {
          softClipStartBases = readBases.substring(0, l - 1);
          softClipStartQualities = qualities.substring(0, l - 1);
          readBases = readBases.substring(l);
          read = read.substring(l);
          qualities = qualities.substring(l);
        }
      }
      if (CigarOperator.S == e2.getOperator()) {
        if (space == SRMAUtil.Space.COLORSPACE) {
          throw new Exception(
              "Error.  Soft clipping with color-space data not currently supported.");
        }
        int l = e2.getLength();
        if (strand) { // reverse
          softClipEndBases = readBases.substring(0, l - 1);
          softClipEndQualities = qualities.substring(0, l - 1);
          readBases = readBases.substring(l);
          read = read.substring(l);
          qualities = qualities.substring(l);
        } else {
          softClipEndBases = readBases.substring(readBases.length() - l);
          softClipEndQualities = qualities.substring(qualities.length() - l);
          readBases = readBases.substring(0, readBases.length() - l);
          read = read.substring(0, read.length() - l);
          qualities = qualities.substring(0, qualities.length() - l);
        }
      }
    }

    // Remove mate pair information
    Align.removeMateInfo(rec);

    comp =
        new AlignHeapNodeComparator(
            (strand) ? AlignHeap.HeapType.MAXHEAP : AlignHeap.HeapType.MINHEAP);

    // Bound by original alignment if possible
    bestAlignHeapNode =
        Align.boundWithOriginalAlignment(
            rec,
            graph,
            recNode,
            comp,
            strand,
            read,
            qualities,
            readBases,
            space,
            sequence,
            alleleCoverageCutoffs,
            useSequenceQualities,
            MAXIMUM_TOTAL_COVERAGE,
            MAX_HEAP_SIZE);

    /*
    System.err.println("readName="+rec.getReadName());
    if(null != bestAlignHeapNode) {
    System.err.println("\nFOUND BEST:" + rec.toString());
    }
    else {
    System.err.println("\nNOT FOUND (BEST): " + rec.toString());
    }
    Align.updateSAM(rec, programRecord, bestAlignHeapNode, space, read, qualities, softClipStartBases, softClipStartQualities, softClipEndBases, softClipEndQualities, strand, correctBases);
    return;
    */

    heap = new AlignHeap((strand) ? AlignHeap.HeapType.MAXHEAP : AlignHeap.HeapType.MINHEAP);

    // Add start nodes
    if (strand) { // reverse
      alignmentStart = rec.getAlignmentEnd();
      for (i = alignmentStart + offset; alignmentStart - offset <= i; i--) {
        int position = graph.getPriorityQueueIndexAtPositionOrBefore(i);
        PriorityQueue<Node> startNodeQueue = graph.getPriorityQueue(position);
        if (0 != position && null != startNodeQueue) {
          Iterator<Node> startNodeQueueIter = startNodeQueue.iterator();
          while (startNodeQueueIter.hasNext()) {
            Node startNode = startNodeQueueIter.next();
            int f = passFilters(graph, startNode, alleleCoverageCutoffs, MAXIMUM_TOTAL_COVERAGE);
            if (0 == f) {
              heap.add(
                  new AlignHeapNode(
                      null,
                      startNode,
                      startNode.coverage,
                      read.charAt(0),
                      qualities.charAt(0),
                      useSequenceQualities,
                      space));
            } else if (f < 0) {
              return;
            }
            if (startNode.position < i) {
              i = startNode.position;
            }
            numStartNodesAdded++;
          }
        }
      }
    } else {
      alignmentStart = rec.getAlignmentStart();
      for (i = alignmentStart - offset; i <= alignmentStart + offset; i++) {
        int position = graph.getPriorityQueueIndexAtPositionOrGreater(i);
        PriorityQueue<Node> startNodeQueue = graph.getPriorityQueue(position);
        if (0 != position && null != startNodeQueue) {
          Iterator<Node> startNodeQueueIter = startNodeQueue.iterator();
          while (startNodeQueueIter.hasNext()) {
            Node startNode = startNodeQueueIter.next();
            int f = passFilters(graph, startNode, alleleCoverageCutoffs, MAXIMUM_TOTAL_COVERAGE);
            if (0 == f) {
              heap.add(
                  new AlignHeapNode(
                      null,
                      startNode,
                      startNode.coverage,
                      read.charAt(0),
                      qualities.charAt(0),
                      useSequenceQualities,
                      space));
            } else if (f < 0) {
              return;
            }
            if (i < startNode.position) {
              i = startNode.position;
            }
            numStartNodesAdded++;
          }
        }
      }
    }
    if (numStartNodesAdded == 0) {
      throw new Exception("Did not add any start nodes!");
    }

    // Get first node off the heap
    curAlignHeapNode = heap.poll();

    while (null != curAlignHeapNode) {

      if (MAX_HEAP_SIZE <= heap.size()) {
        // too many to consider
        return;
      }

      // System.err.println("strand:" + strand + "\tsize:" + heap.size() + "\talignmentStart:" +
      // alignmentStart + "\toffset:" + offset + "\treadOffset:" + curAlignHeapNode.readOffset);
      // System.err.print("size:" + heap.size() + ":" + curAlignHeapNode.readOffset + ":" +
      // curAlignHeapNode.score + ":" + curAlignHeapNode.alleleCoverageSum + ":" +
      // curAlignHeapNode.startPosition + "\t");
      // curAlignHeapNode.node.print(System.err);
      // System.err.print("\rposition:" + curAlignHeapNode.node.position + "\treadOffset:" +
      // curAlignHeapNode.readOffset);

      // Remove all non-insertions with the same contig/pos/read-offset/type/base and lower score
      nextAlignHeapNode = heap.peek();
      while (Node.INSERTION != curAlignHeapNode.node.type
          && null != nextAlignHeapNode
          && 0 == comp.compare(curAlignHeapNode, nextAlignHeapNode)) {
        if (curAlignHeapNode.score < nextAlignHeapNode.score
            || (curAlignHeapNode.score == nextAlignHeapNode.score
                && curAlignHeapNode.alleleCoverageSum < nextAlignHeapNode.alleleCoverageSum)) {
          // Update current node
          curAlignHeapNode = heap.poll();
        } else {
          // Ignore next node
          heap.poll();
        }
        nextAlignHeapNode = heap.peek();
      }
      nextAlignHeapNode = null;

      // Check if the alignment is complete
      if (curAlignHeapNode.readOffset == read.length() - 1) {
        // All read bases examined, store if has the best alignment.

        // System.err.print(curAlignHeapNode.alleleCoverageSum + ":" + curAlignHeapNode.score +
        // ":");
        // System.err.print(curAlignHeapNode.startPosition + ":");
        // curAlignHeapNode.node.print(System.err);

        if (null == bestAlignHeapNode
            || bestAlignHeapNode.score < curAlignHeapNode.score
            || (bestAlignHeapNode.score == curAlignHeapNode.score
                && bestAlignHeapNode.alleleCoverageSum < curAlignHeapNode.alleleCoverageSum)) {
          bestAlignHeapNode = curAlignHeapNode;
        }
      } else if (null != bestAlignHeapNode && curAlignHeapNode.score < bestAlignHeapNode.score) {
        // ignore, under the assumption that scores can only become more negative.
      } else {
        if (strand) { // reverse
          // Go to all the "prev" nodes
          iter = curAlignHeapNode.node.prev.listIterator();
        } else { // forward
          // Go to all "next" nodes
          iter = curAlignHeapNode.node.next.listIterator();
        }
        while (iter.hasNext()) {
          NodeRecord next = iter.next();
          int f =
              passFilters(
                  graph, next.node, next.coverage, alleleCoverageCutoffs, MAXIMUM_TOTAL_COVERAGE);
          if (0 == f) {
            heap.add(
                new AlignHeapNode(
                    curAlignHeapNode,
                    next.node,
                    next.coverage,
                    read.charAt(curAlignHeapNode.readOffset + 1),
                    qualities.charAt(curAlignHeapNode.readOffset + 1),
                    useSequenceQualities,
                    space));
          } else if (f < 0) {
            return;
          }
        }
        iter = null;
      }
      // Get next node
      curAlignHeapNode = heap.poll();
    }

    // Recover alignment
    Align.updateSAM(
        rec,
        sequence,
        programRecord,
        bestAlignHeapNode,
        space,
        read,
        qualities,
        softClipStartBases,
        softClipStartQualities,
        softClipEndBases,
        softClipEndQualities,
        strand,
        correctBases);
  }