Exemplo n.º 1
0
  /**
   * Finds the adaptor boundary around the read and returns the first base inside the adaptor that
   * is closest to the read boundary. If the read is in the positive strand, this is the first base
   * after the end of the fragment (Picard calls it 'insert'), if the read is in the negative
   * strand, this is the first base before the beginning of the fragment.
   *
   * <p>There are two cases we need to treat here:
   *
   * <p>1) Our read is in the reverse strand :
   *
   * <p><----------------------| * |--------------------->
   *
   * <p>in these cases, the adaptor boundary is at the mate start (minus one)
   *
   * <p>2) Our read is in the forward strand :
   *
   * <p>|----------------------> * <----------------------|
   *
   * <p>in these cases the adaptor boundary is at the start of the read plus the inferred insert
   * size (plus one)
   *
   * @param read the read being tested for the adaptor boundary
   * @return the reference coordinate for the adaptor boundary (effectively the first base IN the
   *     adaptor, closest to the read. NULL if the read is unmapped or the mate is mapped to another
   *     contig.
   */
  public static Integer getAdaptorBoundary(final SAMRecord read) {
    final int MAXIMUM_ADAPTOR_LENGTH = 8;
    final int insertSize =
        Math.abs(
            read
                .getInferredInsertSize()); // the inferred insert size can be negative if the mate
                                           // is mapped before the read (so we take the absolute
                                           // value)

    if (insertSize == 0
        || read
            .getReadUnmappedFlag()) // no adaptors in reads with mates in another chromosome or
                                    // unmapped pairs
    return null;

    Integer
        adaptorBoundary; // the reference coordinate for the adaptor boundary (effectively the first
                         // base IN the adaptor, closest to the read)
    if (read.getReadNegativeStrandFlag())
      adaptorBoundary = read.getMateAlignmentStart() - 1; // case 1 (see header)
    else adaptorBoundary = read.getAlignmentStart() + insertSize + 1; // case 2 (see header)

    if ((adaptorBoundary < read.getAlignmentStart() - MAXIMUM_ADAPTOR_LENGTH)
        || (adaptorBoundary > read.getAlignmentEnd() + MAXIMUM_ADAPTOR_LENGTH))
      adaptorBoundary =
          null; // we are being conservative by not allowing the adaptor boundary to go beyond what
                // we belive is the maximum size of an adaptor

    return adaptorBoundary;
  }
      private void collectReadData(final SAMRecord record, final ReferenceSequence ref) {
        metrics.TOTAL_READS++;
        readLengthHistogram.increment(record.getReadBases().length);

        if (!record.getReadFailsVendorQualityCheckFlag()) {
          metrics.PF_READS++;
          if (isNoiseRead(record)) metrics.PF_NOISE_READS++;

          if (record.getReadUnmappedFlag()) {
            // If the read is unmapped see if it's adapter sequence
            final byte[] readBases = record.getReadBases();
            if (!(record instanceof BAMRecord)) StringUtil.toUpperCase(readBases);

            if (isAdapterSequence(readBases)) {
              this.adapterReads++;
            }
          } else if (doRefMetrics) {
            metrics.PF_READS_ALIGNED++;
            if (!record.getReadNegativeStrandFlag()) numPositiveStrand++;

            if (record.getReadPairedFlag() && !record.getMateUnmappedFlag()) {
              metrics.READS_ALIGNED_IN_PAIRS++;

              // Check that both ends have mapq > minimum
              final Integer mateMq = record.getIntegerAttribute("MQ");
              if (mateMq == null
                  || mateMq >= MAPPING_QUALITY_THRESOLD
                      && record.getMappingQuality() >= MAPPING_QUALITY_THRESOLD) {
                ++this.chimerasDenominator;

                // With both reads mapped we can see if this pair is chimeric
                if (Math.abs(record.getInferredInsertSize()) > maxInsertSize
                    || !record.getReferenceIndex().equals(record.getMateReferenceIndex())) {
                  ++this.chimeras;
                }
              }
            }
          }
        }
      }
Exemplo n.º 3
0
Arquivo: Align.java Projeto: nh13/SRMA
  private static void updateSAM(
      SAMRecord rec,
      ReferenceSequence sequence,
      SAMProgramRecord programRecord,
      AlignHeapNode bestAlignHeapNode,
      SRMAUtil.Space space,
      String read,
      String qualities,
      String softClipStartBases,
      String softClipStartQualities,
      String softClipEndBases,
      String softClipEndQualities,
      boolean strand,
      boolean correctBases)
      throws Exception {
    AlignHeapNode curAlignHeapNode = null;
    AlignHeapNode prevAlignHeapNode = null;

    int alignmentStart = 0;
    int readIndex = -1;
    byte readBases[] = null;
    byte baseQualities[] = null;
    byte colorErrors[] = null;
    int i;
    int numEdits = 0;
    List<String> optFieldTags = new LinkedList<String>();
    List<Object> optFieldValues = new LinkedList<Object>();
    Object attr;

    // Debugging stuff
    String readName = rec.getReadName();

    if (null == bestAlignHeapNode) {
      // Do not modify the alignment
      return;
    }

    // To generate a new CIGAR
    List<CigarElement> cigarElements = null;
    CigarOperator prevCigarOperator = null, curCigarOperator = null;
    int prevCigarOperatorLength = 0;

    // TODO
    // setInferredInsertSize (invalidates paired end reads)
    // setMappingQuality (?)
    // setFlag
    // update base qualities for color space reads

    // clear attributes, but save some
    Align.clearAttributes(rec, optFieldTags, optFieldValues);

    readBases = new byte[read.length()];
    baseQualities = new byte[qualities.length()];
    for (i = 0; i < qualities.length(); i++) {
      // Must subtract 33 for PHRED scaling
      baseQualities[i] = (byte) (qualities.charAt(i) - 33);
    }

    if (strand) {
      readIndex = 0;
    } else {
      readIndex = read.length() - 1;
    }
    cigarElements = new LinkedList<CigarElement>();
    if (strand) { // reverse strand is the current position
      alignmentStart = bestAlignHeapNode.node.position;
    } else {
      alignmentStart = bestAlignHeapNode.startPosition;
    }

    assert null != bestAlignHeapNode;
    curAlignHeapNode = bestAlignHeapNode;

    while (null != curAlignHeapNode) {
      // Get the current cigar operator
      if (null != prevAlignHeapNode
          && CigarOperator.DELETION != prevCigarOperator
          && 1 < Math.abs(curAlignHeapNode.node.position - prevAlignHeapNode.node.position)) {
        curCigarOperator = CigarOperator.DELETION;
      } else {
        switch (curAlignHeapNode.node.type) {
          case Node.MISMATCH: // Fall through
          case Node.MATCH:
            curCigarOperator = CigarOperator.MATCH_OR_MISMATCH;
            break;
          case Node.INSERTION:
            // System.out.println("INS");
            curCigarOperator = CigarOperator.INSERTION;
            break;
          default:
            throw new Exception("Unknown node type");
        }
        if (space == SRMAUtil.Space.COLORSPACE || correctBases) {
          readBases[readIndex] = (byte) curAlignHeapNode.node.base;
          if (strand) {
            readIndex++;
          } else {
            readIndex--;
          }
          // count the number of mismatches
          switch (curAlignHeapNode.node.type) {
            case Node.MISMATCH:
            case Node.INSERTION:
              numEdits++;
              break;
            default:
              break;
          }
        } else {
          // count the number of mismatches
          switch (curAlignHeapNode.node.type) {
            case Node.MATCH:
              if (read.charAt(curAlignHeapNode.readOffset) != curAlignHeapNode.node.base) {
                numEdits++;
              }
              break;
            case Node.MISMATCH: // Fall through
              if (read.charAt(curAlignHeapNode.readOffset)
                  != sequence.getBases()[curAlignHeapNode.node.position - 1]) {
                numEdits++;
              }
              break;
            case Node.INSERTION:
              numEdits++;
              break;
            default:
              break;
          }
        }
      }
      if (prevCigarOperator != curCigarOperator) {
        // different cigar operator

        // add the previous cigar operator
        if (null != prevCigarOperator) {
          if (strand) { // reverse
            // append
            cigarElements.add(new CigarElement(prevCigarOperatorLength, prevCigarOperator));
          } else {
            // prepend
            cigarElements.add(0, new CigarElement(prevCigarOperatorLength, prevCigarOperator));
          }
        }

        // update prevCigarOperator
        prevCigarOperator = curCigarOperator;
        if (curCigarOperator == CigarOperator.DELETION) {
          // length of deletion
          prevCigarOperatorLength =
              Math.abs(curAlignHeapNode.node.position - prevAlignHeapNode.node.position) - 1;
          numEdits += prevCigarOperatorLength; // deletions
        } else {
          prevCigarOperatorLength = 1;
        }
      } else {
        // same cigar operator
        prevCigarOperatorLength++;
      }

      // Update
      if (CigarOperator.DELETION != curCigarOperator) {
        prevAlignHeapNode = curAlignHeapNode;
        curAlignHeapNode = curAlignHeapNode.prev;
      }
    }
    if (0 < prevCigarOperatorLength) {
      if (null == prevCigarOperator || CigarOperator.DELETION == prevCigarOperator) {
        throw new Exception("Ended with a null cigar operator or a deletion cigar operator");
      }
      if (strand) { // reverse
        // append
        cigarElements.add(new CigarElement(prevCigarOperatorLength, prevCigarOperator));
      } else {
        // prepend
        cigarElements.add(0, new CigarElement(prevCigarOperatorLength, prevCigarOperator));
      }
    }

    if (space == SRMAUtil.Space.COLORSPACE) { // color space, read bases already inferred
      // Get color error string
      colorErrors = new byte[read.length()];
      char prevBase = SRMAUtil.COLORSPACE_ADAPTOR;
      if (strand) { // reverse
        for (i = 0; i < read.length(); i++) {
          char nextBase = SRMAUtil.colorSpaceNextBase(prevBase, read.charAt(i));
          if (nextBase == SRMAUtil.getCompliment((char) readBases[read.length() - i - 1])) {
            colorErrors[i] = (byte) Alignment.GAP;
          } else {
            colorErrors[i] = (byte) read.charAt(i);
          }
          if (0 < i) {
            // qualities are assumed to be always in the same direction as the color errors
            baseQualities[read.length() - i] =
                getColorQuality(
                    colorErrors[i - 1],
                    colorErrors[i],
                    (byte) (qualities.charAt(i - 1) - 33),
                    (byte) (qualities.charAt(i) - 33));
          }
          prevBase = SRMAUtil.getCompliment((char) readBases[read.length() - i - 1]);
        }
        // last color
        baseQualities[0] = (byte) (qualities.charAt(read.length() - 1) - 33);
      } else {
        for (i = 0; i < read.length(); i++) {
          char nextBase = SRMAUtil.colorSpaceNextBase(prevBase, read.charAt(i));
          if (nextBase == readBases[i]) {
            colorErrors[i] = (byte) Alignment.GAP;
          } else {
            colorErrors[i] = (byte) read.charAt(i);
          }
          if (0 < i) {
            baseQualities[i - 1] =
                getColorQuality(
                    colorErrors[i - 1],
                    colorErrors[i],
                    (byte) (qualities.charAt(i - 1) - 33),
                    (byte) (qualities.charAt(i) - 33));
          }
          prevBase = (char) readBases[i];
        }
        // last color
        baseQualities[read.length() - 1] = (byte) (qualities.charAt(read.length() - 1) - 33);
      }
    } else if (correctBases) { // bases were corrected
      if (strand) {
        for (i = 0; i < read.length(); i++) {
          if (readBases[i] == (byte) read.charAt(read.length() - i - 1)) {
            baseQualities[i] = (byte) (qualities.charAt(read.length() - i - 1) - 33);
          } else {
            // TODO: how much to down-weight ?
            baseQualities[i] =
                (byte)
                    (SRMAUtil.QUAL2CHAR(
                            SRMAUtil.CHAR2QUAL(qualities.charAt(read.length() - i - 1))
                                - CORRECT_BASE_QUALITY_PENALTY)
                        - 33);
            if (baseQualities[i] <= 0) {
              baseQualities[i] = 1;
            }
          }
        }
      } else {
        for (i = 0; i < read.length(); i++) {
          if (readBases[i] == (byte) read.charAt(i)) {
            baseQualities[i] = (byte) (qualities.charAt(i) - 33);
          } else {
            // TODO: how much to down-weight ?
            baseQualities[i] =
                (byte)
                    (SRMAUtil.QUAL2CHAR(
                            SRMAUtil.CHAR2QUAL(qualities.charAt(i)) - CORRECT_BASE_QUALITY_PENALTY)
                        - 33);
            if (baseQualities[i] <= 0) {
              baseQualities[i] = 1;
            }
          }
        }
      }
      rec.setAttribute("XO", read);
      rec.setAttribute("XQ", qualities);
    } else { // bases not corrected
      readBases = new byte[read.length()];
      baseQualities = new byte[qualities.length()]; // qualities.length() == read.length()
      if (strand) { // reverse
        for (i = 0; i < read.length(); i++) {
          readBases[i] = (byte) read.charAt(read.length() - i - 1);
          baseQualities[i] = (byte) (qualities.charAt(read.length() - i - 1) - 33);
        }
      } else {
        for (i = 0; i < read.length(); i++) {
          readBases[i] = (byte) read.charAt(i);
          baseQualities[i] = (byte) (qualities.charAt(i) - 33);
        }
      }
    }

    // Add in soft-clipping
    if (null != softClipStartBases) { // prepend
      cigarElements.add(0, new CigarElement(softClipStartBases.length(), CigarOperator.S));

      byte tmpBases[] = new byte[readBases.length + softClipStartBases.length()];
      System.arraycopy(readBases, 0, tmpBases, softClipStartBases.length(), readBases.length);
      readBases = tmpBases;
      for (i = 0; i < softClipStartBases.length(); i++) {
        readBases[i] = (byte) softClipStartBases.charAt(i);
      }

      byte tmpQualities[] = new byte[baseQualities.length + softClipStartQualities.length()];
      System.arraycopy(
          baseQualities, 0, tmpQualities, softClipStartQualities.length(), baseQualities.length);
      baseQualities = tmpQualities;
      for (i = 0; i < softClipStartQualities.length(); i++) {
        baseQualities[i] = (byte) softClipStartQualities.charAt(i);
      }
    }
    if (null != softClipEndBases) { // append
      cigarElements.add(new CigarElement(softClipEndBases.length(), CigarOperator.S));

      byte tmpBases[] = new byte[readBases.length + softClipEndBases.length()];
      System.arraycopy(readBases, 0, tmpBases, 0, readBases.length);
      for (i = 0; i < softClipEndBases.length(); i++) {
        tmpBases[i + readBases.length] = (byte) softClipEndBases.charAt(i);
      }
      readBases = tmpBases;

      byte tmpQualities[] = new byte[baseQualities.length + softClipEndQualities.length()];
      System.arraycopy(baseQualities, 0, tmpQualities, 0, baseQualities.length);
      for (i = 0; i < softClipEndQualities.length(); i++) {
        tmpQualities[i + baseQualities.length] = (byte) softClipEndQualities.charAt(i);
      }
      baseQualities = tmpQualities;
    }

    // Update SAM record
    rec.setCigar(new Cigar(cigarElements));
    rec.setAlignmentStart(alignmentStart);
    rec.setReadBases(readBases);
    rec.setBaseQualities(baseQualities);
    // Reset saved attributes
    Align.resetAttributes(rec, optFieldTags, optFieldValues);
    // Set new attributes
    if (space == SRMAUtil.Space.COLORSPACE) {
      // set the XE attribute for colorError string
      rec.setAttribute("XE", new String(colorErrors));
    }
    rec.setAttribute("AS", bestAlignHeapNode.score);
    rec.setAttribute("XC", bestAlignHeapNode.alleleCoverageSum);
    rec.setAttribute("PG", programRecord.getId());
    rec.setAttribute("NM", numEdits);
  }