Example #1
0
  /**
   * Finds the adaptor boundary around the read and returns the first base inside the adaptor that
   * is closest to the read boundary. If the read is in the positive strand, this is the first base
   * after the end of the fragment (Picard calls it 'insert'), if the read is in the negative
   * strand, this is the first base before the beginning of the fragment.
   *
   * <p>There are two cases we need to treat here:
   *
   * <p>1) Our read is in the reverse strand :
   *
   * <p><----------------------| * |--------------------->
   *
   * <p>in these cases, the adaptor boundary is at the mate start (minus one)
   *
   * <p>2) Our read is in the forward strand :
   *
   * <p>|----------------------> * <----------------------|
   *
   * <p>in these cases the adaptor boundary is at the start of the read plus the inferred insert
   * size (plus one)
   *
   * @param read the read being tested for the adaptor boundary
   * @return the reference coordinate for the adaptor boundary (effectively the first base IN the
   *     adaptor, closest to the read. NULL if the read is unmapped or the mate is mapped to another
   *     contig.
   */
  public static Integer getAdaptorBoundary(final SAMRecord read) {
    final int MAXIMUM_ADAPTOR_LENGTH = 8;
    final int insertSize =
        Math.abs(
            read
                .getInferredInsertSize()); // the inferred insert size can be negative if the mate
                                           // is mapped before the read (so we take the absolute
                                           // value)

    if (insertSize == 0
        || read
            .getReadUnmappedFlag()) // no adaptors in reads with mates in another chromosome or
                                    // unmapped pairs
    return null;

    Integer
        adaptorBoundary; // the reference coordinate for the adaptor boundary (effectively the first
                         // base IN the adaptor, closest to the read)
    if (read.getReadNegativeStrandFlag())
      adaptorBoundary = read.getMateAlignmentStart() - 1; // case 1 (see header)
    else adaptorBoundary = read.getAlignmentStart() + insertSize + 1; // case 2 (see header)

    if ((adaptorBoundary < read.getAlignmentStart() - MAXIMUM_ADAPTOR_LENGTH)
        || (adaptorBoundary > read.getAlignmentEnd() + MAXIMUM_ADAPTOR_LENGTH))
      adaptorBoundary =
          null; // we are being conservative by not allowing the adaptor boundary to go beyond what
                // we belive is the maximum size of an adaptor

    return adaptorBoundary;
  }
      private void collectReadData(final SAMRecord record, final ReferenceSequence ref) {
        metrics.TOTAL_READS++;
        readLengthHistogram.increment(record.getReadBases().length);

        if (!record.getReadFailsVendorQualityCheckFlag()) {
          metrics.PF_READS++;
          if (isNoiseRead(record)) metrics.PF_NOISE_READS++;

          if (record.getReadUnmappedFlag()) {
            // If the read is unmapped see if it's adapter sequence
            final byte[] readBases = record.getReadBases();
            if (!(record instanceof BAMRecord)) StringUtil.toUpperCase(readBases);

            if (isAdapterSequence(readBases)) {
              this.adapterReads++;
            }
          } else if (doRefMetrics) {
            metrics.PF_READS_ALIGNED++;
            if (!record.getReadNegativeStrandFlag()) numPositiveStrand++;

            if (record.getReadPairedFlag() && !record.getMateUnmappedFlag()) {
              metrics.READS_ALIGNED_IN_PAIRS++;

              // Check that both ends have mapq > minimum
              final Integer mateMq = record.getIntegerAttribute("MQ");
              if (mateMq == null
                  || mateMq >= MAPPING_QUALITY_THRESOLD
                      && record.getMappingQuality() >= MAPPING_QUALITY_THRESOLD) {
                ++this.chimerasDenominator;

                // With both reads mapped we can see if this pair is chimeric
                if (Math.abs(record.getInferredInsertSize()) > maxInsertSize
                    || !record.getReferenceIndex().equals(record.getMateReferenceIndex())) {
                  ++this.chimeras;
                }
              }
            }
          }
        }
      }
Example #3
0
 public static Strand valueOfSAMRecord(SAMRecord sr) {
   return sr.getReadNegativeStrandFlag() ? STRAND_NEGATIVE : STRAND_POSITIVE;
 }
Example #4
0
  @Override
  public int doWork(String[] args) {
    File refFile = null;
    com.github.lindenb.jvarkit.util.cli.GetOpt getopt =
        new com.github.lindenb.jvarkit.util.cli.GetOpt();
    int c;
    while ((c = getopt.getopt(args, "hvL:r:")) != -1) {
      switch (c) {
        case 'h':
          printUsage();
          return 0;
        case 'v':
          System.out.println(getVersion());
          return 0;
        case 'L':
          getLogger().setLevel(java.util.logging.Level.parse(getopt.getOptArg()));
          break;
        case 'r':
          refFile = new File(getopt.getOptArg());
          break;
        case ':':
          System.err.println("Missing argument for option -" + getopt.getOptOpt());
          return -1;
        default:
          System.err.println("Unknown option -" + getopt.getOptOpt());
          return -1;
      }
    }

    if (refFile == null) {
      error("Undefined REF file");
      return -1;
    }
    File bamFile = null;
    if (getopt.getOptInd() + 1 != args.length) {
      info("reading from stdin.");
    } else {
      bamFile = new File(args[getopt.getOptInd()]);
    }

    IndexedFastaSequenceFile indexedFastaSequenceFile = null;
    SAMFileReader samFileReader = null;

    try {
      GenomicSequence genomicSequence = null;
      indexedFastaSequenceFile = new IndexedFastaSequenceFile(refFile);
      SAMFileReader.setDefaultValidationStringency(ValidationStringency.SILENT);
      samFileReader = null;
      if (bamFile == null) {
        samFileReader = new SAMFileReader(System.in);
      } else {
        samFileReader = new SAMFileReader(bamFile);
      }
      XMLOutputFactory xmlfactory = XMLOutputFactory.newInstance();
      XMLStreamWriter w = xmlfactory.createXMLStreamWriter(System.out, "UTF-8");
      w.writeStartDocument("UTF-8", "1.0");
      w.writeStartElement("sam");
      w.writeComment(getProgramCommandLine());
      w.writeAttribute("ref", (bamFile == null ? "stdin" : bamFile.getPath()));
      w.writeAttribute("bam", args[1]);

      SAMRecordIterator iter = samFileReader.iterator();
      while (iter.hasNext()) {
        SAMRecord rec = iter.next();

        final byte readbases[] = rec.getReadBases();
        w.writeStartElement("read");

        w.writeStartElement("name");
        w.writeCharacters(rec.getReadName());
        w.writeEndElement();
        w.writeStartElement("sequence");
        w.writeCharacters(new String(readbases));
        w.writeEndElement();
        w.writeStartElement("flags");
        w.writeAttribute("paired", String.valueOf(rec.getReadPairedFlag()));
        w.writeAttribute(
            "failsVendorQual", String.valueOf(rec.getReadFailsVendorQualityCheckFlag()));
        w.writeAttribute("mapped", String.valueOf(!rec.getReadUnmappedFlag()));
        w.writeAttribute("strand", (rec.getReadNegativeStrandFlag() ? "-" : "+"));

        if (rec.getReadPairedFlag()) {
          w.writeAttribute("mate-mapped", String.valueOf(!rec.getMateUnmappedFlag()));
          w.writeAttribute("mate-strand", (rec.getMateNegativeStrandFlag() ? "-" : "+"));
          w.writeAttribute("proper-pair", String.valueOf(rec.getProperPairFlag()));
        }

        w.writeCharacters(String.valueOf(rec.getFlags()));
        w.writeEndElement();
        if (!rec.getReadUnmappedFlag()) {
          w.writeStartElement("qual");
          w.writeCharacters(String.valueOf(rec.getMappingQuality()));
          w.writeEndElement();

          w.writeStartElement("chrom");
          w.writeAttribute("index", String.valueOf(rec.getReferenceIndex()));
          w.writeCharacters(rec.getReferenceName());
          w.writeEndElement();
          w.writeStartElement("pos");
          w.writeCharacters(String.valueOf(rec.getAlignmentStart()));
          w.writeEndElement();
          w.writeStartElement("cigar");
          w.writeCharacters(rec.getCigarString());
          w.writeEndElement();
        }

        if (!rec.getMateUnmappedFlag()) {
          w.writeStartElement("mate-chrom");
          w.writeAttribute("index", String.valueOf(rec.getMateReferenceIndex()));
          w.writeCharacters(rec.getMateReferenceName());
          w.writeEndElement();
          w.writeStartElement("mate-pos");
          w.writeCharacters(String.valueOf(rec.getMateAlignmentStart()));
          w.writeEndElement();
        }

        if (!rec.getReadUnmappedFlag()) {
          if (genomicSequence == null
              || genomicSequence.getChrom().equals(rec.getReferenceName())) {
            genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
          }

          w.writeStartElement("align");

          int readIndex = 0;
          int refIndex = rec.getAlignmentStart();

          for (final CigarElement e : rec.getCigar().getCigarElements()) {
            switch (e.getOperator()) {
              case H:
                break; // ignore hard clips
              case P:
                break; // ignore pads
              case I: // cont.
              case S:
                {
                  final int length = e.getLength();
                  for (int i = 0; i < length; ++i) {
                    w.writeEmptyElement(e.getOperator().name());
                    w.writeAttribute("read-index", String.valueOf(readIndex + 1));
                    if (readIndex >= 0 && readIndex < readbases.length) {
                      w.writeAttribute("read-base", String.valueOf((char) (readbases[readIndex])));
                    }
                    readIndex++;
                  }
                  break;
                }
              case N: // cont. -- reference skip
              case D:
                {
                  final int length = e.getLength();
                  for (int i = 0; i < length; ++i) {
                    w.writeEmptyElement(e.getOperator().name());
                    w.writeAttribute("ref-index", String.valueOf(refIndex));
                    if (refIndex >= 1 && refIndex <= genomicSequence.length()) {
                      w.writeAttribute(
                          "ref-base", String.valueOf(genomicSequence.charAt(refIndex - 1)));
                    }
                    refIndex++;
                  }
                  break;
                }
              case M:
              case EQ:
              case X:
                {
                  final int length = e.getLength();
                  for (int i = 0; i < length; ++i) {
                    w.writeEmptyElement(e.getOperator().name());
                    char baseRead = '\0';
                    if (readIndex >= 0 && readIndex < readbases.length) {
                      baseRead = (char) (rec.getReadBases()[readIndex]);
                      w.writeAttribute("read-index", String.valueOf(readIndex + 1));
                      w.writeAttribute("read-base", String.valueOf(baseRead));
                    }
                    w.writeAttribute("ref-index", String.valueOf(refIndex));
                    if (refIndex >= 1 && refIndex <= genomicSequence.length()) {
                      char baseRef = genomicSequence.charAt(refIndex - 1);
                      w.writeAttribute("ref-base", String.valueOf(baseRef));
                      if (Character.toUpperCase(baseRef) != Character.toUpperCase(baseRead)) {
                        w.writeAttribute("mismatch", "true");
                      }
                    }

                    refIndex++;
                    readIndex++;
                  }
                  break;
                }

              default:
                throw new IllegalStateException(
                    "Case statement didn't deal with cigar op: " + e.getOperator());
            }
          }
        }

        w.writeEndElement();

        w.writeEndElement();

        iter.close();
        w.writeEndElement();
      }
      w.writeEndElement();
      w.writeEndDocument();
      w.flush();
      w.close();
    } catch (Exception err) {
      error(err);
      return -1;
    } finally {
      CloserUtil.close(samFileReader);
      CloserUtil.close(indexedFastaSequenceFile);
    }
    return 0;
  }
      private void collectQualityData(final SAMRecord record, final ReferenceSequence reference) {
        // If the read isnt an aligned PF read then look at the read for no-calls
        if (record.getReadUnmappedFlag()
            || record.getReadFailsVendorQualityCheckFlag()
            || !doRefMetrics) {
          final byte[] readBases = record.getReadBases();
          for (int i = 0; i < readBases.length; i++) {
            if (SequenceUtil.isNoCall(readBases[i])) {
              badCycleHistogram.increment(
                  CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i));
            }
          }
        } else if (!record.getReadFailsVendorQualityCheckFlag()) {
          final boolean highQualityMapping = isHighQualityMapping(record);
          if (highQualityMapping) metrics.PF_HQ_ALIGNED_READS++;

          final byte[] readBases = record.getReadBases();
          final byte[] refBases = reference.getBases();
          final byte[] qualities = record.getBaseQualities();
          final int refLength = refBases.length;
          long mismatchCount = 0;
          long hqMismatchCount = 0;

          for (final AlignmentBlock alignmentBlock : record.getAlignmentBlocks()) {
            final int readIndex = alignmentBlock.getReadStart() - 1;
            final int refIndex = alignmentBlock.getReferenceStart() - 1;
            final int length = alignmentBlock.getLength();

            for (int i = 0; i < length && refIndex + i < refLength; ++i) {
              final int readBaseIndex = readIndex + i;
              boolean mismatch =
                  !SequenceUtil.basesEqual(readBases[readBaseIndex], refBases[refIndex + i]);
              boolean bisulfiteBase = false;
              if (mismatch && isBisulfiteSequenced) {
                if ((record.getReadNegativeStrandFlag()
                        && (refBases[refIndex + i] == 'G' || refBases[refIndex + i] == 'g')
                        && (readBases[readBaseIndex] == 'A' || readBases[readBaseIndex] == 'a'))
                    || ((!record.getReadNegativeStrandFlag())
                            && (refBases[refIndex + i] == 'C' || refBases[refIndex + i] == 'c')
                            && (readBases[readBaseIndex] == 'T')
                        || readBases[readBaseIndex] == 't')) {

                  bisulfiteBase = true;
                  mismatch = false;
                }
              }

              if (mismatch) mismatchCount++;

              metrics.PF_ALIGNED_BASES++;
              if (!bisulfiteBase) nonBisulfiteAlignedBases++;

              if (highQualityMapping) {
                metrics.PF_HQ_ALIGNED_BASES++;
                if (!bisulfiteBase) hqNonBisulfiteAlignedBases++;
                if (qualities[readBaseIndex] >= BASE_QUALITY_THRESHOLD)
                  metrics.PF_HQ_ALIGNED_Q20_BASES++;
                if (mismatch) hqMismatchCount++;
              }

              if (mismatch || SequenceUtil.isNoCall(readBases[readBaseIndex])) {
                badCycleHistogram.increment(
                    CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i));
              }
            }
          }

          mismatchHistogram.increment(mismatchCount);
          hqMismatchHistogram.increment(hqMismatchCount);

          // Add any insertions and/or deletions to the global count
          for (final CigarElement elem : record.getCigar().getCigarElements()) {
            final CigarOperator op = elem.getOperator();
            if (op == CigarOperator.INSERTION || op == CigarOperator.DELETION) ++this.indels;
          }
        }
      }
Example #6
0
  public void find_coverage(SAMResource sres) {
    int start_base = sres.region.range.start;
    int end_base = sres.region.range.end;

    int coverage_len = (end_base - start_base) + 1;
    int i, end, ref_i, read_i, len;

    int[] coverage = new int[coverage_len];
    Arrays.fill(coverage, 0);

    WorkingFile wf = null;
    if (outfile != null) {
      try {
        wf = new WorkingFile(outfile);
        ps = wf.getPrintStream();
      } catch (Exception e) {
        System.err.println("I/O error: " + e); // debug
        e.printStackTrace();
        System.exit(1);
      }
    }

    try {
      //
      //  gather coverage info:
      //
      CloseableIterator<SAMRecord> iterator = sres.get_iterator();
      int read_count = 0;
      int ref_min = -1;
      int ref_max = -1;

      while (iterator.hasNext()) {
        SAMRecord sr = iterator.next();
        read_count++;

        //	System.err.println(sr.getReadName() + ": " + sr.getAlignmentStart() + "-" +
        // sr.getAlignmentEnd());  // debug

        if (sr.getReadUnmappedFlag()) continue;
        if (sr.getDuplicateReadFlag()) {
          if (verbose_mode)
            System.err.println(
                sr.getReadName()
                    + "."
                    + (sr.getReadNegativeStrandFlag() ? "R" : "F")
                    + " ignoring, duplicate");
          continue;
        }

        byte[] read = sr.getReadBases();
        byte[] quals = sr.getBaseQualities();

        for (AlignmentBlock ab : sr.getAlignmentBlocks()) {
          len = ab.getLength();
          read_i = ab.getReadStart() - 1;
          ref_i = ab.getReferenceStart() - start_base;

          if (ref_min == -1 || ref_i < ref_min) ref_min = ref_i;

          for (i = read_i, end = read_i + len; i < end; i++, ref_i++) {
            if (ref_i >= 0 && ref_i < coverage_len) {
              if (quals[i] >= MIN_QUALITY) {
                if (verbose_mode)
                  System.err.println(
                      sr.getReadName()
                          + "."
                          + (sr.getReadNegativeStrandFlag() ? "R" : "F")
                          + " hit at "
                          + (ref_i + start_base)
                          + " as="
                          + sr.getAlignmentStart()
                          + " ae="
                          + sr.getAlignmentEnd());
                coverage[ref_i]++;
              } else if (verbose_mode) {
                System.err.println(
                    sr.getReadName()
                        + "."
                        + (sr.getReadNegativeStrandFlag() ? "R" : "F")
                        + " qual_reject at "
                        + (ref_i + start_base)
                        + " as="
                        + sr.getAlignmentStart()
                        + " ae="
                        + sr.getAlignmentEnd());
              }
            }
          }
          if (ref_max == -1 || ref_i > ref_max) ref_max = ref_i;
        }
      }
      sres.close();
      System.err.println(
          "records:"
              + read_count
              + " ref_min:"
              + (ref_min + start_base)
              + " ref_max:"
              + (ref_max + start_base)); // debug

      //
      //  report coverage info:
      //
      for (i = 0; i < coverage.length; i++) {
        if (name != null) ps.print(name + ",");
        ps.println((i + start_base) + "," + coverage[i]); // debug
      }
      if (wf != null) wf.finish();

    } catch (Exception e) {
      System.err.println("ERROR: " + e); // debug
      e.printStackTrace();
    }
  }
Example #7
0
File: Align.java Project: nh13/SRMA
  public static void align(
      Graph graph,
      SAMRecord rec,
      Node recNode,
      ReferenceSequence sequence,
      SAMProgramRecord programRecord,
      int offset,
      AlleleCoverageCutoffs alleleCoverageCutoffs,
      boolean correctBases,
      boolean useSequenceQualities,
      int MAXIMUM_TOTAL_COVERAGE,
      int MAX_HEAP_SIZE)
      throws Exception {

    int i;
    AlignHeapNode curAlignHeapNode = null;
    AlignHeapNode nextAlignHeapNode = null;
    AlignHeapNode bestAlignHeapNode = null;
    AlignHeap heap = null;
    String read = null; // could be cs
    String readBases = null; // always nt
    String qualities = null; // could be cq
    SRMAUtil.Space space = SRMAUtil.Space.NTSPACE;
    ListIterator<NodeRecord> iter = null;
    AlignHeapNodeComparator comp = null;
    int alignmentStart = -1;
    int numStartNodesAdded = 0;
    boolean strand = rec.getReadNegativeStrandFlag(); // false -> forward, true -> reverse
    String softClipStartBases = null;
    String softClipStartQualities = null;
    String softClipEndBases = null;
    String softClipEndQualities = null;

    // Debugging stuff
    String readName = rec.getReadName();

    assert SRMAUtil.Space.COLORSPACE != space;

    // Get space
    read = (String) rec.getAttribute("CS");
    if (null == read) {
      // Use base space
      space = SRMAUtil.Space.NTSPACE;
    } else {
      // assumes CS and CQ are always in sequencing order
      space = SRMAUtil.Space.COLORSPACE;
    }

    // Get read and qualities
    if (space == SRMAUtil.Space.NTSPACE) {
      byte tmpRead[] = rec.getReadString().getBytes();
      byte tmpQualities[] = rec.getBaseQualityString().getBytes();
      // Reverse once
      if (strand) { // reverse
        SAMRecordUtil.reverseArray(tmpRead);
        SAMRecordUtil.reverseArray(tmpQualities);
      }
      read = new String(tmpRead);
      readBases = new String(tmpRead);
      qualities = new String(tmpQualities);
      // Reverse again
      if (strand) { // reverse
        SAMRecordUtil.reverseArray(tmpRead);
        SAMRecordUtil.reverseArray(tmpQualities);
      }
    } else {
      byte tmpRead[] = rec.getReadString().getBytes();
      // Reverse once
      if (strand) { // reverse
        SAMRecordUtil.reverseArray(tmpRead);
      }
      readBases = new String(tmpRead);
      // Reverse again
      if (strand) { // reverse
        SAMRecordUtil.reverseArray(tmpRead);
      }
      read = SRMAUtil.normalizeColorSpaceRead(read);
      qualities = (String) rec.getAttribute("CQ");
      // Some aligners include a quality value for the adapter.  A quality value
      // IMHO should not be given for an unobserved (assumed) peice of data.  Trim
      // the first quality in this case
      if (qualities.length() == 1 + read.length()) { // trim the first quality
        qualities = qualities.substring(1);
      }
    }
    // Reverse back
    if (readBases.length() <= 0) {
      throw new Exception("Error.  The current alignment has no bases.");
    }
    if (read.length() <= 0) {
      throw new Exception("Error.  The current alignment has no bases.");
    }
    if (qualities.length() <= 0) {
      throw new Exception("Error.  The current alignment has no qualities.");
    }
    if (readBases.length() != read.length()) {
      if (space == SRMAUtil.Space.COLORSPACE) {
        throw new Exception(
            "Error.  The current alignment's read bases length does not match the length of the colors in the CS tag ["
                + rec.getReadName()
                + "].");
      } else {
        throw new Exception("Error.  Internal error: readBases.length() != read.length()");
      }
    }

    // Deal with soft-clipping
    // - save the soft clipped sequence for latter
    {
      List<CigarElement> cigarElements = null;

      cigarElements = rec.getCigar().getCigarElements();
      CigarElement e1 = cigarElements.get(0); // first
      CigarElement e2 = cigarElements.get(cigarElements.size() - 1); // last

      // Soft-clipped
      if (CigarOperator.S == e1.getOperator()) {
        if (space == SRMAUtil.Space.COLORSPACE) {
          throw new Exception(
              "Error.  Soft clipping with color-space data not currently supported.");
        }
        int l = e1.getLength();
        if (strand) { // reverse
          softClipStartBases = readBases.substring(readBases.length() - l);
          softClipStartQualities = qualities.substring(qualities.length() - l);
          readBases = readBases.substring(0, readBases.length() - l);
          read = read.substring(0, read.length() - l);
          qualities = qualities.substring(0, qualities.length() - l);
        } else {
          softClipStartBases = readBases.substring(0, l - 1);
          softClipStartQualities = qualities.substring(0, l - 1);
          readBases = readBases.substring(l);
          read = read.substring(l);
          qualities = qualities.substring(l);
        }
      }
      if (CigarOperator.S == e2.getOperator()) {
        if (space == SRMAUtil.Space.COLORSPACE) {
          throw new Exception(
              "Error.  Soft clipping with color-space data not currently supported.");
        }
        int l = e2.getLength();
        if (strand) { // reverse
          softClipEndBases = readBases.substring(0, l - 1);
          softClipEndQualities = qualities.substring(0, l - 1);
          readBases = readBases.substring(l);
          read = read.substring(l);
          qualities = qualities.substring(l);
        } else {
          softClipEndBases = readBases.substring(readBases.length() - l);
          softClipEndQualities = qualities.substring(qualities.length() - l);
          readBases = readBases.substring(0, readBases.length() - l);
          read = read.substring(0, read.length() - l);
          qualities = qualities.substring(0, qualities.length() - l);
        }
      }
    }

    // Remove mate pair information
    Align.removeMateInfo(rec);

    comp =
        new AlignHeapNodeComparator(
            (strand) ? AlignHeap.HeapType.MAXHEAP : AlignHeap.HeapType.MINHEAP);

    // Bound by original alignment if possible
    bestAlignHeapNode =
        Align.boundWithOriginalAlignment(
            rec,
            graph,
            recNode,
            comp,
            strand,
            read,
            qualities,
            readBases,
            space,
            sequence,
            alleleCoverageCutoffs,
            useSequenceQualities,
            MAXIMUM_TOTAL_COVERAGE,
            MAX_HEAP_SIZE);

    /*
    System.err.println("readName="+rec.getReadName());
    if(null != bestAlignHeapNode) {
    System.err.println("\nFOUND BEST:" + rec.toString());
    }
    else {
    System.err.println("\nNOT FOUND (BEST): " + rec.toString());
    }
    Align.updateSAM(rec, programRecord, bestAlignHeapNode, space, read, qualities, softClipStartBases, softClipStartQualities, softClipEndBases, softClipEndQualities, strand, correctBases);
    return;
    */

    heap = new AlignHeap((strand) ? AlignHeap.HeapType.MAXHEAP : AlignHeap.HeapType.MINHEAP);

    // Add start nodes
    if (strand) { // reverse
      alignmentStart = rec.getAlignmentEnd();
      for (i = alignmentStart + offset; alignmentStart - offset <= i; i--) {
        int position = graph.getPriorityQueueIndexAtPositionOrBefore(i);
        PriorityQueue<Node> startNodeQueue = graph.getPriorityQueue(position);
        if (0 != position && null != startNodeQueue) {
          Iterator<Node> startNodeQueueIter = startNodeQueue.iterator();
          while (startNodeQueueIter.hasNext()) {
            Node startNode = startNodeQueueIter.next();
            int f = passFilters(graph, startNode, alleleCoverageCutoffs, MAXIMUM_TOTAL_COVERAGE);
            if (0 == f) {
              heap.add(
                  new AlignHeapNode(
                      null,
                      startNode,
                      startNode.coverage,
                      read.charAt(0),
                      qualities.charAt(0),
                      useSequenceQualities,
                      space));
            } else if (f < 0) {
              return;
            }
            if (startNode.position < i) {
              i = startNode.position;
            }
            numStartNodesAdded++;
          }
        }
      }
    } else {
      alignmentStart = rec.getAlignmentStart();
      for (i = alignmentStart - offset; i <= alignmentStart + offset; i++) {
        int position = graph.getPriorityQueueIndexAtPositionOrGreater(i);
        PriorityQueue<Node> startNodeQueue = graph.getPriorityQueue(position);
        if (0 != position && null != startNodeQueue) {
          Iterator<Node> startNodeQueueIter = startNodeQueue.iterator();
          while (startNodeQueueIter.hasNext()) {
            Node startNode = startNodeQueueIter.next();
            int f = passFilters(graph, startNode, alleleCoverageCutoffs, MAXIMUM_TOTAL_COVERAGE);
            if (0 == f) {
              heap.add(
                  new AlignHeapNode(
                      null,
                      startNode,
                      startNode.coverage,
                      read.charAt(0),
                      qualities.charAt(0),
                      useSequenceQualities,
                      space));
            } else if (f < 0) {
              return;
            }
            if (i < startNode.position) {
              i = startNode.position;
            }
            numStartNodesAdded++;
          }
        }
      }
    }
    if (numStartNodesAdded == 0) {
      throw new Exception("Did not add any start nodes!");
    }

    // Get first node off the heap
    curAlignHeapNode = heap.poll();

    while (null != curAlignHeapNode) {

      if (MAX_HEAP_SIZE <= heap.size()) {
        // too many to consider
        return;
      }

      // System.err.println("strand:" + strand + "\tsize:" + heap.size() + "\talignmentStart:" +
      // alignmentStart + "\toffset:" + offset + "\treadOffset:" + curAlignHeapNode.readOffset);
      // System.err.print("size:" + heap.size() + ":" + curAlignHeapNode.readOffset + ":" +
      // curAlignHeapNode.score + ":" + curAlignHeapNode.alleleCoverageSum + ":" +
      // curAlignHeapNode.startPosition + "\t");
      // curAlignHeapNode.node.print(System.err);
      // System.err.print("\rposition:" + curAlignHeapNode.node.position + "\treadOffset:" +
      // curAlignHeapNode.readOffset);

      // Remove all non-insertions with the same contig/pos/read-offset/type/base and lower score
      nextAlignHeapNode = heap.peek();
      while (Node.INSERTION != curAlignHeapNode.node.type
          && null != nextAlignHeapNode
          && 0 == comp.compare(curAlignHeapNode, nextAlignHeapNode)) {
        if (curAlignHeapNode.score < nextAlignHeapNode.score
            || (curAlignHeapNode.score == nextAlignHeapNode.score
                && curAlignHeapNode.alleleCoverageSum < nextAlignHeapNode.alleleCoverageSum)) {
          // Update current node
          curAlignHeapNode = heap.poll();
        } else {
          // Ignore next node
          heap.poll();
        }
        nextAlignHeapNode = heap.peek();
      }
      nextAlignHeapNode = null;

      // Check if the alignment is complete
      if (curAlignHeapNode.readOffset == read.length() - 1) {
        // All read bases examined, store if has the best alignment.

        // System.err.print(curAlignHeapNode.alleleCoverageSum + ":" + curAlignHeapNode.score +
        // ":");
        // System.err.print(curAlignHeapNode.startPosition + ":");
        // curAlignHeapNode.node.print(System.err);

        if (null == bestAlignHeapNode
            || bestAlignHeapNode.score < curAlignHeapNode.score
            || (bestAlignHeapNode.score == curAlignHeapNode.score
                && bestAlignHeapNode.alleleCoverageSum < curAlignHeapNode.alleleCoverageSum)) {
          bestAlignHeapNode = curAlignHeapNode;
        }
      } else if (null != bestAlignHeapNode && curAlignHeapNode.score < bestAlignHeapNode.score) {
        // ignore, under the assumption that scores can only become more negative.
      } else {
        if (strand) { // reverse
          // Go to all the "prev" nodes
          iter = curAlignHeapNode.node.prev.listIterator();
        } else { // forward
          // Go to all "next" nodes
          iter = curAlignHeapNode.node.next.listIterator();
        }
        while (iter.hasNext()) {
          NodeRecord next = iter.next();
          int f =
              passFilters(
                  graph, next.node, next.coverage, alleleCoverageCutoffs, MAXIMUM_TOTAL_COVERAGE);
          if (0 == f) {
            heap.add(
                new AlignHeapNode(
                    curAlignHeapNode,
                    next.node,
                    next.coverage,
                    read.charAt(curAlignHeapNode.readOffset + 1),
                    qualities.charAt(curAlignHeapNode.readOffset + 1),
                    useSequenceQualities,
                    space));
          } else if (f < 0) {
            return;
          }
        }
        iter = null;
      }
      // Get next node
      curAlignHeapNode = heap.poll();
    }

    // Recover alignment
    Align.updateSAM(
        rec,
        sequence,
        programRecord,
        bestAlignHeapNode,
        space,
        read,
        qualities,
        softClipStartBases,
        softClipStartQualities,
        softClipEndBases,
        softClipEndQualities,
        strand,
        correctBases);
  }
Example #8
0
  @Override
  public int doWork(String[] args) {
    boolean repair_missing_read = false;
    SortingCollectionFactory<MappedFastq> sortingFactory =
        new SortingCollectionFactory<MappedFastq>();
    File forwardFile = null;
    File reverseFile = null;
    com.github.lindenb.jvarkit.util.cli.GetOpt opt =
        new com.github.lindenb.jvarkit.util.cli.GetOpt();
    int c;

    sortingFactory.setComponentType(MappedFastq.class);
    sortingFactory.setCodec(new MappedFastqCodec());
    sortingFactory.setComparator(new MappedFastqComparator());

    while ((c = opt.getopt(args, super.getGetOptDefault() + "F:R:N:r")) != -1) {
      switch (c) {
        case 'F':
          forwardFile = new File(opt.getOptArg());
          break;
        case 'R':
          reverseFile = new File(opt.getOptArg());
          break;
        case 't':
          addTmpDirectory(new File(opt.getOptArg()));
          break;
        case 'N':
          sortingFactory.setMaxRecordsInRAM(Math.max(Integer.parseInt(opt.getOptArg()), 100));
          break;
        case 'r':
          repair_missing_read = true;
          break;
        case ':':
          System.err.println("Missing argument for option -" + opt.getOptOpt());
          return -1;
        default:
          {
            switch (handleOtherOptions(c, opt, args)) {
              case EXIT_FAILURE:
                return -1;
              case EXIT_SUCCESS:
                return 0;
              default:
                break;
            }
          }
      }
    }
    SAMFileReader sfr = null;
    SortingCollection<MappedFastq> fastqCollection = null;
    try {
      sortingFactory.setTmpDirs(this.getTmpDirectories());
      fastqCollection = sortingFactory.make();
      fastqCollection.setDestructiveIteration(true);
      boolean found_single = false;
      boolean found_paired = false;
      long non_primary_alignmaned_flag = 0L;

      if (opt.getOptInd() == args.length) {
        info("Reading from stdin");
        sfr = new SAMFileReader(System.in);
      } else if (opt.getOptInd() + 1 == args.length) {
        String filename = args[opt.getOptInd()];
        sfr = new SAMFileReader(new File(filename));
      } else {
        error(getMessageBundle("illegal.number.of.arguments"));
        return -1;
      }
      sfr.setValidationStringency(ValidationStringency.LENIENT);
      SAMRecordIterator iter = sfr.iterator();
      SAMSequenceDictionaryProgress progress =
          new SAMSequenceDictionaryProgress(sfr.getFileHeader().getSequenceDictionary());
      while (iter.hasNext()) {
        SAMRecord rec = iter.next();
        progress.watch(rec);

        if (rec.isSecondaryOrSupplementary()) {
          if (non_primary_alignmaned_flag == 0) {
            warning("SKIPPING NON-PRIMARY " + (non_primary_alignmaned_flag + 1) + " ALIGNMENTS");
          }
          non_primary_alignmaned_flag++;
          continue;
        }

        MappedFastq m = new MappedFastq();
        m.name = rec.getReadName();
        if (m.name == null) m.name = "";
        m.hash = m.name.hashCode();
        m.seq = rec.getReadString();

        if (m.seq.equals(SAMRecord.NULL_SEQUENCE_STRING)) m.seq = "";
        m.qual = rec.getBaseQualityString();
        if (m.qual.equals(SAMRecord.NULL_QUALS_STRING)) m.qual = "";
        if (!rec.getReadUnmappedFlag() && rec.getReadNegativeStrandFlag()) {
          m.seq = AcidNucleics.reverseComplement(m.seq);
          m.qual = new StringBuilder(m.qual).reverse().toString();
        }
        if (m.seq.length() != m.qual.length()) {
          error("length(seq)!=length(qual) in " + m.name);
          continue;
        }
        if (m.seq.isEmpty() && m.qual.isEmpty()) {
          m.seq = "N";
          m.qual = "#";
        }

        if (rec.getReadPairedFlag()) {
          found_paired = true;
          if (found_single) {
            sfr.close();
            throw new PicardException("input is a mix of paired/singled reads");
          }
          m.side = (byte) (rec.getSecondOfPairFlag() ? 2 : 1);
        } else {
          found_single = true;
          if (found_paired) {
            sfr.close();
            throw new PicardException("input is a mix of paired/singled reads");
          }
          m.side = (byte) 0;
        }
        fastqCollection.add(m);
      }
      iter.close();
      CloserUtil.close(iter);
      CloserUtil.close(sfr);
      progress.finish();

      fastqCollection.doneAdding();
      info("Done reading.");

      if (found_paired) {
        FastqWriter fqw1 = null;
        FastqWriter fqw2 = null;
        if (forwardFile != null) {
          info("Writing to " + forwardFile);
          fqw1 = new BasicFastqWriter(forwardFile);
        } else {
          info("Writing to stdout");
          fqw1 = new BasicFastqWriter(new PrintStream(System.out));
        }
        if (reverseFile != null) {
          info("Writing to " + reverseFile);
          fqw2 = new BasicFastqWriter(reverseFile);
        } else {
          info("Writing to interlaced stdout");
          fqw2 = fqw1;
        }
        List<MappedFastq> row = new ArrayList<MappedFastq>();
        CloseableIterator<MappedFastq> r = fastqCollection.iterator();
        for (; ; ) {
          MappedFastq curr = null;
          if (r.hasNext()) curr = r.next();
          if (curr == null || (!row.isEmpty() && !row.get(0).name.equals(curr.name))) {
            if (!row.isEmpty()) {
              if (row.size() > 2) {
                warning("WTF :" + row);
              }
              boolean found_F = false;
              boolean found_R = false;
              for (MappedFastq m : row) {
                switch ((int) m.side) {
                  case 1:
                    if (found_F)
                      throw new PicardException("two forward reads found for " + row.get(0).name);
                    found_F = true;
                    echo(fqw1, m);
                    break;
                  case 2:
                    if (found_R)
                      throw new PicardException("two reverse reads found for " + row.get(0).name);
                    found_R = true;
                    echo(fqw2, m);
                    break;
                  default:
                    throw new IllegalStateException("uh???");
                }
              }
              if (!found_F) {
                if (repair_missing_read) {
                  warning("forward not found for " + row.get(0));
                  MappedFastq pad = new MappedFastq();
                  pad.side = (byte) 1;
                  pad.name = row.get(0).name;
                  pad.seq = "N";
                  pad.qual = "#";
                  echo(fqw1, pad);
                } else {
                  throw new PicardException("forward not found for " + row);
                }
              }
              if (!found_R) {
                if (repair_missing_read) {
                  warning("reverse not found for " + row.get(0));
                  MappedFastq pad = new MappedFastq();
                  pad.side = (byte) 2;
                  pad.name = row.get(0).name;
                  pad.seq = "N";
                  pad.qual = "#";
                  echo(fqw2, pad);
                } else {
                  throw new PicardException("reverse not found for " + row);
                }
              }
            }
            if (curr == null) break;
            row.clear();
          }
          row.add(curr);
        }
        r.close();
        fqw1.close();
        fqw2.close();
      } else if (found_single) {
        FastqWriter fqw1 = null;
        if (forwardFile != null) {
          info("Writing to " + forwardFile);
          fqw1 = new BasicFastqWriter(forwardFile);
        } else {
          info("Writing to stdout");
          fqw1 = new BasicFastqWriter(new PrintStream(System.out));
        }

        CloseableIterator<MappedFastq> r = fastqCollection.iterator();
        while (r.hasNext()) {
          echo(fqw1, r.next());
        }
        r.close();
        fqw1.close();
      }
      return 0;
    } catch (Exception err) {
      error(err);
      return -1;
    } finally {
      if (fastqCollection != null) fastqCollection.cleanup();
    }
  }