public int compare(final SAMRecord rec1, final SAMRecord rec2) {
   if (rec1.getReadUnmappedFlag()) {
     if (rec2.getReadUnmappedFlag()) return 0;
     else return 1;
   } else if (rec2.getReadUnmappedFlag()) {
     return -1;
   }
   return -SAMUtils.compareMapqs(rec1.getMappingQuality(), rec2.getMappingQuality());
 }
Пример #2
0
  public SAMRecord next() {

    SAMRecord cur = it.next();
    if (last != null) verifyRecord(last, cur);
    if (!cur.getReadUnmappedFlag()) last = cur;
    return cur;
  }
Пример #3
0
  /** Note: this is the only getKey function that handles unmapped reads specially! */
  public static long getKey(final SAMRecord rec) {
    final int refIdx = rec.getReferenceIndex();
    final int start = rec.getAlignmentStart();

    if (!(rec.getReadUnmappedFlag() || refIdx < 0 || start < 0)) return getKey(refIdx, start);

    // Put unmapped reads at the end, but don't give them all the exact same
    // key so that they can be distributed to different reducers.
    //
    // A random number would probably be best, but to ensure that the same
    // record always gets the same key we use a fast hash instead.
    //
    // We avoid using hashCode(), because it's not guaranteed to have the
    // same value across different processes.

    int hash = 0;
    byte[] var;
    if ((var = rec.getVariableBinaryRepresentation()) != null) {
      // Undecoded BAM record: just hash its raw data.
      hash = (int) MurmurHash3.murmurhash3(var, hash);
    } else {
      // Decoded BAM record or any SAM record: hash a few representative
      // fields together.
      hash = (int) MurmurHash3.murmurhash3(rec.getReadName(), hash);
      hash = (int) MurmurHash3.murmurhash3(rec.getReadBases(), hash);
      hash = (int) MurmurHash3.murmurhash3(rec.getBaseQualities(), hash);
      hash = (int) MurmurHash3.murmurhash3(rec.getCigarString(), hash);
    }
    return getKey0(Integer.MAX_VALUE, hash);
  }
Пример #4
0
 private boolean overlaps(SAMRecord r) {
   if (intervals == null
       || (r.getReadUnmappedFlag() && r.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START)) {
     return true;
   }
   if (r.getReadUnmappedFlag()) { // special case for unmapped reads with coordinate set
     for (Locatable interval : intervals) {
       if (interval.getStart() <= r.getStart() && interval.getEnd() >= r.getStart()) {
         // This follows the behavior of htsjdk's SamReader which states that
         // "an unmapped read will be returned by this call if it has a coordinate for
         // the purpose of sorting that is in the query region".
         return true;
       }
     }
   }
   final Interval interval = new Interval(r.getContig(), r.getStart(), r.getEnd());
   Collection<Interval> overlaps = overlapDetector.getOverlaps(interval);
   return !overlaps.isEmpty();
 }
  @Override
  public SAMRecordPair getNextReadPair() {
    // insert first read into dictionary by queryname
    // insert second read into dictionary
    // check if the dictionary length for that entry has both pairs
    // if it is does return the read pair
    // otherwise continue reading
    // this way just return pairs as they are completed
    // should be MUCH faster
    // make sure to delete the entry after returning so that we dont have a memory leak

    if (iterator.hasNext()) {
      while (iterator.hasNext()) {
        SAMRecord record = iterator.next();
        countRead(record);
        // skip if the read is unmapped, not properly paired or mate is unmapped
        if (record.getReadUnmappedFlag() == true
            || record.getProperPairFlag() == false
            || record.getMateUnmappedFlag() == true) {
          continue;
        }

        String query = record.getReadName();

        // check if read mate has been read already
        if (readBuffer.containsKey(query)) {
          // if it has then return the pair
          SAMRecordPair pair = readBuffer.get(query);
          pair.addPair(record);
          if (pair.bothPairsAligned() && pair.isValidPair()) {
            // prevent memory leak by deleting keys that are no longer needed
            readBuffer.remove(query);
            return pair;
          } else {
            throw new RuntimeException(query + " is not properly mated");
          }
        } else {
          // otherwise create an entry and store it by its query name
          SAMRecordPair pair = new SAMRecordPair();
          pair.addPair(record);
          readBuffer.put(query, pair);
        }
      }
    } else {
      if (readBuffer.size() > 0) {
        for (String key : readBuffer.keySet()) {
          logger.info("No mate for for " + key);
        }
        throw new RuntimeException(
            "No mates found for some reads please make sure all reads are properly paired");
      }
    }

    return null;
  }
Пример #6
0
  private boolean isOutOfOrder(final SAMRecord last, final SAMRecord cur) {
    if (last == null || cur.getReadUnmappedFlag()) return false;
    else {
      if (last.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX
          || last.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START)
        throw new UserException.MalformedBAM(
            last, String.format("read %s has inconsistent mapping information.", last.format()));
      if (cur.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX
          || cur.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START)
        throw new UserException.MalformedBAM(
            last, String.format("read %s has inconsistent mapping information.", cur.format()));

      return (last.getReferenceIndex() > cur.getReferenceIndex())
          || (last.getReferenceIndex().equals(cur.getReferenceIndex())
              && last.getAlignmentStart() > cur.getAlignmentStart());
    }
  }
Пример #7
0
  //
  //  Returns a downsampled set of reads for each sample.
  //
  private List<List<SAMRecord>> getReads(
      List<String> inputFiles, List<Feature> regions, ReAligner realigner) {

    int downsampleTarget = desiredNumberOfReads(regions);
    List<DownsampledReadList> readsList = new ArrayList<DownsampledReadList>();

    for (String input : inputFiles) {
      Set<String> readIds = new HashSet<String>();
      DownsampledReadList reads = new DownsampledReadList(downsampleTarget);
      readsList.add(reads);

      for (Feature region : regions) {
        SAMFileReader reader = new SAMFileReader(new File(input));
        reader.setValidationStringency(ValidationStringency.SILENT);

        Iterator<SAMRecord> iter;
        if (region != null) {
          iter =
              reader.queryOverlapping(
                  region.getSeqname(), (int) region.getStart(), (int) region.getEnd());
        } else {
          iter = reader.iterator();
        }

        while (iter.hasNext()) {

          SAMRecord read = iter.next();

          // Don't allow same read to be counted twice.
          if ((!realigner.isFiltered(read))
              && (!read.getDuplicateReadFlag())
              && (!read.getReadFailsVendorQualityCheckFlag())
              && (read.getMappingQuality() >= realigner.getMinMappingQuality()
                  || read.getReadUnmappedFlag())
              && (!readIds.contains(getIdentifier(read)))) {

            if (read.getReadString().length() > readLength) {
              reader.close();
              throw new IllegalArgumentException(
                  "Maximum read length of: "
                      + readLength
                      + " exceeded for: "
                      + read.getSAMString());
            }

            readIds.add(getIdentifier(read));

            reads.add(read);
          }
        }

        if (reads.getTotalReadCount() != reads.getReads().size()) {
          if (isDebug) {
            System.err.println(
                "downsampled: "
                    + regions.get(0).getDescriptor()
                    + ": "
                    + reads.getTotalReadCount()
                    + " -> "
                    + reads.getReads().size());
          }
        }

        reader.close();
      }
    }

    List<List<SAMRecord>> sampleReads = new ArrayList<List<SAMRecord>>();

    for (DownsampledReadList downsampledReads : readsList) {
      sampleReads.add(downsampledReads.getReads());
    }

    return sampleReads;
  }
  /**
   * Main method for the program. Checks that all input files are present and readable and that the
   * output file can be written to. Then iterates through all the records accumulating metrics.
   * Finally writes metrics file
   */
  protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);

    final SamReader reader =
        SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);

    final Histogram<Integer> mismatchesHist = new Histogram<Integer>("Predicted", "Mismatches");
    final Histogram<Integer> totalHist = new Histogram<Integer>("Predicted", "Total_Bases");
    final Map<String, Histogram> mismatchesByTypeHist = new HashMap<String, Histogram>();
    final Map<String, Histogram> totalByTypeHist = new HashMap<String, Histogram>();

    // Set up the histograms
    byte[] bases = {'A', 'C', 'G', 'T'};
    for (final byte base : bases) {
      final Histogram<Integer> h = new Histogram<Integer>("Predicted", (char) base + ">");
      mismatchesByTypeHist.put((char) base + ">", h);
      final Histogram<Integer> h2 = new Histogram<Integer>("Predicted", ">" + (char) base);
      mismatchesByTypeHist.put(">" + (char) base, h2);
    }
    for (final byte base : bases) {
      final Histogram<Integer> h = new Histogram<Integer>("Predicted", (char) base + ">");
      totalByTypeHist.put((char) base + ">", h);
      final Histogram<Integer> h2 = new Histogram<Integer>("Predicted", ">" + (char) base);
      totalByTypeHist.put(">" + (char) base, h2);
    }

    for (final SAMRecord record : reader) {
      // Ignore these as we don't know the truth
      if (record.getReadUnmappedFlag() || record.isSecondaryOrSupplementary()) {
        continue;
      }
      final byte[] readBases = record.getReadBases();
      final byte[] readQualities = record.getBaseQualities();
      final byte[] refBases = SequenceUtil.makeReferenceFromAlignment(record, false);

      // We've seen stranger things
      if (readQualities.length != readBases.length) {
        throw new PicardException(
            "Missing Qualities ("
                + readQualities.length
                + ","
                + readBases.length
                + ") : "
                + record.getSAMString());
      }

      if (refBases.length != readBases.length) {
        throw new PicardException(
            "The read length did not match the inferred reference length, please check your MD and CIGAR.");
      }

      int cycleIndex; // zero-based
      if (record.getReadNegativeStrandFlag()) {
        cycleIndex = readBases.length - 1 + CYCLE_OFFSET;
      } else {
        cycleIndex = CYCLE_OFFSET;
      }

      for (int i = 0; i < readBases.length; i++) {
        if (-1 == CYCLE || cycleIndex == CYCLE) {
          if ('-' != refBases[i] && '0' != refBases[i]) { // not insertion and not soft-clipped
            if (!SequenceUtil.basesEqual(readBases[i], refBases[i])) { // mismatch
              mismatchesHist.increment((int) readQualities[i]);
              if (SequenceUtil.isValidBase(refBases[i])) {
                mismatchesByTypeHist
                    .get((char) refBases[i] + ">")
                    .increment((int) readQualities[i]);
              }
              if (SequenceUtil.isValidBase(readBases[i])) {
                mismatchesByTypeHist
                    .get(">" + (char) readBases[i])
                    .increment((int) readQualities[i]);
              }
            } else {
              mismatchesHist.increment(
                  (int) readQualities[i], 0); // to make sure the bin will exist
            }
            totalHist.increment((int) readQualities[i]);
            if (SequenceUtil.isValidBase(readBases[i])) {
              totalByTypeHist.get(">" + (char) readBases[i]).increment((int) readQualities[i]);
            }
            if (SequenceUtil.isValidBase(refBases[i])) {
              totalByTypeHist.get((char) refBases[i] + ">").increment((int) readQualities[i]);
            }
          }
        }
        cycleIndex += record.getReadNegativeStrandFlag() ? -1 : 1;
      }
    }
    CloserUtil.close(reader);

    final Histogram<Integer> hist = new Histogram<Integer>("Predicted", "Observed");

    double sumOfSquaresError = 0.0;

    // compute the aggregate phred values
    for (final Integer key : mismatchesHist.keySet()) {
      final double numMismatches = mismatchesHist.get(key).getValue();
      final double numBases = totalHist.get(key).getValue();
      final double phredErr = Math.log10(numMismatches / numBases) * -10.0;
      sumOfSquaresError += (0 == numMismatches) ? 0.0 : (key - phredErr) * (key - phredErr);
      hist.increment(key, phredErr);

      // make sure the bin will exist
      for (final byte base : bases) {
        mismatchesByTypeHist.get(">" + (char) base).increment(key, 0.0);
        mismatchesByTypeHist.get((char) base + ">").increment(key, 0.0);
        totalByTypeHist.get(">" + (char) base).increment(key, 0.0);
        totalByTypeHist.get((char) base + ">").increment(key, 0.0);
      }
    }

    final QualityScoreAccuracyMetrics metrics = new QualityScoreAccuracyMetrics();
    metrics.SUM_OF_SQUARE_ERROR = sumOfSquaresError;

    final MetricsFile<QualityScoreAccuracyMetrics, Integer> out = getMetricsFile();
    out.addMetric(metrics);
    out.addHistogram(hist);
    for (final byte base : bases) {
      // >base : histograms for mismatches *to* the given base
      Histogram<Integer> m = mismatchesByTypeHist.get(">" + (char) base);
      Histogram<Integer> t = totalByTypeHist.get(">" + (char) base);
      Histogram<Integer> h = new Histogram<Integer>(m.getBinLabel(), m.getValueLabel());
      for (final Integer key : m.keySet()) {
        final double numMismatches = m.get(key).getValue();
        final double numBases = t.get(key).getValue();
        final double phredErr = Math.log10(numMismatches / numBases) * -10.0;
        h.increment(key, phredErr);
      }
      out.addHistogram(h);

      // base> : histograms for mismatches *from* the given base
      m = mismatchesByTypeHist.get((char) base + ">");
      t = totalByTypeHist.get(">" + (char) base);
      h = new Histogram<Integer>(m.getBinLabel(), m.getValueLabel());
      for (final Integer key : m.keySet()) {
        final double numMismatches = m.get(key).getValue();
        final double numBases = t.get(key).getValue();
        final double phredErr = Math.log10(numMismatches / numBases) * -10.0;
        h.increment(key, phredErr);
      }
      out.addHistogram(h);
    }

    out.addHistogram(mismatchesHist);
    out.addHistogram(totalHist);
    out.write(OUTPUT);

    return 0;
  }
Пример #9
0
 /**
  * Returns true if we don't think this read is eligible for the BAQ calculation. Examples include
  * non-PF reads, duplicates, or unmapped reads. Used by baqRead to determine if a read should fall
  * through the calculation.
  *
  * @param read
  * @return
  */
 public boolean excludeReadFromBAQ(SAMRecord read) {
   // keeping mapped reads, regardless of pairing status, or primary alignment status.
   return read.getReadUnmappedFlag()
       || read.getReadFailsVendorQualityCheckFlag()
       || read.getDuplicateReadFlag();
 }
Пример #10
0
  @Override
  public void execute() {
    log.info("Loading reads...");

    List<SAMRecord> reads = new ArrayList<SAMRecord>();

    for (SAMRecord sr : BAM) {
      if (!sr.getReadUnmappedFlag()) {
        reads.add(sr);
      }
    }

    log.info("  {} reads loaded", reads.size());

    Collections.shuffle(reads);

    log.info("  shuffled");

    DataTable dt =
        new DataTable(
            "LanderWaterman",
            "Realistic Lander-Waterman stats",
            "reads",
            "bp_used",
            "bp_total",
            "pct_genome");

    for (int numReads : NUM_READS) {
      log.info("Selecting {} reads...", numReads);

      Map<String, boolean[]> utilizationMask = new HashMap<String, boolean[]>();

      for (SAMSequenceRecord ssr : BAM.getFileHeader().getSequenceDictionary().getSequences()) {
        utilizationMask.put(ssr.getSequenceName(), new boolean[ssr.getSequenceLength()]);
      }

      for (int i = 0; i < numReads; i++) {
        SAMRecord read = reads.get(i);

        for (int j = read.getAlignmentStart(); j < read.getAlignmentEnd(); j++) {
          utilizationMask.get(read.getReferenceName())[j] = true;
        }
      }

      int basesUsed = 0, basesTotal = 0;

      for (String refName : utilizationMask.keySet()) {
        for (int i = 0; i < utilizationMask.get(refName).length; i++) {
          if (utilizationMask.get(refName)[i]) {
            basesUsed++;
          }

          basesTotal++;
        }
      }

      dt.set("lw" + numReads, "reads", numReads);
      dt.set("lw" + numReads, "bp_used", basesUsed);
      dt.set("lw" + numReads, "bp_total", basesTotal);
      dt.set("lw" + numReads, "pct_genome", (float) basesUsed / (float) basesTotal);

      log.info(
          "  reads: {}, bp_used: {}, bp_total: {}, pct_genome: {}",
          numReads,
          basesUsed,
          basesTotal,
          (float) basesUsed / (float) basesTotal);
    }

    out.println(dt);
  }