Esempio n. 1
0
  @Override
  public void recordValues(final SAMRecord read, final ReadCovariates values) {

    // store the original bases and then write Ns over low quality ones
    final byte[] originalBases = Arrays.copyOf(read.getReadBases(), read.getReadBases().length);
    final byte[] strandedBases = getStrandedBytes(read, lowQualTail);

    final List<Integer> mismatchKeys =
        contextWith(strandedBases, mismatchesContextSize, mismatchesKeyMask);
    final List<Integer> indelKeys = contextWith(strandedBases, indelsContextSize, indelsKeyMask);

    final int readLength = strandedBases.length;

    // this is necessary to ensure that we don't keep historical data in the ReadCovariates values
    // since the context covariate may not span the entire set of values in read covariates
    // due to the clipping of the low quality bases
    if (readLength != originalBases.length) {
      // don't both zeroing out if we are going to overwrite the whole array
      for (int i = 0; i < originalBases.length; i++)
        // this base has been clipped off, so zero out the covariate values here
        values.addCovariate(0, 0, 0, i);
    }

    final boolean negativeStrand = read.getReadNegativeStrandFlag();
    for (int i = 0; i < readLength; i++) {
      final int readOffset = getStrandedOffset(negativeStrand, i, readLength);
      final int indelKey = indelKeys.get(i);
      values.addCovariate(mismatchKeys.get(i), indelKey, indelKey, readOffset);
    }

    // put the original bases back in
    read.setReadBases(originalBases);
  }
Esempio n. 2
0
  /**
   * Given a read, clips low quality ends (by overwriting with N) and returns the underlying bases,
   * after reverse-complementing for negative-strand reads.
   *
   * @param read the read
   * @param lowQTail every base quality lower than or equal to this in the tail of the read will be
   *     replaced with N.
   * @return bases of the read.
   */
  public static byte[] getStrandedBytes(final SAMRecord read, final byte lowQTail) {

    // Write N's over the low quality tail of the reads to avoid adding them into the context
    final SAMRecord clippedRead =
        ReadClipper.clipLowQualEnds(read, lowQTail, ClippingRepresentation.WRITE_NS);

    final byte[] bases = clippedRead.getReadBases();
    if (read.getReadNegativeStrandFlag()) {
      return BaseUtils.simpleReverseComplement(bases);
    } else {
      return bases;
    }
  }
Esempio n. 3
0
  public String assembleContigs(
      List<String> inputFiles,
      String output,
      String tempDir,
      List<Feature> regions,
      String prefix,
      boolean checkForDupes,
      ReAligner realigner,
      CompareToReference2 c2r) {

    if ((kmers.length == 0) || (kmers[0] < KmerSizeEvaluator.MIN_KMER)) {
      KmerSizeEvaluator kmerEval = new KmerSizeEvaluator();
      int kmer = kmerEval.identifyMinKmer(readLength, c2r, regions);
      this.kmers = realigner.toKmerArray(kmer, readLength);
    }

    String contigs = "";

    long start = System.currentTimeMillis();

    int readCount = 0;

    int minReadCount = Integer.MAX_VALUE;

    // if c2r is null, this is the unaligned region.
    boolean isAssemblyCandidate = c2r == null ? true : false;

    try {

      List<List<SAMRecord>> readsList = getReads(inputFiles, regions, realigner);

      for (List<SAMRecord> reads : readsList) {
        int candidateReadCount = 0;
        for (SAMRecord read : reads) {
          if (!isAssemblyCandidate && isAssemblyTriggerCandidate(read, c2r)) {
            candidateReadCount++;
          }

          if (shouldSearchForSv && isSvCandidate(read)) {
            svCandidates.add(
                new Position(read.getMateReferenceName(), read.getMateAlignmentStart()));
          }
        }

        if (candidateReadCount > minCandidateCount(reads.size(), regions.get(0))) {
          isAssemblyCandidate = true;
        }

        if (reads.size() < minReadCount) {
          minReadCount = reads.size();
        }

        readCount += reads.size();
      }

      StringBuffer readBuffer = new StringBuffer();

      if (isAssemblyCandidate) {

        int downsampleTarget = desiredNumberOfReads(regions);

        char sampleId = 1;

        for (List<SAMRecord> reads : readsList) {
          // Default to always keep
          double keepProbability = 1.1;

          if (reads.size() > downsampleTarget) {
            keepProbability = (double) downsampleTarget / (double) reads.size();
          }

          Random random = new Random(1);

          for (SAMRecord read : reads) {
            if (random.nextDouble() < keepProbability) {
              readBuffer.append(sampleId);
              readBuffer.append(read.getReadNegativeStrandFlag() ? "1" : "0");

              if (read.getReadString().length() == readLength) {
                readBuffer.append(read.getReadString());
                readBuffer.append(read.getBaseQualityString());
              } else {
                StringBuffer basePadding = new StringBuffer();
                StringBuffer qualPadding = new StringBuffer();

                for (int i = 0; i < readLength - read.getReadString().length(); i++) {
                  basePadding.append('N');
                  qualPadding.append('!');
                }

                readBuffer.append(read.getReadString() + basePadding.toString());
                readBuffer.append(read.getBaseQualityString() + qualPadding.toString());
              }
            }
          }

          // Make this set of reads eligible for GC
          reads.clear();
          sampleId += 1;
        }
      }

      readsList.clear();

      if (isAssemblyCandidate) {
        for (int kmer : kmers) {

          String outputFile = output + "_k" + kmer;

          contigs =
              assemble(
                  readBuffer.toString(),
                  outputFile,
                  prefix,
                  truncateOnRepeat ? 1 : 0,
                  maxContigs,
                  maxPathsFromRoot,
                  readLength,
                  kmer,
                  minKmerFrequency,
                  minBaseQuality,
                  minEdgeRatio,
                  isDebug ? 1 : 0,
                  maxNodes);

          if (!contigs.equals("<REPEAT>")) {
            break;
          } else {
            if (kmer >= readLength / 2 || kmer >= CYCLE_KMER_LENGTH_THRESHOLD) {
              isCycleExceedingThresholdDetected = true;
            }
          }
        }
      } else {
        //				System.out.println("Skipping assembly for: " + prefix);
      }

    } catch (Exception e) {
      e.printStackTrace();
      throw new RuntimeException(e);
    }

    if (this.shouldSearchForSv) {

      Collections.sort(this.svCandidates);
      Position last = null;
      String currentFeatureChr = null;
      int currentFeatureStart = -1;
      int currentFeatureStop = -1;
      int currentFeatureCount = 0;

      // TODO: Calc this dynamically
      int windowSize = 500;

      for (Position pos : this.svCandidates) {
        if ((last != null)
            && pos.getChromosome().equals(last.getChromosome())
            && Math.abs(pos.getPosition() - last.getPosition()) < windowSize) {

          if (currentFeatureChr == null) {
            currentFeatureChr = pos.getChromosome();
            currentFeatureStart = last.getPosition();
            currentFeatureStop = pos.getPosition() + readLength;
            currentFeatureCount = 1;
          } else {
            currentFeatureStop = pos.getPosition() + readLength;
            currentFeatureCount++;
          }
        } else {
          if (currentFeatureChr != null) {
            if (currentFeatureCount
                > (minReadCount / MAX_READ_LENGTHS_PER_REGION) * minReadCandidateFraction) {
              Feature region =
                  new Feature(
                      currentFeatureChr,
                      currentFeatureStart - readLength,
                      currentFeatureStop + readLength);
              BreakpointCandidate candidate = new BreakpointCandidate(region, currentFeatureCount);
              this.svCandidateRegions.add(candidate);
            }
            currentFeatureChr = null;
            currentFeatureStart = -1;
            currentFeatureStop = -1;
            currentFeatureCount = 0;
          } else {
            currentFeatureChr = pos.getChromosome();
            currentFeatureStart = pos.getPosition();
            currentFeatureStop = pos.getPosition() + readLength;
            currentFeatureCount = 1;
          }
        }
        last = pos;
      }

      // Don't forget last SV candidate region
      if (currentFeatureCount
          > (minReadCount / MAX_READ_LENGTHS_PER_REGION) * minReadCandidateFraction) {
        Feature region =
            new Feature(
                currentFeatureChr,
                currentFeatureStart - readLength,
                currentFeatureStop + readLength);
        BreakpointCandidate candidate = new BreakpointCandidate(region, currentFeatureCount);
        this.svCandidateRegions.add(candidate);
      }
    }

    long end = System.currentTimeMillis();

    int kmer = readLength + 1;
    if (kmers.length > 0) {
      kmer = kmers[0];
    }

    if (isDebug) {
      System.err.println(
          "Elapsed_msecs_in_NativeAssembler\tRegion:\t"
              + regions.get(0).getDescriptor()
              + "\tLength:\t"
              + regions.get(0).getLength()
              + "\tReadCount:\t"
              + readCount
              + "\tElapsed\t"
              + (end - start)
              + "\tAssembled\t"
              + isAssemblyCandidate
              + "\t"
              + kmer);
    }

    return contigs;
  }
Esempio n. 4
0
  public String simpleAssemble(List<SAMRecord> reads) {

    StringBuffer readBuffer = new StringBuffer();

    for (SAMRecord read : reads) {
      readBuffer.append((char) 1);
      readBuffer.append(read.getReadNegativeStrandFlag() ? "1" : "0");

      if (read.getReadString().length() == readLength) {
        readBuffer.append(read.getReadString());
        readBuffer.append(read.getBaseQualityString());
      } else {
        StringBuffer basePadding = new StringBuffer();
        StringBuffer qualPadding = new StringBuffer();

        for (int i = 0; i < readLength - read.getReadString().length(); i++) {
          basePadding.append('N');
          qualPadding.append('!');
        }

        readBuffer.append(read.getReadString() + basePadding.toString());
        readBuffer.append(read.getBaseQualityString() + qualPadding.toString());
      }
    }

    SAMRecord lastRead = reads.get(reads.size() - 1);
    int regionStart = reads.get(0).getAlignmentStart();
    int regionEnd =
        lastRead.getAlignmentEnd() > 0 ? lastRead.getAlignmentEnd() : lastRead.getAlignmentStart();

    String output =
        "region_" + reads.get(0).getReferenceName() + "_" + regionStart + "_" + regionEnd;
    String contigs = "";

    // Make this set of reads eligible for GC
    //		reads.clear();

    for (int kmer : kmers) {

      String outputFile = output + "_k" + kmer;

      contigs =
          assemble(
              readBuffer.toString(),
              outputFile,
              output,
              1, // truncate_on_repeat
              maxContigs,
              maxPathsFromRoot,
              readLength,
              kmer,
              minKmerFrequency,
              minBaseQuality,
              minEdgeRatio,
              isDebug ? 1 : 0,
              maxNodes);

      if (!contigs.equals("<REPEAT>")) {
        break;
      }
    }

    return contigs;
  }
  /**
   * Main method for the program. Checks that all input files are present and readable and that the
   * output file can be written to. Then iterates through all the records accumulating metrics.
   * Finally writes metrics file
   */
  protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);

    final SamReader reader =
        SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);

    final Histogram<Integer> mismatchesHist = new Histogram<Integer>("Predicted", "Mismatches");
    final Histogram<Integer> totalHist = new Histogram<Integer>("Predicted", "Total_Bases");
    final Map<String, Histogram> mismatchesByTypeHist = new HashMap<String, Histogram>();
    final Map<String, Histogram> totalByTypeHist = new HashMap<String, Histogram>();

    // Set up the histograms
    byte[] bases = {'A', 'C', 'G', 'T'};
    for (final byte base : bases) {
      final Histogram<Integer> h = new Histogram<Integer>("Predicted", (char) base + ">");
      mismatchesByTypeHist.put((char) base + ">", h);
      final Histogram<Integer> h2 = new Histogram<Integer>("Predicted", ">" + (char) base);
      mismatchesByTypeHist.put(">" + (char) base, h2);
    }
    for (final byte base : bases) {
      final Histogram<Integer> h = new Histogram<Integer>("Predicted", (char) base + ">");
      totalByTypeHist.put((char) base + ">", h);
      final Histogram<Integer> h2 = new Histogram<Integer>("Predicted", ">" + (char) base);
      totalByTypeHist.put(">" + (char) base, h2);
    }

    for (final SAMRecord record : reader) {
      // Ignore these as we don't know the truth
      if (record.getReadUnmappedFlag() || record.isSecondaryOrSupplementary()) {
        continue;
      }
      final byte[] readBases = record.getReadBases();
      final byte[] readQualities = record.getBaseQualities();
      final byte[] refBases = SequenceUtil.makeReferenceFromAlignment(record, false);

      // We've seen stranger things
      if (readQualities.length != readBases.length) {
        throw new PicardException(
            "Missing Qualities ("
                + readQualities.length
                + ","
                + readBases.length
                + ") : "
                + record.getSAMString());
      }

      if (refBases.length != readBases.length) {
        throw new PicardException(
            "The read length did not match the inferred reference length, please check your MD and CIGAR.");
      }

      int cycleIndex; // zero-based
      if (record.getReadNegativeStrandFlag()) {
        cycleIndex = readBases.length - 1 + CYCLE_OFFSET;
      } else {
        cycleIndex = CYCLE_OFFSET;
      }

      for (int i = 0; i < readBases.length; i++) {
        if (-1 == CYCLE || cycleIndex == CYCLE) {
          if ('-' != refBases[i] && '0' != refBases[i]) { // not insertion and not soft-clipped
            if (!SequenceUtil.basesEqual(readBases[i], refBases[i])) { // mismatch
              mismatchesHist.increment((int) readQualities[i]);
              if (SequenceUtil.isValidBase(refBases[i])) {
                mismatchesByTypeHist
                    .get((char) refBases[i] + ">")
                    .increment((int) readQualities[i]);
              }
              if (SequenceUtil.isValidBase(readBases[i])) {
                mismatchesByTypeHist
                    .get(">" + (char) readBases[i])
                    .increment((int) readQualities[i]);
              }
            } else {
              mismatchesHist.increment(
                  (int) readQualities[i], 0); // to make sure the bin will exist
            }
            totalHist.increment((int) readQualities[i]);
            if (SequenceUtil.isValidBase(readBases[i])) {
              totalByTypeHist.get(">" + (char) readBases[i]).increment((int) readQualities[i]);
            }
            if (SequenceUtil.isValidBase(refBases[i])) {
              totalByTypeHist.get((char) refBases[i] + ">").increment((int) readQualities[i]);
            }
          }
        }
        cycleIndex += record.getReadNegativeStrandFlag() ? -1 : 1;
      }
    }
    CloserUtil.close(reader);

    final Histogram<Integer> hist = new Histogram<Integer>("Predicted", "Observed");

    double sumOfSquaresError = 0.0;

    // compute the aggregate phred values
    for (final Integer key : mismatchesHist.keySet()) {
      final double numMismatches = mismatchesHist.get(key).getValue();
      final double numBases = totalHist.get(key).getValue();
      final double phredErr = Math.log10(numMismatches / numBases) * -10.0;
      sumOfSquaresError += (0 == numMismatches) ? 0.0 : (key - phredErr) * (key - phredErr);
      hist.increment(key, phredErr);

      // make sure the bin will exist
      for (final byte base : bases) {
        mismatchesByTypeHist.get(">" + (char) base).increment(key, 0.0);
        mismatchesByTypeHist.get((char) base + ">").increment(key, 0.0);
        totalByTypeHist.get(">" + (char) base).increment(key, 0.0);
        totalByTypeHist.get((char) base + ">").increment(key, 0.0);
      }
    }

    final QualityScoreAccuracyMetrics metrics = new QualityScoreAccuracyMetrics();
    metrics.SUM_OF_SQUARE_ERROR = sumOfSquaresError;

    final MetricsFile<QualityScoreAccuracyMetrics, Integer> out = getMetricsFile();
    out.addMetric(metrics);
    out.addHistogram(hist);
    for (final byte base : bases) {
      // >base : histograms for mismatches *to* the given base
      Histogram<Integer> m = mismatchesByTypeHist.get(">" + (char) base);
      Histogram<Integer> t = totalByTypeHist.get(">" + (char) base);
      Histogram<Integer> h = new Histogram<Integer>(m.getBinLabel(), m.getValueLabel());
      for (final Integer key : m.keySet()) {
        final double numMismatches = m.get(key).getValue();
        final double numBases = t.get(key).getValue();
        final double phredErr = Math.log10(numMismatches / numBases) * -10.0;
        h.increment(key, phredErr);
      }
      out.addHistogram(h);

      // base> : histograms for mismatches *from* the given base
      m = mismatchesByTypeHist.get((char) base + ">");
      t = totalByTypeHist.get(">" + (char) base);
      h = new Histogram<Integer>(m.getBinLabel(), m.getValueLabel());
      for (final Integer key : m.keySet()) {
        final double numMismatches = m.get(key).getValue();
        final double numBases = t.get(key).getValue();
        final double phredErr = Math.log10(numMismatches / numBases) * -10.0;
        h.increment(key, phredErr);
      }
      out.addHistogram(h);
    }

    out.addHistogram(mismatchesHist);
    out.addHistogram(totalHist);
    out.write(OUTPUT);

    return 0;
  }