@Override public void recordValues(final SAMRecord read, final ReadCovariates values) { // store the original bases and then write Ns over low quality ones final byte[] originalBases = Arrays.copyOf(read.getReadBases(), read.getReadBases().length); final byte[] strandedBases = getStrandedBytes(read, lowQualTail); final List<Integer> mismatchKeys = contextWith(strandedBases, mismatchesContextSize, mismatchesKeyMask); final List<Integer> indelKeys = contextWith(strandedBases, indelsContextSize, indelsKeyMask); final int readLength = strandedBases.length; // this is necessary to ensure that we don't keep historical data in the ReadCovariates values // since the context covariate may not span the entire set of values in read covariates // due to the clipping of the low quality bases if (readLength != originalBases.length) { // don't both zeroing out if we are going to overwrite the whole array for (int i = 0; i < originalBases.length; i++) // this base has been clipped off, so zero out the covariate values here values.addCovariate(0, 0, 0, i); } final boolean negativeStrand = read.getReadNegativeStrandFlag(); for (int i = 0; i < readLength; i++) { final int readOffset = getStrandedOffset(negativeStrand, i, readLength); final int indelKey = indelKeys.get(i); values.addCovariate(mismatchKeys.get(i), indelKey, indelKey, readOffset); } // put the original bases back in read.setReadBases(originalBases); }
/** * Given a read, clips low quality ends (by overwriting with N) and returns the underlying bases, * after reverse-complementing for negative-strand reads. * * @param read the read * @param lowQTail every base quality lower than or equal to this in the tail of the read will be * replaced with N. * @return bases of the read. */ public static byte[] getStrandedBytes(final SAMRecord read, final byte lowQTail) { // Write N's over the low quality tail of the reads to avoid adding them into the context final SAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, lowQTail, ClippingRepresentation.WRITE_NS); final byte[] bases = clippedRead.getReadBases(); if (read.getReadNegativeStrandFlag()) { return BaseUtils.simpleReverseComplement(bases); } else { return bases; } }
public String assembleContigs( List<String> inputFiles, String output, String tempDir, List<Feature> regions, String prefix, boolean checkForDupes, ReAligner realigner, CompareToReference2 c2r) { if ((kmers.length == 0) || (kmers[0] < KmerSizeEvaluator.MIN_KMER)) { KmerSizeEvaluator kmerEval = new KmerSizeEvaluator(); int kmer = kmerEval.identifyMinKmer(readLength, c2r, regions); this.kmers = realigner.toKmerArray(kmer, readLength); } String contigs = ""; long start = System.currentTimeMillis(); int readCount = 0; int minReadCount = Integer.MAX_VALUE; // if c2r is null, this is the unaligned region. boolean isAssemblyCandidate = c2r == null ? true : false; try { List<List<SAMRecord>> readsList = getReads(inputFiles, regions, realigner); for (List<SAMRecord> reads : readsList) { int candidateReadCount = 0; for (SAMRecord read : reads) { if (!isAssemblyCandidate && isAssemblyTriggerCandidate(read, c2r)) { candidateReadCount++; } if (shouldSearchForSv && isSvCandidate(read)) { svCandidates.add( new Position(read.getMateReferenceName(), read.getMateAlignmentStart())); } } if (candidateReadCount > minCandidateCount(reads.size(), regions.get(0))) { isAssemblyCandidate = true; } if (reads.size() < minReadCount) { minReadCount = reads.size(); } readCount += reads.size(); } StringBuffer readBuffer = new StringBuffer(); if (isAssemblyCandidate) { int downsampleTarget = desiredNumberOfReads(regions); char sampleId = 1; for (List<SAMRecord> reads : readsList) { // Default to always keep double keepProbability = 1.1; if (reads.size() > downsampleTarget) { keepProbability = (double) downsampleTarget / (double) reads.size(); } Random random = new Random(1); for (SAMRecord read : reads) { if (random.nextDouble() < keepProbability) { readBuffer.append(sampleId); readBuffer.append(read.getReadNegativeStrandFlag() ? "1" : "0"); if (read.getReadString().length() == readLength) { readBuffer.append(read.getReadString()); readBuffer.append(read.getBaseQualityString()); } else { StringBuffer basePadding = new StringBuffer(); StringBuffer qualPadding = new StringBuffer(); for (int i = 0; i < readLength - read.getReadString().length(); i++) { basePadding.append('N'); qualPadding.append('!'); } readBuffer.append(read.getReadString() + basePadding.toString()); readBuffer.append(read.getBaseQualityString() + qualPadding.toString()); } } } // Make this set of reads eligible for GC reads.clear(); sampleId += 1; } } readsList.clear(); if (isAssemblyCandidate) { for (int kmer : kmers) { String outputFile = output + "_k" + kmer; contigs = assemble( readBuffer.toString(), outputFile, prefix, truncateOnRepeat ? 1 : 0, maxContigs, maxPathsFromRoot, readLength, kmer, minKmerFrequency, minBaseQuality, minEdgeRatio, isDebug ? 1 : 0, maxNodes); if (!contigs.equals("<REPEAT>")) { break; } else { if (kmer >= readLength / 2 || kmer >= CYCLE_KMER_LENGTH_THRESHOLD) { isCycleExceedingThresholdDetected = true; } } } } else { // System.out.println("Skipping assembly for: " + prefix); } } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e); } if (this.shouldSearchForSv) { Collections.sort(this.svCandidates); Position last = null; String currentFeatureChr = null; int currentFeatureStart = -1; int currentFeatureStop = -1; int currentFeatureCount = 0; // TODO: Calc this dynamically int windowSize = 500; for (Position pos : this.svCandidates) { if ((last != null) && pos.getChromosome().equals(last.getChromosome()) && Math.abs(pos.getPosition() - last.getPosition()) < windowSize) { if (currentFeatureChr == null) { currentFeatureChr = pos.getChromosome(); currentFeatureStart = last.getPosition(); currentFeatureStop = pos.getPosition() + readLength; currentFeatureCount = 1; } else { currentFeatureStop = pos.getPosition() + readLength; currentFeatureCount++; } } else { if (currentFeatureChr != null) { if (currentFeatureCount > (minReadCount / MAX_READ_LENGTHS_PER_REGION) * minReadCandidateFraction) { Feature region = new Feature( currentFeatureChr, currentFeatureStart - readLength, currentFeatureStop + readLength); BreakpointCandidate candidate = new BreakpointCandidate(region, currentFeatureCount); this.svCandidateRegions.add(candidate); } currentFeatureChr = null; currentFeatureStart = -1; currentFeatureStop = -1; currentFeatureCount = 0; } else { currentFeatureChr = pos.getChromosome(); currentFeatureStart = pos.getPosition(); currentFeatureStop = pos.getPosition() + readLength; currentFeatureCount = 1; } } last = pos; } // Don't forget last SV candidate region if (currentFeatureCount > (minReadCount / MAX_READ_LENGTHS_PER_REGION) * minReadCandidateFraction) { Feature region = new Feature( currentFeatureChr, currentFeatureStart - readLength, currentFeatureStop + readLength); BreakpointCandidate candidate = new BreakpointCandidate(region, currentFeatureCount); this.svCandidateRegions.add(candidate); } } long end = System.currentTimeMillis(); int kmer = readLength + 1; if (kmers.length > 0) { kmer = kmers[0]; } if (isDebug) { System.err.println( "Elapsed_msecs_in_NativeAssembler\tRegion:\t" + regions.get(0).getDescriptor() + "\tLength:\t" + regions.get(0).getLength() + "\tReadCount:\t" + readCount + "\tElapsed\t" + (end - start) + "\tAssembled\t" + isAssemblyCandidate + "\t" + kmer); } return contigs; }
public String simpleAssemble(List<SAMRecord> reads) { StringBuffer readBuffer = new StringBuffer(); for (SAMRecord read : reads) { readBuffer.append((char) 1); readBuffer.append(read.getReadNegativeStrandFlag() ? "1" : "0"); if (read.getReadString().length() == readLength) { readBuffer.append(read.getReadString()); readBuffer.append(read.getBaseQualityString()); } else { StringBuffer basePadding = new StringBuffer(); StringBuffer qualPadding = new StringBuffer(); for (int i = 0; i < readLength - read.getReadString().length(); i++) { basePadding.append('N'); qualPadding.append('!'); } readBuffer.append(read.getReadString() + basePadding.toString()); readBuffer.append(read.getBaseQualityString() + qualPadding.toString()); } } SAMRecord lastRead = reads.get(reads.size() - 1); int regionStart = reads.get(0).getAlignmentStart(); int regionEnd = lastRead.getAlignmentEnd() > 0 ? lastRead.getAlignmentEnd() : lastRead.getAlignmentStart(); String output = "region_" + reads.get(0).getReferenceName() + "_" + regionStart + "_" + regionEnd; String contigs = ""; // Make this set of reads eligible for GC // reads.clear(); for (int kmer : kmers) { String outputFile = output + "_k" + kmer; contigs = assemble( readBuffer.toString(), outputFile, output, 1, // truncate_on_repeat maxContigs, maxPathsFromRoot, readLength, kmer, minKmerFrequency, minBaseQuality, minEdgeRatio, isDebug ? 1 : 0, maxNodes); if (!contigs.equals("<REPEAT>")) { break; } } return contigs; }
/** * Main method for the program. Checks that all input files are present and readable and that the * output file can be written to. Then iterates through all the records accumulating metrics. * Finally writes metrics file */ protected int doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsWritable(OUTPUT); final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT); final Histogram<Integer> mismatchesHist = new Histogram<Integer>("Predicted", "Mismatches"); final Histogram<Integer> totalHist = new Histogram<Integer>("Predicted", "Total_Bases"); final Map<String, Histogram> mismatchesByTypeHist = new HashMap<String, Histogram>(); final Map<String, Histogram> totalByTypeHist = new HashMap<String, Histogram>(); // Set up the histograms byte[] bases = {'A', 'C', 'G', 'T'}; for (final byte base : bases) { final Histogram<Integer> h = new Histogram<Integer>("Predicted", (char) base + ">"); mismatchesByTypeHist.put((char) base + ">", h); final Histogram<Integer> h2 = new Histogram<Integer>("Predicted", ">" + (char) base); mismatchesByTypeHist.put(">" + (char) base, h2); } for (final byte base : bases) { final Histogram<Integer> h = new Histogram<Integer>("Predicted", (char) base + ">"); totalByTypeHist.put((char) base + ">", h); final Histogram<Integer> h2 = new Histogram<Integer>("Predicted", ">" + (char) base); totalByTypeHist.put(">" + (char) base, h2); } for (final SAMRecord record : reader) { // Ignore these as we don't know the truth if (record.getReadUnmappedFlag() || record.isSecondaryOrSupplementary()) { continue; } final byte[] readBases = record.getReadBases(); final byte[] readQualities = record.getBaseQualities(); final byte[] refBases = SequenceUtil.makeReferenceFromAlignment(record, false); // We've seen stranger things if (readQualities.length != readBases.length) { throw new PicardException( "Missing Qualities (" + readQualities.length + "," + readBases.length + ") : " + record.getSAMString()); } if (refBases.length != readBases.length) { throw new PicardException( "The read length did not match the inferred reference length, please check your MD and CIGAR."); } int cycleIndex; // zero-based if (record.getReadNegativeStrandFlag()) { cycleIndex = readBases.length - 1 + CYCLE_OFFSET; } else { cycleIndex = CYCLE_OFFSET; } for (int i = 0; i < readBases.length; i++) { if (-1 == CYCLE || cycleIndex == CYCLE) { if ('-' != refBases[i] && '0' != refBases[i]) { // not insertion and not soft-clipped if (!SequenceUtil.basesEqual(readBases[i], refBases[i])) { // mismatch mismatchesHist.increment((int) readQualities[i]); if (SequenceUtil.isValidBase(refBases[i])) { mismatchesByTypeHist .get((char) refBases[i] + ">") .increment((int) readQualities[i]); } if (SequenceUtil.isValidBase(readBases[i])) { mismatchesByTypeHist .get(">" + (char) readBases[i]) .increment((int) readQualities[i]); } } else { mismatchesHist.increment( (int) readQualities[i], 0); // to make sure the bin will exist } totalHist.increment((int) readQualities[i]); if (SequenceUtil.isValidBase(readBases[i])) { totalByTypeHist.get(">" + (char) readBases[i]).increment((int) readQualities[i]); } if (SequenceUtil.isValidBase(refBases[i])) { totalByTypeHist.get((char) refBases[i] + ">").increment((int) readQualities[i]); } } } cycleIndex += record.getReadNegativeStrandFlag() ? -1 : 1; } } CloserUtil.close(reader); final Histogram<Integer> hist = new Histogram<Integer>("Predicted", "Observed"); double sumOfSquaresError = 0.0; // compute the aggregate phred values for (final Integer key : mismatchesHist.keySet()) { final double numMismatches = mismatchesHist.get(key).getValue(); final double numBases = totalHist.get(key).getValue(); final double phredErr = Math.log10(numMismatches / numBases) * -10.0; sumOfSquaresError += (0 == numMismatches) ? 0.0 : (key - phredErr) * (key - phredErr); hist.increment(key, phredErr); // make sure the bin will exist for (final byte base : bases) { mismatchesByTypeHist.get(">" + (char) base).increment(key, 0.0); mismatchesByTypeHist.get((char) base + ">").increment(key, 0.0); totalByTypeHist.get(">" + (char) base).increment(key, 0.0); totalByTypeHist.get((char) base + ">").increment(key, 0.0); } } final QualityScoreAccuracyMetrics metrics = new QualityScoreAccuracyMetrics(); metrics.SUM_OF_SQUARE_ERROR = sumOfSquaresError; final MetricsFile<QualityScoreAccuracyMetrics, Integer> out = getMetricsFile(); out.addMetric(metrics); out.addHistogram(hist); for (final byte base : bases) { // >base : histograms for mismatches *to* the given base Histogram<Integer> m = mismatchesByTypeHist.get(">" + (char) base); Histogram<Integer> t = totalByTypeHist.get(">" + (char) base); Histogram<Integer> h = new Histogram<Integer>(m.getBinLabel(), m.getValueLabel()); for (final Integer key : m.keySet()) { final double numMismatches = m.get(key).getValue(); final double numBases = t.get(key).getValue(); final double phredErr = Math.log10(numMismatches / numBases) * -10.0; h.increment(key, phredErr); } out.addHistogram(h); // base> : histograms for mismatches *from* the given base m = mismatchesByTypeHist.get((char) base + ">"); t = totalByTypeHist.get(">" + (char) base); h = new Histogram<Integer>(m.getBinLabel(), m.getValueLabel()); for (final Integer key : m.keySet()) { final double numMismatches = m.get(key).getValue(); final double numBases = t.get(key).getValue(); final double phredErr = Math.log10(numMismatches / numBases) * -10.0; h.increment(key, phredErr); } out.addHistogram(h); } out.addHistogram(mismatchesHist); out.addHistogram(totalHist); out.write(OUTPUT); return 0; }