private boolean hasLowQualityBase(SAMRecord read) { // TODO: Don't hardcode phred33 for (int i = 0; i < read.getBaseQualityString().length(); i++) { if ((read.getBaseQualityString().charAt(i) - '!') < 20) { return true; } } return false; }
public String assembleContigs( List<String> inputFiles, String output, String tempDir, List<Feature> regions, String prefix, boolean checkForDupes, ReAligner realigner, CompareToReference2 c2r) { if ((kmers.length == 0) || (kmers[0] < KmerSizeEvaluator.MIN_KMER)) { KmerSizeEvaluator kmerEval = new KmerSizeEvaluator(); int kmer = kmerEval.identifyMinKmer(readLength, c2r, regions); this.kmers = realigner.toKmerArray(kmer, readLength); } String contigs = ""; long start = System.currentTimeMillis(); int readCount = 0; int minReadCount = Integer.MAX_VALUE; // if c2r is null, this is the unaligned region. boolean isAssemblyCandidate = c2r == null ? true : false; try { List<List<SAMRecord>> readsList = getReads(inputFiles, regions, realigner); for (List<SAMRecord> reads : readsList) { int candidateReadCount = 0; for (SAMRecord read : reads) { if (!isAssemblyCandidate && isAssemblyTriggerCandidate(read, c2r)) { candidateReadCount++; } if (shouldSearchForSv && isSvCandidate(read)) { svCandidates.add( new Position(read.getMateReferenceName(), read.getMateAlignmentStart())); } } if (candidateReadCount > minCandidateCount(reads.size(), regions.get(0))) { isAssemblyCandidate = true; } if (reads.size() < minReadCount) { minReadCount = reads.size(); } readCount += reads.size(); } StringBuffer readBuffer = new StringBuffer(); if (isAssemblyCandidate) { int downsampleTarget = desiredNumberOfReads(regions); char sampleId = 1; for (List<SAMRecord> reads : readsList) { // Default to always keep double keepProbability = 1.1; if (reads.size() > downsampleTarget) { keepProbability = (double) downsampleTarget / (double) reads.size(); } Random random = new Random(1); for (SAMRecord read : reads) { if (random.nextDouble() < keepProbability) { readBuffer.append(sampleId); readBuffer.append(read.getReadNegativeStrandFlag() ? "1" : "0"); if (read.getReadString().length() == readLength) { readBuffer.append(read.getReadString()); readBuffer.append(read.getBaseQualityString()); } else { StringBuffer basePadding = new StringBuffer(); StringBuffer qualPadding = new StringBuffer(); for (int i = 0; i < readLength - read.getReadString().length(); i++) { basePadding.append('N'); qualPadding.append('!'); } readBuffer.append(read.getReadString() + basePadding.toString()); readBuffer.append(read.getBaseQualityString() + qualPadding.toString()); } } } // Make this set of reads eligible for GC reads.clear(); sampleId += 1; } } readsList.clear(); if (isAssemblyCandidate) { for (int kmer : kmers) { String outputFile = output + "_k" + kmer; contigs = assemble( readBuffer.toString(), outputFile, prefix, truncateOnRepeat ? 1 : 0, maxContigs, maxPathsFromRoot, readLength, kmer, minKmerFrequency, minBaseQuality, minEdgeRatio, isDebug ? 1 : 0, maxNodes); if (!contigs.equals("<REPEAT>")) { break; } else { if (kmer >= readLength / 2 || kmer >= CYCLE_KMER_LENGTH_THRESHOLD) { isCycleExceedingThresholdDetected = true; } } } } else { // System.out.println("Skipping assembly for: " + prefix); } } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e); } if (this.shouldSearchForSv) { Collections.sort(this.svCandidates); Position last = null; String currentFeatureChr = null; int currentFeatureStart = -1; int currentFeatureStop = -1; int currentFeatureCount = 0; // TODO: Calc this dynamically int windowSize = 500; for (Position pos : this.svCandidates) { if ((last != null) && pos.getChromosome().equals(last.getChromosome()) && Math.abs(pos.getPosition() - last.getPosition()) < windowSize) { if (currentFeatureChr == null) { currentFeatureChr = pos.getChromosome(); currentFeatureStart = last.getPosition(); currentFeatureStop = pos.getPosition() + readLength; currentFeatureCount = 1; } else { currentFeatureStop = pos.getPosition() + readLength; currentFeatureCount++; } } else { if (currentFeatureChr != null) { if (currentFeatureCount > (minReadCount / MAX_READ_LENGTHS_PER_REGION) * minReadCandidateFraction) { Feature region = new Feature( currentFeatureChr, currentFeatureStart - readLength, currentFeatureStop + readLength); BreakpointCandidate candidate = new BreakpointCandidate(region, currentFeatureCount); this.svCandidateRegions.add(candidate); } currentFeatureChr = null; currentFeatureStart = -1; currentFeatureStop = -1; currentFeatureCount = 0; } else { currentFeatureChr = pos.getChromosome(); currentFeatureStart = pos.getPosition(); currentFeatureStop = pos.getPosition() + readLength; currentFeatureCount = 1; } } last = pos; } // Don't forget last SV candidate region if (currentFeatureCount > (minReadCount / MAX_READ_LENGTHS_PER_REGION) * minReadCandidateFraction) { Feature region = new Feature( currentFeatureChr, currentFeatureStart - readLength, currentFeatureStop + readLength); BreakpointCandidate candidate = new BreakpointCandidate(region, currentFeatureCount); this.svCandidateRegions.add(candidate); } } long end = System.currentTimeMillis(); int kmer = readLength + 1; if (kmers.length > 0) { kmer = kmers[0]; } if (isDebug) { System.err.println( "Elapsed_msecs_in_NativeAssembler\tRegion:\t" + regions.get(0).getDescriptor() + "\tLength:\t" + regions.get(0).getLength() + "\tReadCount:\t" + readCount + "\tElapsed\t" + (end - start) + "\tAssembled\t" + isAssemblyCandidate + "\t" + kmer); } return contigs; }
public String simpleAssemble(List<SAMRecord> reads) { StringBuffer readBuffer = new StringBuffer(); for (SAMRecord read : reads) { readBuffer.append((char) 1); readBuffer.append(read.getReadNegativeStrandFlag() ? "1" : "0"); if (read.getReadString().length() == readLength) { readBuffer.append(read.getReadString()); readBuffer.append(read.getBaseQualityString()); } else { StringBuffer basePadding = new StringBuffer(); StringBuffer qualPadding = new StringBuffer(); for (int i = 0; i < readLength - read.getReadString().length(); i++) { basePadding.append('N'); qualPadding.append('!'); } readBuffer.append(read.getReadString() + basePadding.toString()); readBuffer.append(read.getBaseQualityString() + qualPadding.toString()); } } SAMRecord lastRead = reads.get(reads.size() - 1); int regionStart = reads.get(0).getAlignmentStart(); int regionEnd = lastRead.getAlignmentEnd() > 0 ? lastRead.getAlignmentEnd() : lastRead.getAlignmentStart(); String output = "region_" + reads.get(0).getReferenceName() + "_" + regionStart + "_" + regionEnd; String contigs = ""; // Make this set of reads eligible for GC // reads.clear(); for (int kmer : kmers) { String outputFile = output + "_k" + kmer; contigs = assemble( readBuffer.toString(), outputFile, output, 1, // truncate_on_repeat maxContigs, maxPathsFromRoot, readLength, kmer, minKmerFrequency, minBaseQuality, minEdgeRatio, isDebug ? 1 : 0, maxNodes); if (!contigs.equals("<REPEAT>")) { break; } } return contigs; }