public String assembleContigs( List<String> inputFiles, String output, String tempDir, List<Feature> regions, String prefix, boolean checkForDupes, ReAligner realigner, CompareToReference2 c2r) { if ((kmers.length == 0) || (kmers[0] < KmerSizeEvaluator.MIN_KMER)) { KmerSizeEvaluator kmerEval = new KmerSizeEvaluator(); int kmer = kmerEval.identifyMinKmer(readLength, c2r, regions); this.kmers = realigner.toKmerArray(kmer, readLength); } String contigs = ""; long start = System.currentTimeMillis(); int readCount = 0; int minReadCount = Integer.MAX_VALUE; // if c2r is null, this is the unaligned region. boolean isAssemblyCandidate = c2r == null ? true : false; try { List<List<SAMRecord>> readsList = getReads(inputFiles, regions, realigner); for (List<SAMRecord> reads : readsList) { int candidateReadCount = 0; for (SAMRecord read : reads) { if (!isAssemblyCandidate && isAssemblyTriggerCandidate(read, c2r)) { candidateReadCount++; } if (shouldSearchForSv && isSvCandidate(read)) { svCandidates.add( new Position(read.getMateReferenceName(), read.getMateAlignmentStart())); } } if (candidateReadCount > minCandidateCount(reads.size(), regions.get(0))) { isAssemblyCandidate = true; } if (reads.size() < minReadCount) { minReadCount = reads.size(); } readCount += reads.size(); } StringBuffer readBuffer = new StringBuffer(); if (isAssemblyCandidate) { int downsampleTarget = desiredNumberOfReads(regions); char sampleId = 1; for (List<SAMRecord> reads : readsList) { // Default to always keep double keepProbability = 1.1; if (reads.size() > downsampleTarget) { keepProbability = (double) downsampleTarget / (double) reads.size(); } Random random = new Random(1); for (SAMRecord read : reads) { if (random.nextDouble() < keepProbability) { readBuffer.append(sampleId); readBuffer.append(read.getReadNegativeStrandFlag() ? "1" : "0"); if (read.getReadString().length() == readLength) { readBuffer.append(read.getReadString()); readBuffer.append(read.getBaseQualityString()); } else { StringBuffer basePadding = new StringBuffer(); StringBuffer qualPadding = new StringBuffer(); for (int i = 0; i < readLength - read.getReadString().length(); i++) { basePadding.append('N'); qualPadding.append('!'); } readBuffer.append(read.getReadString() + basePadding.toString()); readBuffer.append(read.getBaseQualityString() + qualPadding.toString()); } } } // Make this set of reads eligible for GC reads.clear(); sampleId += 1; } } readsList.clear(); if (isAssemblyCandidate) { for (int kmer : kmers) { String outputFile = output + "_k" + kmer; contigs = assemble( readBuffer.toString(), outputFile, prefix, truncateOnRepeat ? 1 : 0, maxContigs, maxPathsFromRoot, readLength, kmer, minKmerFrequency, minBaseQuality, minEdgeRatio, isDebug ? 1 : 0, maxNodes); if (!contigs.equals("<REPEAT>")) { break; } else { if (kmer >= readLength / 2 || kmer >= CYCLE_KMER_LENGTH_THRESHOLD) { isCycleExceedingThresholdDetected = true; } } } } else { // System.out.println("Skipping assembly for: " + prefix); } } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e); } if (this.shouldSearchForSv) { Collections.sort(this.svCandidates); Position last = null; String currentFeatureChr = null; int currentFeatureStart = -1; int currentFeatureStop = -1; int currentFeatureCount = 0; // TODO: Calc this dynamically int windowSize = 500; for (Position pos : this.svCandidates) { if ((last != null) && pos.getChromosome().equals(last.getChromosome()) && Math.abs(pos.getPosition() - last.getPosition()) < windowSize) { if (currentFeatureChr == null) { currentFeatureChr = pos.getChromosome(); currentFeatureStart = last.getPosition(); currentFeatureStop = pos.getPosition() + readLength; currentFeatureCount = 1; } else { currentFeatureStop = pos.getPosition() + readLength; currentFeatureCount++; } } else { if (currentFeatureChr != null) { if (currentFeatureCount > (minReadCount / MAX_READ_LENGTHS_PER_REGION) * minReadCandidateFraction) { Feature region = new Feature( currentFeatureChr, currentFeatureStart - readLength, currentFeatureStop + readLength); BreakpointCandidate candidate = new BreakpointCandidate(region, currentFeatureCount); this.svCandidateRegions.add(candidate); } currentFeatureChr = null; currentFeatureStart = -1; currentFeatureStop = -1; currentFeatureCount = 0; } else { currentFeatureChr = pos.getChromosome(); currentFeatureStart = pos.getPosition(); currentFeatureStop = pos.getPosition() + readLength; currentFeatureCount = 1; } } last = pos; } // Don't forget last SV candidate region if (currentFeatureCount > (minReadCount / MAX_READ_LENGTHS_PER_REGION) * minReadCandidateFraction) { Feature region = new Feature( currentFeatureChr, currentFeatureStart - readLength, currentFeatureStop + readLength); BreakpointCandidate candidate = new BreakpointCandidate(region, currentFeatureCount); this.svCandidateRegions.add(candidate); } } long end = System.currentTimeMillis(); int kmer = readLength + 1; if (kmers.length > 0) { kmer = kmers[0]; } if (isDebug) { System.err.println( "Elapsed_msecs_in_NativeAssembler\tRegion:\t" + regions.get(0).getDescriptor() + "\tLength:\t" + regions.get(0).getLength() + "\tReadCount:\t" + readCount + "\tElapsed\t" + (end - start) + "\tAssembled\t" + isAssemblyCandidate + "\t" + kmer); } return contigs; }
// // Returns a downsampled set of reads for each sample. // private List<List<SAMRecord>> getReads( List<String> inputFiles, List<Feature> regions, ReAligner realigner) { int downsampleTarget = desiredNumberOfReads(regions); List<DownsampledReadList> readsList = new ArrayList<DownsampledReadList>(); for (String input : inputFiles) { Set<String> readIds = new HashSet<String>(); DownsampledReadList reads = new DownsampledReadList(downsampleTarget); readsList.add(reads); for (Feature region : regions) { SAMFileReader reader = new SAMFileReader(new File(input)); reader.setValidationStringency(ValidationStringency.SILENT); Iterator<SAMRecord> iter; if (region != null) { iter = reader.queryOverlapping( region.getSeqname(), (int) region.getStart(), (int) region.getEnd()); } else { iter = reader.iterator(); } while (iter.hasNext()) { SAMRecord read = iter.next(); // Don't allow same read to be counted twice. if ((!realigner.isFiltered(read)) && (!read.getDuplicateReadFlag()) && (!read.getReadFailsVendorQualityCheckFlag()) && (read.getMappingQuality() >= realigner.getMinMappingQuality() || read.getReadUnmappedFlag()) && (!readIds.contains(getIdentifier(read)))) { if (read.getReadString().length() > readLength) { reader.close(); throw new IllegalArgumentException( "Maximum read length of: " + readLength + " exceeded for: " + read.getSAMString()); } readIds.add(getIdentifier(read)); reads.add(read); } } if (reads.getTotalReadCount() != reads.getReads().size()) { if (isDebug) { System.err.println( "downsampled: " + regions.get(0).getDescriptor() + ": " + reads.getTotalReadCount() + " -> " + reads.getReads().size()); } } reader.close(); } } List<List<SAMRecord>> sampleReads = new ArrayList<List<SAMRecord>>(); for (DownsampledReadList downsampledReads : readsList) { sampleReads.add(downsampledReads.getReads()); } return sampleReads; }