@Test(dataProvider = "loadReadsADAM", groups = "spark") public void readsSinkADAMTest(String inputBam, String outputDirectoryName) throws IOException { // Since the test requires that we not create the actual output directory in advance, // we instead create its parent directory and mark it for deletion on exit. This protects // us from naming collisions across multiple instances of the test suite. final File outputParentDirectory = createTempDir(outputDirectoryName + "_parent"); final File outputDirectory = new File(outputParentDirectory, outputDirectoryName); JavaSparkContext ctx = SparkContextFactory.getTestSparkContext(); ReadsSparkSource readSource = new ReadsSparkSource(ctx); JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(inputBam, null); SAMFileHeader header = ReadsSparkSource.getHeader(ctx, inputBam, null); ReadsSparkSink.writeReads( ctx, outputDirectory.getAbsolutePath(), rddParallelReads, header, ReadsWriteFormat.ADAM); JavaRDD<GATKRead> rddParallelReads2 = readSource.getADAMReads(outputDirectory.getAbsolutePath(), null, header); Assert.assertEquals(rddParallelReads.count(), rddParallelReads2.count()); // Test the round trip List<GATKRead> samList = rddParallelReads.collect(); List<GATKRead> adamList = rddParallelReads2.collect(); Comparator<GATKRead> comparator = new ReadCoordinateComparator(header); samList.sort(comparator); adamList.sort(comparator); for (int i = 0; i < samList.size(); i++) { SAMRecord expected = samList.get(i).convertToSAMRecord(header); SAMRecord observed = adamList.get(i).convertToSAMRecord(header); // manually test equality of some fields, as there are issues with roundtrip BAM -> ADAM -> // BAM // see https://github.com/bigdatagenomics/adam/issues/823 Assert.assertEquals(observed.getReadName(), expected.getReadName(), "readname"); Assert.assertEquals( observed.getAlignmentStart(), expected.getAlignmentStart(), "getAlignmentStart"); Assert.assertEquals( observed.getAlignmentEnd(), expected.getAlignmentEnd(), "getAlignmentEnd"); Assert.assertEquals(observed.getFlags(), expected.getFlags(), "getFlags"); Assert.assertEquals( observed.getMappingQuality(), expected.getMappingQuality(), "getMappingQuality"); Assert.assertEquals( observed.getMateAlignmentStart(), expected.getMateAlignmentStart(), "getMateAlignmentStart"); Assert.assertEquals(observed.getCigar(), expected.getCigar(), "getCigar"); } }
/** * HACK TO CREATE GATKSAMRECORD BASED ONLY A SAMRECORD FOR TESTING PURPOSES ONLY * * @param read */ public GATKSAMRecord(final SAMRecord read) { super(read.getHeader()); super.setReferenceIndex(read.getReferenceIndex()); super.setAlignmentStart(read.getAlignmentStart()); super.setReadName(read.getReadName()); super.setMappingQuality(read.getMappingQuality()); // indexing bin done below super.setCigar(read.getCigar()); super.setFlags(read.getFlags()); super.setMateReferenceIndex(read.getMateReferenceIndex()); super.setMateAlignmentStart(read.getMateAlignmentStart()); super.setInferredInsertSize(read.getInferredInsertSize()); SAMReadGroupRecord samRG = read.getReadGroup(); SAMBinaryTagAndValue samAttr = GATKBin.getReadBinaryAttributes(read); if (samAttr == null) { clearAttributes(); } else { setAttributes(samAttr); } if (samRG != null) { GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(samRG); setReadGroup(rg); } super.setFileSource(read.getFileSource()); super.setReadName(read.getReadName()); super.setCigarString(read.getCigarString()); super.setReadBases(read.getReadBases()); super.setBaseQualities(read.getBaseQualities()); // From SAMRecord constructor: Do this after the above because setCigarString will clear it. GATKBin.setReadIndexingBin(this, GATKBin.getReadIndexingBin(read)); }
private boolean isSvCandidate(SAMRecord read) { boolean isCandidate = false; if (!read.getProperPairFlag() && !read.getMateUnmappedFlag()) { if (!read.getReferenceName().equals(read.getMateReferenceName())) { isCandidate = true; } else if (Math.abs(read.getAlignmentStart() - read.getMateAlignmentStart()) > CombineChimera3.MAX_GAP_LENGTH) { isCandidate = true; } } return isCandidate; }
public String assembleContigs( List<String> inputFiles, String output, String tempDir, List<Feature> regions, String prefix, boolean checkForDupes, ReAligner realigner, CompareToReference2 c2r) { if ((kmers.length == 0) || (kmers[0] < KmerSizeEvaluator.MIN_KMER)) { KmerSizeEvaluator kmerEval = new KmerSizeEvaluator(); int kmer = kmerEval.identifyMinKmer(readLength, c2r, regions); this.kmers = realigner.toKmerArray(kmer, readLength); } String contigs = ""; long start = System.currentTimeMillis(); int readCount = 0; int minReadCount = Integer.MAX_VALUE; // if c2r is null, this is the unaligned region. boolean isAssemblyCandidate = c2r == null ? true : false; try { List<List<SAMRecord>> readsList = getReads(inputFiles, regions, realigner); for (List<SAMRecord> reads : readsList) { int candidateReadCount = 0; for (SAMRecord read : reads) { if (!isAssemblyCandidate && isAssemblyTriggerCandidate(read, c2r)) { candidateReadCount++; } if (shouldSearchForSv && isSvCandidate(read)) { svCandidates.add( new Position(read.getMateReferenceName(), read.getMateAlignmentStart())); } } if (candidateReadCount > minCandidateCount(reads.size(), regions.get(0))) { isAssemblyCandidate = true; } if (reads.size() < minReadCount) { minReadCount = reads.size(); } readCount += reads.size(); } StringBuffer readBuffer = new StringBuffer(); if (isAssemblyCandidate) { int downsampleTarget = desiredNumberOfReads(regions); char sampleId = 1; for (List<SAMRecord> reads : readsList) { // Default to always keep double keepProbability = 1.1; if (reads.size() > downsampleTarget) { keepProbability = (double) downsampleTarget / (double) reads.size(); } Random random = new Random(1); for (SAMRecord read : reads) { if (random.nextDouble() < keepProbability) { readBuffer.append(sampleId); readBuffer.append(read.getReadNegativeStrandFlag() ? "1" : "0"); if (read.getReadString().length() == readLength) { readBuffer.append(read.getReadString()); readBuffer.append(read.getBaseQualityString()); } else { StringBuffer basePadding = new StringBuffer(); StringBuffer qualPadding = new StringBuffer(); for (int i = 0; i < readLength - read.getReadString().length(); i++) { basePadding.append('N'); qualPadding.append('!'); } readBuffer.append(read.getReadString() + basePadding.toString()); readBuffer.append(read.getBaseQualityString() + qualPadding.toString()); } } } // Make this set of reads eligible for GC reads.clear(); sampleId += 1; } } readsList.clear(); if (isAssemblyCandidate) { for (int kmer : kmers) { String outputFile = output + "_k" + kmer; contigs = assemble( readBuffer.toString(), outputFile, prefix, truncateOnRepeat ? 1 : 0, maxContigs, maxPathsFromRoot, readLength, kmer, minKmerFrequency, minBaseQuality, minEdgeRatio, isDebug ? 1 : 0, maxNodes); if (!contigs.equals("<REPEAT>")) { break; } else { if (kmer >= readLength / 2 || kmer >= CYCLE_KMER_LENGTH_THRESHOLD) { isCycleExceedingThresholdDetected = true; } } } } else { // System.out.println("Skipping assembly for: " + prefix); } } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e); } if (this.shouldSearchForSv) { Collections.sort(this.svCandidates); Position last = null; String currentFeatureChr = null; int currentFeatureStart = -1; int currentFeatureStop = -1; int currentFeatureCount = 0; // TODO: Calc this dynamically int windowSize = 500; for (Position pos : this.svCandidates) { if ((last != null) && pos.getChromosome().equals(last.getChromosome()) && Math.abs(pos.getPosition() - last.getPosition()) < windowSize) { if (currentFeatureChr == null) { currentFeatureChr = pos.getChromosome(); currentFeatureStart = last.getPosition(); currentFeatureStop = pos.getPosition() + readLength; currentFeatureCount = 1; } else { currentFeatureStop = pos.getPosition() + readLength; currentFeatureCount++; } } else { if (currentFeatureChr != null) { if (currentFeatureCount > (minReadCount / MAX_READ_LENGTHS_PER_REGION) * minReadCandidateFraction) { Feature region = new Feature( currentFeatureChr, currentFeatureStart - readLength, currentFeatureStop + readLength); BreakpointCandidate candidate = new BreakpointCandidate(region, currentFeatureCount); this.svCandidateRegions.add(candidate); } currentFeatureChr = null; currentFeatureStart = -1; currentFeatureStop = -1; currentFeatureCount = 0; } else { currentFeatureChr = pos.getChromosome(); currentFeatureStart = pos.getPosition(); currentFeatureStop = pos.getPosition() + readLength; currentFeatureCount = 1; } } last = pos; } // Don't forget last SV candidate region if (currentFeatureCount > (minReadCount / MAX_READ_LENGTHS_PER_REGION) * minReadCandidateFraction) { Feature region = new Feature( currentFeatureChr, currentFeatureStart - readLength, currentFeatureStop + readLength); BreakpointCandidate candidate = new BreakpointCandidate(region, currentFeatureCount); this.svCandidateRegions.add(candidate); } } long end = System.currentTimeMillis(); int kmer = readLength + 1; if (kmers.length > 0) { kmer = kmers[0]; } if (isDebug) { System.err.println( "Elapsed_msecs_in_NativeAssembler\tRegion:\t" + regions.get(0).getDescriptor() + "\tLength:\t" + regions.get(0).getLength() + "\tReadCount:\t" + readCount + "\tElapsed\t" + (end - start) + "\tAssembled\t" + isAssemblyCandidate + "\t" + kmer); } return contigs; }