@Test(dataProvider = "loadReadsADAM", groups = "spark") public void readsSinkADAMTest(String inputBam, String outputDirectoryName) throws IOException { // Since the test requires that we not create the actual output directory in advance, // we instead create its parent directory and mark it for deletion on exit. This protects // us from naming collisions across multiple instances of the test suite. final File outputParentDirectory = createTempDir(outputDirectoryName + "_parent"); final File outputDirectory = new File(outputParentDirectory, outputDirectoryName); JavaSparkContext ctx = SparkContextFactory.getTestSparkContext(); ReadsSparkSource readSource = new ReadsSparkSource(ctx); JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(inputBam, null); SAMFileHeader header = ReadsSparkSource.getHeader(ctx, inputBam, null); ReadsSparkSink.writeReads( ctx, outputDirectory.getAbsolutePath(), rddParallelReads, header, ReadsWriteFormat.ADAM); JavaRDD<GATKRead> rddParallelReads2 = readSource.getADAMReads(outputDirectory.getAbsolutePath(), null, header); Assert.assertEquals(rddParallelReads.count(), rddParallelReads2.count()); // Test the round trip List<GATKRead> samList = rddParallelReads.collect(); List<GATKRead> adamList = rddParallelReads2.collect(); Comparator<GATKRead> comparator = new ReadCoordinateComparator(header); samList.sort(comparator); adamList.sort(comparator); for (int i = 0; i < samList.size(); i++) { SAMRecord expected = samList.get(i).convertToSAMRecord(header); SAMRecord observed = adamList.get(i).convertToSAMRecord(header); // manually test equality of some fields, as there are issues with roundtrip BAM -> ADAM -> // BAM // see https://github.com/bigdatagenomics/adam/issues/823 Assert.assertEquals(observed.getReadName(), expected.getReadName(), "readname"); Assert.assertEquals( observed.getAlignmentStart(), expected.getAlignmentStart(), "getAlignmentStart"); Assert.assertEquals( observed.getAlignmentEnd(), expected.getAlignmentEnd(), "getAlignmentEnd"); Assert.assertEquals(observed.getFlags(), expected.getFlags(), "getFlags"); Assert.assertEquals( observed.getMappingQuality(), expected.getMappingQuality(), "getMappingQuality"); Assert.assertEquals( observed.getMateAlignmentStart(), expected.getMateAlignmentStart(), "getMateAlignmentStart"); Assert.assertEquals(observed.getCigar(), expected.getCigar(), "getCigar"); } }
/** Note: this is the only getKey function that handles unmapped reads specially! */ public static long getKey(final SAMRecord rec) { final int refIdx = rec.getReferenceIndex(); final int start = rec.getAlignmentStart(); if (!(rec.getReadUnmappedFlag() || refIdx < 0 || start < 0)) return getKey(refIdx, start); // Put unmapped reads at the end, but don't give them all the exact same // key so that they can be distributed to different reducers. // // A random number would probably be best, but to ensure that the same // record always gets the same key we use a fast hash instead. // // We avoid using hashCode(), because it's not guaranteed to have the // same value across different processes. int hash = 0; byte[] var; if ((var = rec.getVariableBinaryRepresentation()) != null) { // Undecoded BAM record: just hash its raw data. hash = (int) MurmurHash3.murmurhash3(var, hash); } else { // Decoded BAM record or any SAM record: hash a few representative // fields together. hash = (int) MurmurHash3.murmurhash3(rec.getReadName(), hash); hash = (int) MurmurHash3.murmurhash3(rec.getReadBases(), hash); hash = (int) MurmurHash3.murmurhash3(rec.getBaseQualities(), hash); hash = (int) MurmurHash3.murmurhash3(rec.getCigarString(), hash); } return getKey0(Integer.MAX_VALUE, hash); }
/** * HACK TO CREATE GATKSAMRECORD BASED ONLY A SAMRECORD FOR TESTING PURPOSES ONLY * * @param read */ public GATKSAMRecord(final SAMRecord read) { super(read.getHeader()); super.setReferenceIndex(read.getReferenceIndex()); super.setAlignmentStart(read.getAlignmentStart()); super.setReadName(read.getReadName()); super.setMappingQuality(read.getMappingQuality()); // indexing bin done below super.setCigar(read.getCigar()); super.setFlags(read.getFlags()); super.setMateReferenceIndex(read.getMateReferenceIndex()); super.setMateAlignmentStart(read.getMateAlignmentStart()); super.setInferredInsertSize(read.getInferredInsertSize()); SAMReadGroupRecord samRG = read.getReadGroup(); SAMBinaryTagAndValue samAttr = GATKBin.getReadBinaryAttributes(read); if (samAttr == null) { clearAttributes(); } else { setAttributes(samAttr); } if (samRG != null) { GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(samRG); setReadGroup(rg); } super.setFileSource(read.getFileSource()); super.setReadName(read.getReadName()); super.setCigarString(read.getCigarString()); super.setReadBases(read.getReadBases()); super.setBaseQualities(read.getBaseQualities()); // From SAMRecord constructor: Do this after the above because setCigarString will clear it. GATKBin.setReadIndexingBin(this, GATKBin.getReadIndexingBin(read)); }
private boolean isOutOfOrder(final SAMRecord last, final SAMRecord cur) { if (last == null || cur.getReadUnmappedFlag()) return false; else { if (last.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX || last.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) throw new UserException.MalformedBAM( last, String.format("read %s has inconsistent mapping information.", last.format())); if (cur.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX || cur.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) throw new UserException.MalformedBAM( last, String.format("read %s has inconsistent mapping information.", cur.format())); return (last.getReferenceIndex() > cur.getReferenceIndex()) || (last.getReferenceIndex().equals(cur.getReferenceIndex()) && last.getAlignmentStart() > cur.getAlignmentStart()); } }
private boolean isSvCandidate(SAMRecord read) { boolean isCandidate = false; if (!read.getProperPairFlag() && !read.getMateUnmappedFlag()) { if (!read.getReferenceName().equals(read.getMateReferenceName())) { isCandidate = true; } else if (Math.abs(read.getAlignmentStart() - read.getMateAlignmentStart()) > CombineChimera3.MAX_GAP_LENGTH) { isCandidate = true; } } return isCandidate; }
private void countRead(final SAMRecord record) { if (record != null) { readCount++; if (readCount % logSkipSize == 0) { logger.info("Single reads processed: " + readCount); logger.info( "Last read: " + record.getReferenceName() + ":" + record.getAlignmentStart() + "-" + record.getAlignmentEnd()); } } }
private boolean overlaps(SAMRecord r) { if (intervals == null || (r.getReadUnmappedFlag() && r.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START)) { return true; } if (r.getReadUnmappedFlag()) { // special case for unmapped reads with coordinate set for (Locatable interval : intervals) { if (interval.getStart() <= r.getStart() && interval.getEnd() >= r.getStart()) { // This follows the behavior of htsjdk's SamReader which states that // "an unmapped read will be returned by this call if it has a coordinate for // the purpose of sorting that is in the query region". return true; } } } final Interval interval = new Interval(r.getContig(), r.getStart(), r.getEnd()); Collection<Interval> overlaps = overlapDetector.getOverlaps(interval); return !overlaps.isEmpty(); }
public BAQCalculationResult calcBAQFromHMM(SAMRecord read, IndexedFastaSequenceFile refReader) { // start is alignment start - band width / 2 - size of first I element, if there is one. Stop // is similar int offset = getBandWidth() / 2; long readStart = includeClippedBases ? read.getUnclippedStart() : read.getAlignmentStart(); long start = Math.max(readStart - offset - ReadUtils.getFirstInsertionOffset(read), 0); long stop = (includeClippedBases ? read.getUnclippedEnd() : read.getAlignmentEnd()) + offset + ReadUtils.getLastInsertionOffset(read); if (stop > refReader .getSequenceDictionary() .getSequence(read.getReferenceName()) .getSequenceLength()) { return null; } else { // now that we have the start and stop, get the reference sequence covering it ReferenceSequence refSeq = refReader.getSubsequenceAt(read.getReferenceName(), start, stop); return calcBAQFromHMM(read, refSeq.getBases(), (int) (start - readStart)); } }
public String simpleAssemble(List<SAMRecord> reads) { StringBuffer readBuffer = new StringBuffer(); for (SAMRecord read : reads) { readBuffer.append((char) 1); readBuffer.append(read.getReadNegativeStrandFlag() ? "1" : "0"); if (read.getReadString().length() == readLength) { readBuffer.append(read.getReadString()); readBuffer.append(read.getBaseQualityString()); } else { StringBuffer basePadding = new StringBuffer(); StringBuffer qualPadding = new StringBuffer(); for (int i = 0; i < readLength - read.getReadString().length(); i++) { basePadding.append('N'); qualPadding.append('!'); } readBuffer.append(read.getReadString() + basePadding.toString()); readBuffer.append(read.getBaseQualityString() + qualPadding.toString()); } } SAMRecord lastRead = reads.get(reads.size() - 1); int regionStart = reads.get(0).getAlignmentStart(); int regionEnd = lastRead.getAlignmentEnd() > 0 ? lastRead.getAlignmentEnd() : lastRead.getAlignmentStart(); String output = "region_" + reads.get(0).getReferenceName() + "_" + regionStart + "_" + regionEnd; String contigs = ""; // Make this set of reads eligible for GC // reads.clear(); for (int kmer : kmers) { String outputFile = output + "_k" + kmer; contigs = assemble( readBuffer.toString(), outputFile, output, 1, // truncate_on_repeat maxContigs, maxPathsFromRoot, readLength, kmer, minKmerFrequency, minBaseQuality, minEdgeRatio, isDebug ? 1 : 0, maxNodes); if (!contigs.equals("<REPEAT>")) { break; } } return contigs; }
@Override public void execute() { log.info("Loading reads..."); List<SAMRecord> reads = new ArrayList<SAMRecord>(); for (SAMRecord sr : BAM) { if (!sr.getReadUnmappedFlag()) { reads.add(sr); } } log.info(" {} reads loaded", reads.size()); Collections.shuffle(reads); log.info(" shuffled"); DataTable dt = new DataTable( "LanderWaterman", "Realistic Lander-Waterman stats", "reads", "bp_used", "bp_total", "pct_genome"); for (int numReads : NUM_READS) { log.info("Selecting {} reads...", numReads); Map<String, boolean[]> utilizationMask = new HashMap<String, boolean[]>(); for (SAMSequenceRecord ssr : BAM.getFileHeader().getSequenceDictionary().getSequences()) { utilizationMask.put(ssr.getSequenceName(), new boolean[ssr.getSequenceLength()]); } for (int i = 0; i < numReads; i++) { SAMRecord read = reads.get(i); for (int j = read.getAlignmentStart(); j < read.getAlignmentEnd(); j++) { utilizationMask.get(read.getReferenceName())[j] = true; } } int basesUsed = 0, basesTotal = 0; for (String refName : utilizationMask.keySet()) { for (int i = 0; i < utilizationMask.get(refName).length; i++) { if (utilizationMask.get(refName)[i]) { basesUsed++; } basesTotal++; } } dt.set("lw" + numReads, "reads", numReads); dt.set("lw" + numReads, "bp_used", basesUsed); dt.set("lw" + numReads, "bp_total", basesTotal); dt.set("lw" + numReads, "pct_genome", (float) basesUsed / (float) basesTotal); log.info( " reads: {}, bp_used: {}, bp_total: {}, pct_genome: {}", numReads, basesUsed, basesTotal, (float) basesUsed / (float) basesTotal); } out.println(dt); }
@Override public void execute() { log.info("Initializing kmer code map..."); Map<Character, Integer> kmerCodeIndices = new HashMap<Character, Integer>(); kmerCodeIndices.put('0', 1); kmerCodeIndices.put('A', 3); kmerCodeIndices.put('B', 4); kmerCodeIndices.put('C', 5); kmerCodeIndices.put('_', 6); kmerCodeIndices.put('.', 7); kmerCodeIndices.put('1', 9); Map<Character, String> kmerCodeNames = new LinkedHashMap<Character, String>(); kmerCodeNames.put('0', "ref0"); kmerCodeNames.put('A', "repetitive"); kmerCodeNames.put('B', "both"); kmerCodeNames.put('C', "lowcoverage"); kmerCodeNames.put('_', "lowconfidence"); kmerCodeNames.put('.', "novel"); kmerCodeNames.put('1', "ref1"); if (KMER_CODE_NAMES != null) { for (Character c : kmerCodeNames.keySet()) { String cStr = String.valueOf(c); if (KMER_CODE_NAMES.containsKey(cStr)) { kmerCodeNames.put(c, KMER_CODE_NAMES.get(cStr)); } } } for (Character c : kmerCodeNames.keySet()) { log.info(" {} {}: {}", c, kmerCodeIndices.get(c), kmerCodeNames.get(c)); } log.info("Loading annotated contigs..."); Map<String, Map<String, String>> annotatedContigs = new HashMap<String, Map<String, String>>(); int kmerSize = 0; if (ANN.length() > 0) { TableReader tr = new TableReader(ANN); for (Map<String, String> te : tr) { String contigName = te.get("contigName"); if (kmerSize == 0) { kmerSize = te.get("seq").length() - te.get("kmerOrigin").length() + 1; } annotatedContigs.put(contigName, te); String[] ref0ToCanonicalExact = (te.get("ref0ToCanonicalExact").equals("NA") || te.get("ref0ToCanonicalExact").equals("*:0-0") ? "NA:0-0" : te.get("ref0ToCanonicalExact")) .split("[:-]"); String[] ref1ToCanonicalExact = (te.get("ref1ToCanonicalExact").equals("NA") || te.get("ref1ToCanonicalExact").equals("*:0-0") ? "NA:0-0" : te.get("ref1ToCanonicalExact")) .split("[:-]"); cout.println( te.get("sampleName") + "_" + te.get("accession") + "_" + contigName + " " + ref0ToCanonicalExact[0] + " " + ref0ToCanonicalExact[1] + " " + ref0ToCanonicalExact[2] + " radius1=0.8r"); cout.println( te.get("sampleName") + "_" + te.get("accession") + "_" + contigName + " " + ref1ToCanonicalExact[0] + " " + ref1ToCanonicalExact[1] + " " + ref1ToCanonicalExact[2] + " radius2=0.6r"); } } log.info(" contigs: {}", annotatedContigs.size()); log.info(" kmer size: {}", kmerSize); log.info("Computing kmer inheritance information..."); SAMFileHeader sfh = CONTIGS.getFileHeader(); for (Character c : kmerCodeNames.keySet()) { SAMReadGroupRecord rgr = new SAMReadGroupRecord(kmerCodeNames.get(c)); rgr.setSample(kmerCodeNames.get(c)); sfh.addReadGroup(rgr); } SAMFileWriterFactory sfwf = new SAMFileWriterFactory(); sfwf.setCreateIndex(true); SAMFileWriter sfw = sfwf.makeBAMWriter(sfh, false, bout); TableWriter tw = new TableWriter(sout); Set<IGVEntry> igvEntries = new TreeSet<IGVEntry>(); int numContigs = 0; for (SAMRecord contig : CONTIGS) { if (CONTIG_NAMES == null || CONTIG_NAMES.isEmpty() || CONTIG_NAMES.contains(contig.getReadName())) { Map<String, String> te = annotatedContigs.get(contig.getReadName()); if (annotatedContigs.containsKey(contig.getReadName())) { String seq = contig.getReadString(); // log.debug(" te: {}", te); String annSeq = te.get("seq"); String kmerOrigin = te.get("kmerOrigin"); Map<CortexKmer, Character> kmerCodes = new HashMap<CortexKmer, Character>(); for (int i = 0; i < kmerOrigin.length(); i++) { CortexKmer kmer = new CortexKmer(annSeq.substring(i, i + kmerSize)); Character code = kmerOrigin.charAt(i); kmerCodes.put(kmer, code); } Map<Character, Integer> kmerStats = new HashMap<Character, Integer>(); for (Character c : kmerCodeNames.keySet()) { kmerStats.put(c, 0); } boolean changed = false; // We want to be able to examine soft-clipped regions as well. List<CigarElement> ces = new ArrayList<CigarElement>(); for (CigarElement ce : contig.getCigar().getCigarElements()) { if (ce.getOperator().equals(CigarOperator.S)) { ces.add(new CigarElement(ce.getLength(), CigarOperator.M)); changed = true; } else { ces.add(ce); } } if (changed) { CigarElement firstCe = contig.getCigar().getCigarElements().get(0); if (firstCe.getOperator().equals(CigarOperator.S)) { contig.setAlignmentStart(contig.getAlignmentStart() - firstCe.getLength()); } contig.setCigar(new Cigar(ces)); } for (AlignmentBlock ab : contig.getAlignmentBlocks()) { for (int i = ab.getReadStart() - 1; i < ab.getReadStart() + ab.getLength(); i++) { if (i + kmerSize < seq.length()) { CortexKmer kmer = new CortexKmer(seq.substring(i, i + kmerSize)); SAMRecord skmer = new SAMRecord(CONTIGS.getFileHeader()); skmer.setReadBases(seq.substring(i, i + kmerSize).getBytes()); List<CigarElement> cigarElements = new ArrayList<CigarElement>(); cigarElements.add(new CigarElement(kmerSize, CigarOperator.M)); Cigar cigar = new Cigar(cigarElements); skmer.setReadName(contig.getReadName() + "." + kmer.getKmerAsString()); skmer.setReferenceName(contig.getReferenceName()); skmer.setCigar(cigar); skmer.setReadPairedFlag(false); skmer.setDuplicateReadFlag(false); skmer.setMateNegativeStrandFlag(false); skmer.setAlignmentStart(ab.getReferenceStart() - ab.getReadStart() + 1 + i); skmer.setAttribute("RG", "none"); skmer.setMappingQuality(0); Character c = kmerCodes.get(kmer); String codeName = kmerCodeNames.get(c); String parentReadGroupId = null; String sampleReadGroupId = null; for (SAMReadGroupRecord rgr : sfh.getReadGroups()) { if (rgr.getSample().equals(codeName)) { parentReadGroupId = rgr.getReadGroupId(); } if (rgr.getSample().equals(contig.getReadGroup().getSample())) { sampleReadGroupId = rgr.getReadGroupId(); } } skmer.setAttribute( "RG", parentReadGroupId != null ? parentReadGroupId : sampleReadGroupId); skmer.setMappingQuality(99); sfw.addAlignment(skmer); kmerStats.put(c, kmerStats.get(c) + 1); IGVEntry igvEntry = new IGVEntry(); igvEntry.chromosome = contig.getReferenceName(); igvEntry.start = ab.getReferenceStart() - ab.getReadStart() + i; igvEntry.parentageName = kmerCodeNames.get(c); igvEntry.parentage = kmerCodeIndices.get(c); igvEntries.add(igvEntry); } } } if (!contig.isSecondaryOrSupplementary()) { beout.println( contig.getReferenceName() + "\t" + contig.getAlignmentStart() + "\t" + contig.getAlignmentEnd() + "\t" + contig.getReadName() + "." + contig.getReadGroup().getSample()); if (annotatedContigs.size() > 10 && numContigs % (annotatedContigs.size() / 10) == 0) { log.info(" processed {}/{} contigs", numContigs, annotatedContigs.size()); } numContigs++; } Map<String, String> stats = new LinkedHashMap<String, String>(); stats.put("contigName", contig.getReadName()); stats.put("sampleName", contig.getReadGroup().getSample()); for (Character c : kmerCodeNames.keySet()) { stats.put(kmerCodeNames.get(c), String.valueOf(kmerStats.get(c))); } tw.addEntry(stats); } } } log.info("Writing kmer inheritance information..."); out.printf("%s\t%s\t%s\t%s\t%s\n", "Chromosome", "Start", "End", "Feature", "Parentage"); for (IGVEntry igvEntry : igvEntries) { out.printf( "%s\t%d\t%d\t%s\t%d\n", igvEntry.chromosome, igvEntry.start, igvEntry.start + 1, igvEntry.parentageName, igvEntry.parentage); } sfw.close(); }