@Test(dataProvider = "loadReadsADAM", groups = "spark") public void readsSinkADAMTest(String inputBam, String outputDirectoryName) throws IOException { // Since the test requires that we not create the actual output directory in advance, // we instead create its parent directory and mark it for deletion on exit. This protects // us from naming collisions across multiple instances of the test suite. final File outputParentDirectory = createTempDir(outputDirectoryName + "_parent"); final File outputDirectory = new File(outputParentDirectory, outputDirectoryName); JavaSparkContext ctx = SparkContextFactory.getTestSparkContext(); ReadsSparkSource readSource = new ReadsSparkSource(ctx); JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(inputBam, null); SAMFileHeader header = ReadsSparkSource.getHeader(ctx, inputBam, null); ReadsSparkSink.writeReads( ctx, outputDirectory.getAbsolutePath(), rddParallelReads, header, ReadsWriteFormat.ADAM); JavaRDD<GATKRead> rddParallelReads2 = readSource.getADAMReads(outputDirectory.getAbsolutePath(), null, header); Assert.assertEquals(rddParallelReads.count(), rddParallelReads2.count()); // Test the round trip List<GATKRead> samList = rddParallelReads.collect(); List<GATKRead> adamList = rddParallelReads2.collect(); Comparator<GATKRead> comparator = new ReadCoordinateComparator(header); samList.sort(comparator); adamList.sort(comparator); for (int i = 0; i < samList.size(); i++) { SAMRecord expected = samList.get(i).convertToSAMRecord(header); SAMRecord observed = adamList.get(i).convertToSAMRecord(header); // manually test equality of some fields, as there are issues with roundtrip BAM -> ADAM -> // BAM // see https://github.com/bigdatagenomics/adam/issues/823 Assert.assertEquals(observed.getReadName(), expected.getReadName(), "readname"); Assert.assertEquals( observed.getAlignmentStart(), expected.getAlignmentStart(), "getAlignmentStart"); Assert.assertEquals( observed.getAlignmentEnd(), expected.getAlignmentEnd(), "getAlignmentEnd"); Assert.assertEquals(observed.getFlags(), expected.getFlags(), "getFlags"); Assert.assertEquals( observed.getMappingQuality(), expected.getMappingQuality(), "getMappingQuality"); Assert.assertEquals( observed.getMateAlignmentStart(), expected.getMateAlignmentStart(), "getMateAlignmentStart"); Assert.assertEquals(observed.getCigar(), expected.getCigar(), "getCigar"); } }
/** * @param read a read containing the variant * @return the number of hard clipped and low qual bases at the read start (where start is the * leftmost end w.r.t. the reference) */ public static int getNumClippedBasesAtStart(final SAMRecord read) { // check for hard clips (never consider these bases): final Cigar c = read.getCigar(); final CigarElement first = c.getCigarElement(0); int numStartClippedBases = 0; if (first.getOperator() == CigarOperator.H) { numStartClippedBases = first.getLength(); } final byte[] unclippedReadBases = read.getReadBases(); final byte[] unclippedReadQuals = read.getBaseQualities(); // Do a stricter base clipping than provided by CIGAR string, since this one may be too // conservative, // and may leave a string of Q2 bases still hanging off the reads. // TODO: this code may not even get used because HaplotypeCaller already hard clips low quality // tails for (int i = numStartClippedBases; i < unclippedReadBases.length; i++) { if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD) numStartClippedBases++; else break; } return numStartClippedBases; }
/** * HACK TO CREATE GATKSAMRECORD BASED ONLY A SAMRECORD FOR TESTING PURPOSES ONLY * * @param read */ public GATKSAMRecord(final SAMRecord read) { super(read.getHeader()); super.setReferenceIndex(read.getReferenceIndex()); super.setAlignmentStart(read.getAlignmentStart()); super.setReadName(read.getReadName()); super.setMappingQuality(read.getMappingQuality()); // indexing bin done below super.setCigar(read.getCigar()); super.setFlags(read.getFlags()); super.setMateReferenceIndex(read.getMateReferenceIndex()); super.setMateAlignmentStart(read.getMateAlignmentStart()); super.setInferredInsertSize(read.getInferredInsertSize()); SAMReadGroupRecord samRG = read.getReadGroup(); SAMBinaryTagAndValue samAttr = GATKBin.getReadBinaryAttributes(read); if (samAttr == null) { clearAttributes(); } else { setAttributes(samAttr); } if (samRG != null) { GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(samRG); setReadGroup(rg); } super.setFileSource(read.getFileSource()); super.setReadName(read.getReadName()); super.setCigarString(read.getCigarString()); super.setReadBases(read.getReadBases()); super.setBaseQualities(read.getBaseQualities()); // From SAMRecord constructor: Do this after the above because setCigarString will clear it. GATKBin.setReadIndexingBin(this, GATKBin.getReadIndexingBin(read)); }
// we need to pad ref by at least the bandwidth / 2 on either side public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) { // todo -- need to handle the case where the cigar sum of lengths doesn't cover the whole read Pair<Integer, Integer> queryRange = calculateQueryRange(read); if (queryRange == null) return null; // read has Ns, or is completely clipped away int queryStart = queryRange.getFirst(); int queryEnd = queryRange.getSecond(); BAQCalculationResult baqResult = calcBAQFromHMM(ref, read.getReadBases(), read.getBaseQualities(), queryStart, queryEnd); // cap quals int readI = 0, refI = 0; for (CigarElement elt : read.getCigar().getCigarElements()) { int l = elt.getLength(); switch (elt.getOperator()) { case N: // cannot handle these return null; case H: case P: // ignore pads and hard clips break; case S: refI += l; // move the reference too, in addition to I case I: // todo -- is it really the case that we want to treat I and S the same? for (int i = readI; i < readI + l; i++) baqResult.bq[i] = baqResult.rawQuals[i]; readI += l; break; case D: refI += l; break; case M: for (int i = readI; i < readI + l; i++) { int expectedPos = refI - refOffset + (i - readI); baqResult.bq[i] = capBaseByBAQ( baqResult.rawQuals[i], baqResult.bq[i], baqResult.state[i], expectedPos); } readI += l; refI += l; break; default: throw new ReviewedGATKException( "BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName()); } } if (readI != read.getReadLength()) // odd cigar string System.arraycopy(baqResult.rawQuals, 0, baqResult.bq, 0, baqResult.bq.length); return baqResult; }
/** * Determine the appropriate start and stop offsets in the reads for the bases given the cigar * string * * @param read * @return */ private final Pair<Integer, Integer> calculateQueryRange(SAMRecord read) { int queryStart = -1, queryStop = -1; int readI = 0; // iterate over the cigar elements to determine the start and stop of the read bases for the BAQ // calculation for (CigarElement elt : read.getCigar().getCigarElements()) { switch (elt.getOperator()) { case N: return null; // cannot handle these case H: case P: case D: break; // ignore pads, hard clips, and deletions case I: case S: case M: case EQ: case X: int prev = readI; readI += elt.getLength(); if (includeClippedBases || elt.getOperator() != CigarOperator.S) { if (queryStart == -1) queryStart = prev; queryStop = readI; } // in the else case we aren't including soft clipped bases, so we don't update // queryStart or queryStop break; default: throw new ReviewedGATKException( "BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName()); } } if (queryStop == queryStart) { // this read is completely clipped away, and yet is present in the file for some reason // usually they are flagged as non-PF, but it's possible to push them through the BAM // System.err.printf("WARNING -- read is completely clipped away: " + read.format()); return null; } return new Pair<Integer, Integer>(queryStart, queryStop); }
public boolean filterOut(final SAMRecord read) { int alignedLength = 0; int softClipBlocks = 0; int minSoftClipBlocks = doNotRequireSoftclipsOnBothEnds ? 1 : 2; CigarOperator lastOperator = null; for (final CigarElement element : read.getCigar().getCigarElements()) { if (element.getOperator() == CigarOperator.S) { // Treat consecutive S blocks as a single one if (lastOperator != CigarOperator.S) { softClipBlocks += 1; } } else if (element .getOperator() .consumesReadBases()) { // M, I, X, and EQ (S was already accounted for above) alignedLength += element.getLength(); } lastOperator = element.getOperator(); } return (alignedLength < tooShort && softClipBlocks >= minSoftClipBlocks); }
/** * Finds a virtual BAM record position in the physical position range [beg,end). Returns end if no * BAM record was found. */ public long guessNextBAMRecordStart(long beg, long end) throws IOException { // Buffer what we need to go through. byte[] arr = new byte[MAX_BYTES_READ]; this.inFile.seek(beg); int totalRead = 0; for (int left = Math.min((int) (end - beg), arr.length); left > 0; ) { final int r = inFile.read(arr, totalRead, left); if (r < 0) break; totalRead += r; left -= r; } arr = Arrays.copyOf(arr, totalRead); this.in = new SeekableArrayStream(arr); this.bgzf = new BlockCompressedInputStream(this.in); this.bgzf.setCheckCrcs(true); this.bamCodec.setInputStream(bgzf); final int firstBGZFEnd = Math.min((int) (end - beg), 0xffff); // cp: Compressed Position, indexes the entire BGZF input. for (int cp = 0; ; ++cp) { final PosSize psz = guessNextBGZFPos(cp, firstBGZFEnd); if (psz == null) return end; final int cp0 = cp = psz.pos; final long cp0Virt = (long) cp0 << 16; try { bgzf.seek(cp0Virt); // This has to catch Throwable, because it's possible to get an // OutOfMemoryError due to an overly large size. } catch (Throwable e) { // Guessed BGZF position incorrectly: try the next guess. continue; } // up: Uncompressed Position, indexes the data inside the BGZF block. for (int up = 0; ; ++up) { final int up0 = up = guessNextBAMPos(cp0Virt, up, psz.size); if (up0 < 0) { // No BAM records found in the BGZF block: try the next BGZF // block. break; } // Verify that we can actually decode BLOCKS_NEEDED_FOR_GUESS worth // of records starting at (cp0,up0). bgzf.seek(cp0Virt | up0); boolean decodedAny = false; try { byte b = 0; int prevCP = cp0; while (b < BLOCKS_NEEDED_FOR_GUESS) { SAMRecord record = bamCodec.decode(); if (record == null) { break; } record.getCigar(); // force decoding of CIGAR decodedAny = true; final int cp2 = (int) (bgzf.getFilePointer() >>> 16); if (cp2 != prevCP) { // The compressed position changed so we must be in a new // block. assert cp2 > prevCP; prevCP = cp2; ++b; } } // Running out of records to verify is fine as long as we // verified at least something. It should only happen if we // couldn't fill the array. if (b < BLOCKS_NEEDED_FOR_GUESS) { assert arr.length < MAX_BYTES_READ; if (!decodedAny) continue; } } catch (SAMFormatException e) { continue; } catch (FileTruncatedException e) { continue; } catch (OutOfMemoryError e) { continue; } catch (IllegalArgumentException e) { continue; } catch (RuntimeIOException e) { continue; } catch (RuntimeEOFException e) { // This can happen legitimately if the [beg,end) range is too // small to accommodate BLOCKS_NEEDED_FOR_GUESS and we get cut // off in the middle of a record. In that case, our stream // should have hit EOF as well. If we've then verified at least // something, go ahead with it and hope for the best. if (!decodedAny && this.in.eof()) continue; } return beg + cp0 << 16 | up0; } } }
@Override public void execute() { log.info("Initializing kmer code map..."); Map<Character, Integer> kmerCodeIndices = new HashMap<Character, Integer>(); kmerCodeIndices.put('0', 1); kmerCodeIndices.put('A', 3); kmerCodeIndices.put('B', 4); kmerCodeIndices.put('C', 5); kmerCodeIndices.put('_', 6); kmerCodeIndices.put('.', 7); kmerCodeIndices.put('1', 9); Map<Character, String> kmerCodeNames = new LinkedHashMap<Character, String>(); kmerCodeNames.put('0', "ref0"); kmerCodeNames.put('A', "repetitive"); kmerCodeNames.put('B', "both"); kmerCodeNames.put('C', "lowcoverage"); kmerCodeNames.put('_', "lowconfidence"); kmerCodeNames.put('.', "novel"); kmerCodeNames.put('1', "ref1"); if (KMER_CODE_NAMES != null) { for (Character c : kmerCodeNames.keySet()) { String cStr = String.valueOf(c); if (KMER_CODE_NAMES.containsKey(cStr)) { kmerCodeNames.put(c, KMER_CODE_NAMES.get(cStr)); } } } for (Character c : kmerCodeNames.keySet()) { log.info(" {} {}: {}", c, kmerCodeIndices.get(c), kmerCodeNames.get(c)); } log.info("Loading annotated contigs..."); Map<String, Map<String, String>> annotatedContigs = new HashMap<String, Map<String, String>>(); int kmerSize = 0; if (ANN.length() > 0) { TableReader tr = new TableReader(ANN); for (Map<String, String> te : tr) { String contigName = te.get("contigName"); if (kmerSize == 0) { kmerSize = te.get("seq").length() - te.get("kmerOrigin").length() + 1; } annotatedContigs.put(contigName, te); String[] ref0ToCanonicalExact = (te.get("ref0ToCanonicalExact").equals("NA") || te.get("ref0ToCanonicalExact").equals("*:0-0") ? "NA:0-0" : te.get("ref0ToCanonicalExact")) .split("[:-]"); String[] ref1ToCanonicalExact = (te.get("ref1ToCanonicalExact").equals("NA") || te.get("ref1ToCanonicalExact").equals("*:0-0") ? "NA:0-0" : te.get("ref1ToCanonicalExact")) .split("[:-]"); cout.println( te.get("sampleName") + "_" + te.get("accession") + "_" + contigName + " " + ref0ToCanonicalExact[0] + " " + ref0ToCanonicalExact[1] + " " + ref0ToCanonicalExact[2] + " radius1=0.8r"); cout.println( te.get("sampleName") + "_" + te.get("accession") + "_" + contigName + " " + ref1ToCanonicalExact[0] + " " + ref1ToCanonicalExact[1] + " " + ref1ToCanonicalExact[2] + " radius2=0.6r"); } } log.info(" contigs: {}", annotatedContigs.size()); log.info(" kmer size: {}", kmerSize); log.info("Computing kmer inheritance information..."); SAMFileHeader sfh = CONTIGS.getFileHeader(); for (Character c : kmerCodeNames.keySet()) { SAMReadGroupRecord rgr = new SAMReadGroupRecord(kmerCodeNames.get(c)); rgr.setSample(kmerCodeNames.get(c)); sfh.addReadGroup(rgr); } SAMFileWriterFactory sfwf = new SAMFileWriterFactory(); sfwf.setCreateIndex(true); SAMFileWriter sfw = sfwf.makeBAMWriter(sfh, false, bout); TableWriter tw = new TableWriter(sout); Set<IGVEntry> igvEntries = new TreeSet<IGVEntry>(); int numContigs = 0; for (SAMRecord contig : CONTIGS) { if (CONTIG_NAMES == null || CONTIG_NAMES.isEmpty() || CONTIG_NAMES.contains(contig.getReadName())) { Map<String, String> te = annotatedContigs.get(contig.getReadName()); if (annotatedContigs.containsKey(contig.getReadName())) { String seq = contig.getReadString(); // log.debug(" te: {}", te); String annSeq = te.get("seq"); String kmerOrigin = te.get("kmerOrigin"); Map<CortexKmer, Character> kmerCodes = new HashMap<CortexKmer, Character>(); for (int i = 0; i < kmerOrigin.length(); i++) { CortexKmer kmer = new CortexKmer(annSeq.substring(i, i + kmerSize)); Character code = kmerOrigin.charAt(i); kmerCodes.put(kmer, code); } Map<Character, Integer> kmerStats = new HashMap<Character, Integer>(); for (Character c : kmerCodeNames.keySet()) { kmerStats.put(c, 0); } boolean changed = false; // We want to be able to examine soft-clipped regions as well. List<CigarElement> ces = new ArrayList<CigarElement>(); for (CigarElement ce : contig.getCigar().getCigarElements()) { if (ce.getOperator().equals(CigarOperator.S)) { ces.add(new CigarElement(ce.getLength(), CigarOperator.M)); changed = true; } else { ces.add(ce); } } if (changed) { CigarElement firstCe = contig.getCigar().getCigarElements().get(0); if (firstCe.getOperator().equals(CigarOperator.S)) { contig.setAlignmentStart(contig.getAlignmentStart() - firstCe.getLength()); } contig.setCigar(new Cigar(ces)); } for (AlignmentBlock ab : contig.getAlignmentBlocks()) { for (int i = ab.getReadStart() - 1; i < ab.getReadStart() + ab.getLength(); i++) { if (i + kmerSize < seq.length()) { CortexKmer kmer = new CortexKmer(seq.substring(i, i + kmerSize)); SAMRecord skmer = new SAMRecord(CONTIGS.getFileHeader()); skmer.setReadBases(seq.substring(i, i + kmerSize).getBytes()); List<CigarElement> cigarElements = new ArrayList<CigarElement>(); cigarElements.add(new CigarElement(kmerSize, CigarOperator.M)); Cigar cigar = new Cigar(cigarElements); skmer.setReadName(contig.getReadName() + "." + kmer.getKmerAsString()); skmer.setReferenceName(contig.getReferenceName()); skmer.setCigar(cigar); skmer.setReadPairedFlag(false); skmer.setDuplicateReadFlag(false); skmer.setMateNegativeStrandFlag(false); skmer.setAlignmentStart(ab.getReferenceStart() - ab.getReadStart() + 1 + i); skmer.setAttribute("RG", "none"); skmer.setMappingQuality(0); Character c = kmerCodes.get(kmer); String codeName = kmerCodeNames.get(c); String parentReadGroupId = null; String sampleReadGroupId = null; for (SAMReadGroupRecord rgr : sfh.getReadGroups()) { if (rgr.getSample().equals(codeName)) { parentReadGroupId = rgr.getReadGroupId(); } if (rgr.getSample().equals(contig.getReadGroup().getSample())) { sampleReadGroupId = rgr.getReadGroupId(); } } skmer.setAttribute( "RG", parentReadGroupId != null ? parentReadGroupId : sampleReadGroupId); skmer.setMappingQuality(99); sfw.addAlignment(skmer); kmerStats.put(c, kmerStats.get(c) + 1); IGVEntry igvEntry = new IGVEntry(); igvEntry.chromosome = contig.getReferenceName(); igvEntry.start = ab.getReferenceStart() - ab.getReadStart() + i; igvEntry.parentageName = kmerCodeNames.get(c); igvEntry.parentage = kmerCodeIndices.get(c); igvEntries.add(igvEntry); } } } if (!contig.isSecondaryOrSupplementary()) { beout.println( contig.getReferenceName() + "\t" + contig.getAlignmentStart() + "\t" + contig.getAlignmentEnd() + "\t" + contig.getReadName() + "." + contig.getReadGroup().getSample()); if (annotatedContigs.size() > 10 && numContigs % (annotatedContigs.size() / 10) == 0) { log.info(" processed {}/{} contigs", numContigs, annotatedContigs.size()); } numContigs++; } Map<String, String> stats = new LinkedHashMap<String, String>(); stats.put("contigName", contig.getReadName()); stats.put("sampleName", contig.getReadGroup().getSample()); for (Character c : kmerCodeNames.keySet()) { stats.put(kmerCodeNames.get(c), String.valueOf(kmerStats.get(c))); } tw.addEntry(stats); } } } log.info("Writing kmer inheritance information..."); out.printf("%s\t%s\t%s\t%s\t%s\n", "Chromosome", "Start", "End", "Feature", "Parentage"); for (IGVEntry igvEntry : igvEntries) { out.printf( "%s\t%d\t%d\t%s\t%d\n", igvEntry.chromosome, igvEntry.start, igvEntry.start + 1, igvEntry.parentageName, igvEntry.parentage); } sfw.close(); }