@Test(dataProvider = "loadReadsBAM", groups = "spark") public void readsSinkTest(String inputBam, String outputFileName, String outputFileExtension) throws IOException { final File outputFile = createTempFile(outputFileName, outputFileExtension); JavaSparkContext ctx = SparkContextFactory.getTestSparkContext(); ReadsSparkSource readSource = new ReadsSparkSource(ctx); JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(inputBam, null); SAMFileHeader header = ReadsSparkSource.getHeader(ctx, inputBam, null); ReadsSparkSink.writeReads( ctx, outputFile.getAbsolutePath(), rddParallelReads, header, ReadsWriteFormat.SINGLE); JavaRDD<GATKRead> rddParallelReads2 = readSource.getParallelReads(outputFile.getAbsolutePath(), null); final List<GATKRead> writtenReads = rddParallelReads2.collect(); final SAMRecordCoordinateComparator comparator = new SAMRecordCoordinateComparator(); // Assert that the reads are sorted. final int size = writtenReads.size(); for (int i = 0; i < size - 1; ++i) { final SAMRecord smaller = writtenReads.get(i).convertToSAMRecord(header); final SAMRecord larger = writtenReads.get(i + 1).convertToSAMRecord(header); final int compare = comparator.compare(smaller, larger); Assert.assertTrue( compare < 0, "Reads are out of order (compare=" + compare + "): " + smaller.getSAMString() + " and " + larger.getSAMString()); } Assert.assertEquals(rddParallelReads.count(), rddParallelReads2.count()); }
// // Returns a downsampled set of reads for each sample. // private List<List<SAMRecord>> getReads( List<String> inputFiles, List<Feature> regions, ReAligner realigner) { int downsampleTarget = desiredNumberOfReads(regions); List<DownsampledReadList> readsList = new ArrayList<DownsampledReadList>(); for (String input : inputFiles) { Set<String> readIds = new HashSet<String>(); DownsampledReadList reads = new DownsampledReadList(downsampleTarget); readsList.add(reads); for (Feature region : regions) { SAMFileReader reader = new SAMFileReader(new File(input)); reader.setValidationStringency(ValidationStringency.SILENT); Iterator<SAMRecord> iter; if (region != null) { iter = reader.queryOverlapping( region.getSeqname(), (int) region.getStart(), (int) region.getEnd()); } else { iter = reader.iterator(); } while (iter.hasNext()) { SAMRecord read = iter.next(); // Don't allow same read to be counted twice. if ((!realigner.isFiltered(read)) && (!read.getDuplicateReadFlag()) && (!read.getReadFailsVendorQualityCheckFlag()) && (read.getMappingQuality() >= realigner.getMinMappingQuality() || read.getReadUnmappedFlag()) && (!readIds.contains(getIdentifier(read)))) { if (read.getReadString().length() > readLength) { reader.close(); throw new IllegalArgumentException( "Maximum read length of: " + readLength + " exceeded for: " + read.getSAMString()); } readIds.add(getIdentifier(read)); reads.add(read); } } if (reads.getTotalReadCount() != reads.getReads().size()) { if (isDebug) { System.err.println( "downsampled: " + regions.get(0).getDescriptor() + ": " + reads.getTotalReadCount() + " -> " + reads.getReads().size()); } } reader.close(); } } List<List<SAMRecord>> sampleReads = new ArrayList<List<SAMRecord>>(); for (DownsampledReadList downsampledReads : readsList) { sampleReads.add(downsampledReads.getReads()); } return sampleReads; }
/** * Main method for the program. Checks that all input files are present and readable and that the * output file can be written to. Then iterates through all the records accumulating metrics. * Finally writes metrics file */ protected int doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsWritable(OUTPUT); final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT); final Histogram<Integer> mismatchesHist = new Histogram<Integer>("Predicted", "Mismatches"); final Histogram<Integer> totalHist = new Histogram<Integer>("Predicted", "Total_Bases"); final Map<String, Histogram> mismatchesByTypeHist = new HashMap<String, Histogram>(); final Map<String, Histogram> totalByTypeHist = new HashMap<String, Histogram>(); // Set up the histograms byte[] bases = {'A', 'C', 'G', 'T'}; for (final byte base : bases) { final Histogram<Integer> h = new Histogram<Integer>("Predicted", (char) base + ">"); mismatchesByTypeHist.put((char) base + ">", h); final Histogram<Integer> h2 = new Histogram<Integer>("Predicted", ">" + (char) base); mismatchesByTypeHist.put(">" + (char) base, h2); } for (final byte base : bases) { final Histogram<Integer> h = new Histogram<Integer>("Predicted", (char) base + ">"); totalByTypeHist.put((char) base + ">", h); final Histogram<Integer> h2 = new Histogram<Integer>("Predicted", ">" + (char) base); totalByTypeHist.put(">" + (char) base, h2); } for (final SAMRecord record : reader) { // Ignore these as we don't know the truth if (record.getReadUnmappedFlag() || record.isSecondaryOrSupplementary()) { continue; } final byte[] readBases = record.getReadBases(); final byte[] readQualities = record.getBaseQualities(); final byte[] refBases = SequenceUtil.makeReferenceFromAlignment(record, false); // We've seen stranger things if (readQualities.length != readBases.length) { throw new PicardException( "Missing Qualities (" + readQualities.length + "," + readBases.length + ") : " + record.getSAMString()); } if (refBases.length != readBases.length) { throw new PicardException( "The read length did not match the inferred reference length, please check your MD and CIGAR."); } int cycleIndex; // zero-based if (record.getReadNegativeStrandFlag()) { cycleIndex = readBases.length - 1 + CYCLE_OFFSET; } else { cycleIndex = CYCLE_OFFSET; } for (int i = 0; i < readBases.length; i++) { if (-1 == CYCLE || cycleIndex == CYCLE) { if ('-' != refBases[i] && '0' != refBases[i]) { // not insertion and not soft-clipped if (!SequenceUtil.basesEqual(readBases[i], refBases[i])) { // mismatch mismatchesHist.increment((int) readQualities[i]); if (SequenceUtil.isValidBase(refBases[i])) { mismatchesByTypeHist .get((char) refBases[i] + ">") .increment((int) readQualities[i]); } if (SequenceUtil.isValidBase(readBases[i])) { mismatchesByTypeHist .get(">" + (char) readBases[i]) .increment((int) readQualities[i]); } } else { mismatchesHist.increment( (int) readQualities[i], 0); // to make sure the bin will exist } totalHist.increment((int) readQualities[i]); if (SequenceUtil.isValidBase(readBases[i])) { totalByTypeHist.get(">" + (char) readBases[i]).increment((int) readQualities[i]); } if (SequenceUtil.isValidBase(refBases[i])) { totalByTypeHist.get((char) refBases[i] + ">").increment((int) readQualities[i]); } } } cycleIndex += record.getReadNegativeStrandFlag() ? -1 : 1; } } CloserUtil.close(reader); final Histogram<Integer> hist = new Histogram<Integer>("Predicted", "Observed"); double sumOfSquaresError = 0.0; // compute the aggregate phred values for (final Integer key : mismatchesHist.keySet()) { final double numMismatches = mismatchesHist.get(key).getValue(); final double numBases = totalHist.get(key).getValue(); final double phredErr = Math.log10(numMismatches / numBases) * -10.0; sumOfSquaresError += (0 == numMismatches) ? 0.0 : (key - phredErr) * (key - phredErr); hist.increment(key, phredErr); // make sure the bin will exist for (final byte base : bases) { mismatchesByTypeHist.get(">" + (char) base).increment(key, 0.0); mismatchesByTypeHist.get((char) base + ">").increment(key, 0.0); totalByTypeHist.get(">" + (char) base).increment(key, 0.0); totalByTypeHist.get((char) base + ">").increment(key, 0.0); } } final QualityScoreAccuracyMetrics metrics = new QualityScoreAccuracyMetrics(); metrics.SUM_OF_SQUARE_ERROR = sumOfSquaresError; final MetricsFile<QualityScoreAccuracyMetrics, Integer> out = getMetricsFile(); out.addMetric(metrics); out.addHistogram(hist); for (final byte base : bases) { // >base : histograms for mismatches *to* the given base Histogram<Integer> m = mismatchesByTypeHist.get(">" + (char) base); Histogram<Integer> t = totalByTypeHist.get(">" + (char) base); Histogram<Integer> h = new Histogram<Integer>(m.getBinLabel(), m.getValueLabel()); for (final Integer key : m.keySet()) { final double numMismatches = m.get(key).getValue(); final double numBases = t.get(key).getValue(); final double phredErr = Math.log10(numMismatches / numBases) * -10.0; h.increment(key, phredErr); } out.addHistogram(h); // base> : histograms for mismatches *from* the given base m = mismatchesByTypeHist.get((char) base + ">"); t = totalByTypeHist.get(">" + (char) base); h = new Histogram<Integer>(m.getBinLabel(), m.getValueLabel()); for (final Integer key : m.keySet()) { final double numMismatches = m.get(key).getValue(); final double numBases = t.get(key).getValue(); final double phredErr = Math.log10(numMismatches / numBases) * -10.0; h.increment(key, phredErr); } out.addHistogram(h); } out.addHistogram(mismatchesHist); out.addHistogram(totalHist); out.write(OUTPUT); return 0; }