/** * Outputs all intervals that are behind the current reference locus * * @param refLocus the current reference locus * @param refBase the reference allele */ private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) { // output any intervals that were finished final List<GenomeLoc> toRemove = new LinkedList<>(); for (GenomeLoc key : intervalMap.keySet()) { if (key.isBefore(refLocus)) { final IntervalStratification intervalStats = intervalMap.get(key); outputStatsToVCF(intervalStats, Allele.create(refBase, true)); if (hasMissingLoci(intervalStats)) { outputMissingInterval(intervalStats); } toRemove.add(key); } } for (GenomeLoc key : toRemove) { intervalMap.remove(key); } }
/** * Analyze coverage distribution and validate read mates per interval and per sample * * <p>This tool is useful for diagnosing regions with bad coverage, mapping, or read mate pairs. It * analyzes each sample independently and aggregates results over intervals of interest. * Low-coverage regions can be identified by using e.g. FindCoveredIntervals with the -uncovered * argument. * * <h3>Input</h3> * * <ul> * <li>A reference file * <li>one or more input BAMs * <li>One or more intervals * </ul> * * <h3>Output</h3> * * <p>A modified VCF detailing each interval by sample and information for each interval according * to the thresholds used. Interval information includes GC Content, average interval depth, * callable status among others. If you use the --missing option, you can get as a second output a * intervals file with the loci that have missing data. This file can then be used as input to * QualifyMissingIntervals for full qualification and interpretation of why the data is missing. * * <h3>Usage example</h3> * * <pre> * java -jar GenomeAnalysisTK.jar * -T DiagnoseTargets \ * -R reference.fasta \ * -I sample1.bam \ * -I sample2.bam \ * -I sample3.bam \ * -L intervals.interval_list \ * -o output.vcf * </pre> * * @author Mauricio Carneiro, Roger Zurawicki * @since 5/8/12 */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class}) @By(value = DataSource.READS) @PartitionBy(PartitionType.INTERVAL) @Downsample(by = DownsampleType.NONE) public class DiagnoseTargets extends LocusWalker<Long, Long> { @Output(doc = "File to which interval statistics should be written") private VariantContextWriter vcfWriter = null; @ArgumentCollection private ThresHolder thresholds = new ThresHolder(); private Map<GenomeLoc, IntervalStratification> intervalMap = null; // maps each interval => statistics private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the // genome private Set<String> samples = null; // all the samples being processed private static final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times private static final Allele UNCOVERED_ALLELE = Allele.create( "A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times private static final int INITIAL_HASH_SIZE = 50; // enough room for potential overlapping intervals plus recently finished intervals @Override public void initialize() { super.initialize(); if (getToolkit().getIntervals() == null || getToolkit().getIntervals().isEmpty()) throw new UserException( "This tool only works if you provide one or more intervals (use the -L argument). If you want to run whole genome, use -T DepthOfCoverage instead."); intervalMap = new LinkedHashMap<>(INITIAL_HASH_SIZE); intervalListIterator = new PeekableIterator<>(getToolkit().getIntervals().iterator()); // get all of the unique sample names for the VCF Header samples = ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); // pre load all the statistics classes because it is costly to operate on the JVM and we only // want to do it once. loadAllPlugins(thresholds); } @Override public Long map( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { GenomeLoc refLocus = ref.getLocus(); // process and remove any intervals in the map that are don't overlap the current locus anymore // and add all new intervals that may overlap this reference locus addNewOverlappingIntervals(refLocus); outputFinishedIntervals(refLocus, ref.getBase()); // at this point, all intervals in intervalMap overlap with this locus, so update all of them for (IntervalStratification intervalStratification : intervalMap.values()) intervalStratification.addLocus(context, ref); return 1L; } @Override public Long reduceInit() { return 0L; } /** * Not sure what we are going to do here * * @param value result of the map. * @param sum accumulator for the reduce. * @return a long */ @Override public Long reduce(Long value, Long sum) { return sum + value; } /** * Process all remaining intervals * * @param result number of loci processed by the walker */ @Override public void onTraversalDone(final Long result) { for (GenomeLoc interval : intervalMap.keySet()) outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE); GenomeLoc interval = intervalListIterator.peek(); while (interval != null) { outputStatsToVCF(createIntervalStatistic(interval), UNCOVERED_ALLELE); intervalListIterator.next(); interval = intervalListIterator.peek(); } if (thresholds.missingTargets != null) { thresholds.missingTargets.close(); } } /** * Outputs all intervals that are behind the current reference locus * * @param refLocus the current reference locus * @param refBase the reference allele */ private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) { // output any intervals that were finished final List<GenomeLoc> toRemove = new LinkedList<>(); for (GenomeLoc key : intervalMap.keySet()) { if (key.isBefore(refLocus)) { final IntervalStratification intervalStats = intervalMap.get(key); outputStatsToVCF(intervalStats, Allele.create(refBase, true)); if (hasMissingLoci(intervalStats)) { outputMissingInterval(intervalStats); } toRemove.add(key); } } for (GenomeLoc key : toRemove) { intervalMap.remove(key); } } /** * Adds all intervals that overlap the current reference locus to the intervalMap * * @param refLocus the current reference locus */ private void addNewOverlappingIntervals(final GenomeLoc refLocus) { GenomeLoc interval = intervalListIterator.peek(); while (interval != null && !interval.isPast(refLocus)) { intervalMap.put(interval, createIntervalStatistic(interval)); intervalListIterator.next(); interval = intervalListIterator.peek(); } } /** * Takes the interval, finds it in the stash, prints it to the VCF * * @param stats The statistics of the interval * @param refAllele the reference allele */ private void outputStatsToVCF(final IntervalStratification stats, final Allele refAllele) { GenomeLoc interval = stats.getInterval(); final List<Allele> alleles = new ArrayList<>(); final Map<String, Object> attributes = new HashMap<>(); final ArrayList<Genotype> genotypes = new ArrayList<>(); for (String sample : samples) { final GenotypeBuilder gb = new GenotypeBuilder(sample); SampleStratification sampleStat = stats.getSampleStatistics(sample); gb.attribute( GATKVCFConstants.AVG_INTERVAL_DP_BY_SAMPLE_KEY, sampleStat.averageCoverage(interval.size())); gb.attribute(GATKVCFConstants.LOW_COVERAGE_LOCI, sampleStat.getNLowCoveredLoci()); gb.attribute(GATKVCFConstants.ZERO_COVERAGE_LOCI, sampleStat.getNUncoveredLoci()); gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false)); genotypes.add(gb.make()); } alleles.add(refAllele); alleles.add(SYMBOLIC_ALLELE); VariantContextBuilder vcb = new VariantContextBuilder( "DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles); vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); vcb.filters(new LinkedHashSet<>(statusToStrings(stats.callableStatuses(), true))); attributes.put(VCFConstants.END_KEY, interval.getStop()); attributes.put(GATKVCFConstants.AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size())); attributes.put(GATKVCFConstants.INTERVAL_GC_CONTENT_KEY, stats.gcContent()); vcb = vcb.attributes(attributes); vcb = vcb.genotypes(genotypes); vcfWriter.add(vcb.make()); } private boolean hasMissingStatuses(AbstractStratification stats) { return !stats.callableStatuses().isEmpty(); } private boolean hasMissingLoci(final IntervalStratification stats) { return thresholds.missingTargets != null && hasMissingStatuses(stats); } private void outputMissingInterval(final IntervalStratification stats) { final GenomeLoc interval = stats.getInterval(); final boolean missing[] = new boolean[interval.size()]; Arrays.fill(missing, true); for (AbstractStratification sample : stats.getElements()) { if (hasMissingStatuses(sample)) { int pos = 0; for (AbstractStratification locus : sample.getElements()) { if (locus.callableStatuses().isEmpty()) { missing[pos] = false; } pos++; } } } int start = -1; boolean insideMissing = false; for (int i = 0; i < missing.length; i++) { if (missing[i] && !insideMissing) { start = interval.getStart() + i; insideMissing = true; } else if (!missing[i] && insideMissing) { final int stop = interval.getStart() + i - 1; outputMissingInterval(interval.getContig(), start, stop); insideMissing = false; } } if (insideMissing) { outputMissingInterval(interval.getContig(), start, interval.getStop()); } } private void outputMissingInterval(final String contig, final int start, final int stop) { final PrintStream out = thresholds.missingTargets; out.println(String.format("%s:%d-%d", contig, start, stop)); } /** * Function that process a set of statuses into strings * * @param statuses the set of statuses to be converted * @return a matching set of strings */ private List<String> statusToStrings( Iterable<CallableStatus> statuses, final boolean isInfoField) { List<String> output = new LinkedList<>(); for (CallableStatus status : statuses) if (isInfoField || status != CallableStatus.PASS) output.add(status.name()); return output; } private IntervalStratification createIntervalStatistic(GenomeLoc interval) { return new IntervalStratification(samples, interval, thresholds); } protected static void loadAllPlugins(final ThresHolder thresholds) { for (Class<?> stat : new PluginManager<LocusMetric>(LocusMetric.class).getPlugins()) { try { final LocusMetric stats = (LocusMetric) stat.newInstance(); stats.initialize(thresholds); thresholds.locusMetricList.add(stats); } catch (Exception e) { throw new DynamicClassResolutionException(stat, e); } } for (Class<?> stat : new PluginManager<SampleMetric>(SampleMetric.class).getPlugins()) { try { final SampleMetric stats = (SampleMetric) stat.newInstance(); stats.initialize(thresholds); thresholds.sampleMetricList.add(stats); } catch (Exception e) { throw new DynamicClassResolutionException(stat, e); } } for (Class<?> stat : new PluginManager<IntervalMetric>(IntervalMetric.class).getPlugins()) { try { final IntervalMetric stats = (IntervalMetric) stat.newInstance(); stats.initialize(thresholds); thresholds.intervalMetricList.add(stats); } catch (Exception e) { throw new DynamicClassResolutionException(stat, e); } } } /** * Gets the header lines for the VCF writer * * @return A set of VCF header lines */ private static Set<VCFHeaderLine> getHeaderInfo() { Set<VCFHeaderLine> headerLines = new HashSet<>(); // INFO fields for overall data headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AVG_INTERVAL_DP_KEY)); headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.INTERVAL_GC_CONTENT_KEY)); headerLines.add( new VCFInfoHeaderLine( "Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); // FORMAT fields for each genotype headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); headerLines.add( GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.AVG_INTERVAL_DP_BY_SAMPLE_KEY)); headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.LOW_COVERAGE_LOCI)); headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.ZERO_COVERAGE_LOCI)); // FILTER fields for (CallableStatus stat : CallableStatus.values()) headerLines.add(new VCFFilterHeaderLine(stat.name(), stat.description)); return headerLines; } }