Exemplo n.º 1
0
 /**
  * Outputs all intervals that are behind the current reference locus
  *
  * @param refLocus the current reference locus
  * @param refBase the reference allele
  */
 private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) {
   // output any intervals that were finished
   final List<GenomeLoc> toRemove = new LinkedList<>();
   for (GenomeLoc key : intervalMap.keySet()) {
     if (key.isBefore(refLocus)) {
       final IntervalStratification intervalStats = intervalMap.get(key);
       outputStatsToVCF(intervalStats, Allele.create(refBase, true));
       if (hasMissingLoci(intervalStats)) {
         outputMissingInterval(intervalStats);
       }
       toRemove.add(key);
     }
   }
   for (GenomeLoc key : toRemove) {
     intervalMap.remove(key);
   }
 }
Exemplo n.º 2
0
/**
 * Analyze coverage distribution and validate read mates per interval and per sample
 *
 * <p>This tool is useful for diagnosing regions with bad coverage, mapping, or read mate pairs. It
 * analyzes each sample independently and aggregates results over intervals of interest.
 * Low-coverage regions can be identified by using e.g. FindCoveredIntervals with the -uncovered
 * argument.
 *
 * <h3>Input</h3>
 *
 * <ul>
 *   <li>A reference file
 *   <li>one or more input BAMs
 *   <li>One or more intervals
 * </ul>
 *
 * <h3>Output</h3>
 *
 * <p>A modified VCF detailing each interval by sample and information for each interval according
 * to the thresholds used. Interval information includes GC Content, average interval depth,
 * callable status among others. If you use the --missing option, you can get as a second output a
 * intervals file with the loci that have missing data. This file can then be used as input to
 * QualifyMissingIntervals for full qualification and interpretation of why the data is missing.
 *
 * <h3>Usage example</h3>
 *
 * <pre>
 *    java -jar GenomeAnalysisTK.jar
 *              -T DiagnoseTargets \
 *              -R reference.fasta \
 *              -I sample1.bam \
 *              -I sample2.bam \
 *              -I sample3.bam \
 *              -L intervals.interval_list \
 *              -o output.vcf
 *  </pre>
 *
 * @author Mauricio Carneiro, Roger Zurawicki
 * @since 5/8/12
 */
@DocumentedGATKFeature(
    groupName = HelpConstants.DOCS_CAT_QC,
    extraDocs = {CommandLineGATK.class})
@By(value = DataSource.READS)
@PartitionBy(PartitionType.INTERVAL)
@Downsample(by = DownsampleType.NONE)
public class DiagnoseTargets extends LocusWalker<Long, Long> {

  @Output(doc = "File to which interval statistics should be written")
  private VariantContextWriter vcfWriter = null;

  @ArgumentCollection private ThresHolder thresholds = new ThresHolder();

  private Map<GenomeLoc, IntervalStratification> intervalMap =
      null; // maps each interval => statistics
  private PeekableIterator<GenomeLoc>
      intervalListIterator; // an iterator to go over all the intervals provided as we traverse the
                            // genome
  private Set<String> samples = null; // all the samples being processed
  private static final Allele SYMBOLIC_ALLELE =
      Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times
  private static final Allele UNCOVERED_ALLELE =
      Allele.create(
          "A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times
  private static final int INITIAL_HASH_SIZE =
      50; // enough room for potential overlapping intervals plus recently finished intervals

  @Override
  public void initialize() {
    super.initialize();

    if (getToolkit().getIntervals() == null || getToolkit().getIntervals().isEmpty())
      throw new UserException(
          "This tool only works if you provide one or more intervals (use the -L argument). If you want to run whole genome, use -T DepthOfCoverage instead.");

    intervalMap = new LinkedHashMap<>(INITIAL_HASH_SIZE);
    intervalListIterator = new PeekableIterator<>(getToolkit().getIntervals().iterator());

    // get all of the unique sample names for the VCF Header
    samples = ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
    vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples));

    // pre load all the statistics classes because it is costly to operate on the JVM and we only
    // want to do it once.
    loadAllPlugins(thresholds);
  }

  @Override
  public Long map(
      final RefMetaDataTracker tracker,
      final ReferenceContext ref,
      final AlignmentContext context) {
    GenomeLoc refLocus = ref.getLocus();

    // process and remove any intervals in the map that are don't overlap the current locus anymore
    // and add all new intervals that may overlap this reference locus
    addNewOverlappingIntervals(refLocus);
    outputFinishedIntervals(refLocus, ref.getBase());

    // at this point, all intervals in intervalMap overlap with this locus, so update all of them
    for (IntervalStratification intervalStratification : intervalMap.values())
      intervalStratification.addLocus(context, ref);

    return 1L;
  }

  @Override
  public Long reduceInit() {
    return 0L;
  }

  /**
   * Not sure what we are going to do here
   *
   * @param value result of the map.
   * @param sum accumulator for the reduce.
   * @return a long
   */
  @Override
  public Long reduce(Long value, Long sum) {
    return sum + value;
  }

  /**
   * Process all remaining intervals
   *
   * @param result number of loci processed by the walker
   */
  @Override
  public void onTraversalDone(final Long result) {
    for (GenomeLoc interval : intervalMap.keySet())
      outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE);

    GenomeLoc interval = intervalListIterator.peek();
    while (interval != null) {
      outputStatsToVCF(createIntervalStatistic(interval), UNCOVERED_ALLELE);
      intervalListIterator.next();
      interval = intervalListIterator.peek();
    }

    if (thresholds.missingTargets != null) {
      thresholds.missingTargets.close();
    }
  }

  /**
   * Outputs all intervals that are behind the current reference locus
   *
   * @param refLocus the current reference locus
   * @param refBase the reference allele
   */
  private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) {
    // output any intervals that were finished
    final List<GenomeLoc> toRemove = new LinkedList<>();
    for (GenomeLoc key : intervalMap.keySet()) {
      if (key.isBefore(refLocus)) {
        final IntervalStratification intervalStats = intervalMap.get(key);
        outputStatsToVCF(intervalStats, Allele.create(refBase, true));
        if (hasMissingLoci(intervalStats)) {
          outputMissingInterval(intervalStats);
        }
        toRemove.add(key);
      }
    }
    for (GenomeLoc key : toRemove) {
      intervalMap.remove(key);
    }
  }

  /**
   * Adds all intervals that overlap the current reference locus to the intervalMap
   *
   * @param refLocus the current reference locus
   */
  private void addNewOverlappingIntervals(final GenomeLoc refLocus) {
    GenomeLoc interval = intervalListIterator.peek();
    while (interval != null && !interval.isPast(refLocus)) {
      intervalMap.put(interval, createIntervalStatistic(interval));
      intervalListIterator.next();
      interval = intervalListIterator.peek();
    }
  }

  /**
   * Takes the interval, finds it in the stash, prints it to the VCF
   *
   * @param stats The statistics of the interval
   * @param refAllele the reference allele
   */
  private void outputStatsToVCF(final IntervalStratification stats, final Allele refAllele) {
    GenomeLoc interval = stats.getInterval();

    final List<Allele> alleles = new ArrayList<>();
    final Map<String, Object> attributes = new HashMap<>();
    final ArrayList<Genotype> genotypes = new ArrayList<>();

    for (String sample : samples) {
      final GenotypeBuilder gb = new GenotypeBuilder(sample);

      SampleStratification sampleStat = stats.getSampleStatistics(sample);
      gb.attribute(
          GATKVCFConstants.AVG_INTERVAL_DP_BY_SAMPLE_KEY,
          sampleStat.averageCoverage(interval.size()));
      gb.attribute(GATKVCFConstants.LOW_COVERAGE_LOCI, sampleStat.getNLowCoveredLoci());
      gb.attribute(GATKVCFConstants.ZERO_COVERAGE_LOCI, sampleStat.getNUncoveredLoci());
      gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false));

      genotypes.add(gb.make());
    }
    alleles.add(refAllele);
    alleles.add(SYMBOLIC_ALLELE);
    VariantContextBuilder vcb =
        new VariantContextBuilder(
            "DiagnoseTargets",
            interval.getContig(),
            interval.getStart(),
            interval.getStop(),
            alleles);

    vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR);
    vcb.filters(new LinkedHashSet<>(statusToStrings(stats.callableStatuses(), true)));

    attributes.put(VCFConstants.END_KEY, interval.getStop());
    attributes.put(GATKVCFConstants.AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size()));
    attributes.put(GATKVCFConstants.INTERVAL_GC_CONTENT_KEY, stats.gcContent());

    vcb = vcb.attributes(attributes);
    vcb = vcb.genotypes(genotypes);

    vcfWriter.add(vcb.make());
  }

  private boolean hasMissingStatuses(AbstractStratification stats) {
    return !stats.callableStatuses().isEmpty();
  }

  private boolean hasMissingLoci(final IntervalStratification stats) {
    return thresholds.missingTargets != null && hasMissingStatuses(stats);
  }

  private void outputMissingInterval(final IntervalStratification stats) {
    final GenomeLoc interval = stats.getInterval();
    final boolean missing[] = new boolean[interval.size()];
    Arrays.fill(missing, true);
    for (AbstractStratification sample : stats.getElements()) {
      if (hasMissingStatuses(sample)) {
        int pos = 0;
        for (AbstractStratification locus : sample.getElements()) {
          if (locus.callableStatuses().isEmpty()) {
            missing[pos] = false;
          }
          pos++;
        }
      }
    }
    int start = -1;
    boolean insideMissing = false;
    for (int i = 0; i < missing.length; i++) {
      if (missing[i] && !insideMissing) {
        start = interval.getStart() + i;
        insideMissing = true;
      } else if (!missing[i] && insideMissing) {
        final int stop = interval.getStart() + i - 1;
        outputMissingInterval(interval.getContig(), start, stop);
        insideMissing = false;
      }
    }
    if (insideMissing) {
      outputMissingInterval(interval.getContig(), start, interval.getStop());
    }
  }

  private void outputMissingInterval(final String contig, final int start, final int stop) {
    final PrintStream out = thresholds.missingTargets;
    out.println(String.format("%s:%d-%d", contig, start, stop));
  }

  /**
   * Function that process a set of statuses into strings
   *
   * @param statuses the set of statuses to be converted
   * @return a matching set of strings
   */
  private List<String> statusToStrings(
      Iterable<CallableStatus> statuses, final boolean isInfoField) {
    List<String> output = new LinkedList<>();

    for (CallableStatus status : statuses)
      if (isInfoField || status != CallableStatus.PASS) output.add(status.name());

    return output;
  }

  private IntervalStratification createIntervalStatistic(GenomeLoc interval) {
    return new IntervalStratification(samples, interval, thresholds);
  }

  protected static void loadAllPlugins(final ThresHolder thresholds) {
    for (Class<?> stat : new PluginManager<LocusMetric>(LocusMetric.class).getPlugins()) {
      try {
        final LocusMetric stats = (LocusMetric) stat.newInstance();
        stats.initialize(thresholds);
        thresholds.locusMetricList.add(stats);
      } catch (Exception e) {
        throw new DynamicClassResolutionException(stat, e);
      }
    }

    for (Class<?> stat : new PluginManager<SampleMetric>(SampleMetric.class).getPlugins()) {
      try {
        final SampleMetric stats = (SampleMetric) stat.newInstance();
        stats.initialize(thresholds);
        thresholds.sampleMetricList.add(stats);
      } catch (Exception e) {
        throw new DynamicClassResolutionException(stat, e);
      }
    }

    for (Class<?> stat : new PluginManager<IntervalMetric>(IntervalMetric.class).getPlugins()) {
      try {
        final IntervalMetric stats = (IntervalMetric) stat.newInstance();
        stats.initialize(thresholds);
        thresholds.intervalMetricList.add(stats);
      } catch (Exception e) {
        throw new DynamicClassResolutionException(stat, e);
      }
    }
  }

  /**
   * Gets the header lines for the VCF writer
   *
   * @return A set of VCF header lines
   */
  private static Set<VCFHeaderLine> getHeaderInfo() {
    Set<VCFHeaderLine> headerLines = new HashSet<>();

    // INFO fields for overall data
    headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
    headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AVG_INTERVAL_DP_KEY));
    headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.INTERVAL_GC_CONTENT_KEY));
    headerLines.add(
        new VCFInfoHeaderLine(
            "Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode"));

    // FORMAT fields for each genotype
    headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
    headerLines.add(
        GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.AVG_INTERVAL_DP_BY_SAMPLE_KEY));
    headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.LOW_COVERAGE_LOCI));
    headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.ZERO_COVERAGE_LOCI));

    // FILTER fields
    for (CallableStatus stat : CallableStatus.values())
      headerLines.add(new VCFFilterHeaderLine(stat.name(), stat.description));

    return headerLines;
  }
}