/**
 * General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ti/Tv ratios, and
 * a lot more)
 *
 * <p>Given a variant callset, it is common to calculate various quality control metrics. These
 * metrics include the number of raw or filtered SNP counts; ratio of transition mutations to
 * transversions; concordance of a particular sample's calls to a genotyping chip; number of
 * singletons per sample; etc. Furthermore, it is often useful to stratify these metrics by various
 * criteria like functional class (missense, nonsense, silent), whether the site is CpG site, the
 * amino acid degeneracy of the site, etc. VariantEval facilitates these calculations in two ways:
 * by providing several built-in evaluation and stratification modules, and by providing a framework
 * that permits the easy development of new evaluation and stratification modules.
 *
 * <h2>Input</h2>
 *
 * <p>One or more variant sets to evaluate plus any number of comparison sets.
 *
 * <h2>Output</h2>
 *
 * <p>Evaluation tables detailing the results of the eval modules which were applied. For example:
 *
 * <pre>
 * output.eval.gatkreport:
 * ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample
 * CountVariants  CompRod   CpG      EvalRod  JexlExpression  Novelty  nProcessedLoci  nCalledLoci  nRefLoci  nVariantLoci  variantRate ...
 * CountVariants  dbsnp     CpG      eval     none            all      65900028        135770       0         135770        0.00206024  ...
 * CountVariants  dbsnp     CpG      eval     none            known    65900028        47068        0         47068         0.00071423  ...
 * CountVariants  dbsnp     CpG      eval     none            novel    65900028        88702        0         88702         0.00134601  ...
 * CountVariants  dbsnp     all      eval     none            all      65900028        330818       0         330818        0.00502000  ...
 * CountVariants  dbsnp     all      eval     none            known    65900028        120685       0         120685        0.00183133  ...
 * CountVariants  dbsnp     all      eval     none            novel    65900028        210133       0         210133        0.00318866  ...
 * CountVariants  dbsnp     non_CpG  eval     none            all      65900028        195048       0         195048        0.00295976  ...
 * CountVariants  dbsnp     non_CpG  eval     none            known    65900028        73617        0         73617         0.00111710  ...
 * CountVariants  dbsnp     non_CpG  eval     none            novel    65900028        121431       0         121431        0.00184265  ...
 * ...
 * </pre>
 *
 * <h2>Examples</h2>
 *
 * <pre>
 * java -Xmx2g -jar GenomeAnalysisTK.jar \
 *   -R ref.fasta \
 *   -T VariantEval \
 *   -o output.eval.gatkreport \
 *   --eval:set1 set1.vcf \
 *   --eval:set2 set2.vcf \
 *   [--comp comp.vcf]
 * </pre>
 */
@Reference(window = @Window(start = -50, stop = 50))
public class VariantEvalWalker extends RodWalker<Integer, Integer>
    implements TreeReducible<Integer> {

  @Output protected PrintStream out;

  /** The variant file(s) to evaluate. */
  @Input(fullName = "eval", shortName = "eval", doc = "Input evaluation file(s)", required = true)
  public List<RodBinding<VariantContext>> evals;

  /** The variant file(s) to compare against. */
  @Input(fullName = "comp", shortName = "comp", doc = "Input comparison file(s)", required = false)
  public List<RodBinding<VariantContext>> compsProvided = Collections.emptyList();

  private List<RodBinding<VariantContext>> comps = new ArrayList<RodBinding<VariantContext>>();

  /**
   * dbSNP comparison VCF. By default, the dbSNP file is used to specify the set of "known"
   * variants. Other sets can be specified with the -knownName (--known_names) argument.
   */
  @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();

  // Help arguments
  @Argument(
      fullName = "list",
      shortName = "ls",
      doc = "List the available eval modules and exit",
      required = false)
  protected Boolean LIST = false;

  // Partitioning the data arguments
  @Argument(
      shortName = "select",
      doc = "One or more stratifications to use when evaluating the data",
      required = false)
  protected ArrayList<String> SELECT_EXPS = new ArrayList<String>();

  @Argument(
      shortName = "selectName",
      doc = "Names to use for the list of stratifications (must be a 1-to-1 mapping)",
      required = false)
  protected ArrayList<String> SELECT_NAMES = new ArrayList<String>();

  @Argument(
      fullName = "sample",
      shortName = "sn",
      doc =
          "Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context",
      required = false)
  protected Set<String> SAMPLE_EXPRESSIONS;

  /** List of rod tracks to be used for specifying "known" variants other than dbSNP. */
  @Argument(
      shortName = "knownName",
      doc =
          "Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets",
      required = false)
  protected HashSet<String> KNOWN_NAMES = new HashSet<String>();

  List<RodBinding<VariantContext>> knowns = new ArrayList<RodBinding<VariantContext>>();

  // Stratification arguments
  @Argument(
      fullName = "stratificationModule",
      shortName = "ST",
      doc =
          "One or more specific stratification modules to apply to the eval track(s) (in addition to the standard stratifications, unless -noS is specified)",
      required = false)
  protected String[] STRATIFICATIONS_TO_USE = {};

  @Argument(
      fullName = "doNotUseAllStandardStratifications",
      shortName = "noST",
      doc =
          "Do not use the standard stratification modules by default (instead, only those that are specified with the -S option)",
      required = false)
  protected Boolean NO_STANDARD_STRATIFICATIONS = false;

  /** See the -list argument to view available modules. */
  @Argument(
      fullName = "evalModule",
      shortName = "EV",
      doc =
          "One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noEV is specified)",
      required = false)
  protected String[] MODULES_TO_USE = {};

  @Argument(
      fullName = "doNotUseAllStandardModules",
      shortName = "noEV",
      doc =
          "Do not use the standard modules by default (instead, only those that are specified with the -EV option)",
      required = false)
  protected Boolean NO_STANDARD_MODULES = false;

  // Other arguments
  @Argument(
      fullName = "numSamples",
      shortName = "ns",
      doc = "Number of samples (used if no samples are available in the VCF file",
      required = false)
  protected Integer NUM_SAMPLES = 0;

  @Argument(
      fullName = "minPhaseQuality",
      shortName = "mpq",
      doc = "Minimum phasing quality",
      required = false)
  protected double MIN_PHASE_QUALITY = 10.0;

  @Argument(
      shortName = "mvq",
      fullName = "mendelianViolationQualThreshold",
      doc =
          "Minimum genotype QUAL score for each trio member required to accept a site as a violation. Default is 50.",
      required = false)
  protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50;

  @Argument(
      fullName = "ancestralAlignments",
      shortName = "aa",
      doc = "Fasta file with ancestral alleles",
      required = false)
  private File ancestralAlignmentsFile = null;

  @Argument(
      fullName = "requireStrictAlleleMatch",
      shortName = "strict",
      doc =
          "If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping",
      required = false)
  private boolean requireStrictAlleleMatch = false;

  /**
   * If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying
   * variant set, and evaluate the union of the results. Useful when you want to do -eval chr1.vcf
   * -eval chr2.vcf etc.
   */
  @Argument(
      fullName = "mergeEvals",
      shortName = "mergeEvals",
      doc = "If provided, all -eval tracks will be merged into a single eval track",
      required = false)
  public boolean mergeEvals = false;

  /** File containing tribble-readable features for the IntervalStratificiation */
  @Input(
      fullName = "stratIntervals",
      shortName = "stratIntervals",
      doc = "File containing tribble-readable features for the IntervalStratificiation",
      required = false)
  public IntervalBinding<Feature> intervalsFile = null;

  /**
   * File containing tribble-readable features containing known CNVs. For use with VariantSummary
   * table.
   */
  @Input(
      fullName = "knownCNVs",
      shortName = "knownCNVs",
      doc =
          "File containing tribble-readable features describing a known list of copy number variants",
      required = false)
  public IntervalBinding<Feature> knownCNVsFile = null;

  Map<String, IntervalTree<GenomeLoc>> knownCNVsByContig = Collections.emptyMap();

  // Variables
  private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();

  private Set<String> sampleNamesForEvaluation = new TreeSet<String>();
  private Set<String> sampleNamesForStratification = new TreeSet<String>();
  private int numSamples = 0;

  // The list of stratifiers and evaluators to use
  private TreeSet<VariantStratifier> stratificationObjects = null;

  // The set of all possible evaluation contexts
  private HashMap<StateKey, NewEvaluationContext> evaluationContexts = null;

  // important stratifications
  private boolean byFilterIsEnabled = false;
  private boolean perSampleIsEnabled = false;

  // Output report
  private GATKReport report = null;

  // Public constants
  private static String ALL_SAMPLE_NAME = "all";

  // Utility class
  private final VariantEvalUtils variantEvalUtils = new VariantEvalUtils(this);

  // Ancestral alignments
  private IndexedFastaSequenceFile ancestralAlignments = null;

  /** Initialize the stratifications, evaluations, evaluation contexts, and reporting object */
  public void initialize() {
    // Just list the modules, and exit quickly.
    if (LIST) {
      variantEvalUtils.listModulesAndExit();
    }

    // maintain the full list of comps
    comps.addAll(compsProvided);
    if (dbsnp.dbsnp.isBound()) {
      comps.add(dbsnp.dbsnp);
      knowns.add(dbsnp.dbsnp);
    }

    // Add a dummy comp track if none exists
    if (comps.size() == 0)
      comps.add(
          new RodBinding<VariantContext>(VariantContext.class, "none", "UNBOUND", "", new Tags()));

    // Set up set of additional knowns
    for (RodBinding<VariantContext> compRod : comps) {
      if (KNOWN_NAMES.contains(compRod.getName())) knowns.add(compRod);
    }

    // Now that we have all the rods categorized, determine the sample list from the eval rods.
    Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evals);
    Set<String> vcfSamples =
        SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);

    // Load the sample list
    sampleNamesForEvaluation.addAll(
        SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS));
    numSamples = NUM_SAMPLES > 0 ? NUM_SAMPLES : sampleNamesForEvaluation.size();

    if (Arrays.asList(STRATIFICATIONS_TO_USE).contains("Sample")) {
      sampleNamesForStratification.addAll(sampleNamesForEvaluation);
    }
    sampleNamesForStratification.add(ALL_SAMPLE_NAME);

    // Initialize select expressions
    for (VariantContextUtils.JexlVCMatchExp jexl :
        VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS)) {
      SortableJexlVCMatchExp sjexl = new SortableJexlVCMatchExp(jexl.name, jexl.exp);
      jexlExpressions.add(sjexl);
    }

    // Initialize the set of stratifications and evaluations to use
    stratificationObjects =
        variantEvalUtils.initializeStratificationObjects(
            this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE);
    Set<Class<? extends VariantEvaluator>> evaluationObjects =
        variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE);
    for (VariantStratifier vs : getStratificationObjects()) {
      if (vs.getName().equals("Filter")) byFilterIsEnabled = true;
      else if (vs.getName().equals("Sample")) perSampleIsEnabled = true;
    }

    if (intervalsFile != null) {
      boolean fail = true;
      for (final VariantStratifier vs : stratificationObjects) {
        if (vs.getClass().equals(IntervalStratification.class)) fail = false;
      }
      if (fail)
        throw new UserException.BadArgumentValue(
            "ST", "stratIntervals argument provided but -ST IntervalStratification not provided");
    }

    // Initialize the evaluation contexts
    evaluationContexts =
        variantEvalUtils.initializeEvaluationContexts(
            stratificationObjects, evaluationObjects, null, null);

    // Initialize report table
    report = variantEvalUtils.initializeGATKReport(stratificationObjects, evaluationObjects);

    // Load ancestral alignments
    if (ancestralAlignmentsFile != null) {
      try {
        ancestralAlignments = new IndexedFastaSequenceFile(ancestralAlignmentsFile);
      } catch (FileNotFoundException e) {
        throw new ReviewedStingException(
            String.format(
                "The ancestral alignments file, '%s', could not be found",
                ancestralAlignmentsFile.getAbsolutePath()));
      }
    }

    // initialize CNVs
    if (knownCNVsFile != null) {
      knownCNVsByContig = createIntervalTreeByContig(knownCNVsFile);
    }
  }

  public final Map<String, IntervalTree<GenomeLoc>> createIntervalTreeByContig(
      final IntervalBinding<Feature> intervals) {
    final Map<String, IntervalTree<GenomeLoc>> byContig =
        new HashMap<String, IntervalTree<GenomeLoc>>();

    final List<GenomeLoc> locs = intervals.getIntervals(getToolkit());

    // set up the map from contig -> interval tree
    for (final String contig : getContigNames())
      byContig.put(contig, new IntervalTree<GenomeLoc>());

    for (final GenomeLoc loc : locs) {
      byContig.get(loc.getContig()).put(loc.getStart(), loc.getStop(), loc);
    }

    return byContig;
  }

  /** Collect relevant information from each variant in the supplied VCFs */
  @Override
  public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
    for (NewEvaluationContext nec : evaluationContexts.values()) {
      synchronized (nec) {
        nec.update0(tracker, ref, context);
      }
    }

    if (tracker != null) {
      String aastr =
          (ancestralAlignments == null)
              ? null
              : new String(
                  ancestralAlignments
                      .getSubsequenceAt(
                          ref.getLocus().getContig(),
                          ref.getLocus().getStart(),
                          ref.getLocus().getStop())
                      .getBases());

      //      --------- track ---------           sample  - VariantContexts -
      HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> evalVCs =
          variantEvalUtils.bindVariantContexts(
              tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals);
      HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> compVCs =
          variantEvalUtils.bindVariantContexts(
              tracker, ref, comps, byFilterIsEnabled, false, false, false);

      // for each eval track
      for (final RodBinding<VariantContext> evalRod : evals) {
        final Map<String, Collection<VariantContext>> emptyEvalMap = Collections.emptyMap();
        final Map<String, Collection<VariantContext>> evalSet =
            evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : emptyEvalMap;

        // for each sample stratifier
        for (final String sampleName : sampleNamesForStratification) {
          Collection<VariantContext> evalSetBySample = evalSet.get(sampleName);
          if (evalSetBySample == null) {
            evalSetBySample = new HashSet<VariantContext>(1);
            evalSetBySample.add(null);
          }

          // for each eval in the track
          for (VariantContext eval : evalSetBySample) {
            // deal with ancestral alleles if requested
            if (eval != null && aastr != null) {
              eval = new VariantContextBuilder(eval).attribute("ANCESTRALALLELE", aastr).make();
            }

            // for each comp track
            for (final RodBinding<VariantContext> compRod : comps) {
              // no sample stratification for comps
              final HashMap<String, Collection<VariantContext>> compSetHash = compVCs.get(compRod);
              final Collection<VariantContext> compSet =
                  (compSetHash == null || compSetHash.size() == 0)
                      ? Collections.<VariantContext>emptyList()
                      : compVCs.get(compRod).values().iterator().next();

              // find the comp
              final VariantContext comp = findMatchingComp(eval, compSet);

              HashMap<VariantStratifier, List<String>> stateMap =
                  new HashMap<VariantStratifier, List<String>>();
              for (VariantStratifier vs : stratificationObjects) {
                List<String> states =
                    vs.getRelevantStates(
                        ref, tracker, comp, compRod.getName(), eval, evalRod.getName(), sampleName);
                stateMap.put(vs, states);
              }

              ArrayList<StateKey> stateKeys = new ArrayList<StateKey>();
              variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys);

              HashSet<StateKey> stateKeysHash = new HashSet<StateKey>(stateKeys);

              for (StateKey stateKey : stateKeysHash) {
                NewEvaluationContext nec = evaluationContexts.get(stateKey);

                // eval against the comp
                synchronized (nec) {
                  nec.apply(tracker, ref, context, comp, eval);
                }

                // eval=null against all comps of different type that aren't bound to another eval
                for (VariantContext otherComp : compSet) {
                  if (otherComp != comp && !compHasMatchingEval(otherComp, evalSetBySample)) {
                    synchronized (nec) {
                      nec.apply(tracker, ref, context, otherComp, null);
                    }
                  }
                }
              }
            }
          }
        }

        if (mergeEvals) break; // stop processing the eval tracks
      }
    }

    return null;
  }

  @Requires({"comp != null", "evals != null"})
  private boolean compHasMatchingEval(
      final VariantContext comp, final Collection<VariantContext> evals) {
    // find all of the matching comps
    for (final VariantContext eval : evals) {
      if (eval != null
          && doEvalAndCompMatch(comp, eval, requireStrictAlleleMatch) != EvalCompMatchType.NO_MATCH)
        return true;
    }

    // nothing matched
    return false;
  }

  private enum EvalCompMatchType {
    NO_MATCH,
    STRICT,
    LENIENT
  }

  @Requires({"eval != null", "comp != null"})
  private EvalCompMatchType doEvalAndCompMatch(
      final VariantContext eval, final VariantContext comp, boolean requireStrictAlleleMatch) {
    // find all of the matching comps
    if (comp.getType() != eval.getType()) return EvalCompMatchType.NO_MATCH;

    // find the comp which matches both the reference allele and alternate allele from eval
    final Allele altEval =
        eval.getAlternateAlleles().size() == 0 ? null : eval.getAlternateAllele(0);
    final Allele altComp =
        comp.getAlternateAlleles().size() == 0 ? null : comp.getAlternateAllele(0);
    if ((altEval == null && altComp == null)
        || (altEval != null
            && altEval.equals(altComp)
            && eval.getReference().equals(comp.getReference()))) return EvalCompMatchType.STRICT;
    else return requireStrictAlleleMatch ? EvalCompMatchType.NO_MATCH : EvalCompMatchType.LENIENT;
  }

  private VariantContext findMatchingComp(
      final VariantContext eval, final Collection<VariantContext> comps) {
    // if no comps, return null
    if (comps == null || comps.isEmpty()) return null;

    // if no eval, return any comp
    if (eval == null) return comps.iterator().next();

    // find all of the matching comps
    VariantContext lenientMatch = null;
    for (final VariantContext comp : comps) {
      switch (doEvalAndCompMatch(comp, eval, requireStrictAlleleMatch)) {
        case STRICT:
          return comp;
        case LENIENT:
          if (lenientMatch == null) lenientMatch = comp;
          break;
        case NO_MATCH:;
      }
    }

    // nothing matched, just return lenientMatch, which might be null
    return lenientMatch;
  }

  public Integer treeReduce(Integer lhs, Integer rhs) {
    return null;
  }

  @Override
  public Integer reduceInit() {
    return null;
  }

  @Override
  public Integer reduce(Integer value, Integer sum) {
    return null;
  }

  /**
   * Output the finalized report
   *
   * @param result an integer that doesn't get used for anything
   */
  public void onTraversalDone(Integer result) {
    logger.info("Finalizing variant report");

    for (StateKey stateKey : evaluationContexts.keySet()) {
      NewEvaluationContext nec = evaluationContexts.get(stateKey);

      for (VariantEvaluator ve : nec.getEvaluationClassList().values()) {
        ve.finalizeEvaluation();

        AnalysisModuleScanner scanner = new AnalysisModuleScanner(ve);
        Map<Field, DataPoint> datamap = scanner.getData();

        for (Field field : datamap.keySet()) {
          try {
            field.setAccessible(true);

            if (field.get(ve) instanceof TableType) {
              TableType t = (TableType) field.get(ve);

              String subTableName = ve.getClass().getSimpleName() + "." + field.getName();
              final DataPoint dataPointAnn = datamap.get(field);

              GATKReportTable table;
              if (!report.hasTable(subTableName)) {
                report.addTable(subTableName, dataPointAnn.description());
                table = report.getTable(subTableName);

                table.addPrimaryKey("entry", false);
                table.addColumn(subTableName, subTableName);

                for (VariantStratifier vs : stratificationObjects) {
                  table.addColumn(vs.getName(), "unknown");
                }

                table.addColumn("row", "unknown");

                for (Object o : t.getColumnKeys()) {
                  String c;

                  if (o instanceof String) {
                    c = (String) o;
                  } else {
                    c = o.toString();
                  }

                  table.addColumn(c, 0.0);
                }
              } else {
                table = report.getTable(subTableName);
              }

              for (int row = 0; row < t.getRowKeys().length; row++) {
                String r = (String) t.getRowKeys()[row];

                for (VariantStratifier vs : stratificationObjects) {
                  final String columnName = vs.getName();
                  table.set(stateKey.toString() + r, columnName, stateKey.get(columnName));
                }

                for (int col = 0; col < t.getColumnKeys().length; col++) {
                  String c;
                  if (t.getColumnKeys()[col] instanceof String) {
                    c = (String) t.getColumnKeys()[col];
                  } else {
                    c = t.getColumnKeys()[col].toString();
                  }

                  String newStateKey = stateKey.toString() + r;
                  table.set(newStateKey, c, t.getCell(row, col));

                  table.set(newStateKey, "row", r);
                }
              }
            } else {
              GATKReportTable table = report.getTable(ve.getClass().getSimpleName());

              for (VariantStratifier vs : stratificationObjects) {
                String columnName = vs.getName();

                table.set(stateKey.toString(), columnName, stateKey.get(vs.getName()));
              }

              table.set(stateKey.toString(), field.getName(), field.get(ve));
            }
          } catch (IllegalAccessException e) {
            throw new StingException("IllegalAccessException: " + e);
          }
        }
      }
    }

    report.print(out);
  }

  // Accessors
  public Logger getLogger() {
    return logger;
  }

  public int getNumSamples() {
    return numSamples;
  }

  public double getMinPhaseQuality() {
    return MIN_PHASE_QUALITY;
  }

  public double getMendelianViolationQualThreshold() {
    return MENDELIAN_VIOLATION_QUAL_THRESHOLD;
  }

  public TreeSet<VariantStratifier> getStratificationObjects() {
    return stratificationObjects;
  }

  public static String getAllSampleName() {
    return ALL_SAMPLE_NAME;
  }

  public List<RodBinding<VariantContext>> getKnowns() {
    return knowns;
  }

  public List<RodBinding<VariantContext>> getEvals() {
    return evals;
  }

  public Set<String> getSampleNamesForEvaluation() {
    return sampleNamesForEvaluation;
  }

  public Set<String> getSampleNamesForStratification() {
    return sampleNamesForStratification;
  }

  public List<RodBinding<VariantContext>> getComps() {
    return comps;
  }

  public Set<SortableJexlVCMatchExp> getJexlExpressions() {
    return jexlExpressions;
  }

  public Set<String> getContigNames() {
    final TreeSet<String> contigs = new TreeSet<String>();
    for (final SAMSequenceRecord r :
        getToolkit()
            .getReferenceDataSource()
            .getReference()
            .getSequenceDictionary()
            .getSequences()) {
      contigs.add(r.getSequenceName());
    }
    return contigs;
  }

  public GenomeLocParser getGenomeLocParser() {
    return getToolkit().getGenomeLocParser();
  }

  public GenomeAnalysisEngine getToolkit() {
    return super.getToolkit();
  }
}
Пример #2
0
  /** Collect relevant information from each variant in the supplied VCFs */
  @Override
  public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
    // we track the processed bp and expose this for modules instead of wasting CPU power on
    // calculating
    // the same thing over and over in evals that want the processed bp
    synchronized (this) {
      nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1);
    }

    if (tracker != null) {
      String aastr =
          (ancestralAlignments == null)
              ? null
              : new String(
                  ancestralAlignments
                      .getSubsequenceAt(
                          ref.getLocus().getContig(),
                          ref.getLocus().getStart(),
                          ref.getLocus().getStop())
                      .getBases());

      //            // update the dynamic stratifications
      //            for (final VariantContext vc : tracker.getValues(evals, ref.getLocus())) {
      //                // don't worry -- DynamicStratification only work with one eval object
      //                for ( final DynamicStratification ds :  dynamicStratifications ) {
      //                    ds.update(vc);
      //                }
      //            }

      //      --------- track ---------           sample  - VariantContexts -
      HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> evalVCs =
          variantEvalUtils.bindVariantContexts(
              tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals);
      HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> compVCs =
          variantEvalUtils.bindVariantContexts(
              tracker, ref, comps, byFilterIsEnabled, false, false, false);

      // for each eval track
      for (final RodBinding<VariantContext> evalRod : evals) {
        final Map<String, Collection<VariantContext>> emptyEvalMap = Collections.emptyMap();
        final Map<String, Collection<VariantContext>> evalSet =
            evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : emptyEvalMap;

        // for each sample stratifier
        for (final String sampleName : sampleNamesForStratification) {
          Collection<VariantContext> evalSetBySample = evalSet.get(sampleName);
          if (evalSetBySample == null) {
            evalSetBySample = new HashSet<VariantContext>(1);
            evalSetBySample.add(null);
          }

          // for each eval in the track
          for (VariantContext eval : evalSetBySample) {
            // deal with ancestral alleles if requested
            if (eval != null && aastr != null) {
              eval = new VariantContextBuilder(eval).attribute("ANCESTRALALLELE", aastr).make();
            }

            // for each comp track
            for (final RodBinding<VariantContext> compRod : comps) {
              // no sample stratification for comps
              final HashMap<String, Collection<VariantContext>> compSetHash = compVCs.get(compRod);
              final Collection<VariantContext> compSet =
                  (compSetHash == null || compSetHash.size() == 0)
                      ? Collections.<VariantContext>emptyList()
                      : compVCs.get(compRod).values().iterator().next();

              // find the comp
              final VariantContext comp = findMatchingComp(eval, compSet);

              for (EvaluationContext nec :
                  getEvaluationContexts(
                      tracker, ref, eval, evalRod.getName(), comp, compRod.getName(), sampleName)) {

                // eval against the comp
                synchronized (nec) {
                  nec.apply(tracker, ref, context, comp, eval);
                }

                // eval=null against all comps of different type that aren't bound to another eval
                for (VariantContext otherComp : compSet) {
                  if (otherComp != comp && !compHasMatchingEval(otherComp, evalSetBySample)) {
                    synchronized (nec) {
                      nec.apply(tracker, ref, context, otherComp, null);
                    }
                  }
                }
              }
            }
          }
        }

        if (mergeEvals) break; // stop processing the eval tracks
      }
    }

    return null;
  }
  /** Collect relevant information from each variant in the supplied VCFs */
  @Override
  public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
    for (NewEvaluationContext nec : evaluationContexts.values()) {
      synchronized (nec) {
        nec.update0(tracker, ref, context);
      }
    }

    if (tracker != null) {
      String aastr =
          (ancestralAlignments == null)
              ? null
              : new String(
                  ancestralAlignments
                      .getSubsequenceAt(
                          ref.getLocus().getContig(),
                          ref.getLocus().getStart(),
                          ref.getLocus().getStop())
                      .getBases());

      //      --------- track ---------           sample  - VariantContexts -
      HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> evalVCs =
          variantEvalUtils.bindVariantContexts(
              tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals);
      HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> compVCs =
          variantEvalUtils.bindVariantContexts(
              tracker, ref, comps, byFilterIsEnabled, false, false, false);

      // for each eval track
      for (final RodBinding<VariantContext> evalRod : evals) {
        final Map<String, Collection<VariantContext>> emptyEvalMap = Collections.emptyMap();
        final Map<String, Collection<VariantContext>> evalSet =
            evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : emptyEvalMap;

        // for each sample stratifier
        for (final String sampleName : sampleNamesForStratification) {
          Collection<VariantContext> evalSetBySample = evalSet.get(sampleName);
          if (evalSetBySample == null) {
            evalSetBySample = new HashSet<VariantContext>(1);
            evalSetBySample.add(null);
          }

          // for each eval in the track
          for (VariantContext eval : evalSetBySample) {
            // deal with ancestral alleles if requested
            if (eval != null && aastr != null) {
              eval = new VariantContextBuilder(eval).attribute("ANCESTRALALLELE", aastr).make();
            }

            // for each comp track
            for (final RodBinding<VariantContext> compRod : comps) {
              // no sample stratification for comps
              final HashMap<String, Collection<VariantContext>> compSetHash = compVCs.get(compRod);
              final Collection<VariantContext> compSet =
                  (compSetHash == null || compSetHash.size() == 0)
                      ? Collections.<VariantContext>emptyList()
                      : compVCs.get(compRod).values().iterator().next();

              // find the comp
              final VariantContext comp = findMatchingComp(eval, compSet);

              HashMap<VariantStratifier, List<String>> stateMap =
                  new HashMap<VariantStratifier, List<String>>();
              for (VariantStratifier vs : stratificationObjects) {
                List<String> states =
                    vs.getRelevantStates(
                        ref, tracker, comp, compRod.getName(), eval, evalRod.getName(), sampleName);
                stateMap.put(vs, states);
              }

              ArrayList<StateKey> stateKeys = new ArrayList<StateKey>();
              variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys);

              HashSet<StateKey> stateKeysHash = new HashSet<StateKey>(stateKeys);

              for (StateKey stateKey : stateKeysHash) {
                NewEvaluationContext nec = evaluationContexts.get(stateKey);

                // eval against the comp
                synchronized (nec) {
                  nec.apply(tracker, ref, context, comp, eval);
                }

                // eval=null against all comps of different type that aren't bound to another eval
                for (VariantContext otherComp : compSet) {
                  if (otherComp != comp && !compHasMatchingEval(otherComp, evalSetBySample)) {
                    synchronized (nec) {
                      nec.apply(tracker, ref, context, otherComp, null);
                    }
                  }
                }
              }
            }
          }
        }

        if (mergeEvals) break; // stop processing the eval tracks
      }
    }

    return null;
  }
Пример #4
0
/**
 * General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ti/Tv ratios, and
 * a lot more)
 *
 * <p>Given a variant callset, it is common to calculate various quality control metrics. These
 * metrics include the number of raw or filtered SNP counts; ratio of transition mutations to
 * transversions; concordance of a particular sample's calls to a genotyping chip; number of
 * singletons per sample; etc. Furthermore, it is often useful to stratify these metrics by various
 * criteria like functional class (missense, nonsense, silent), whether the site is CpG site, the
 * amino acid degeneracy of the site, etc. VariantEval facilitates these calculations in two ways:
 * by providing several built-in evaluation and stratification modules, and by providing a framework
 * that permits the easy development of new evaluation and stratification modules.
 *
 * <h2>Input</h2>
 *
 * <p>One or more variant sets to evaluate plus any number of comparison sets.
 *
 * <h2>Output</h2>
 *
 * <p>Evaluation tables detailing the results of the eval modules which were applied. For example:
 *
 * <pre>
 * output.eval.gatkreport:
 * ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample
 * CountVariants  CompRod   CpG      EvalRod  JexlExpression  Novelty  nProcessedLoci  nCalledLoci  nRefLoci  nVariantLoci  variantRate ...
 * CountVariants  dbsnp     CpG      eval     none            all      65900028        135770       0         135770        0.00206024  ...
 * CountVariants  dbsnp     CpG      eval     none            known    65900028        47068        0         47068         0.00071423  ...
 * CountVariants  dbsnp     CpG      eval     none            novel    65900028        88702        0         88702         0.00134601  ...
 * CountVariants  dbsnp     all      eval     none            all      65900028        330818       0         330818        0.00502000  ...
 * CountVariants  dbsnp     all      eval     none            known    65900028        120685       0         120685        0.00183133  ...
 * CountVariants  dbsnp     all      eval     none            novel    65900028        210133       0         210133        0.00318866  ...
 * CountVariants  dbsnp     non_CpG  eval     none            all      65900028        195048       0         195048        0.00295976  ...
 * CountVariants  dbsnp     non_CpG  eval     none            known    65900028        73617        0         73617         0.00111710  ...
 * CountVariants  dbsnp     non_CpG  eval     none            novel    65900028        121431       0         121431        0.00184265  ...
 * ...
 * </pre>
 *
 * <h2>Examples</h2>
 *
 * <pre>
 * java -Xmx2g -jar GenomeAnalysisTK.jar \
 *   -R ref.fasta \
 *   -T VariantEval \
 *   -o output.eval.gatkreport \
 *   --eval:set1 set1.vcf \
 *   --eval:set2 set2.vcf \
 *   [--comp comp.vcf]
 * </pre>
 */
@DocumentedGATKFeature(
    groupName = HelpConstants.DOCS_CAT_VARMANIP,
    extraDocs = {CommandLineGATK.class})
@Reference(window = @Window(start = -50, stop = 50))
@PartitionBy(PartitionType.NONE)
public class VariantEval extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
  public static final String IS_SINGLETON_KEY = "ISSINGLETON";

  @Output protected PrintStream out;

  /** The variant file(s) to evaluate. */
  @Input(fullName = "eval", shortName = "eval", doc = "Input evaluation file(s)", required = true)
  public List<RodBinding<VariantContext>> evals;

  /** The variant file(s) to compare against. */
  @Input(fullName = "comp", shortName = "comp", doc = "Input comparison file(s)", required = false)
  public List<RodBinding<VariantContext>> compsProvided = Collections.emptyList();

  private List<RodBinding<VariantContext>> comps = new ArrayList<RodBinding<VariantContext>>();

  /**
   * dbSNP comparison VCF. By default, the dbSNP file is used to specify the set of "known"
   * variants. Other sets can be specified with the -knownName (--known_names) argument.
   */
  @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();

  /**
   * Some analyses want to count overlap not with dbSNP (which is in general very open) but actually
   * want to itemize their overlap specifically with a set of gold standard sites such as HapMap,
   * OMNI, or the gold standard indels. This argument provides a mechanism for communicating which
   * file to use
   */
  @Input(
      fullName = "goldStandard",
      shortName = "gold",
      doc =
          "Evaluations that count calls at sites of true variation (e.g., indel calls) will use this argument as their gold standard for comparison",
      required = false)
  public RodBinding<VariantContext> goldStandard = null;

  /** Note that the --list argument requires a fully resolved and correct command-line to work. */
  @Argument(
      fullName = "list",
      shortName = "ls",
      doc = "List the available eval modules and exit",
      required = false)
  protected Boolean LIST = false;

  // Partitioning the data arguments
  @Argument(
      shortName = "select",
      doc = "One or more stratifications to use when evaluating the data",
      required = false)
  protected ArrayList<String> SELECT_EXPS = new ArrayList<String>();

  @Argument(
      shortName = "selectName",
      doc = "Names to use for the list of stratifications (must be a 1-to-1 mapping)",
      required = false)
  protected ArrayList<String> SELECT_NAMES = new ArrayList<String>();

  @Argument(
      fullName = "sample",
      shortName = "sn",
      doc =
          "Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context",
      required = false)
  protected Set<String> SAMPLE_EXPRESSIONS;

  /** List of rod tracks to be used for specifying "known" variants other than dbSNP. */
  @Argument(
      shortName = "knownName",
      doc =
          "Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets",
      required = false)
  protected HashSet<String> KNOWN_NAMES = new HashSet<String>();

  List<RodBinding<VariantContext>> knowns = new ArrayList<RodBinding<VariantContext>>();

  // Stratification arguments
  @Argument(
      fullName = "stratificationModule",
      shortName = "ST",
      doc =
          "One or more specific stratification modules to apply to the eval track(s) (in addition to the standard stratifications, unless -noS is specified)",
      required = false)
  protected String[] STRATIFICATIONS_TO_USE = {};

  @Argument(
      fullName = "doNotUseAllStandardStratifications",
      shortName = "noST",
      doc =
          "Do not use the standard stratification modules by default (instead, only those that are specified with the -S option)",
      required = false)
  protected Boolean NO_STANDARD_STRATIFICATIONS = false;

  /** See the -list argument to view available modules. */
  @Argument(
      fullName = "evalModule",
      shortName = "EV",
      doc =
          "One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noEV is specified)",
      required = false)
  protected String[] MODULES_TO_USE = {};

  @Argument(
      fullName = "doNotUseAllStandardModules",
      shortName = "noEV",
      doc =
          "Do not use the standard modules by default (instead, only those that are specified with the -EV option)",
      required = false)
  protected Boolean NO_STANDARD_MODULES = false;

  @Argument(
      fullName = "minPhaseQuality",
      shortName = "mpq",
      doc = "Minimum phasing quality",
      required = false)
  protected double MIN_PHASE_QUALITY = 10.0;

  @Argument(
      shortName = "mvq",
      fullName = "mendelianViolationQualThreshold",
      doc =
          "Minimum genotype QUAL score for each trio member required to accept a site as a violation. Default is 50.",
      required = false)
  protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50;

  @Argument(
      shortName = "ploidy",
      fullName = "samplePloidy",
      doc = "Per-sample ploidy (number of chromosomes per sample)",
      required = false)
  protected int ploidy = GATKVariantContextUtils.DEFAULT_PLOIDY;

  @Argument(
      fullName = "ancestralAlignments",
      shortName = "aa",
      doc = "Fasta file with ancestral alleles",
      required = false)
  private File ancestralAlignmentsFile = null;

  @Argument(
      fullName = "requireStrictAlleleMatch",
      shortName = "strict",
      doc =
          "If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping",
      required = false)
  private boolean requireStrictAlleleMatch = false;

  @Argument(
      fullName = "keepAC0",
      shortName = "keepAC0",
      doc =
          "If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes",
      required = false)
  private boolean keepSitesWithAC0 = false;

  @Hidden
  @Argument(
      fullName = "numSamples",
      shortName = "numSamples",
      doc =
          "If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes",
      required = false)
  private int numSamplesFromArgument = 0;

  /**
   * If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying
   * variant set, and evaluate the union of the results. Useful when you want to do -eval chr1.vcf
   * -eval chr2.vcf etc.
   */
  @Argument(
      fullName = "mergeEvals",
      shortName = "mergeEvals",
      doc = "If provided, all -eval tracks will be merged into a single eval track",
      required = false)
  public boolean mergeEvals = false;

  /** File containing tribble-readable features for the IntervalStratificiation */
  @Input(
      fullName = "stratIntervals",
      shortName = "stratIntervals",
      doc = "File containing tribble-readable features for the IntervalStratificiation",
      required = false)
  public IntervalBinding<Feature> intervalsFile = null;

  /**
   * File containing tribble-readable features containing known CNVs. For use with VariantSummary
   * table.
   */
  @Input(
      fullName = "knownCNVs",
      shortName = "knownCNVs",
      doc =
          "File containing tribble-readable features describing a known list of copy number variants",
      required = false)
  public IntervalBinding<Feature> knownCNVsFile = null;

  Map<String, IntervalTree<GenomeLoc>> knownCNVsByContig = Collections.emptyMap();

  // Variables
  private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();

  private boolean isSubsettingSamples;
  private Set<String> sampleNamesForEvaluation = new LinkedHashSet<String>();
  private Set<String> sampleNamesForStratification = new LinkedHashSet<String>();

  // important stratifications
  private boolean byFilterIsEnabled = false;
  private boolean perSampleIsEnabled = false;

  // Public constants
  private static String ALL_SAMPLE_NAME = "all";

  // the number of processed bp for this walker
  long nProcessedLoci = 0;

  // Utility class
  private final VariantEvalUtils variantEvalUtils = new VariantEvalUtils(this);

  // Ancestral alignments
  private IndexedFastaSequenceFile ancestralAlignments = null;

  // The set of all possible evaluation contexts
  StratificationManager<VariantStratifier, EvaluationContext> stratManager;
  // Set<DynamicStratification> dynamicStratifications = Collections.emptySet();

  /** Initialize the stratifications, evaluations, evaluation contexts, and reporting object */
  public void initialize() {
    // Just list the modules, and exit quickly.
    if (LIST) {
      variantEvalUtils.listModulesAndExit();
    }

    // maintain the full list of comps
    comps.addAll(compsProvided);
    if (dbsnp.dbsnp.isBound()) {
      comps.add(dbsnp.dbsnp);
      knowns.add(dbsnp.dbsnp);
    }

    // Add a dummy comp track if none exists
    if (comps.size() == 0)
      comps.add(
          new RodBinding<VariantContext>(VariantContext.class, "none", "UNBOUND", "", new Tags()));

    // Set up set of additional knowns
    for (RodBinding<VariantContext> compRod : comps) {
      if (KNOWN_NAMES.contains(compRod.getName())) knowns.add(compRod);
    }

    // Now that we have all the rods categorized, determine the sample list from the eval rods.
    Map<String, VCFHeader> vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), evals);
    Set<String> vcfSamples =
        SampleUtils.getSampleList(
            vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);

    // Load the sample list, using an intermediate tree set to sort the samples
    final Set<String> allSampleNames = SampleUtils.getSamplesFromCommandLineInput(vcfSamples);
    sampleNamesForEvaluation.addAll(
        new TreeSet<String>(
            SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS)));
    isSubsettingSamples = !sampleNamesForEvaluation.containsAll(allSampleNames);

    if (Arrays.asList(STRATIFICATIONS_TO_USE).contains("Sample")) {
      sampleNamesForStratification.addAll(sampleNamesForEvaluation);
    }
    sampleNamesForStratification.add(ALL_SAMPLE_NAME);

    // Initialize select expressions
    for (VariantContextUtils.JexlVCMatchExp jexl :
        VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS)) {
      SortableJexlVCMatchExp sjexl = new SortableJexlVCMatchExp(jexl.name, jexl.exp);
      jexlExpressions.add(sjexl);
    }

    // Initialize the set of stratifications and evaluations to use
    // The list of stratifiers and evaluators to use
    final List<VariantStratifier> stratificationObjects =
        variantEvalUtils.initializeStratificationObjects(
            NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE);
    final Set<Class<? extends VariantEvaluator>> evaluationClasses =
        variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE);

    checkForIncompatibleEvaluatorsAndStratifiers(stratificationObjects, evaluationClasses);

    for (VariantStratifier vs : stratificationObjects) {
      if (vs.getName().equals("Filter")) byFilterIsEnabled = true;
      else if (vs.getName().equals("Sample")) perSampleIsEnabled = true;
    }

    if (intervalsFile != null) {
      boolean fail = true;
      for (final VariantStratifier vs : stratificationObjects) {
        if (vs.getClass().equals(IntervalStratification.class)) fail = false;
      }
      if (fail)
        throw new UserException.BadArgumentValue(
            "ST", "stratIntervals argument provided but -ST IntervalStratification not provided");
    }

    // Initialize the evaluation contexts
    createStratificationStates(stratificationObjects, evaluationClasses);

    // Load ancestral alignments
    if (ancestralAlignmentsFile != null) {
      try {
        ancestralAlignments = new IndexedFastaSequenceFile(ancestralAlignmentsFile);
      } catch (FileNotFoundException e) {
        throw new ReviewedStingException(
            String.format(
                "The ancestral alignments file, '%s', could not be found",
                ancestralAlignmentsFile.getAbsolutePath()));
      }
    }

    // initialize CNVs
    if (knownCNVsFile != null) {
      knownCNVsByContig = createIntervalTreeByContig(knownCNVsFile);
    }
  }

  final void checkForIncompatibleEvaluatorsAndStratifiers(
      final List<VariantStratifier> stratificationObjects,
      Set<Class<? extends VariantEvaluator>> evaluationClasses) {
    for (final VariantStratifier vs : stratificationObjects) {
      for (Class<? extends VariantEvaluator> ec : evaluationClasses)
        if (vs.getIncompatibleEvaluators().contains(ec))
          throw new UserException.BadArgumentValue(
              "ST and ET",
              "The selected stratification "
                  + vs.getName()
                  + " and evaluator "
                  + ec.getSimpleName()
                  + " are incompatible due to combinatorial memory requirements."
                  + " Please disable one");
    }
  }

  final void createStratificationStates(
      final List<VariantStratifier> stratificationObjects,
      final Set<Class<? extends VariantEvaluator>> evaluationObjects) {
    final List<VariantStratifier> strats = new ArrayList<VariantStratifier>(stratificationObjects);
    stratManager = new StratificationManager<VariantStratifier, EvaluationContext>(strats);

    logger.info("Creating " + stratManager.size() + " combinatorial stratification states");
    for (int i = 0; i < stratManager.size(); i++) {
      EvaluationContext ec = new EvaluationContext(this, evaluationObjects);
      stratManager.set(i, ec);
    }
  }

  public final Map<String, IntervalTree<GenomeLoc>> createIntervalTreeByContig(
      final IntervalBinding<Feature> intervals) {
    final Map<String, IntervalTree<GenomeLoc>> byContig =
        new HashMap<String, IntervalTree<GenomeLoc>>();

    final List<GenomeLoc> locs = intervals.getIntervals(getToolkit());

    // set up the map from contig -> interval tree
    for (final String contig : getContigNames())
      byContig.put(contig, new IntervalTree<GenomeLoc>());

    for (final GenomeLoc loc : locs) {
      byContig.get(loc.getContig()).put(loc.getStart(), loc.getStop(), loc);
    }

    return byContig;
  }

  /** Collect relevant information from each variant in the supplied VCFs */
  @Override
  public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
    // we track the processed bp and expose this for modules instead of wasting CPU power on
    // calculating
    // the same thing over and over in evals that want the processed bp
    synchronized (this) {
      nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1);
    }

    if (tracker != null) {
      String aastr =
          (ancestralAlignments == null)
              ? null
              : new String(
                  ancestralAlignments
                      .getSubsequenceAt(
                          ref.getLocus().getContig(),
                          ref.getLocus().getStart(),
                          ref.getLocus().getStop())
                      .getBases());

      //            // update the dynamic stratifications
      //            for (final VariantContext vc : tracker.getValues(evals, ref.getLocus())) {
      //                // don't worry -- DynamicStratification only work with one eval object
      //                for ( final DynamicStratification ds :  dynamicStratifications ) {
      //                    ds.update(vc);
      //                }
      //            }

      //      --------- track ---------           sample  - VariantContexts -
      HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> evalVCs =
          variantEvalUtils.bindVariantContexts(
              tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals);
      HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> compVCs =
          variantEvalUtils.bindVariantContexts(
              tracker, ref, comps, byFilterIsEnabled, false, false, false);

      // for each eval track
      for (final RodBinding<VariantContext> evalRod : evals) {
        final Map<String, Collection<VariantContext>> emptyEvalMap = Collections.emptyMap();
        final Map<String, Collection<VariantContext>> evalSet =
            evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : emptyEvalMap;

        // for each sample stratifier
        for (final String sampleName : sampleNamesForStratification) {
          Collection<VariantContext> evalSetBySample = evalSet.get(sampleName);
          if (evalSetBySample == null) {
            evalSetBySample = new HashSet<VariantContext>(1);
            evalSetBySample.add(null);
          }

          // for each eval in the track
          for (VariantContext eval : evalSetBySample) {
            // deal with ancestral alleles if requested
            if (eval != null && aastr != null) {
              eval = new VariantContextBuilder(eval).attribute("ANCESTRALALLELE", aastr).make();
            }

            // for each comp track
            for (final RodBinding<VariantContext> compRod : comps) {
              // no sample stratification for comps
              final HashMap<String, Collection<VariantContext>> compSetHash = compVCs.get(compRod);
              final Collection<VariantContext> compSet =
                  (compSetHash == null || compSetHash.size() == 0)
                      ? Collections.<VariantContext>emptyList()
                      : compVCs.get(compRod).values().iterator().next();

              // find the comp
              final VariantContext comp = findMatchingComp(eval, compSet);

              for (EvaluationContext nec :
                  getEvaluationContexts(
                      tracker, ref, eval, evalRod.getName(), comp, compRod.getName(), sampleName)) {

                // eval against the comp
                synchronized (nec) {
                  nec.apply(tracker, ref, context, comp, eval);
                }

                // eval=null against all comps of different type that aren't bound to another eval
                for (VariantContext otherComp : compSet) {
                  if (otherComp != comp && !compHasMatchingEval(otherComp, evalSetBySample)) {
                    synchronized (nec) {
                      nec.apply(tracker, ref, context, otherComp, null);
                    }
                  }
                }
              }
            }
          }
        }

        if (mergeEvals) break; // stop processing the eval tracks
      }
    }

    return null;
  }

  /**
   * Given specific eval and comp VCs and the sample name, return an iterable over all of the
   * applicable state keys.
   *
   * <p>this code isn't structured yet for efficiency. Here we currently are doing the following
   * inefficient algorithm:
   *
   * <p>for each strat: get list of relevant states that eval and comp according to strat add this
   * list of states to a list of list states
   *
   * <p>then
   *
   * <p>ask the strat manager to look up all of the keys associated with the combinations of these
   * states. For example, suppose we have a single variant S. We have active strats EvalRod,
   * CompRod, and Novelty. We produce a list that looks like:
   *
   * <p>L = [[Eval], [Comp], [All, Novel]]
   *
   * <p>We then go through the strat manager tree to produce the keys associated with these states:
   *
   * <p>K = [0, 1] where EVAL x COMP x ALL = 0 and EVAL x COMP x NOVEL = 1
   *
   * <p>It's clear that a better
   *
   * <p>TODO -- create an inline version that doesn't create the intermediate list of list
   *
   * @param tracker
   * @param ref
   * @param eval
   * @param evalName
   * @param comp
   * @param compName
   * @param sampleName
   * @return
   */
  protected Collection<EvaluationContext> getEvaluationContexts(
      final RefMetaDataTracker tracker,
      final ReferenceContext ref,
      final VariantContext eval,
      final String evalName,
      final VariantContext comp,
      final String compName,
      final String sampleName) {
    final List<List<Object>> states = new LinkedList<List<Object>>();
    for (final VariantStratifier vs : stratManager.getStratifiers()) {
      states.add(vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName));
    }
    return stratManager.values(states);
  }

  @Requires({"comp != null", "evals != null"})
  private boolean compHasMatchingEval(
      final VariantContext comp, final Collection<VariantContext> evals) {
    // find all of the matching comps
    for (final VariantContext eval : evals) {
      if (eval != null
          && doEvalAndCompMatch(comp, eval, requireStrictAlleleMatch) != EvalCompMatchType.NO_MATCH)
        return true;
    }

    // nothing matched
    return false;
  }

  private enum EvalCompMatchType {
    NO_MATCH,
    STRICT,
    LENIENT
  }

  @Requires({"eval != null", "comp != null"})
  private EvalCompMatchType doEvalAndCompMatch(
      final VariantContext eval, final VariantContext comp, boolean requireStrictAlleleMatch) {
    if (comp.getType() == VariantContext.Type.NO_VARIATION
        || eval.getType() == VariantContext.Type.NO_VARIATION)
      // if either of these are NO_VARIATION they are LENIENT matches
      return EvalCompMatchType.LENIENT;

    if (comp.getType() != eval.getType()) return EvalCompMatchType.NO_MATCH;

    // find the comp which matches both the reference allele and alternate allele from eval
    final Allele altEval =
        eval.getAlternateAlleles().size() == 0 ? null : eval.getAlternateAllele(0);
    final Allele altComp =
        comp.getAlternateAlleles().size() == 0 ? null : comp.getAlternateAllele(0);
    if ((altEval == null && altComp == null)
        || (altEval != null
            && altEval.equals(altComp)
            && eval.getReference().equals(comp.getReference()))) return EvalCompMatchType.STRICT;
    else return requireStrictAlleleMatch ? EvalCompMatchType.NO_MATCH : EvalCompMatchType.LENIENT;
  }

  private VariantContext findMatchingComp(
      final VariantContext eval, final Collection<VariantContext> comps) {
    // if no comps, return null
    if (comps == null || comps.isEmpty()) return null;

    // if no eval, return any comp
    if (eval == null) return comps.iterator().next();

    // find all of the matching comps
    VariantContext lenientMatch = null;
    for (final VariantContext comp : comps) {
      switch (doEvalAndCompMatch(comp, eval, requireStrictAlleleMatch)) {
        case STRICT:
          return comp;
        case LENIENT:
          if (lenientMatch == null) lenientMatch = comp;
          break;
        case NO_MATCH:
          // do nothing
      }
    }

    // nothing matched, just return lenientMatch, which might be null
    return lenientMatch;
  }

  public Integer treeReduce(Integer lhs, Integer rhs) {
    return null;
  }

  @Override
  public Integer reduceInit() {
    return null;
  }

  @Override
  public Integer reduce(Integer value, Integer sum) {
    return null;
  }

  /**
   * Output the finalized report
   *
   * @param result an integer that doesn't get used for anything
   */
  public void onTraversalDone(Integer result) {
    logger.info("Finalizing variant report");

    // go through the evaluations and finalize them
    for (final EvaluationContext nec : stratManager.values())
      for (final VariantEvaluator ve : nec.getVariantEvaluators()) ve.finalizeEvaluation();

    VariantEvalReportWriter.writeReport(
        out,
        stratManager,
        stratManager.getStratifiers(),
        stratManager.get(0).getVariantEvaluators());
  }

  // Accessors
  public Logger getLogger() {
    return logger;
  }

  public double getMinPhaseQuality() {
    return MIN_PHASE_QUALITY;
  }

  public int getSamplePloidy() {
    return ploidy;
  }

  public double getMendelianViolationQualThreshold() {
    return MENDELIAN_VIOLATION_QUAL_THRESHOLD;
  }

  public static String getAllSampleName() {
    return ALL_SAMPLE_NAME;
  }

  public List<RodBinding<VariantContext>> getKnowns() {
    return knowns;
  }

  public List<RodBinding<VariantContext>> getEvals() {
    return evals;
  }

  public boolean isSubsettingToSpecificSamples() {
    return isSubsettingSamples;
  }

  public Set<String> getSampleNamesForEvaluation() {
    return sampleNamesForEvaluation;
  }

  public int getNumberOfSamplesForEvaluation() {
    if (sampleNamesForEvaluation != null && !sampleNamesForEvaluation.isEmpty())
      return sampleNamesForEvaluation.size();
    else {
      return numSamplesFromArgument;
    }
  }

  public Set<String> getSampleNamesForStratification() {
    return sampleNamesForStratification;
  }

  public List<RodBinding<VariantContext>> getComps() {
    return comps;
  }

  public Set<SortableJexlVCMatchExp> getJexlExpressions() {
    return jexlExpressions;
  }

  public long getnProcessedLoci() {
    return nProcessedLoci;
  }

  public Set<String> getContigNames() {
    final TreeSet<String> contigs = new TreeSet<String>();
    for (final SAMSequenceRecord r :
        getToolkit()
            .getReferenceDataSource()
            .getReference()
            .getSequenceDictionary()
            .getSequences()) {
      contigs.add(r.getSequenceName());
    }
    return contigs;
  }

  /**
   * getToolkit is protected, so we have to pseudo-overload it here so eval / strats can get the
   * toolkit
   *
   * @return
   */
  public GenomeAnalysisEngine getToolkit() {
    return super.getToolkit();
  }

  public boolean ignoreAC0Sites() {
    return !keepSitesWithAC0;
  }
}