/**
 * Created by IntelliJ IDEA. User: rpoplin Date: Nov 27, 2009
 *
 * <p>A collection of the arguments that are used for BQSR. Used to be common to both
 * CovariateCounterWalker and TableRecalibrationWalker. This set of arguments will also be passed to
 * the constructor of every Covariate when it is instantiated.
 */
public class RecalibrationArgumentCollection implements Cloneable {

  /**
   * This algorithm treats every reference mismatch as an indication of error. However, real genetic
   * variation is expected to mismatch the reference, so it is critical that a database of known
   * polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any
   * number of RodBindings (VCF, Bed, etc.) for use as this database. For users wishing to exclude
   * an interval list of known variation simply use -XL my.interval.list to skip over processing
   * those sites. Please note however that the statistics reported by the tool will not accurately
   * reflected those sites skipped by the -XL argument.
   */
  @Input(
      fullName = "knownSites",
      shortName = "knownSites",
      doc = "A database of known polymorphic sites to skip over in the recalibration algorithm",
      required = false)
  public List<RodBinding<Feature>> knownSites = Collections.emptyList();

  /**
   * After the header, data records occur one per line until the end of the file. The first several
   * items on a line are the values of the individual covariates and will change depending on which
   * covariates were specified at runtime. The last three items are the data- that is, number of
   * observations for this combination of covariates, number of reference mismatches, and the raw
   * empirical quality score calculated by phred-scaling the mismatch rate. Use '/dev/stdout' to
   * print to standard out.
   */
  @Gather(BQSRGatherer.class)
  @Output(doc = "The output recalibration table file to create", required = true)
  public File RECAL_TABLE_FILE = null;

  public PrintStream RECAL_TABLE;

  /** Note that the --list argument requires a fully resolved and correct command-line to work. */
  @Argument(
      fullName = "list",
      shortName = "ls",
      doc = "List the available covariates and exit",
      required = false)
  public boolean LIST_ONLY = false;

  /**
   * Note that the ReadGroup and QualityScore covariates are required and do not need to be
   * specified. Also, unless --no_standard_covs is specified, the Cycle and Context covariates are
   * standard and are included by default. Use the --list argument to see the available covariates.
   */
  @Argument(
      fullName = "covariate",
      shortName = "cov",
      doc =
          "One or more covariates to be used in the recalibration. Can be specified multiple times",
      required = false)
  public String[] COVARIATES = null;

  /*
   * The Cycle and Context covariates are standard and are included by default unless this argument is provided.
   * Note that the ReadGroup and QualityScore covariates are required and cannot be excluded.
   */
  @Argument(
      fullName = "no_standard_covs",
      shortName = "noStandard",
      doc =
          "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument",
      required = false)
  public boolean DO_NOT_USE_STANDARD_COVARIATES = false;

  /**
   * This calculation is critically dependent on being able to skip over known polymorphic sites.
   * Please be sure that you know what you are doing if you use this option.
   */
  @Advanced
  @Argument(
      fullName = "run_without_dbsnp_potentially_ruining_quality",
      shortName = "run_without_dbsnp_potentially_ruining_quality",
      required = false,
      doc =
          "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
  public boolean RUN_WITHOUT_DBSNP = false;

  /**
   * BaseRecalibrator accepts a --solid_recal_mode <MODE> flag which governs how the recalibrator
   * handles the reads which have had the reference inserted because of color space inconsistencies.
   */
  @Argument(
      fullName = "solid_recal_mode",
      shortName = "sMode",
      required = false,
      doc =
          "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
  public RecalUtils.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.SET_Q_ZERO;

  /**
   * BaseRecalibrator accepts a --solid_nocall_strategy <MODE> flag which governs how the
   * recalibrator handles no calls in the color space tag. Unfortunately because of the reference
   * inserted bases mentioned above, reads with no calls in their color space tag can not be
   * recalibrated.
   */
  @Argument(
      fullName = "solid_nocall_strategy",
      shortName = "solid_nocall_strategy",
      doc =
          "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ",
      required = false)
  public RecalUtils.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY =
      RecalUtils.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;

  /**
   * The context covariate will use a context of this size to calculate its covariate value for base
   * mismatches. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime
   * and required java heap size.
   */
  @Argument(
      fullName = "mismatches_context_size",
      shortName = "mcs",
      doc = "Size of the k-mer context to be used for base mismatches",
      required = false)
  public int MISMATCHES_CONTEXT_SIZE = 2;

  /**
   * The context covariate will use a context of this size to calculate its covariate value for base
   * insertions and deletions. Must be between 1 and 13 (inclusive). Note that higher values will
   * increase runtime and required java heap size.
   */
  @Argument(
      fullName = "indels_context_size",
      shortName = "ics",
      doc = "Size of the k-mer context to be used for base insertions and deletions",
      required = false)
  public int INDELS_CONTEXT_SIZE = 3;

  /**
   * The cycle covariate will generate an error if it encounters a cycle greater than this value.
   * This argument is ignored if the Cycle covariate is not used.
   */
  @Argument(
      fullName = "maximum_cycle_value",
      shortName = "maxCycle",
      doc = "The maximum cycle value permitted for the Cycle covariate",
      required = false)
  public int MAXIMUM_CYCLE_VALUE = 500;

  /**
   * A default base qualities to use as a prior (reported quality) in the mismatch covariate model.
   * This value will replace all base qualities in the read for this default value. Negative value
   * turns it off. [default is off]
   */
  @Argument(
      fullName = "mismatches_default_quality",
      shortName = "mdq",
      doc = "default quality for the base mismatches covariate",
      required = false)
  public byte MISMATCHES_DEFAULT_QUALITY = -1;

  /**
   * A default base qualities to use as a prior (reported quality) in the insertion covariate model.
   * This parameter is used for all reads without insertion quality scores for each base. [default
   * is on]
   */
  @Argument(
      fullName = "insertions_default_quality",
      shortName = "idq",
      doc = "default quality for the base insertions covariate",
      required = false)
  public byte INSERTIONS_DEFAULT_QUALITY = 45;

  /**
   * A default base qualities to use as a prior (reported quality) in the mismatch covariate model.
   * This value will replace all base qualities in the read for this default value. Negative value
   * turns it off. [default is on]
   */
  @Argument(
      fullName = "deletions_default_quality",
      shortName = "ddq",
      doc = "default quality for the base deletions covariate",
      required = false)
  public byte DELETIONS_DEFAULT_QUALITY = 45;

  /**
   * Reads with low quality bases on either tail (beginning or end) will not be considered in the
   * context. This parameter defines the quality below which (inclusive) a tail is considered low
   * quality
   */
  @Argument(
      fullName = "low_quality_tail",
      shortName = "lqt",
      doc = "minimum quality for the bases in the tail of the reads to be considered",
      required = false)
  public byte LOW_QUAL_TAIL = 2;

  /**
   * BQSR generates a quantization table for quick quantization later by subsequent tools. BQSR does
   * not quantize the base qualities, this is done by the engine with the -qq or -BQSR options. This
   * parameter tells BQSR the number of levels of quantization to use to build the quantization
   * table.
   */
  @Argument(
      fullName = "quantizing_levels",
      shortName = "ql",
      required = false,
      doc = "number of distinct quality scores in the quantized output")
  public int QUANTIZING_LEVELS = 16;

  /** The tag name for the binary tag covariate (if using it) */
  @Argument(
      fullName = "binary_tag_name",
      shortName = "bintag",
      required = false,
      doc = "the binary tag covariate name if using it")
  public String BINARY_TAG_NAME = null;

  /*
   * whether GATK report tables should have rows in sorted order, starting from leftmost column
   */
  @Argument(
      fullName = "sort_by_all_columns",
      shortName = "sortAllCols",
      doc = "Sort the rows in the tables of reports",
      required = false)
  public Boolean SORT_BY_ALL_COLUMNS = false;

  /////////////////////////////
  // Debugging-only Arguments
  /////////////////////////////

  @Hidden
  @Argument(
      fullName = "default_platform",
      shortName = "dP",
      required = false,
      doc =
          "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
  public String DEFAULT_PLATFORM = null;

  @Hidden
  @Argument(
      fullName = "force_platform",
      shortName = "fP",
      required = false,
      doc =
          "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
  public String FORCE_PLATFORM = null;

  @Hidden
  @Argument(
      fullName = "force_readgroup",
      shortName = "fRG",
      required = false,
      doc = "If provided, the read group of EVERY read will be forced to be the provided String.")
  public String FORCE_READGROUP = null;

  @Hidden
  @Output(
      fullName = "recal_table_update_log",
      shortName = "recal_table_update_log",
      required = false,
      doc =
          "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only",
      defaultToStdout = false)
  public PrintStream RECAL_TABLE_UPDATE_LOG = null;

  /**
   * The repeat covariate will use a context of this size to calculate it's covariate value for base
   * insertions and deletions
   */
  @Hidden
  @Argument(
      fullName = "max_str_unit_length",
      shortName = "maxstr",
      doc = "Max size of the k-mer context to be used for repeat covariates",
      required = false)
  public int MAX_STR_UNIT_LENGTH = 8;

  @Hidden
  @Argument(
      fullName = "max_repeat_length",
      shortName = "maxrep",
      doc = "Max number of repetitions to be used for repeat covariates",
      required = false)
  public int MAX_REPEAT_LENGTH = 20;

  public File existingRecalibrationReport = null;

  public GATKReportTable generateReportTable(final String covariateNames) {
    GATKReportTable argumentsTable;
    if (SORT_BY_ALL_COLUMNS) {
      argumentsTable =
          new GATKReportTable(
              "Arguments",
              "Recalibration argument collection values used in this run",
              2,
              GATKReportTable.TableSortingWay.SORT_BY_COLUMN);
    } else {
      argumentsTable =
          new GATKReportTable(
              "Arguments", "Recalibration argument collection values used in this run", 2);
    }
    argumentsTable.addColumn("Argument");
    argumentsTable.addColumn(RecalUtils.ARGUMENT_VALUE_COLUMN_NAME);
    argumentsTable.addRowID("covariate", true);
    argumentsTable.set("covariate", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, covariateNames);
    argumentsTable.addRowID("no_standard_covs", true);
    argumentsTable.set(
        "no_standard_covs", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DO_NOT_USE_STANDARD_COVARIATES);
    argumentsTable.addRowID("run_without_dbsnp", true);
    argumentsTable.set(
        "run_without_dbsnp", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP);
    argumentsTable.addRowID("solid_recal_mode", true);
    argumentsTable.set("solid_recal_mode", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_RECAL_MODE);
    argumentsTable.addRowID("solid_nocall_strategy", true);
    argumentsTable.set(
        "solid_nocall_strategy", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY);
    argumentsTable.addRowID("mismatches_context_size", true);
    argumentsTable.set(
        "mismatches_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE);
    argumentsTable.addRowID("indels_context_size", true);
    argumentsTable.set(
        "indels_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INDELS_CONTEXT_SIZE);
    argumentsTable.addRowID("mismatches_default_quality", true);
    argumentsTable.set(
        "mismatches_default_quality",
        RecalUtils.ARGUMENT_VALUE_COLUMN_NAME,
        MISMATCHES_DEFAULT_QUALITY);
    argumentsTable.addRowID("deletions_default_quality", true);
    argumentsTable.set(
        "deletions_default_quality",
        RecalUtils.ARGUMENT_VALUE_COLUMN_NAME,
        DELETIONS_DEFAULT_QUALITY);
    argumentsTable.addRowID("insertions_default_quality", true);
    argumentsTable.set(
        "insertions_default_quality",
        RecalUtils.ARGUMENT_VALUE_COLUMN_NAME,
        INSERTIONS_DEFAULT_QUALITY);
    argumentsTable.addRowID("maximum_cycle_value", true);
    argumentsTable.set(
        "maximum_cycle_value", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MAXIMUM_CYCLE_VALUE);
    argumentsTable.addRowID("low_quality_tail", true);
    argumentsTable.set("low_quality_tail", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, LOW_QUAL_TAIL);
    argumentsTable.addRowID("default_platform", true);
    argumentsTable.set("default_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM);
    argumentsTable.addRowID("force_platform", true);
    argumentsTable.set("force_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM);
    argumentsTable.addRowID("quantizing_levels", true);
    argumentsTable.set(
        "quantizing_levels", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS);
    argumentsTable.addRowID("recalibration_report", true);
    argumentsTable.set(
        "recalibration_report",
        RecalUtils.ARGUMENT_VALUE_COLUMN_NAME,
        existingRecalibrationReport == null
            ? "null"
            : existingRecalibrationReport.getAbsolutePath());
    argumentsTable.addRowID("binary_tag_name", true);
    argumentsTable.set(
        "binary_tag_name",
        RecalUtils.ARGUMENT_VALUE_COLUMN_NAME,
        BINARY_TAG_NAME == null ? "null" : BINARY_TAG_NAME);
    return argumentsTable;
  }

  /**
   * Returns a map with the arguments that differ between this an another {@link
   * RecalibrationArgumentCollection} instance.
   *
   * <p>The key is the name of that argument in the report file. The value is a message that
   * explains the difference to the end user.
   *
   * <p>Thus, a empty map indicates that there is no differences between both argument collection
   * that is relevant to report comparison.
   *
   * <p>This method should not throw any exception.
   *
   * @param other the argument-collection to compare against.
   * @param thisRole the name used to refer to this RAC report that makes sense to the end user.
   * @param otherRole the name used to refer to the other RAC report that makes sense to the end
   *     user.
   * @return never <code>null</code>, but a zero-size collection if there are no differences.
   */
  @Requires(
      "other != null && thisRole != null && otherRole != null && !thisRole.equalsIgnoreCase(otherRole)")
  Map<String, ? extends CharSequence> compareReportArguments(
      final RecalibrationArgumentCollection other, final String thisRole, final String otherRole) {
    final Map<String, String> result = new LinkedHashMap<>(15);
    compareRequestedCovariates(result, other, thisRole, otherRole);
    compareSimpleReportArgument(
        result,
        "no_standard_covs",
        DO_NOT_USE_STANDARD_COVARIATES,
        other.DO_NOT_USE_STANDARD_COVARIATES,
        thisRole,
        otherRole);
    compareSimpleReportArgument(
        result,
        "run_without_dbsnp",
        RUN_WITHOUT_DBSNP,
        other.RUN_WITHOUT_DBSNP,
        thisRole,
        otherRole);
    compareSimpleReportArgument(
        result, "solid_recal_mode", SOLID_RECAL_MODE, other.SOLID_RECAL_MODE, thisRole, otherRole);
    compareSimpleReportArgument(
        result,
        "solid_nocall_strategy",
        SOLID_NOCALL_STRATEGY,
        other.SOLID_NOCALL_STRATEGY,
        thisRole,
        otherRole);
    compareSimpleReportArgument(
        result,
        "mismatches_context_size",
        MISMATCHES_CONTEXT_SIZE,
        other.MISMATCHES_CONTEXT_SIZE,
        thisRole,
        otherRole);
    compareSimpleReportArgument(
        result,
        "mismatches_default_quality",
        MISMATCHES_DEFAULT_QUALITY,
        other.MISMATCHES_DEFAULT_QUALITY,
        thisRole,
        otherRole);
    compareSimpleReportArgument(
        result,
        "deletions_default_quality",
        DELETIONS_DEFAULT_QUALITY,
        other.DELETIONS_DEFAULT_QUALITY,
        thisRole,
        otherRole);
    compareSimpleReportArgument(
        result,
        "insertions_default_quality",
        INSERTIONS_DEFAULT_QUALITY,
        other.INSERTIONS_DEFAULT_QUALITY,
        thisRole,
        otherRole);
    compareSimpleReportArgument(
        result,
        "maximum_cycle_value",
        MAXIMUM_CYCLE_VALUE,
        other.MAXIMUM_CYCLE_VALUE,
        thisRole,
        otherRole);
    compareSimpleReportArgument(
        result, "low_quality_tail", LOW_QUAL_TAIL, other.LOW_QUAL_TAIL, thisRole, otherRole);
    compareSimpleReportArgument(
        result, "default_platform", DEFAULT_PLATFORM, other.DEFAULT_PLATFORM, thisRole, otherRole);
    compareSimpleReportArgument(
        result, "force_platform", FORCE_PLATFORM, other.FORCE_PLATFORM, thisRole, otherRole);
    compareSimpleReportArgument(
        result,
        "quantizing_levels",
        QUANTIZING_LEVELS,
        other.QUANTIZING_LEVELS,
        thisRole,
        otherRole);
    compareSimpleReportArgument(
        result, "binary_tag_name", BINARY_TAG_NAME, other.BINARY_TAG_NAME, thisRole, otherRole);
    return result;
  }

  /**
   * Compares the covariate report lists.
   *
   * @param diffs map where to annotate the difference.
   * @param other the argument collection to compare against.
   * @param thisRole the name for this argument collection that makes sense to the user.
   * @param otherRole the name for the other argument collection that makes sense to the end user.
   * @return <code>true</code> if a difference was found.
   */
  @Requires("diffs != null && other != null && thisRole != null && otherRole != null")
  private boolean compareRequestedCovariates(
      final Map<String, String> diffs,
      final RecalibrationArgumentCollection other,
      final String thisRole,
      final String otherRole) {

    final Set<String> beforeNames = new HashSet<>(this.COVARIATES.length);
    final Set<String> afterNames = new HashSet<>(other.COVARIATES.length);
    Utils.addAll(beforeNames, this.COVARIATES);
    Utils.addAll(afterNames, other.COVARIATES);
    final Set<String> intersect = new HashSet<>(Math.min(beforeNames.size(), afterNames.size()));
    intersect.addAll(beforeNames);
    intersect.retainAll(afterNames);

    String diffMessage = null;
    if (intersect.size()
        == 0) { // In practice this is not possible due to required covariates but...
      diffMessage =
          String.format(
              "There are no common covariates between '%s' and '%s'"
                  + " recalibrator reports. Covariates in '%s': {%s}. Covariates in '%s': {%s}.",
              thisRole,
              otherRole,
              thisRole,
              Utils.join(", ", this.COVARIATES),
              otherRole,
              Utils.join(",", other.COVARIATES));
    } else if (intersect.size() != beforeNames.size() || intersect.size() != afterNames.size()) {
      beforeNames.removeAll(intersect);
      afterNames.removeAll(intersect);
      diffMessage =
          String.format(
              "There are differences in the set of covariates requested in the"
                  + " '%s' and '%s' recalibrator reports. "
                  + " Exclusive to '%s': {%s}. Exclusive to '%s': {%s}.",
              thisRole,
              otherRole,
              thisRole,
              Utils.join(", ", beforeNames),
              otherRole,
              Utils.join(", ", afterNames));
    }
    if (diffMessage != null) {
      diffs.put("covariate", diffMessage);
      return true;
    } else {
      return false;
    }
  }

  /**
   * Annotates a map with any difference encountered in a simple value report argument that differs
   * between this an another {@link RecalibrationArgumentCollection} instance.
   *
   * <p>The key of the new entry would be the name of that argument in the report file. The value is
   * a message that explains the difference to the end user.
   *
   * <p>
   *
   * <p>This method should not return any exception.
   *
   * @param diffs where to annotate the differences.
   * @param name the name of the report argument to compare.
   * @param thisValue this argument collection value for that argument.
   * @param otherValue the other collection value for that argument.
   * @param thisRole the name used to refer to this RAC report that makes sense to the end user.
   * @param otherRole the name used to refer to the other RAC report that makes sense to the end
   *     user.
   * @type T the argument Object value type.
   * @return <code>true</code> if a difference has been spotted, thus <code>diff</code> has been
   *     modified.
   */
  private <T> boolean compareSimpleReportArgument(
      final Map<String, String> diffs,
      final String name,
      final T thisValue,
      final T otherValue,
      final String thisRole,
      final String otherRole) {
    if (thisValue == null && otherValue == null) {
      return false;
    } else if (thisValue != null && thisValue.equals(otherValue)) {
      return false;
    } else {
      diffs.put(
          name,
          String.format(
              "differences between '%s' {%s} and '%s' {%s}.",
              thisRole,
              thisValue == null ? "" : thisValue,
              otherRole,
              otherValue == null ? "" : otherValue));
      return true;
    }
  }

  /**
   * Create a shallow copy of this argument collection.
   *
   * @return never <code>null</code>.
   */
  @Override
  public RecalibrationArgumentCollection clone() {
    try {
      return (RecalibrationArgumentCollection) super.clone();
    } catch (CloneNotSupportedException e) {
      throw new StingException(
          "Unreachable code clone not supported thrown when the class "
              + this.getClass().getName()
              + " is cloneable ",
          e);
    }
  }
}
/**
 * General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ti/Tv ratios, and
 * a lot more)
 *
 * <p>Given a variant callset, it is common to calculate various quality control metrics. These
 * metrics include the number of raw or filtered SNP counts; ratio of transition mutations to
 * transversions; concordance of a particular sample's calls to a genotyping chip; number of
 * singletons per sample; etc. Furthermore, it is often useful to stratify these metrics by various
 * criteria like functional class (missense, nonsense, silent), whether the site is CpG site, the
 * amino acid degeneracy of the site, etc. VariantEval facilitates these calculations in two ways:
 * by providing several built-in evaluation and stratification modules, and by providing a framework
 * that permits the easy development of new evaluation and stratification modules.
 *
 * <h2>Input</h2>
 *
 * <p>One or more variant sets to evaluate plus any number of comparison sets.
 *
 * <h2>Output</h2>
 *
 * <p>Evaluation tables detailing the results of the eval modules which were applied. For example:
 *
 * <pre>
 * output.eval.gatkreport:
 * ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample
 * CountVariants  CompRod   CpG      EvalRod  JexlExpression  Novelty  nProcessedLoci  nCalledLoci  nRefLoci  nVariantLoci  variantRate ...
 * CountVariants  dbsnp     CpG      eval     none            all      65900028        135770       0         135770        0.00206024  ...
 * CountVariants  dbsnp     CpG      eval     none            known    65900028        47068        0         47068         0.00071423  ...
 * CountVariants  dbsnp     CpG      eval     none            novel    65900028        88702        0         88702         0.00134601  ...
 * CountVariants  dbsnp     all      eval     none            all      65900028        330818       0         330818        0.00502000  ...
 * CountVariants  dbsnp     all      eval     none            known    65900028        120685       0         120685        0.00183133  ...
 * CountVariants  dbsnp     all      eval     none            novel    65900028        210133       0         210133        0.00318866  ...
 * CountVariants  dbsnp     non_CpG  eval     none            all      65900028        195048       0         195048        0.00295976  ...
 * CountVariants  dbsnp     non_CpG  eval     none            known    65900028        73617        0         73617         0.00111710  ...
 * CountVariants  dbsnp     non_CpG  eval     none            novel    65900028        121431       0         121431        0.00184265  ...
 * ...
 * </pre>
 *
 * <h2>Examples</h2>
 *
 * <pre>
 * java -Xmx2g -jar GenomeAnalysisTK.jar \
 *   -R ref.fasta \
 *   -T VariantEval \
 *   -o output.eval.gatkreport \
 *   --eval:set1 set1.vcf \
 *   --eval:set2 set2.vcf \
 *   [--comp comp.vcf]
 * </pre>
 */
@Reference(window = @Window(start = -50, stop = 50))
public class VariantEvalWalker extends RodWalker<Integer, Integer>
    implements TreeReducible<Integer> {

  @Output protected PrintStream out;

  /** The variant file(s) to evaluate. */
  @Input(fullName = "eval", shortName = "eval", doc = "Input evaluation file(s)", required = true)
  public List<RodBinding<VariantContext>> evals;

  /** The variant file(s) to compare against. */
  @Input(fullName = "comp", shortName = "comp", doc = "Input comparison file(s)", required = false)
  public List<RodBinding<VariantContext>> compsProvided = Collections.emptyList();

  private List<RodBinding<VariantContext>> comps = new ArrayList<RodBinding<VariantContext>>();

  /**
   * dbSNP comparison VCF. By default, the dbSNP file is used to specify the set of "known"
   * variants. Other sets can be specified with the -knownName (--known_names) argument.
   */
  @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();

  // Help arguments
  @Argument(
      fullName = "list",
      shortName = "ls",
      doc = "List the available eval modules and exit",
      required = false)
  protected Boolean LIST = false;

  // Partitioning the data arguments
  @Argument(
      shortName = "select",
      doc = "One or more stratifications to use when evaluating the data",
      required = false)
  protected ArrayList<String> SELECT_EXPS = new ArrayList<String>();

  @Argument(
      shortName = "selectName",
      doc = "Names to use for the list of stratifications (must be a 1-to-1 mapping)",
      required = false)
  protected ArrayList<String> SELECT_NAMES = new ArrayList<String>();

  @Argument(
      fullName = "sample",
      shortName = "sn",
      doc =
          "Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context",
      required = false)
  protected Set<String> SAMPLE_EXPRESSIONS;

  /** List of rod tracks to be used for specifying "known" variants other than dbSNP. */
  @Argument(
      shortName = "knownName",
      doc =
          "Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets",
      required = false)
  protected HashSet<String> KNOWN_NAMES = new HashSet<String>();

  List<RodBinding<VariantContext>> knowns = new ArrayList<RodBinding<VariantContext>>();

  // Stratification arguments
  @Argument(
      fullName = "stratificationModule",
      shortName = "ST",
      doc =
          "One or more specific stratification modules to apply to the eval track(s) (in addition to the standard stratifications, unless -noS is specified)",
      required = false)
  protected String[] STRATIFICATIONS_TO_USE = {};

  @Argument(
      fullName = "doNotUseAllStandardStratifications",
      shortName = "noST",
      doc =
          "Do not use the standard stratification modules by default (instead, only those that are specified with the -S option)",
      required = false)
  protected Boolean NO_STANDARD_STRATIFICATIONS = false;

  /** See the -list argument to view available modules. */
  @Argument(
      fullName = "evalModule",
      shortName = "EV",
      doc =
          "One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noEV is specified)",
      required = false)
  protected String[] MODULES_TO_USE = {};

  @Argument(
      fullName = "doNotUseAllStandardModules",
      shortName = "noEV",
      doc =
          "Do not use the standard modules by default (instead, only those that are specified with the -EV option)",
      required = false)
  protected Boolean NO_STANDARD_MODULES = false;

  // Other arguments
  @Argument(
      fullName = "numSamples",
      shortName = "ns",
      doc = "Number of samples (used if no samples are available in the VCF file",
      required = false)
  protected Integer NUM_SAMPLES = 0;

  @Argument(
      fullName = "minPhaseQuality",
      shortName = "mpq",
      doc = "Minimum phasing quality",
      required = false)
  protected double MIN_PHASE_QUALITY = 10.0;

  @Argument(
      shortName = "mvq",
      fullName = "mendelianViolationQualThreshold",
      doc =
          "Minimum genotype QUAL score for each trio member required to accept a site as a violation. Default is 50.",
      required = false)
  protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50;

  @Argument(
      fullName = "ancestralAlignments",
      shortName = "aa",
      doc = "Fasta file with ancestral alleles",
      required = false)
  private File ancestralAlignmentsFile = null;

  @Argument(
      fullName = "requireStrictAlleleMatch",
      shortName = "strict",
      doc =
          "If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping",
      required = false)
  private boolean requireStrictAlleleMatch = false;

  /**
   * If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying
   * variant set, and evaluate the union of the results. Useful when you want to do -eval chr1.vcf
   * -eval chr2.vcf etc.
   */
  @Argument(
      fullName = "mergeEvals",
      shortName = "mergeEvals",
      doc = "If provided, all -eval tracks will be merged into a single eval track",
      required = false)
  public boolean mergeEvals = false;

  /** File containing tribble-readable features for the IntervalStratificiation */
  @Input(
      fullName = "stratIntervals",
      shortName = "stratIntervals",
      doc = "File containing tribble-readable features for the IntervalStratificiation",
      required = false)
  public IntervalBinding<Feature> intervalsFile = null;

  /**
   * File containing tribble-readable features containing known CNVs. For use with VariantSummary
   * table.
   */
  @Input(
      fullName = "knownCNVs",
      shortName = "knownCNVs",
      doc =
          "File containing tribble-readable features describing a known list of copy number variants",
      required = false)
  public IntervalBinding<Feature> knownCNVsFile = null;

  Map<String, IntervalTree<GenomeLoc>> knownCNVsByContig = Collections.emptyMap();

  // Variables
  private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();

  private Set<String> sampleNamesForEvaluation = new TreeSet<String>();
  private Set<String> sampleNamesForStratification = new TreeSet<String>();
  private int numSamples = 0;

  // The list of stratifiers and evaluators to use
  private TreeSet<VariantStratifier> stratificationObjects = null;

  // The set of all possible evaluation contexts
  private HashMap<StateKey, NewEvaluationContext> evaluationContexts = null;

  // important stratifications
  private boolean byFilterIsEnabled = false;
  private boolean perSampleIsEnabled = false;

  // Output report
  private GATKReport report = null;

  // Public constants
  private static String ALL_SAMPLE_NAME = "all";

  // Utility class
  private final VariantEvalUtils variantEvalUtils = new VariantEvalUtils(this);

  // Ancestral alignments
  private IndexedFastaSequenceFile ancestralAlignments = null;

  /** Initialize the stratifications, evaluations, evaluation contexts, and reporting object */
  public void initialize() {
    // Just list the modules, and exit quickly.
    if (LIST) {
      variantEvalUtils.listModulesAndExit();
    }

    // maintain the full list of comps
    comps.addAll(compsProvided);
    if (dbsnp.dbsnp.isBound()) {
      comps.add(dbsnp.dbsnp);
      knowns.add(dbsnp.dbsnp);
    }

    // Add a dummy comp track if none exists
    if (comps.size() == 0)
      comps.add(
          new RodBinding<VariantContext>(VariantContext.class, "none", "UNBOUND", "", new Tags()));

    // Set up set of additional knowns
    for (RodBinding<VariantContext> compRod : comps) {
      if (KNOWN_NAMES.contains(compRod.getName())) knowns.add(compRod);
    }

    // Now that we have all the rods categorized, determine the sample list from the eval rods.
    Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evals);
    Set<String> vcfSamples =
        SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);

    // Load the sample list
    sampleNamesForEvaluation.addAll(
        SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS));
    numSamples = NUM_SAMPLES > 0 ? NUM_SAMPLES : sampleNamesForEvaluation.size();

    if (Arrays.asList(STRATIFICATIONS_TO_USE).contains("Sample")) {
      sampleNamesForStratification.addAll(sampleNamesForEvaluation);
    }
    sampleNamesForStratification.add(ALL_SAMPLE_NAME);

    // Initialize select expressions
    for (VariantContextUtils.JexlVCMatchExp jexl :
        VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS)) {
      SortableJexlVCMatchExp sjexl = new SortableJexlVCMatchExp(jexl.name, jexl.exp);
      jexlExpressions.add(sjexl);
    }

    // Initialize the set of stratifications and evaluations to use
    stratificationObjects =
        variantEvalUtils.initializeStratificationObjects(
            this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE);
    Set<Class<? extends VariantEvaluator>> evaluationObjects =
        variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE);
    for (VariantStratifier vs : getStratificationObjects()) {
      if (vs.getName().equals("Filter")) byFilterIsEnabled = true;
      else if (vs.getName().equals("Sample")) perSampleIsEnabled = true;
    }

    if (intervalsFile != null) {
      boolean fail = true;
      for (final VariantStratifier vs : stratificationObjects) {
        if (vs.getClass().equals(IntervalStratification.class)) fail = false;
      }
      if (fail)
        throw new UserException.BadArgumentValue(
            "ST", "stratIntervals argument provided but -ST IntervalStratification not provided");
    }

    // Initialize the evaluation contexts
    evaluationContexts =
        variantEvalUtils.initializeEvaluationContexts(
            stratificationObjects, evaluationObjects, null, null);

    // Initialize report table
    report = variantEvalUtils.initializeGATKReport(stratificationObjects, evaluationObjects);

    // Load ancestral alignments
    if (ancestralAlignmentsFile != null) {
      try {
        ancestralAlignments = new IndexedFastaSequenceFile(ancestralAlignmentsFile);
      } catch (FileNotFoundException e) {
        throw new ReviewedStingException(
            String.format(
                "The ancestral alignments file, '%s', could not be found",
                ancestralAlignmentsFile.getAbsolutePath()));
      }
    }

    // initialize CNVs
    if (knownCNVsFile != null) {
      knownCNVsByContig = createIntervalTreeByContig(knownCNVsFile);
    }
  }

  public final Map<String, IntervalTree<GenomeLoc>> createIntervalTreeByContig(
      final IntervalBinding<Feature> intervals) {
    final Map<String, IntervalTree<GenomeLoc>> byContig =
        new HashMap<String, IntervalTree<GenomeLoc>>();

    final List<GenomeLoc> locs = intervals.getIntervals(getToolkit());

    // set up the map from contig -> interval tree
    for (final String contig : getContigNames())
      byContig.put(contig, new IntervalTree<GenomeLoc>());

    for (final GenomeLoc loc : locs) {
      byContig.get(loc.getContig()).put(loc.getStart(), loc.getStop(), loc);
    }

    return byContig;
  }

  /** Collect relevant information from each variant in the supplied VCFs */
  @Override
  public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
    for (NewEvaluationContext nec : evaluationContexts.values()) {
      synchronized (nec) {
        nec.update0(tracker, ref, context);
      }
    }

    if (tracker != null) {
      String aastr =
          (ancestralAlignments == null)
              ? null
              : new String(
                  ancestralAlignments
                      .getSubsequenceAt(
                          ref.getLocus().getContig(),
                          ref.getLocus().getStart(),
                          ref.getLocus().getStop())
                      .getBases());

      //      --------- track ---------           sample  - VariantContexts -
      HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> evalVCs =
          variantEvalUtils.bindVariantContexts(
              tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals);
      HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> compVCs =
          variantEvalUtils.bindVariantContexts(
              tracker, ref, comps, byFilterIsEnabled, false, false, false);

      // for each eval track
      for (final RodBinding<VariantContext> evalRod : evals) {
        final Map<String, Collection<VariantContext>> emptyEvalMap = Collections.emptyMap();
        final Map<String, Collection<VariantContext>> evalSet =
            evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : emptyEvalMap;

        // for each sample stratifier
        for (final String sampleName : sampleNamesForStratification) {
          Collection<VariantContext> evalSetBySample = evalSet.get(sampleName);
          if (evalSetBySample == null) {
            evalSetBySample = new HashSet<VariantContext>(1);
            evalSetBySample.add(null);
          }

          // for each eval in the track
          for (VariantContext eval : evalSetBySample) {
            // deal with ancestral alleles if requested
            if (eval != null && aastr != null) {
              eval = new VariantContextBuilder(eval).attribute("ANCESTRALALLELE", aastr).make();
            }

            // for each comp track
            for (final RodBinding<VariantContext> compRod : comps) {
              // no sample stratification for comps
              final HashMap<String, Collection<VariantContext>> compSetHash = compVCs.get(compRod);
              final Collection<VariantContext> compSet =
                  (compSetHash == null || compSetHash.size() == 0)
                      ? Collections.<VariantContext>emptyList()
                      : compVCs.get(compRod).values().iterator().next();

              // find the comp
              final VariantContext comp = findMatchingComp(eval, compSet);

              HashMap<VariantStratifier, List<String>> stateMap =
                  new HashMap<VariantStratifier, List<String>>();
              for (VariantStratifier vs : stratificationObjects) {
                List<String> states =
                    vs.getRelevantStates(
                        ref, tracker, comp, compRod.getName(), eval, evalRod.getName(), sampleName);
                stateMap.put(vs, states);
              }

              ArrayList<StateKey> stateKeys = new ArrayList<StateKey>();
              variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys);

              HashSet<StateKey> stateKeysHash = new HashSet<StateKey>(stateKeys);

              for (StateKey stateKey : stateKeysHash) {
                NewEvaluationContext nec = evaluationContexts.get(stateKey);

                // eval against the comp
                synchronized (nec) {
                  nec.apply(tracker, ref, context, comp, eval);
                }

                // eval=null against all comps of different type that aren't bound to another eval
                for (VariantContext otherComp : compSet) {
                  if (otherComp != comp && !compHasMatchingEval(otherComp, evalSetBySample)) {
                    synchronized (nec) {
                      nec.apply(tracker, ref, context, otherComp, null);
                    }
                  }
                }
              }
            }
          }
        }

        if (mergeEvals) break; // stop processing the eval tracks
      }
    }

    return null;
  }

  @Requires({"comp != null", "evals != null"})
  private boolean compHasMatchingEval(
      final VariantContext comp, final Collection<VariantContext> evals) {
    // find all of the matching comps
    for (final VariantContext eval : evals) {
      if (eval != null
          && doEvalAndCompMatch(comp, eval, requireStrictAlleleMatch) != EvalCompMatchType.NO_MATCH)
        return true;
    }

    // nothing matched
    return false;
  }

  private enum EvalCompMatchType {
    NO_MATCH,
    STRICT,
    LENIENT
  }

  @Requires({"eval != null", "comp != null"})
  private EvalCompMatchType doEvalAndCompMatch(
      final VariantContext eval, final VariantContext comp, boolean requireStrictAlleleMatch) {
    // find all of the matching comps
    if (comp.getType() != eval.getType()) return EvalCompMatchType.NO_MATCH;

    // find the comp which matches both the reference allele and alternate allele from eval
    final Allele altEval =
        eval.getAlternateAlleles().size() == 0 ? null : eval.getAlternateAllele(0);
    final Allele altComp =
        comp.getAlternateAlleles().size() == 0 ? null : comp.getAlternateAllele(0);
    if ((altEval == null && altComp == null)
        || (altEval != null
            && altEval.equals(altComp)
            && eval.getReference().equals(comp.getReference()))) return EvalCompMatchType.STRICT;
    else return requireStrictAlleleMatch ? EvalCompMatchType.NO_MATCH : EvalCompMatchType.LENIENT;
  }

  private VariantContext findMatchingComp(
      final VariantContext eval, final Collection<VariantContext> comps) {
    // if no comps, return null
    if (comps == null || comps.isEmpty()) return null;

    // if no eval, return any comp
    if (eval == null) return comps.iterator().next();

    // find all of the matching comps
    VariantContext lenientMatch = null;
    for (final VariantContext comp : comps) {
      switch (doEvalAndCompMatch(comp, eval, requireStrictAlleleMatch)) {
        case STRICT:
          return comp;
        case LENIENT:
          if (lenientMatch == null) lenientMatch = comp;
          break;
        case NO_MATCH:;
      }
    }

    // nothing matched, just return lenientMatch, which might be null
    return lenientMatch;
  }

  public Integer treeReduce(Integer lhs, Integer rhs) {
    return null;
  }

  @Override
  public Integer reduceInit() {
    return null;
  }

  @Override
  public Integer reduce(Integer value, Integer sum) {
    return null;
  }

  /**
   * Output the finalized report
   *
   * @param result an integer that doesn't get used for anything
   */
  public void onTraversalDone(Integer result) {
    logger.info("Finalizing variant report");

    for (StateKey stateKey : evaluationContexts.keySet()) {
      NewEvaluationContext nec = evaluationContexts.get(stateKey);

      for (VariantEvaluator ve : nec.getEvaluationClassList().values()) {
        ve.finalizeEvaluation();

        AnalysisModuleScanner scanner = new AnalysisModuleScanner(ve);
        Map<Field, DataPoint> datamap = scanner.getData();

        for (Field field : datamap.keySet()) {
          try {
            field.setAccessible(true);

            if (field.get(ve) instanceof TableType) {
              TableType t = (TableType) field.get(ve);

              String subTableName = ve.getClass().getSimpleName() + "." + field.getName();
              final DataPoint dataPointAnn = datamap.get(field);

              GATKReportTable table;
              if (!report.hasTable(subTableName)) {
                report.addTable(subTableName, dataPointAnn.description());
                table = report.getTable(subTableName);

                table.addPrimaryKey("entry", false);
                table.addColumn(subTableName, subTableName);

                for (VariantStratifier vs : stratificationObjects) {
                  table.addColumn(vs.getName(), "unknown");
                }

                table.addColumn("row", "unknown");

                for (Object o : t.getColumnKeys()) {
                  String c;

                  if (o instanceof String) {
                    c = (String) o;
                  } else {
                    c = o.toString();
                  }

                  table.addColumn(c, 0.0);
                }
              } else {
                table = report.getTable(subTableName);
              }

              for (int row = 0; row < t.getRowKeys().length; row++) {
                String r = (String) t.getRowKeys()[row];

                for (VariantStratifier vs : stratificationObjects) {
                  final String columnName = vs.getName();
                  table.set(stateKey.toString() + r, columnName, stateKey.get(columnName));
                }

                for (int col = 0; col < t.getColumnKeys().length; col++) {
                  String c;
                  if (t.getColumnKeys()[col] instanceof String) {
                    c = (String) t.getColumnKeys()[col];
                  } else {
                    c = t.getColumnKeys()[col].toString();
                  }

                  String newStateKey = stateKey.toString() + r;
                  table.set(newStateKey, c, t.getCell(row, col));

                  table.set(newStateKey, "row", r);
                }
              }
            } else {
              GATKReportTable table = report.getTable(ve.getClass().getSimpleName());

              for (VariantStratifier vs : stratificationObjects) {
                String columnName = vs.getName();

                table.set(stateKey.toString(), columnName, stateKey.get(vs.getName()));
              }

              table.set(stateKey.toString(), field.getName(), field.get(ve));
            }
          } catch (IllegalAccessException e) {
            throw new StingException("IllegalAccessException: " + e);
          }
        }
      }
    }

    report.print(out);
  }

  // Accessors
  public Logger getLogger() {
    return logger;
  }

  public int getNumSamples() {
    return numSamples;
  }

  public double getMinPhaseQuality() {
    return MIN_PHASE_QUALITY;
  }

  public double getMendelianViolationQualThreshold() {
    return MENDELIAN_VIOLATION_QUAL_THRESHOLD;
  }

  public TreeSet<VariantStratifier> getStratificationObjects() {
    return stratificationObjects;
  }

  public static String getAllSampleName() {
    return ALL_SAMPLE_NAME;
  }

  public List<RodBinding<VariantContext>> getKnowns() {
    return knowns;
  }

  public List<RodBinding<VariantContext>> getEvals() {
    return evals;
  }

  public Set<String> getSampleNamesForEvaluation() {
    return sampleNamesForEvaluation;
  }

  public Set<String> getSampleNamesForStratification() {
    return sampleNamesForStratification;
  }

  public List<RodBinding<VariantContext>> getComps() {
    return comps;
  }

  public Set<SortableJexlVCMatchExp> getJexlExpressions() {
    return jexlExpressions;
  }

  public Set<String> getContigNames() {
    final TreeSet<String> contigs = new TreeSet<String>();
    for (final SAMSequenceRecord r :
        getToolkit()
            .getReferenceDataSource()
            .getReference()
            .getSequenceDictionary()
            .getSequences()) {
      contigs.add(r.getSequenceName());
    }
    return contigs;
  }

  public GenomeLocParser getGenomeLocParser() {
    return getToolkit().getGenomeLocParser();
  }

  public GenomeAnalysisEngine getToolkit() {
    return super.getToolkit();
  }
}
Exemplo n.º 3
0
  /** Collect relevant information from each variant in the supplied VCFs */
  @Override
  public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
    // we track the processed bp and expose this for modules instead of wasting CPU power on
    // calculating
    // the same thing over and over in evals that want the processed bp
    synchronized (this) {
      nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1);
    }

    if (tracker != null) {
      String aastr =
          (ancestralAlignments == null)
              ? null
              : new String(
                  ancestralAlignments
                      .getSubsequenceAt(
                          ref.getLocus().getContig(),
                          ref.getLocus().getStart(),
                          ref.getLocus().getStop())
                      .getBases());

      //            // update the dynamic stratifications
      //            for (final VariantContext vc : tracker.getValues(evals, ref.getLocus())) {
      //                // don't worry -- DynamicStratification only work with one eval object
      //                for ( final DynamicStratification ds :  dynamicStratifications ) {
      //                    ds.update(vc);
      //                }
      //            }

      //      --------- track ---------           sample  - VariantContexts -
      HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> evalVCs =
          variantEvalUtils.bindVariantContexts(
              tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals);
      HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> compVCs =
          variantEvalUtils.bindVariantContexts(
              tracker, ref, comps, byFilterIsEnabled, false, false, false);

      // for each eval track
      for (final RodBinding<VariantContext> evalRod : evals) {
        final Map<String, Collection<VariantContext>> emptyEvalMap = Collections.emptyMap();
        final Map<String, Collection<VariantContext>> evalSet =
            evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : emptyEvalMap;

        // for each sample stratifier
        for (final String sampleName : sampleNamesForStratification) {
          Collection<VariantContext> evalSetBySample = evalSet.get(sampleName);
          if (evalSetBySample == null) {
            evalSetBySample = new HashSet<VariantContext>(1);
            evalSetBySample.add(null);
          }

          // for each eval in the track
          for (VariantContext eval : evalSetBySample) {
            // deal with ancestral alleles if requested
            if (eval != null && aastr != null) {
              eval = new VariantContextBuilder(eval).attribute("ANCESTRALALLELE", aastr).make();
            }

            // for each comp track
            for (final RodBinding<VariantContext> compRod : comps) {
              // no sample stratification for comps
              final HashMap<String, Collection<VariantContext>> compSetHash = compVCs.get(compRod);
              final Collection<VariantContext> compSet =
                  (compSetHash == null || compSetHash.size() == 0)
                      ? Collections.<VariantContext>emptyList()
                      : compVCs.get(compRod).values().iterator().next();

              // find the comp
              final VariantContext comp = findMatchingComp(eval, compSet);

              for (EvaluationContext nec :
                  getEvaluationContexts(
                      tracker, ref, eval, evalRod.getName(), comp, compRod.getName(), sampleName)) {

                // eval against the comp
                synchronized (nec) {
                  nec.apply(tracker, ref, context, comp, eval);
                }

                // eval=null against all comps of different type that aren't bound to another eval
                for (VariantContext otherComp : compSet) {
                  if (otherComp != comp && !compHasMatchingEval(otherComp, evalSetBySample)) {
                    synchronized (nec) {
                      nec.apply(tracker, ref, context, otherComp, null);
                    }
                  }
                }
              }
            }
          }
        }

        if (mergeEvals) break; // stop processing the eval tracks
      }
    }

    return null;
  }
  /** Collect relevant information from each variant in the supplied VCFs */
  @Override
  public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
    for (NewEvaluationContext nec : evaluationContexts.values()) {
      synchronized (nec) {
        nec.update0(tracker, ref, context);
      }
    }

    if (tracker != null) {
      String aastr =
          (ancestralAlignments == null)
              ? null
              : new String(
                  ancestralAlignments
                      .getSubsequenceAt(
                          ref.getLocus().getContig(),
                          ref.getLocus().getStart(),
                          ref.getLocus().getStop())
                      .getBases());

      //      --------- track ---------           sample  - VariantContexts -
      HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> evalVCs =
          variantEvalUtils.bindVariantContexts(
              tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals);
      HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> compVCs =
          variantEvalUtils.bindVariantContexts(
              tracker, ref, comps, byFilterIsEnabled, false, false, false);

      // for each eval track
      for (final RodBinding<VariantContext> evalRod : evals) {
        final Map<String, Collection<VariantContext>> emptyEvalMap = Collections.emptyMap();
        final Map<String, Collection<VariantContext>> evalSet =
            evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : emptyEvalMap;

        // for each sample stratifier
        for (final String sampleName : sampleNamesForStratification) {
          Collection<VariantContext> evalSetBySample = evalSet.get(sampleName);
          if (evalSetBySample == null) {
            evalSetBySample = new HashSet<VariantContext>(1);
            evalSetBySample.add(null);
          }

          // for each eval in the track
          for (VariantContext eval : evalSetBySample) {
            // deal with ancestral alleles if requested
            if (eval != null && aastr != null) {
              eval = new VariantContextBuilder(eval).attribute("ANCESTRALALLELE", aastr).make();
            }

            // for each comp track
            for (final RodBinding<VariantContext> compRod : comps) {
              // no sample stratification for comps
              final HashMap<String, Collection<VariantContext>> compSetHash = compVCs.get(compRod);
              final Collection<VariantContext> compSet =
                  (compSetHash == null || compSetHash.size() == 0)
                      ? Collections.<VariantContext>emptyList()
                      : compVCs.get(compRod).values().iterator().next();

              // find the comp
              final VariantContext comp = findMatchingComp(eval, compSet);

              HashMap<VariantStratifier, List<String>> stateMap =
                  new HashMap<VariantStratifier, List<String>>();
              for (VariantStratifier vs : stratificationObjects) {
                List<String> states =
                    vs.getRelevantStates(
                        ref, tracker, comp, compRod.getName(), eval, evalRod.getName(), sampleName);
                stateMap.put(vs, states);
              }

              ArrayList<StateKey> stateKeys = new ArrayList<StateKey>();
              variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys);

              HashSet<StateKey> stateKeysHash = new HashSet<StateKey>(stateKeys);

              for (StateKey stateKey : stateKeysHash) {
                NewEvaluationContext nec = evaluationContexts.get(stateKey);

                // eval against the comp
                synchronized (nec) {
                  nec.apply(tracker, ref, context, comp, eval);
                }

                // eval=null against all comps of different type that aren't bound to another eval
                for (VariantContext otherComp : compSet) {
                  if (otherComp != comp && !compHasMatchingEval(otherComp, evalSetBySample)) {
                    synchronized (nec) {
                      nec.apply(tracker, ref, context, otherComp, null);
                    }
                  }
                }
              }
            }
          }
        }

        if (mergeEvals) break; // stop processing the eval tracks
      }
    }

    return null;
  }
Exemplo n.º 5
0
/**
 * General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ti/Tv ratios, and
 * a lot more)
 *
 * <p>Given a variant callset, it is common to calculate various quality control metrics. These
 * metrics include the number of raw or filtered SNP counts; ratio of transition mutations to
 * transversions; concordance of a particular sample's calls to a genotyping chip; number of
 * singletons per sample; etc. Furthermore, it is often useful to stratify these metrics by various
 * criteria like functional class (missense, nonsense, silent), whether the site is CpG site, the
 * amino acid degeneracy of the site, etc. VariantEval facilitates these calculations in two ways:
 * by providing several built-in evaluation and stratification modules, and by providing a framework
 * that permits the easy development of new evaluation and stratification modules.
 *
 * <h2>Input</h2>
 *
 * <p>One or more variant sets to evaluate plus any number of comparison sets.
 *
 * <h2>Output</h2>
 *
 * <p>Evaluation tables detailing the results of the eval modules which were applied. For example:
 *
 * <pre>
 * output.eval.gatkreport:
 * ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample
 * CountVariants  CompRod   CpG      EvalRod  JexlExpression  Novelty  nProcessedLoci  nCalledLoci  nRefLoci  nVariantLoci  variantRate ...
 * CountVariants  dbsnp     CpG      eval     none            all      65900028        135770       0         135770        0.00206024  ...
 * CountVariants  dbsnp     CpG      eval     none            known    65900028        47068        0         47068         0.00071423  ...
 * CountVariants  dbsnp     CpG      eval     none            novel    65900028        88702        0         88702         0.00134601  ...
 * CountVariants  dbsnp     all      eval     none            all      65900028        330818       0         330818        0.00502000  ...
 * CountVariants  dbsnp     all      eval     none            known    65900028        120685       0         120685        0.00183133  ...
 * CountVariants  dbsnp     all      eval     none            novel    65900028        210133       0         210133        0.00318866  ...
 * CountVariants  dbsnp     non_CpG  eval     none            all      65900028        195048       0         195048        0.00295976  ...
 * CountVariants  dbsnp     non_CpG  eval     none            known    65900028        73617        0         73617         0.00111710  ...
 * CountVariants  dbsnp     non_CpG  eval     none            novel    65900028        121431       0         121431        0.00184265  ...
 * ...
 * </pre>
 *
 * <h2>Examples</h2>
 *
 * <pre>
 * java -Xmx2g -jar GenomeAnalysisTK.jar \
 *   -R ref.fasta \
 *   -T VariantEval \
 *   -o output.eval.gatkreport \
 *   --eval:set1 set1.vcf \
 *   --eval:set2 set2.vcf \
 *   [--comp comp.vcf]
 * </pre>
 */
@DocumentedGATKFeature(
    groupName = HelpConstants.DOCS_CAT_VARMANIP,
    extraDocs = {CommandLineGATK.class})
@Reference(window = @Window(start = -50, stop = 50))
@PartitionBy(PartitionType.NONE)
public class VariantEval extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
  public static final String IS_SINGLETON_KEY = "ISSINGLETON";

  @Output protected PrintStream out;

  /** The variant file(s) to evaluate. */
  @Input(fullName = "eval", shortName = "eval", doc = "Input evaluation file(s)", required = true)
  public List<RodBinding<VariantContext>> evals;

  /** The variant file(s) to compare against. */
  @Input(fullName = "comp", shortName = "comp", doc = "Input comparison file(s)", required = false)
  public List<RodBinding<VariantContext>> compsProvided = Collections.emptyList();

  private List<RodBinding<VariantContext>> comps = new ArrayList<RodBinding<VariantContext>>();

  /**
   * dbSNP comparison VCF. By default, the dbSNP file is used to specify the set of "known"
   * variants. Other sets can be specified with the -knownName (--known_names) argument.
   */
  @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();

  /**
   * Some analyses want to count overlap not with dbSNP (which is in general very open) but actually
   * want to itemize their overlap specifically with a set of gold standard sites such as HapMap,
   * OMNI, or the gold standard indels. This argument provides a mechanism for communicating which
   * file to use
   */
  @Input(
      fullName = "goldStandard",
      shortName = "gold",
      doc =
          "Evaluations that count calls at sites of true variation (e.g., indel calls) will use this argument as their gold standard for comparison",
      required = false)
  public RodBinding<VariantContext> goldStandard = null;

  /** Note that the --list argument requires a fully resolved and correct command-line to work. */
  @Argument(
      fullName = "list",
      shortName = "ls",
      doc = "List the available eval modules and exit",
      required = false)
  protected Boolean LIST = false;

  // Partitioning the data arguments
  @Argument(
      shortName = "select",
      doc = "One or more stratifications to use when evaluating the data",
      required = false)
  protected ArrayList<String> SELECT_EXPS = new ArrayList<String>();

  @Argument(
      shortName = "selectName",
      doc = "Names to use for the list of stratifications (must be a 1-to-1 mapping)",
      required = false)
  protected ArrayList<String> SELECT_NAMES = new ArrayList<String>();

  @Argument(
      fullName = "sample",
      shortName = "sn",
      doc =
          "Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context",
      required = false)
  protected Set<String> SAMPLE_EXPRESSIONS;

  /** List of rod tracks to be used for specifying "known" variants other than dbSNP. */
  @Argument(
      shortName = "knownName",
      doc =
          "Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets",
      required = false)
  protected HashSet<String> KNOWN_NAMES = new HashSet<String>();

  List<RodBinding<VariantContext>> knowns = new ArrayList<RodBinding<VariantContext>>();

  // Stratification arguments
  @Argument(
      fullName = "stratificationModule",
      shortName = "ST",
      doc =
          "One or more specific stratification modules to apply to the eval track(s) (in addition to the standard stratifications, unless -noS is specified)",
      required = false)
  protected String[] STRATIFICATIONS_TO_USE = {};

  @Argument(
      fullName = "doNotUseAllStandardStratifications",
      shortName = "noST",
      doc =
          "Do not use the standard stratification modules by default (instead, only those that are specified with the -S option)",
      required = false)
  protected Boolean NO_STANDARD_STRATIFICATIONS = false;

  /** See the -list argument to view available modules. */
  @Argument(
      fullName = "evalModule",
      shortName = "EV",
      doc =
          "One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noEV is specified)",
      required = false)
  protected String[] MODULES_TO_USE = {};

  @Argument(
      fullName = "doNotUseAllStandardModules",
      shortName = "noEV",
      doc =
          "Do not use the standard modules by default (instead, only those that are specified with the -EV option)",
      required = false)
  protected Boolean NO_STANDARD_MODULES = false;

  @Argument(
      fullName = "minPhaseQuality",
      shortName = "mpq",
      doc = "Minimum phasing quality",
      required = false)
  protected double MIN_PHASE_QUALITY = 10.0;

  @Argument(
      shortName = "mvq",
      fullName = "mendelianViolationQualThreshold",
      doc =
          "Minimum genotype QUAL score for each trio member required to accept a site as a violation. Default is 50.",
      required = false)
  protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50;

  @Argument(
      shortName = "ploidy",
      fullName = "samplePloidy",
      doc = "Per-sample ploidy (number of chromosomes per sample)",
      required = false)
  protected int ploidy = GATKVariantContextUtils.DEFAULT_PLOIDY;

  @Argument(
      fullName = "ancestralAlignments",
      shortName = "aa",
      doc = "Fasta file with ancestral alleles",
      required = false)
  private File ancestralAlignmentsFile = null;

  @Argument(
      fullName = "requireStrictAlleleMatch",
      shortName = "strict",
      doc =
          "If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping",
      required = false)
  private boolean requireStrictAlleleMatch = false;

  @Argument(
      fullName = "keepAC0",
      shortName = "keepAC0",
      doc =
          "If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes",
      required = false)
  private boolean keepSitesWithAC0 = false;

  @Hidden
  @Argument(
      fullName = "numSamples",
      shortName = "numSamples",
      doc =
          "If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes",
      required = false)
  private int numSamplesFromArgument = 0;

  /**
   * If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying
   * variant set, and evaluate the union of the results. Useful when you want to do -eval chr1.vcf
   * -eval chr2.vcf etc.
   */
  @Argument(
      fullName = "mergeEvals",
      shortName = "mergeEvals",
      doc = "If provided, all -eval tracks will be merged into a single eval track",
      required = false)
  public boolean mergeEvals = false;

  /** File containing tribble-readable features for the IntervalStratificiation */
  @Input(
      fullName = "stratIntervals",
      shortName = "stratIntervals",
      doc = "File containing tribble-readable features for the IntervalStratificiation",
      required = false)
  public IntervalBinding<Feature> intervalsFile = null;

  /**
   * File containing tribble-readable features containing known CNVs. For use with VariantSummary
   * table.
   */
  @Input(
      fullName = "knownCNVs",
      shortName = "knownCNVs",
      doc =
          "File containing tribble-readable features describing a known list of copy number variants",
      required = false)
  public IntervalBinding<Feature> knownCNVsFile = null;

  Map<String, IntervalTree<GenomeLoc>> knownCNVsByContig = Collections.emptyMap();

  // Variables
  private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();

  private boolean isSubsettingSamples;
  private Set<String> sampleNamesForEvaluation = new LinkedHashSet<String>();
  private Set<String> sampleNamesForStratification = new LinkedHashSet<String>();

  // important stratifications
  private boolean byFilterIsEnabled = false;
  private boolean perSampleIsEnabled = false;

  // Public constants
  private static String ALL_SAMPLE_NAME = "all";

  // the number of processed bp for this walker
  long nProcessedLoci = 0;

  // Utility class
  private final VariantEvalUtils variantEvalUtils = new VariantEvalUtils(this);

  // Ancestral alignments
  private IndexedFastaSequenceFile ancestralAlignments = null;

  // The set of all possible evaluation contexts
  StratificationManager<VariantStratifier, EvaluationContext> stratManager;
  // Set<DynamicStratification> dynamicStratifications = Collections.emptySet();

  /** Initialize the stratifications, evaluations, evaluation contexts, and reporting object */
  public void initialize() {
    // Just list the modules, and exit quickly.
    if (LIST) {
      variantEvalUtils.listModulesAndExit();
    }

    // maintain the full list of comps
    comps.addAll(compsProvided);
    if (dbsnp.dbsnp.isBound()) {
      comps.add(dbsnp.dbsnp);
      knowns.add(dbsnp.dbsnp);
    }

    // Add a dummy comp track if none exists
    if (comps.size() == 0)
      comps.add(
          new RodBinding<VariantContext>(VariantContext.class, "none", "UNBOUND", "", new Tags()));

    // Set up set of additional knowns
    for (RodBinding<VariantContext> compRod : comps) {
      if (KNOWN_NAMES.contains(compRod.getName())) knowns.add(compRod);
    }

    // Now that we have all the rods categorized, determine the sample list from the eval rods.
    Map<String, VCFHeader> vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), evals);
    Set<String> vcfSamples =
        SampleUtils.getSampleList(
            vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);

    // Load the sample list, using an intermediate tree set to sort the samples
    final Set<String> allSampleNames = SampleUtils.getSamplesFromCommandLineInput(vcfSamples);
    sampleNamesForEvaluation.addAll(
        new TreeSet<String>(
            SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS)));
    isSubsettingSamples = !sampleNamesForEvaluation.containsAll(allSampleNames);

    if (Arrays.asList(STRATIFICATIONS_TO_USE).contains("Sample")) {
      sampleNamesForStratification.addAll(sampleNamesForEvaluation);
    }
    sampleNamesForStratification.add(ALL_SAMPLE_NAME);

    // Initialize select expressions
    for (VariantContextUtils.JexlVCMatchExp jexl :
        VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS)) {
      SortableJexlVCMatchExp sjexl = new SortableJexlVCMatchExp(jexl.name, jexl.exp);
      jexlExpressions.add(sjexl);
    }

    // Initialize the set of stratifications and evaluations to use
    // The list of stratifiers and evaluators to use
    final List<VariantStratifier> stratificationObjects =
        variantEvalUtils.initializeStratificationObjects(
            NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE);
    final Set<Class<? extends VariantEvaluator>> evaluationClasses =
        variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE);

    checkForIncompatibleEvaluatorsAndStratifiers(stratificationObjects, evaluationClasses);

    for (VariantStratifier vs : stratificationObjects) {
      if (vs.getName().equals("Filter")) byFilterIsEnabled = true;
      else if (vs.getName().equals("Sample")) perSampleIsEnabled = true;
    }

    if (intervalsFile != null) {
      boolean fail = true;
      for (final VariantStratifier vs : stratificationObjects) {
        if (vs.getClass().equals(IntervalStratification.class)) fail = false;
      }
      if (fail)
        throw new UserException.BadArgumentValue(
            "ST", "stratIntervals argument provided but -ST IntervalStratification not provided");
    }

    // Initialize the evaluation contexts
    createStratificationStates(stratificationObjects, evaluationClasses);

    // Load ancestral alignments
    if (ancestralAlignmentsFile != null) {
      try {
        ancestralAlignments = new IndexedFastaSequenceFile(ancestralAlignmentsFile);
      } catch (FileNotFoundException e) {
        throw new ReviewedStingException(
            String.format(
                "The ancestral alignments file, '%s', could not be found",
                ancestralAlignmentsFile.getAbsolutePath()));
      }
    }

    // initialize CNVs
    if (knownCNVsFile != null) {
      knownCNVsByContig = createIntervalTreeByContig(knownCNVsFile);
    }
  }

  final void checkForIncompatibleEvaluatorsAndStratifiers(
      final List<VariantStratifier> stratificationObjects,
      Set<Class<? extends VariantEvaluator>> evaluationClasses) {
    for (final VariantStratifier vs : stratificationObjects) {
      for (Class<? extends VariantEvaluator> ec : evaluationClasses)
        if (vs.getIncompatibleEvaluators().contains(ec))
          throw new UserException.BadArgumentValue(
              "ST and ET",
              "The selected stratification "
                  + vs.getName()
                  + " and evaluator "
                  + ec.getSimpleName()
                  + " are incompatible due to combinatorial memory requirements."
                  + " Please disable one");
    }
  }

  final void createStratificationStates(
      final List<VariantStratifier> stratificationObjects,
      final Set<Class<? extends VariantEvaluator>> evaluationObjects) {
    final List<VariantStratifier> strats = new ArrayList<VariantStratifier>(stratificationObjects);
    stratManager = new StratificationManager<VariantStratifier, EvaluationContext>(strats);

    logger.info("Creating " + stratManager.size() + " combinatorial stratification states");
    for (int i = 0; i < stratManager.size(); i++) {
      EvaluationContext ec = new EvaluationContext(this, evaluationObjects);
      stratManager.set(i, ec);
    }
  }

  public final Map<String, IntervalTree<GenomeLoc>> createIntervalTreeByContig(
      final IntervalBinding<Feature> intervals) {
    final Map<String, IntervalTree<GenomeLoc>> byContig =
        new HashMap<String, IntervalTree<GenomeLoc>>();

    final List<GenomeLoc> locs = intervals.getIntervals(getToolkit());

    // set up the map from contig -> interval tree
    for (final String contig : getContigNames())
      byContig.put(contig, new IntervalTree<GenomeLoc>());

    for (final GenomeLoc loc : locs) {
      byContig.get(loc.getContig()).put(loc.getStart(), loc.getStop(), loc);
    }

    return byContig;
  }

  /** Collect relevant information from each variant in the supplied VCFs */
  @Override
  public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
    // we track the processed bp and expose this for modules instead of wasting CPU power on
    // calculating
    // the same thing over and over in evals that want the processed bp
    synchronized (this) {
      nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1);
    }

    if (tracker != null) {
      String aastr =
          (ancestralAlignments == null)
              ? null
              : new String(
                  ancestralAlignments
                      .getSubsequenceAt(
                          ref.getLocus().getContig(),
                          ref.getLocus().getStart(),
                          ref.getLocus().getStop())
                      .getBases());

      //            // update the dynamic stratifications
      //            for (final VariantContext vc : tracker.getValues(evals, ref.getLocus())) {
      //                // don't worry -- DynamicStratification only work with one eval object
      //                for ( final DynamicStratification ds :  dynamicStratifications ) {
      //                    ds.update(vc);
      //                }
      //            }

      //      --------- track ---------           sample  - VariantContexts -
      HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> evalVCs =
          variantEvalUtils.bindVariantContexts(
              tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals);
      HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> compVCs =
          variantEvalUtils.bindVariantContexts(
              tracker, ref, comps, byFilterIsEnabled, false, false, false);

      // for each eval track
      for (final RodBinding<VariantContext> evalRod : evals) {
        final Map<String, Collection<VariantContext>> emptyEvalMap = Collections.emptyMap();
        final Map<String, Collection<VariantContext>> evalSet =
            evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : emptyEvalMap;

        // for each sample stratifier
        for (final String sampleName : sampleNamesForStratification) {
          Collection<VariantContext> evalSetBySample = evalSet.get(sampleName);
          if (evalSetBySample == null) {
            evalSetBySample = new HashSet<VariantContext>(1);
            evalSetBySample.add(null);
          }

          // for each eval in the track
          for (VariantContext eval : evalSetBySample) {
            // deal with ancestral alleles if requested
            if (eval != null && aastr != null) {
              eval = new VariantContextBuilder(eval).attribute("ANCESTRALALLELE", aastr).make();
            }

            // for each comp track
            for (final RodBinding<VariantContext> compRod : comps) {
              // no sample stratification for comps
              final HashMap<String, Collection<VariantContext>> compSetHash = compVCs.get(compRod);
              final Collection<VariantContext> compSet =
                  (compSetHash == null || compSetHash.size() == 0)
                      ? Collections.<VariantContext>emptyList()
                      : compVCs.get(compRod).values().iterator().next();

              // find the comp
              final VariantContext comp = findMatchingComp(eval, compSet);

              for (EvaluationContext nec :
                  getEvaluationContexts(
                      tracker, ref, eval, evalRod.getName(), comp, compRod.getName(), sampleName)) {

                // eval against the comp
                synchronized (nec) {
                  nec.apply(tracker, ref, context, comp, eval);
                }

                // eval=null against all comps of different type that aren't bound to another eval
                for (VariantContext otherComp : compSet) {
                  if (otherComp != comp && !compHasMatchingEval(otherComp, evalSetBySample)) {
                    synchronized (nec) {
                      nec.apply(tracker, ref, context, otherComp, null);
                    }
                  }
                }
              }
            }
          }
        }

        if (mergeEvals) break; // stop processing the eval tracks
      }
    }

    return null;
  }

  /**
   * Given specific eval and comp VCs and the sample name, return an iterable over all of the
   * applicable state keys.
   *
   * <p>this code isn't structured yet for efficiency. Here we currently are doing the following
   * inefficient algorithm:
   *
   * <p>for each strat: get list of relevant states that eval and comp according to strat add this
   * list of states to a list of list states
   *
   * <p>then
   *
   * <p>ask the strat manager to look up all of the keys associated with the combinations of these
   * states. For example, suppose we have a single variant S. We have active strats EvalRod,
   * CompRod, and Novelty. We produce a list that looks like:
   *
   * <p>L = [[Eval], [Comp], [All, Novel]]
   *
   * <p>We then go through the strat manager tree to produce the keys associated with these states:
   *
   * <p>K = [0, 1] where EVAL x COMP x ALL = 0 and EVAL x COMP x NOVEL = 1
   *
   * <p>It's clear that a better
   *
   * <p>TODO -- create an inline version that doesn't create the intermediate list of list
   *
   * @param tracker
   * @param ref
   * @param eval
   * @param evalName
   * @param comp
   * @param compName
   * @param sampleName
   * @return
   */
  protected Collection<EvaluationContext> getEvaluationContexts(
      final RefMetaDataTracker tracker,
      final ReferenceContext ref,
      final VariantContext eval,
      final String evalName,
      final VariantContext comp,
      final String compName,
      final String sampleName) {
    final List<List<Object>> states = new LinkedList<List<Object>>();
    for (final VariantStratifier vs : stratManager.getStratifiers()) {
      states.add(vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName));
    }
    return stratManager.values(states);
  }

  @Requires({"comp != null", "evals != null"})
  private boolean compHasMatchingEval(
      final VariantContext comp, final Collection<VariantContext> evals) {
    // find all of the matching comps
    for (final VariantContext eval : evals) {
      if (eval != null
          && doEvalAndCompMatch(comp, eval, requireStrictAlleleMatch) != EvalCompMatchType.NO_MATCH)
        return true;
    }

    // nothing matched
    return false;
  }

  private enum EvalCompMatchType {
    NO_MATCH,
    STRICT,
    LENIENT
  }

  @Requires({"eval != null", "comp != null"})
  private EvalCompMatchType doEvalAndCompMatch(
      final VariantContext eval, final VariantContext comp, boolean requireStrictAlleleMatch) {
    if (comp.getType() == VariantContext.Type.NO_VARIATION
        || eval.getType() == VariantContext.Type.NO_VARIATION)
      // if either of these are NO_VARIATION they are LENIENT matches
      return EvalCompMatchType.LENIENT;

    if (comp.getType() != eval.getType()) return EvalCompMatchType.NO_MATCH;

    // find the comp which matches both the reference allele and alternate allele from eval
    final Allele altEval =
        eval.getAlternateAlleles().size() == 0 ? null : eval.getAlternateAllele(0);
    final Allele altComp =
        comp.getAlternateAlleles().size() == 0 ? null : comp.getAlternateAllele(0);
    if ((altEval == null && altComp == null)
        || (altEval != null
            && altEval.equals(altComp)
            && eval.getReference().equals(comp.getReference()))) return EvalCompMatchType.STRICT;
    else return requireStrictAlleleMatch ? EvalCompMatchType.NO_MATCH : EvalCompMatchType.LENIENT;
  }

  private VariantContext findMatchingComp(
      final VariantContext eval, final Collection<VariantContext> comps) {
    // if no comps, return null
    if (comps == null || comps.isEmpty()) return null;

    // if no eval, return any comp
    if (eval == null) return comps.iterator().next();

    // find all of the matching comps
    VariantContext lenientMatch = null;
    for (final VariantContext comp : comps) {
      switch (doEvalAndCompMatch(comp, eval, requireStrictAlleleMatch)) {
        case STRICT:
          return comp;
        case LENIENT:
          if (lenientMatch == null) lenientMatch = comp;
          break;
        case NO_MATCH:
          // do nothing
      }
    }

    // nothing matched, just return lenientMatch, which might be null
    return lenientMatch;
  }

  public Integer treeReduce(Integer lhs, Integer rhs) {
    return null;
  }

  @Override
  public Integer reduceInit() {
    return null;
  }

  @Override
  public Integer reduce(Integer value, Integer sum) {
    return null;
  }

  /**
   * Output the finalized report
   *
   * @param result an integer that doesn't get used for anything
   */
  public void onTraversalDone(Integer result) {
    logger.info("Finalizing variant report");

    // go through the evaluations and finalize them
    for (final EvaluationContext nec : stratManager.values())
      for (final VariantEvaluator ve : nec.getVariantEvaluators()) ve.finalizeEvaluation();

    VariantEvalReportWriter.writeReport(
        out,
        stratManager,
        stratManager.getStratifiers(),
        stratManager.get(0).getVariantEvaluators());
  }

  // Accessors
  public Logger getLogger() {
    return logger;
  }

  public double getMinPhaseQuality() {
    return MIN_PHASE_QUALITY;
  }

  public int getSamplePloidy() {
    return ploidy;
  }

  public double getMendelianViolationQualThreshold() {
    return MENDELIAN_VIOLATION_QUAL_THRESHOLD;
  }

  public static String getAllSampleName() {
    return ALL_SAMPLE_NAME;
  }

  public List<RodBinding<VariantContext>> getKnowns() {
    return knowns;
  }

  public List<RodBinding<VariantContext>> getEvals() {
    return evals;
  }

  public boolean isSubsettingToSpecificSamples() {
    return isSubsettingSamples;
  }

  public Set<String> getSampleNamesForEvaluation() {
    return sampleNamesForEvaluation;
  }

  public int getNumberOfSamplesForEvaluation() {
    if (sampleNamesForEvaluation != null && !sampleNamesForEvaluation.isEmpty())
      return sampleNamesForEvaluation.size();
    else {
      return numSamplesFromArgument;
    }
  }

  public Set<String> getSampleNamesForStratification() {
    return sampleNamesForStratification;
  }

  public List<RodBinding<VariantContext>> getComps() {
    return comps;
  }

  public Set<SortableJexlVCMatchExp> getJexlExpressions() {
    return jexlExpressions;
  }

  public long getnProcessedLoci() {
    return nProcessedLoci;
  }

  public Set<String> getContigNames() {
    final TreeSet<String> contigs = new TreeSet<String>();
    for (final SAMSequenceRecord r :
        getToolkit()
            .getReferenceDataSource()
            .getReference()
            .getSequenceDictionary()
            .getSequences()) {
      contigs.add(r.getSequenceName());
    }
    return contigs;
  }

  /**
   * getToolkit is protected, so we have to pseudo-overload it here so eval / strats can get the
   * toolkit
   *
   * @return
   */
  public GenomeAnalysisEngine getToolkit() {
    return super.getToolkit();
  }

  public boolean ignoreAC0Sites() {
    return !keepSitesWithAC0;
  }
}