/** * Created by IntelliJ IDEA. User: rpoplin Date: Nov 27, 2009 * * <p>A collection of the arguments that are used for BQSR. Used to be common to both * CovariateCounterWalker and TableRecalibrationWalker. This set of arguments will also be passed to * the constructor of every Covariate when it is instantiated. */ public class RecalibrationArgumentCollection implements Cloneable { /** * This algorithm treats every reference mismatch as an indication of error. However, real genetic * variation is expected to mismatch the reference, so it is critical that a database of known * polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any * number of RodBindings (VCF, Bed, etc.) for use as this database. For users wishing to exclude * an interval list of known variation simply use -XL my.interval.list to skip over processing * those sites. Please note however that the statistics reported by the tool will not accurately * reflected those sites skipped by the -XL argument. */ @Input( fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false) public List<RodBinding<Feature>> knownSites = Collections.emptyList(); /** * After the header, data records occur one per line until the end of the file. The first several * items on a line are the values of the individual covariates and will change depending on which * covariates were specified at runtime. The last three items are the data- that is, number of * observations for this combination of covariates, number of reference mismatches, and the raw * empirical quality score calculated by phred-scaling the mismatch rate. Use '/dev/stdout' to * print to standard out. */ @Gather(BQSRGatherer.class) @Output(doc = "The output recalibration table file to create", required = true) public File RECAL_TABLE_FILE = null; public PrintStream RECAL_TABLE; /** Note that the --list argument requires a fully resolved and correct command-line to work. */ @Argument( fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false) public boolean LIST_ONLY = false; /** * Note that the ReadGroup and QualityScore covariates are required and do not need to be * specified. Also, unless --no_standard_covs is specified, the Cycle and Context covariates are * standard and are included by default. Use the --list argument to see the available covariates. */ @Argument( fullName = "covariate", shortName = "cov", doc = "One or more covariates to be used in the recalibration. Can be specified multiple times", required = false) public String[] COVARIATES = null; /* * The Cycle and Context covariates are standard and are included by default unless this argument is provided. * Note that the ReadGroup and QualityScore covariates are required and cannot be excluded. */ @Argument( fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false) public boolean DO_NOT_USE_STANDARD_COVARIATES = false; /** * This calculation is critically dependent on being able to skip over known polymorphic sites. * Please be sure that you know what you are doing if you use this option. */ @Advanced @Argument( fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.") public boolean RUN_WITHOUT_DBSNP = false; /** * BaseRecalibrator accepts a --solid_recal_mode <MODE> flag which governs how the recalibrator * handles the reads which have had the reference inserted because of color space inconsistencies. */ @Argument( fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS") public RecalUtils.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.SET_Q_ZERO; /** * BaseRecalibrator accepts a --solid_nocall_strategy <MODE> flag which governs how the * recalibrator handles no calls in the color space tag. Unfortunately because of the reference * inserted bases mentioned above, reads with no calls in their color space tag can not be * recalibrated. */ @Argument( fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false) public RecalUtils.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; /** * The context covariate will use a context of this size to calculate its covariate value for base * mismatches. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime * and required java heap size. */ @Argument( fullName = "mismatches_context_size", shortName = "mcs", doc = "Size of the k-mer context to be used for base mismatches", required = false) public int MISMATCHES_CONTEXT_SIZE = 2; /** * The context covariate will use a context of this size to calculate its covariate value for base * insertions and deletions. Must be between 1 and 13 (inclusive). Note that higher values will * increase runtime and required java heap size. */ @Argument( fullName = "indels_context_size", shortName = "ics", doc = "Size of the k-mer context to be used for base insertions and deletions", required = false) public int INDELS_CONTEXT_SIZE = 3; /** * The cycle covariate will generate an error if it encounters a cycle greater than this value. * This argument is ignored if the Cycle covariate is not used. */ @Argument( fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "The maximum cycle value permitted for the Cycle covariate", required = false) public int MAXIMUM_CYCLE_VALUE = 500; /** * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. * This value will replace all base qualities in the read for this default value. Negative value * turns it off. [default is off] */ @Argument( fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false) public byte MISMATCHES_DEFAULT_QUALITY = -1; /** * A default base qualities to use as a prior (reported quality) in the insertion covariate model. * This parameter is used for all reads without insertion quality scores for each base. [default * is on] */ @Argument( fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false) public byte INSERTIONS_DEFAULT_QUALITY = 45; /** * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. * This value will replace all base qualities in the read for this default value. Negative value * turns it off. [default is on] */ @Argument( fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false) public byte DELETIONS_DEFAULT_QUALITY = 45; /** * Reads with low quality bases on either tail (beginning or end) will not be considered in the * context. This parameter defines the quality below which (inclusive) a tail is considered low * quality */ @Argument( fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false) public byte LOW_QUAL_TAIL = 2; /** * BQSR generates a quantization table for quick quantization later by subsequent tools. BQSR does * not quantize the base qualities, this is done by the engine with the -qq or -BQSR options. This * parameter tells BQSR the number of levels of quantization to use to build the quantization * table. */ @Argument( fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output") public int QUANTIZING_LEVELS = 16; /** The tag name for the binary tag covariate (if using it) */ @Argument( fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it") public String BINARY_TAG_NAME = null; /* * whether GATK report tables should have rows in sorted order, starting from leftmost column */ @Argument( fullName = "sort_by_all_columns", shortName = "sortAllCols", doc = "Sort the rows in the tables of reports", required = false) public Boolean SORT_BY_ALL_COLUMNS = false; ///////////////////////////// // Debugging-only Arguments ///////////////////////////// @Hidden @Argument( fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") public String DEFAULT_PLATFORM = null; @Hidden @Argument( fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; @Hidden @Argument( fullName = "force_readgroup", shortName = "fRG", required = false, doc = "If provided, the read group of EVERY read will be forced to be the provided String.") public String FORCE_READGROUP = null; @Hidden @Output( fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only", defaultToStdout = false) public PrintStream RECAL_TABLE_UPDATE_LOG = null; /** * The repeat covariate will use a context of this size to calculate it's covariate value for base * insertions and deletions */ @Hidden @Argument( fullName = "max_str_unit_length", shortName = "maxstr", doc = "Max size of the k-mer context to be used for repeat covariates", required = false) public int MAX_STR_UNIT_LENGTH = 8; @Hidden @Argument( fullName = "max_repeat_length", shortName = "maxrep", doc = "Max number of repetitions to be used for repeat covariates", required = false) public int MAX_REPEAT_LENGTH = 20; public File existingRecalibrationReport = null; public GATKReportTable generateReportTable(final String covariateNames) { GATKReportTable argumentsTable; if (SORT_BY_ALL_COLUMNS) { argumentsTable = new GATKReportTable( "Arguments", "Recalibration argument collection values used in this run", 2, GATKReportTable.TableSortingWay.SORT_BY_COLUMN); } else { argumentsTable = new GATKReportTable( "Arguments", "Recalibration argument collection values used in this run", 2); } argumentsTable.addColumn("Argument"); argumentsTable.addColumn(RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); argumentsTable.addRowID("covariate", true); argumentsTable.set("covariate", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, covariateNames); argumentsTable.addRowID("no_standard_covs", true); argumentsTable.set( "no_standard_covs", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DO_NOT_USE_STANDARD_COVARIATES); argumentsTable.addRowID("run_without_dbsnp", true); argumentsTable.set( "run_without_dbsnp", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP); argumentsTable.addRowID("solid_recal_mode", true); argumentsTable.set("solid_recal_mode", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_RECAL_MODE); argumentsTable.addRowID("solid_nocall_strategy", true); argumentsTable.set( "solid_nocall_strategy", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY); argumentsTable.addRowID("mismatches_context_size", true); argumentsTable.set( "mismatches_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE); argumentsTable.addRowID("indels_context_size", true); argumentsTable.set( "indels_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INDELS_CONTEXT_SIZE); argumentsTable.addRowID("mismatches_default_quality", true); argumentsTable.set( "mismatches_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY); argumentsTable.addRowID("deletions_default_quality", true); argumentsTable.set( "deletions_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DELETIONS_DEFAULT_QUALITY); argumentsTable.addRowID("insertions_default_quality", true); argumentsTable.set( "insertions_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_DEFAULT_QUALITY); argumentsTable.addRowID("maximum_cycle_value", true); argumentsTable.set( "maximum_cycle_value", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MAXIMUM_CYCLE_VALUE); argumentsTable.addRowID("low_quality_tail", true); argumentsTable.set("low_quality_tail", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, LOW_QUAL_TAIL); argumentsTable.addRowID("default_platform", true); argumentsTable.set("default_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM); argumentsTable.addRowID("force_platform", true); argumentsTable.set("force_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM); argumentsTable.addRowID("quantizing_levels", true); argumentsTable.set( "quantizing_levels", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS); argumentsTable.addRowID("recalibration_report", true); argumentsTable.set( "recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, existingRecalibrationReport == null ? "null" : existingRecalibrationReport.getAbsolutePath()); argumentsTable.addRowID("binary_tag_name", true); argumentsTable.set( "binary_tag_name", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, BINARY_TAG_NAME == null ? "null" : BINARY_TAG_NAME); return argumentsTable; } /** * Returns a map with the arguments that differ between this an another {@link * RecalibrationArgumentCollection} instance. * * <p>The key is the name of that argument in the report file. The value is a message that * explains the difference to the end user. * * <p>Thus, a empty map indicates that there is no differences between both argument collection * that is relevant to report comparison. * * <p>This method should not throw any exception. * * @param other the argument-collection to compare against. * @param thisRole the name used to refer to this RAC report that makes sense to the end user. * @param otherRole the name used to refer to the other RAC report that makes sense to the end * user. * @return never <code>null</code>, but a zero-size collection if there are no differences. */ @Requires( "other != null && thisRole != null && otherRole != null && !thisRole.equalsIgnoreCase(otherRole)") Map<String, ? extends CharSequence> compareReportArguments( final RecalibrationArgumentCollection other, final String thisRole, final String otherRole) { final Map<String, String> result = new LinkedHashMap<>(15); compareRequestedCovariates(result, other, thisRole, otherRole); compareSimpleReportArgument( result, "no_standard_covs", DO_NOT_USE_STANDARD_COVARIATES, other.DO_NOT_USE_STANDARD_COVARIATES, thisRole, otherRole); compareSimpleReportArgument( result, "run_without_dbsnp", RUN_WITHOUT_DBSNP, other.RUN_WITHOUT_DBSNP, thisRole, otherRole); compareSimpleReportArgument( result, "solid_recal_mode", SOLID_RECAL_MODE, other.SOLID_RECAL_MODE, thisRole, otherRole); compareSimpleReportArgument( result, "solid_nocall_strategy", SOLID_NOCALL_STRATEGY, other.SOLID_NOCALL_STRATEGY, thisRole, otherRole); compareSimpleReportArgument( result, "mismatches_context_size", MISMATCHES_CONTEXT_SIZE, other.MISMATCHES_CONTEXT_SIZE, thisRole, otherRole); compareSimpleReportArgument( result, "mismatches_default_quality", MISMATCHES_DEFAULT_QUALITY, other.MISMATCHES_DEFAULT_QUALITY, thisRole, otherRole); compareSimpleReportArgument( result, "deletions_default_quality", DELETIONS_DEFAULT_QUALITY, other.DELETIONS_DEFAULT_QUALITY, thisRole, otherRole); compareSimpleReportArgument( result, "insertions_default_quality", INSERTIONS_DEFAULT_QUALITY, other.INSERTIONS_DEFAULT_QUALITY, thisRole, otherRole); compareSimpleReportArgument( result, "maximum_cycle_value", MAXIMUM_CYCLE_VALUE, other.MAXIMUM_CYCLE_VALUE, thisRole, otherRole); compareSimpleReportArgument( result, "low_quality_tail", LOW_QUAL_TAIL, other.LOW_QUAL_TAIL, thisRole, otherRole); compareSimpleReportArgument( result, "default_platform", DEFAULT_PLATFORM, other.DEFAULT_PLATFORM, thisRole, otherRole); compareSimpleReportArgument( result, "force_platform", FORCE_PLATFORM, other.FORCE_PLATFORM, thisRole, otherRole); compareSimpleReportArgument( result, "quantizing_levels", QUANTIZING_LEVELS, other.QUANTIZING_LEVELS, thisRole, otherRole); compareSimpleReportArgument( result, "binary_tag_name", BINARY_TAG_NAME, other.BINARY_TAG_NAME, thisRole, otherRole); return result; } /** * Compares the covariate report lists. * * @param diffs map where to annotate the difference. * @param other the argument collection to compare against. * @param thisRole the name for this argument collection that makes sense to the user. * @param otherRole the name for the other argument collection that makes sense to the end user. * @return <code>true</code> if a difference was found. */ @Requires("diffs != null && other != null && thisRole != null && otherRole != null") private boolean compareRequestedCovariates( final Map<String, String> diffs, final RecalibrationArgumentCollection other, final String thisRole, final String otherRole) { final Set<String> beforeNames = new HashSet<>(this.COVARIATES.length); final Set<String> afterNames = new HashSet<>(other.COVARIATES.length); Utils.addAll(beforeNames, this.COVARIATES); Utils.addAll(afterNames, other.COVARIATES); final Set<String> intersect = new HashSet<>(Math.min(beforeNames.size(), afterNames.size())); intersect.addAll(beforeNames); intersect.retainAll(afterNames); String diffMessage = null; if (intersect.size() == 0) { // In practice this is not possible due to required covariates but... diffMessage = String.format( "There are no common covariates between '%s' and '%s'" + " recalibrator reports. Covariates in '%s': {%s}. Covariates in '%s': {%s}.", thisRole, otherRole, thisRole, Utils.join(", ", this.COVARIATES), otherRole, Utils.join(",", other.COVARIATES)); } else if (intersect.size() != beforeNames.size() || intersect.size() != afterNames.size()) { beforeNames.removeAll(intersect); afterNames.removeAll(intersect); diffMessage = String.format( "There are differences in the set of covariates requested in the" + " '%s' and '%s' recalibrator reports. " + " Exclusive to '%s': {%s}. Exclusive to '%s': {%s}.", thisRole, otherRole, thisRole, Utils.join(", ", beforeNames), otherRole, Utils.join(", ", afterNames)); } if (diffMessage != null) { diffs.put("covariate", diffMessage); return true; } else { return false; } } /** * Annotates a map with any difference encountered in a simple value report argument that differs * between this an another {@link RecalibrationArgumentCollection} instance. * * <p>The key of the new entry would be the name of that argument in the report file. The value is * a message that explains the difference to the end user. * * <p> * * <p>This method should not return any exception. * * @param diffs where to annotate the differences. * @param name the name of the report argument to compare. * @param thisValue this argument collection value for that argument. * @param otherValue the other collection value for that argument. * @param thisRole the name used to refer to this RAC report that makes sense to the end user. * @param otherRole the name used to refer to the other RAC report that makes sense to the end * user. * @type T the argument Object value type. * @return <code>true</code> if a difference has been spotted, thus <code>diff</code> has been * modified. */ private <T> boolean compareSimpleReportArgument( final Map<String, String> diffs, final String name, final T thisValue, final T otherValue, final String thisRole, final String otherRole) { if (thisValue == null && otherValue == null) { return false; } else if (thisValue != null && thisValue.equals(otherValue)) { return false; } else { diffs.put( name, String.format( "differences between '%s' {%s} and '%s' {%s}.", thisRole, thisValue == null ? "" : thisValue, otherRole, otherValue == null ? "" : otherValue)); return true; } } /** * Create a shallow copy of this argument collection. * * @return never <code>null</code>. */ @Override public RecalibrationArgumentCollection clone() { try { return (RecalibrationArgumentCollection) super.clone(); } catch (CloneNotSupportedException e) { throw new StingException( "Unreachable code clone not supported thrown when the class " + this.getClass().getName() + " is cloneable ", e); } } }
/** * General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ti/Tv ratios, and * a lot more) * * <p>Given a variant callset, it is common to calculate various quality control metrics. These * metrics include the number of raw or filtered SNP counts; ratio of transition mutations to * transversions; concordance of a particular sample's calls to a genotyping chip; number of * singletons per sample; etc. Furthermore, it is often useful to stratify these metrics by various * criteria like functional class (missense, nonsense, silent), whether the site is CpG site, the * amino acid degeneracy of the site, etc. VariantEval facilitates these calculations in two ways: * by providing several built-in evaluation and stratification modules, and by providing a framework * that permits the easy development of new evaluation and stratification modules. * * <h2>Input</h2> * * <p>One or more variant sets to evaluate plus any number of comparison sets. * * <h2>Output</h2> * * <p>Evaluation tables detailing the results of the eval modules which were applied. For example: * * <pre> * output.eval.gatkreport: * ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample * CountVariants CompRod CpG EvalRod JexlExpression Novelty nProcessedLoci nCalledLoci nRefLoci nVariantLoci variantRate ... * CountVariants dbsnp CpG eval none all 65900028 135770 0 135770 0.00206024 ... * CountVariants dbsnp CpG eval none known 65900028 47068 0 47068 0.00071423 ... * CountVariants dbsnp CpG eval none novel 65900028 88702 0 88702 0.00134601 ... * CountVariants dbsnp all eval none all 65900028 330818 0 330818 0.00502000 ... * CountVariants dbsnp all eval none known 65900028 120685 0 120685 0.00183133 ... * CountVariants dbsnp all eval none novel 65900028 210133 0 210133 0.00318866 ... * CountVariants dbsnp non_CpG eval none all 65900028 195048 0 195048 0.00295976 ... * CountVariants dbsnp non_CpG eval none known 65900028 73617 0 73617 0.00111710 ... * CountVariants dbsnp non_CpG eval none novel 65900028 121431 0 121431 0.00184265 ... * ... * </pre> * * <h2>Examples</h2> * * <pre> * java -Xmx2g -jar GenomeAnalysisTK.jar \ * -R ref.fasta \ * -T VariantEval \ * -o output.eval.gatkreport \ * --eval:set1 set1.vcf \ * --eval:set2 set2.vcf \ * [--comp comp.vcf] * </pre> */ @Reference(window = @Window(start = -50, stop = 50)) public class VariantEvalWalker extends RodWalker<Integer, Integer> implements TreeReducible<Integer> { @Output protected PrintStream out; /** The variant file(s) to evaluate. */ @Input(fullName = "eval", shortName = "eval", doc = "Input evaluation file(s)", required = true) public List<RodBinding<VariantContext>> evals; /** The variant file(s) to compare against. */ @Input(fullName = "comp", shortName = "comp", doc = "Input comparison file(s)", required = false) public List<RodBinding<VariantContext>> compsProvided = Collections.emptyList(); private List<RodBinding<VariantContext>> comps = new ArrayList<RodBinding<VariantContext>>(); /** * dbSNP comparison VCF. By default, the dbSNP file is used to specify the set of "known" * variants. Other sets can be specified with the -knownName (--known_names) argument. */ @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); // Help arguments @Argument( fullName = "list", shortName = "ls", doc = "List the available eval modules and exit", required = false) protected Boolean LIST = false; // Partitioning the data arguments @Argument( shortName = "select", doc = "One or more stratifications to use when evaluating the data", required = false) protected ArrayList<String> SELECT_EXPS = new ArrayList<String>(); @Argument( shortName = "selectName", doc = "Names to use for the list of stratifications (must be a 1-to-1 mapping)", required = false) protected ArrayList<String> SELECT_NAMES = new ArrayList<String>(); @Argument( fullName = "sample", shortName = "sn", doc = "Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context", required = false) protected Set<String> SAMPLE_EXPRESSIONS; /** List of rod tracks to be used for specifying "known" variants other than dbSNP. */ @Argument( shortName = "knownName", doc = "Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required = false) protected HashSet<String> KNOWN_NAMES = new HashSet<String>(); List<RodBinding<VariantContext>> knowns = new ArrayList<RodBinding<VariantContext>>(); // Stratification arguments @Argument( fullName = "stratificationModule", shortName = "ST", doc = "One or more specific stratification modules to apply to the eval track(s) (in addition to the standard stratifications, unless -noS is specified)", required = false) protected String[] STRATIFICATIONS_TO_USE = {}; @Argument( fullName = "doNotUseAllStandardStratifications", shortName = "noST", doc = "Do not use the standard stratification modules by default (instead, only those that are specified with the -S option)", required = false) protected Boolean NO_STANDARD_STRATIFICATIONS = false; /** See the -list argument to view available modules. */ @Argument( fullName = "evalModule", shortName = "EV", doc = "One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noEV is specified)", required = false) protected String[] MODULES_TO_USE = {}; @Argument( fullName = "doNotUseAllStandardModules", shortName = "noEV", doc = "Do not use the standard modules by default (instead, only those that are specified with the -EV option)", required = false) protected Boolean NO_STANDARD_MODULES = false; // Other arguments @Argument( fullName = "numSamples", shortName = "ns", doc = "Number of samples (used if no samples are available in the VCF file", required = false) protected Integer NUM_SAMPLES = 0; @Argument( fullName = "minPhaseQuality", shortName = "mpq", doc = "Minimum phasing quality", required = false) protected double MIN_PHASE_QUALITY = 10.0; @Argument( shortName = "mvq", fullName = "mendelianViolationQualThreshold", doc = "Minimum genotype QUAL score for each trio member required to accept a site as a violation. Default is 50.", required = false) protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50; @Argument( fullName = "ancestralAlignments", shortName = "aa", doc = "Fasta file with ancestral alleles", required = false) private File ancestralAlignmentsFile = null; @Argument( fullName = "requireStrictAlleleMatch", shortName = "strict", doc = "If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required = false) private boolean requireStrictAlleleMatch = false; /** * If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying * variant set, and evaluate the union of the results. Useful when you want to do -eval chr1.vcf * -eval chr2.vcf etc. */ @Argument( fullName = "mergeEvals", shortName = "mergeEvals", doc = "If provided, all -eval tracks will be merged into a single eval track", required = false) public boolean mergeEvals = false; /** File containing tribble-readable features for the IntervalStratificiation */ @Input( fullName = "stratIntervals", shortName = "stratIntervals", doc = "File containing tribble-readable features for the IntervalStratificiation", required = false) public IntervalBinding<Feature> intervalsFile = null; /** * File containing tribble-readable features containing known CNVs. For use with VariantSummary * table. */ @Input( fullName = "knownCNVs", shortName = "knownCNVs", doc = "File containing tribble-readable features describing a known list of copy number variants", required = false) public IntervalBinding<Feature> knownCNVsFile = null; Map<String, IntervalTree<GenomeLoc>> knownCNVsByContig = Collections.emptyMap(); // Variables private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>(); private Set<String> sampleNamesForEvaluation = new TreeSet<String>(); private Set<String> sampleNamesForStratification = new TreeSet<String>(); private int numSamples = 0; // The list of stratifiers and evaluators to use private TreeSet<VariantStratifier> stratificationObjects = null; // The set of all possible evaluation contexts private HashMap<StateKey, NewEvaluationContext> evaluationContexts = null; // important stratifications private boolean byFilterIsEnabled = false; private boolean perSampleIsEnabled = false; // Output report private GATKReport report = null; // Public constants private static String ALL_SAMPLE_NAME = "all"; // Utility class private final VariantEvalUtils variantEvalUtils = new VariantEvalUtils(this); // Ancestral alignments private IndexedFastaSequenceFile ancestralAlignments = null; /** Initialize the stratifications, evaluations, evaluation contexts, and reporting object */ public void initialize() { // Just list the modules, and exit quickly. if (LIST) { variantEvalUtils.listModulesAndExit(); } // maintain the full list of comps comps.addAll(compsProvided); if (dbsnp.dbsnp.isBound()) { comps.add(dbsnp.dbsnp); knowns.add(dbsnp.dbsnp); } // Add a dummy comp track if none exists if (comps.size() == 0) comps.add( new RodBinding<VariantContext>(VariantContext.class, "none", "UNBOUND", "", new Tags())); // Set up set of additional knowns for (RodBinding<VariantContext> compRod : comps) { if (KNOWN_NAMES.contains(compRod.getName())) knowns.add(compRod); } // Now that we have all the rods categorized, determine the sample list from the eval rods. Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evals); Set<String> vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); // Load the sample list sampleNamesForEvaluation.addAll( SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS)); numSamples = NUM_SAMPLES > 0 ? NUM_SAMPLES : sampleNamesForEvaluation.size(); if (Arrays.asList(STRATIFICATIONS_TO_USE).contains("Sample")) { sampleNamesForStratification.addAll(sampleNamesForEvaluation); } sampleNamesForStratification.add(ALL_SAMPLE_NAME); // Initialize select expressions for (VariantContextUtils.JexlVCMatchExp jexl : VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS)) { SortableJexlVCMatchExp sjexl = new SortableJexlVCMatchExp(jexl.name, jexl.exp); jexlExpressions.add(sjexl); } // Initialize the set of stratifications and evaluations to use stratificationObjects = variantEvalUtils.initializeStratificationObjects( this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE); Set<Class<? extends VariantEvaluator>> evaluationObjects = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE); for (VariantStratifier vs : getStratificationObjects()) { if (vs.getName().equals("Filter")) byFilterIsEnabled = true; else if (vs.getName().equals("Sample")) perSampleIsEnabled = true; } if (intervalsFile != null) { boolean fail = true; for (final VariantStratifier vs : stratificationObjects) { if (vs.getClass().equals(IntervalStratification.class)) fail = false; } if (fail) throw new UserException.BadArgumentValue( "ST", "stratIntervals argument provided but -ST IntervalStratification not provided"); } // Initialize the evaluation contexts evaluationContexts = variantEvalUtils.initializeEvaluationContexts( stratificationObjects, evaluationObjects, null, null); // Initialize report table report = variantEvalUtils.initializeGATKReport(stratificationObjects, evaluationObjects); // Load ancestral alignments if (ancestralAlignmentsFile != null) { try { ancestralAlignments = new IndexedFastaSequenceFile(ancestralAlignmentsFile); } catch (FileNotFoundException e) { throw new ReviewedStingException( String.format( "The ancestral alignments file, '%s', could not be found", ancestralAlignmentsFile.getAbsolutePath())); } } // initialize CNVs if (knownCNVsFile != null) { knownCNVsByContig = createIntervalTreeByContig(knownCNVsFile); } } public final Map<String, IntervalTree<GenomeLoc>> createIntervalTreeByContig( final IntervalBinding<Feature> intervals) { final Map<String, IntervalTree<GenomeLoc>> byContig = new HashMap<String, IntervalTree<GenomeLoc>>(); final List<GenomeLoc> locs = intervals.getIntervals(getToolkit()); // set up the map from contig -> interval tree for (final String contig : getContigNames()) byContig.put(contig, new IntervalTree<GenomeLoc>()); for (final GenomeLoc loc : locs) { byContig.get(loc.getContig()).put(loc.getStart(), loc.getStop(), loc); } return byContig; } /** Collect relevant information from each variant in the supplied VCFs */ @Override public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { for (NewEvaluationContext nec : evaluationContexts.values()) { synchronized (nec) { nec.update0(tracker, ref, context); } } if (tracker != null) { String aastr = (ancestralAlignments == null) ? null : new String( ancestralAlignments .getSubsequenceAt( ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop()) .getBases()); // --------- track --------- sample - VariantContexts - HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> evalVCs = variantEvalUtils.bindVariantContexts( tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals); HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> compVCs = variantEvalUtils.bindVariantContexts( tracker, ref, comps, byFilterIsEnabled, false, false, false); // for each eval track for (final RodBinding<VariantContext> evalRod : evals) { final Map<String, Collection<VariantContext>> emptyEvalMap = Collections.emptyMap(); final Map<String, Collection<VariantContext>> evalSet = evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : emptyEvalMap; // for each sample stratifier for (final String sampleName : sampleNamesForStratification) { Collection<VariantContext> evalSetBySample = evalSet.get(sampleName); if (evalSetBySample == null) { evalSetBySample = new HashSet<VariantContext>(1); evalSetBySample.add(null); } // for each eval in the track for (VariantContext eval : evalSetBySample) { // deal with ancestral alleles if requested if (eval != null && aastr != null) { eval = new VariantContextBuilder(eval).attribute("ANCESTRALALLELE", aastr).make(); } // for each comp track for (final RodBinding<VariantContext> compRod : comps) { // no sample stratification for comps final HashMap<String, Collection<VariantContext>> compSetHash = compVCs.get(compRod); final Collection<VariantContext> compSet = (compSetHash == null || compSetHash.size() == 0) ? Collections.<VariantContext>emptyList() : compVCs.get(compRod).values().iterator().next(); // find the comp final VariantContext comp = findMatchingComp(eval, compSet); HashMap<VariantStratifier, List<String>> stateMap = new HashMap<VariantStratifier, List<String>>(); for (VariantStratifier vs : stratificationObjects) { List<String> states = vs.getRelevantStates( ref, tracker, comp, compRod.getName(), eval, evalRod.getName(), sampleName); stateMap.put(vs, states); } ArrayList<StateKey> stateKeys = new ArrayList<StateKey>(); variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys); HashSet<StateKey> stateKeysHash = new HashSet<StateKey>(stateKeys); for (StateKey stateKey : stateKeysHash) { NewEvaluationContext nec = evaluationContexts.get(stateKey); // eval against the comp synchronized (nec) { nec.apply(tracker, ref, context, comp, eval); } // eval=null against all comps of different type that aren't bound to another eval for (VariantContext otherComp : compSet) { if (otherComp != comp && !compHasMatchingEval(otherComp, evalSetBySample)) { synchronized (nec) { nec.apply(tracker, ref, context, otherComp, null); } } } } } } } if (mergeEvals) break; // stop processing the eval tracks } } return null; } @Requires({"comp != null", "evals != null"}) private boolean compHasMatchingEval( final VariantContext comp, final Collection<VariantContext> evals) { // find all of the matching comps for (final VariantContext eval : evals) { if (eval != null && doEvalAndCompMatch(comp, eval, requireStrictAlleleMatch) != EvalCompMatchType.NO_MATCH) return true; } // nothing matched return false; } private enum EvalCompMatchType { NO_MATCH, STRICT, LENIENT } @Requires({"eval != null", "comp != null"}) private EvalCompMatchType doEvalAndCompMatch( final VariantContext eval, final VariantContext comp, boolean requireStrictAlleleMatch) { // find all of the matching comps if (comp.getType() != eval.getType()) return EvalCompMatchType.NO_MATCH; // find the comp which matches both the reference allele and alternate allele from eval final Allele altEval = eval.getAlternateAlleles().size() == 0 ? null : eval.getAlternateAllele(0); final Allele altComp = comp.getAlternateAlleles().size() == 0 ? null : comp.getAlternateAllele(0); if ((altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp) && eval.getReference().equals(comp.getReference()))) return EvalCompMatchType.STRICT; else return requireStrictAlleleMatch ? EvalCompMatchType.NO_MATCH : EvalCompMatchType.LENIENT; } private VariantContext findMatchingComp( final VariantContext eval, final Collection<VariantContext> comps) { // if no comps, return null if (comps == null || comps.isEmpty()) return null; // if no eval, return any comp if (eval == null) return comps.iterator().next(); // find all of the matching comps VariantContext lenientMatch = null; for (final VariantContext comp : comps) { switch (doEvalAndCompMatch(comp, eval, requireStrictAlleleMatch)) { case STRICT: return comp; case LENIENT: if (lenientMatch == null) lenientMatch = comp; break; case NO_MATCH:; } } // nothing matched, just return lenientMatch, which might be null return lenientMatch; } public Integer treeReduce(Integer lhs, Integer rhs) { return null; } @Override public Integer reduceInit() { return null; } @Override public Integer reduce(Integer value, Integer sum) { return null; } /** * Output the finalized report * * @param result an integer that doesn't get used for anything */ public void onTraversalDone(Integer result) { logger.info("Finalizing variant report"); for (StateKey stateKey : evaluationContexts.keySet()) { NewEvaluationContext nec = evaluationContexts.get(stateKey); for (VariantEvaluator ve : nec.getEvaluationClassList().values()) { ve.finalizeEvaluation(); AnalysisModuleScanner scanner = new AnalysisModuleScanner(ve); Map<Field, DataPoint> datamap = scanner.getData(); for (Field field : datamap.keySet()) { try { field.setAccessible(true); if (field.get(ve) instanceof TableType) { TableType t = (TableType) field.get(ve); String subTableName = ve.getClass().getSimpleName() + "." + field.getName(); final DataPoint dataPointAnn = datamap.get(field); GATKReportTable table; if (!report.hasTable(subTableName)) { report.addTable(subTableName, dataPointAnn.description()); table = report.getTable(subTableName); table.addPrimaryKey("entry", false); table.addColumn(subTableName, subTableName); for (VariantStratifier vs : stratificationObjects) { table.addColumn(vs.getName(), "unknown"); } table.addColumn("row", "unknown"); for (Object o : t.getColumnKeys()) { String c; if (o instanceof String) { c = (String) o; } else { c = o.toString(); } table.addColumn(c, 0.0); } } else { table = report.getTable(subTableName); } for (int row = 0; row < t.getRowKeys().length; row++) { String r = (String) t.getRowKeys()[row]; for (VariantStratifier vs : stratificationObjects) { final String columnName = vs.getName(); table.set(stateKey.toString() + r, columnName, stateKey.get(columnName)); } for (int col = 0; col < t.getColumnKeys().length; col++) { String c; if (t.getColumnKeys()[col] instanceof String) { c = (String) t.getColumnKeys()[col]; } else { c = t.getColumnKeys()[col].toString(); } String newStateKey = stateKey.toString() + r; table.set(newStateKey, c, t.getCell(row, col)); table.set(newStateKey, "row", r); } } } else { GATKReportTable table = report.getTable(ve.getClass().getSimpleName()); for (VariantStratifier vs : stratificationObjects) { String columnName = vs.getName(); table.set(stateKey.toString(), columnName, stateKey.get(vs.getName())); } table.set(stateKey.toString(), field.getName(), field.get(ve)); } } catch (IllegalAccessException e) { throw new StingException("IllegalAccessException: " + e); } } } } report.print(out); } // Accessors public Logger getLogger() { return logger; } public int getNumSamples() { return numSamples; } public double getMinPhaseQuality() { return MIN_PHASE_QUALITY; } public double getMendelianViolationQualThreshold() { return MENDELIAN_VIOLATION_QUAL_THRESHOLD; } public TreeSet<VariantStratifier> getStratificationObjects() { return stratificationObjects; } public static String getAllSampleName() { return ALL_SAMPLE_NAME; } public List<RodBinding<VariantContext>> getKnowns() { return knowns; } public List<RodBinding<VariantContext>> getEvals() { return evals; } public Set<String> getSampleNamesForEvaluation() { return sampleNamesForEvaluation; } public Set<String> getSampleNamesForStratification() { return sampleNamesForStratification; } public List<RodBinding<VariantContext>> getComps() { return comps; } public Set<SortableJexlVCMatchExp> getJexlExpressions() { return jexlExpressions; } public Set<String> getContigNames() { final TreeSet<String> contigs = new TreeSet<String>(); for (final SAMSequenceRecord r : getToolkit() .getReferenceDataSource() .getReference() .getSequenceDictionary() .getSequences()) { contigs.add(r.getSequenceName()); } return contigs; } public GenomeLocParser getGenomeLocParser() { return getToolkit().getGenomeLocParser(); } public GenomeAnalysisEngine getToolkit() { return super.getToolkit(); } }
/** Collect relevant information from each variant in the supplied VCFs */ @Override public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { // we track the processed bp and expose this for modules instead of wasting CPU power on // calculating // the same thing over and over in evals that want the processed bp synchronized (this) { nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); } if (tracker != null) { String aastr = (ancestralAlignments == null) ? null : new String( ancestralAlignments .getSubsequenceAt( ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop()) .getBases()); // // update the dynamic stratifications // for (final VariantContext vc : tracker.getValues(evals, ref.getLocus())) { // // don't worry -- DynamicStratification only work with one eval object // for ( final DynamicStratification ds : dynamicStratifications ) { // ds.update(vc); // } // } // --------- track --------- sample - VariantContexts - HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> evalVCs = variantEvalUtils.bindVariantContexts( tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals); HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> compVCs = variantEvalUtils.bindVariantContexts( tracker, ref, comps, byFilterIsEnabled, false, false, false); // for each eval track for (final RodBinding<VariantContext> evalRod : evals) { final Map<String, Collection<VariantContext>> emptyEvalMap = Collections.emptyMap(); final Map<String, Collection<VariantContext>> evalSet = evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : emptyEvalMap; // for each sample stratifier for (final String sampleName : sampleNamesForStratification) { Collection<VariantContext> evalSetBySample = evalSet.get(sampleName); if (evalSetBySample == null) { evalSetBySample = new HashSet<VariantContext>(1); evalSetBySample.add(null); } // for each eval in the track for (VariantContext eval : evalSetBySample) { // deal with ancestral alleles if requested if (eval != null && aastr != null) { eval = new VariantContextBuilder(eval).attribute("ANCESTRALALLELE", aastr).make(); } // for each comp track for (final RodBinding<VariantContext> compRod : comps) { // no sample stratification for comps final HashMap<String, Collection<VariantContext>> compSetHash = compVCs.get(compRod); final Collection<VariantContext> compSet = (compSetHash == null || compSetHash.size() == 0) ? Collections.<VariantContext>emptyList() : compVCs.get(compRod).values().iterator().next(); // find the comp final VariantContext comp = findMatchingComp(eval, compSet); for (EvaluationContext nec : getEvaluationContexts( tracker, ref, eval, evalRod.getName(), comp, compRod.getName(), sampleName)) { // eval against the comp synchronized (nec) { nec.apply(tracker, ref, context, comp, eval); } // eval=null against all comps of different type that aren't bound to another eval for (VariantContext otherComp : compSet) { if (otherComp != comp && !compHasMatchingEval(otherComp, evalSetBySample)) { synchronized (nec) { nec.apply(tracker, ref, context, otherComp, null); } } } } } } } if (mergeEvals) break; // stop processing the eval tracks } } return null; }
/** Collect relevant information from each variant in the supplied VCFs */ @Override public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { for (NewEvaluationContext nec : evaluationContexts.values()) { synchronized (nec) { nec.update0(tracker, ref, context); } } if (tracker != null) { String aastr = (ancestralAlignments == null) ? null : new String( ancestralAlignments .getSubsequenceAt( ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop()) .getBases()); // --------- track --------- sample - VariantContexts - HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> evalVCs = variantEvalUtils.bindVariantContexts( tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals); HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> compVCs = variantEvalUtils.bindVariantContexts( tracker, ref, comps, byFilterIsEnabled, false, false, false); // for each eval track for (final RodBinding<VariantContext> evalRod : evals) { final Map<String, Collection<VariantContext>> emptyEvalMap = Collections.emptyMap(); final Map<String, Collection<VariantContext>> evalSet = evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : emptyEvalMap; // for each sample stratifier for (final String sampleName : sampleNamesForStratification) { Collection<VariantContext> evalSetBySample = evalSet.get(sampleName); if (evalSetBySample == null) { evalSetBySample = new HashSet<VariantContext>(1); evalSetBySample.add(null); } // for each eval in the track for (VariantContext eval : evalSetBySample) { // deal with ancestral alleles if requested if (eval != null && aastr != null) { eval = new VariantContextBuilder(eval).attribute("ANCESTRALALLELE", aastr).make(); } // for each comp track for (final RodBinding<VariantContext> compRod : comps) { // no sample stratification for comps final HashMap<String, Collection<VariantContext>> compSetHash = compVCs.get(compRod); final Collection<VariantContext> compSet = (compSetHash == null || compSetHash.size() == 0) ? Collections.<VariantContext>emptyList() : compVCs.get(compRod).values().iterator().next(); // find the comp final VariantContext comp = findMatchingComp(eval, compSet); HashMap<VariantStratifier, List<String>> stateMap = new HashMap<VariantStratifier, List<String>>(); for (VariantStratifier vs : stratificationObjects) { List<String> states = vs.getRelevantStates( ref, tracker, comp, compRod.getName(), eval, evalRod.getName(), sampleName); stateMap.put(vs, states); } ArrayList<StateKey> stateKeys = new ArrayList<StateKey>(); variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys); HashSet<StateKey> stateKeysHash = new HashSet<StateKey>(stateKeys); for (StateKey stateKey : stateKeysHash) { NewEvaluationContext nec = evaluationContexts.get(stateKey); // eval against the comp synchronized (nec) { nec.apply(tracker, ref, context, comp, eval); } // eval=null against all comps of different type that aren't bound to another eval for (VariantContext otherComp : compSet) { if (otherComp != comp && !compHasMatchingEval(otherComp, evalSetBySample)) { synchronized (nec) { nec.apply(tracker, ref, context, otherComp, null); } } } } } } } if (mergeEvals) break; // stop processing the eval tracks } } return null; }
/** * General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ti/Tv ratios, and * a lot more) * * <p>Given a variant callset, it is common to calculate various quality control metrics. These * metrics include the number of raw or filtered SNP counts; ratio of transition mutations to * transversions; concordance of a particular sample's calls to a genotyping chip; number of * singletons per sample; etc. Furthermore, it is often useful to stratify these metrics by various * criteria like functional class (missense, nonsense, silent), whether the site is CpG site, the * amino acid degeneracy of the site, etc. VariantEval facilitates these calculations in two ways: * by providing several built-in evaluation and stratification modules, and by providing a framework * that permits the easy development of new evaluation and stratification modules. * * <h2>Input</h2> * * <p>One or more variant sets to evaluate plus any number of comparison sets. * * <h2>Output</h2> * * <p>Evaluation tables detailing the results of the eval modules which were applied. For example: * * <pre> * output.eval.gatkreport: * ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample * CountVariants CompRod CpG EvalRod JexlExpression Novelty nProcessedLoci nCalledLoci nRefLoci nVariantLoci variantRate ... * CountVariants dbsnp CpG eval none all 65900028 135770 0 135770 0.00206024 ... * CountVariants dbsnp CpG eval none known 65900028 47068 0 47068 0.00071423 ... * CountVariants dbsnp CpG eval none novel 65900028 88702 0 88702 0.00134601 ... * CountVariants dbsnp all eval none all 65900028 330818 0 330818 0.00502000 ... * CountVariants dbsnp all eval none known 65900028 120685 0 120685 0.00183133 ... * CountVariants dbsnp all eval none novel 65900028 210133 0 210133 0.00318866 ... * CountVariants dbsnp non_CpG eval none all 65900028 195048 0 195048 0.00295976 ... * CountVariants dbsnp non_CpG eval none known 65900028 73617 0 73617 0.00111710 ... * CountVariants dbsnp non_CpG eval none novel 65900028 121431 0 121431 0.00184265 ... * ... * </pre> * * <h2>Examples</h2> * * <pre> * java -Xmx2g -jar GenomeAnalysisTK.jar \ * -R ref.fasta \ * -T VariantEval \ * -o output.eval.gatkreport \ * --eval:set1 set1.vcf \ * --eval:set2 set2.vcf \ * [--comp comp.vcf] * </pre> */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class}) @Reference(window = @Window(start = -50, stop = 50)) @PartitionBy(PartitionType.NONE) public class VariantEval extends RodWalker<Integer, Integer> implements TreeReducible<Integer> { public static final String IS_SINGLETON_KEY = "ISSINGLETON"; @Output protected PrintStream out; /** The variant file(s) to evaluate. */ @Input(fullName = "eval", shortName = "eval", doc = "Input evaluation file(s)", required = true) public List<RodBinding<VariantContext>> evals; /** The variant file(s) to compare against. */ @Input(fullName = "comp", shortName = "comp", doc = "Input comparison file(s)", required = false) public List<RodBinding<VariantContext>> compsProvided = Collections.emptyList(); private List<RodBinding<VariantContext>> comps = new ArrayList<RodBinding<VariantContext>>(); /** * dbSNP comparison VCF. By default, the dbSNP file is used to specify the set of "known" * variants. Other sets can be specified with the -knownName (--known_names) argument. */ @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); /** * Some analyses want to count overlap not with dbSNP (which is in general very open) but actually * want to itemize their overlap specifically with a set of gold standard sites such as HapMap, * OMNI, or the gold standard indels. This argument provides a mechanism for communicating which * file to use */ @Input( fullName = "goldStandard", shortName = "gold", doc = "Evaluations that count calls at sites of true variation (e.g., indel calls) will use this argument as their gold standard for comparison", required = false) public RodBinding<VariantContext> goldStandard = null; /** Note that the --list argument requires a fully resolved and correct command-line to work. */ @Argument( fullName = "list", shortName = "ls", doc = "List the available eval modules and exit", required = false) protected Boolean LIST = false; // Partitioning the data arguments @Argument( shortName = "select", doc = "One or more stratifications to use when evaluating the data", required = false) protected ArrayList<String> SELECT_EXPS = new ArrayList<String>(); @Argument( shortName = "selectName", doc = "Names to use for the list of stratifications (must be a 1-to-1 mapping)", required = false) protected ArrayList<String> SELECT_NAMES = new ArrayList<String>(); @Argument( fullName = "sample", shortName = "sn", doc = "Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context", required = false) protected Set<String> SAMPLE_EXPRESSIONS; /** List of rod tracks to be used for specifying "known" variants other than dbSNP. */ @Argument( shortName = "knownName", doc = "Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required = false) protected HashSet<String> KNOWN_NAMES = new HashSet<String>(); List<RodBinding<VariantContext>> knowns = new ArrayList<RodBinding<VariantContext>>(); // Stratification arguments @Argument( fullName = "stratificationModule", shortName = "ST", doc = "One or more specific stratification modules to apply to the eval track(s) (in addition to the standard stratifications, unless -noS is specified)", required = false) protected String[] STRATIFICATIONS_TO_USE = {}; @Argument( fullName = "doNotUseAllStandardStratifications", shortName = "noST", doc = "Do not use the standard stratification modules by default (instead, only those that are specified with the -S option)", required = false) protected Boolean NO_STANDARD_STRATIFICATIONS = false; /** See the -list argument to view available modules. */ @Argument( fullName = "evalModule", shortName = "EV", doc = "One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noEV is specified)", required = false) protected String[] MODULES_TO_USE = {}; @Argument( fullName = "doNotUseAllStandardModules", shortName = "noEV", doc = "Do not use the standard modules by default (instead, only those that are specified with the -EV option)", required = false) protected Boolean NO_STANDARD_MODULES = false; @Argument( fullName = "minPhaseQuality", shortName = "mpq", doc = "Minimum phasing quality", required = false) protected double MIN_PHASE_QUALITY = 10.0; @Argument( shortName = "mvq", fullName = "mendelianViolationQualThreshold", doc = "Minimum genotype QUAL score for each trio member required to accept a site as a violation. Default is 50.", required = false) protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50; @Argument( shortName = "ploidy", fullName = "samplePloidy", doc = "Per-sample ploidy (number of chromosomes per sample)", required = false) protected int ploidy = GATKVariantContextUtils.DEFAULT_PLOIDY; @Argument( fullName = "ancestralAlignments", shortName = "aa", doc = "Fasta file with ancestral alleles", required = false) private File ancestralAlignmentsFile = null; @Argument( fullName = "requireStrictAlleleMatch", shortName = "strict", doc = "If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required = false) private boolean requireStrictAlleleMatch = false; @Argument( fullName = "keepAC0", shortName = "keepAC0", doc = "If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required = false) private boolean keepSitesWithAC0 = false; @Hidden @Argument( fullName = "numSamples", shortName = "numSamples", doc = "If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required = false) private int numSamplesFromArgument = 0; /** * If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying * variant set, and evaluate the union of the results. Useful when you want to do -eval chr1.vcf * -eval chr2.vcf etc. */ @Argument( fullName = "mergeEvals", shortName = "mergeEvals", doc = "If provided, all -eval tracks will be merged into a single eval track", required = false) public boolean mergeEvals = false; /** File containing tribble-readable features for the IntervalStratificiation */ @Input( fullName = "stratIntervals", shortName = "stratIntervals", doc = "File containing tribble-readable features for the IntervalStratificiation", required = false) public IntervalBinding<Feature> intervalsFile = null; /** * File containing tribble-readable features containing known CNVs. For use with VariantSummary * table. */ @Input( fullName = "knownCNVs", shortName = "knownCNVs", doc = "File containing tribble-readable features describing a known list of copy number variants", required = false) public IntervalBinding<Feature> knownCNVsFile = null; Map<String, IntervalTree<GenomeLoc>> knownCNVsByContig = Collections.emptyMap(); // Variables private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>(); private boolean isSubsettingSamples; private Set<String> sampleNamesForEvaluation = new LinkedHashSet<String>(); private Set<String> sampleNamesForStratification = new LinkedHashSet<String>(); // important stratifications private boolean byFilterIsEnabled = false; private boolean perSampleIsEnabled = false; // Public constants private static String ALL_SAMPLE_NAME = "all"; // the number of processed bp for this walker long nProcessedLoci = 0; // Utility class private final VariantEvalUtils variantEvalUtils = new VariantEvalUtils(this); // Ancestral alignments private IndexedFastaSequenceFile ancestralAlignments = null; // The set of all possible evaluation contexts StratificationManager<VariantStratifier, EvaluationContext> stratManager; // Set<DynamicStratification> dynamicStratifications = Collections.emptySet(); /** Initialize the stratifications, evaluations, evaluation contexts, and reporting object */ public void initialize() { // Just list the modules, and exit quickly. if (LIST) { variantEvalUtils.listModulesAndExit(); } // maintain the full list of comps comps.addAll(compsProvided); if (dbsnp.dbsnp.isBound()) { comps.add(dbsnp.dbsnp); knowns.add(dbsnp.dbsnp); } // Add a dummy comp track if none exists if (comps.size() == 0) comps.add( new RodBinding<VariantContext>(VariantContext.class, "none", "UNBOUND", "", new Tags())); // Set up set of additional knowns for (RodBinding<VariantContext> compRod : comps) { if (KNOWN_NAMES.contains(compRod.getName())) knowns.add(compRod); } // Now that we have all the rods categorized, determine the sample list from the eval rods. Map<String, VCFHeader> vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), evals); Set<String> vcfSamples = SampleUtils.getSampleList( vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); // Load the sample list, using an intermediate tree set to sort the samples final Set<String> allSampleNames = SampleUtils.getSamplesFromCommandLineInput(vcfSamples); sampleNamesForEvaluation.addAll( new TreeSet<String>( SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS))); isSubsettingSamples = !sampleNamesForEvaluation.containsAll(allSampleNames); if (Arrays.asList(STRATIFICATIONS_TO_USE).contains("Sample")) { sampleNamesForStratification.addAll(sampleNamesForEvaluation); } sampleNamesForStratification.add(ALL_SAMPLE_NAME); // Initialize select expressions for (VariantContextUtils.JexlVCMatchExp jexl : VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS)) { SortableJexlVCMatchExp sjexl = new SortableJexlVCMatchExp(jexl.name, jexl.exp); jexlExpressions.add(sjexl); } // Initialize the set of stratifications and evaluations to use // The list of stratifiers and evaluators to use final List<VariantStratifier> stratificationObjects = variantEvalUtils.initializeStratificationObjects( NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE); final Set<Class<? extends VariantEvaluator>> evaluationClasses = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE); checkForIncompatibleEvaluatorsAndStratifiers(stratificationObjects, evaluationClasses); for (VariantStratifier vs : stratificationObjects) { if (vs.getName().equals("Filter")) byFilterIsEnabled = true; else if (vs.getName().equals("Sample")) perSampleIsEnabled = true; } if (intervalsFile != null) { boolean fail = true; for (final VariantStratifier vs : stratificationObjects) { if (vs.getClass().equals(IntervalStratification.class)) fail = false; } if (fail) throw new UserException.BadArgumentValue( "ST", "stratIntervals argument provided but -ST IntervalStratification not provided"); } // Initialize the evaluation contexts createStratificationStates(stratificationObjects, evaluationClasses); // Load ancestral alignments if (ancestralAlignmentsFile != null) { try { ancestralAlignments = new IndexedFastaSequenceFile(ancestralAlignmentsFile); } catch (FileNotFoundException e) { throw new ReviewedStingException( String.format( "The ancestral alignments file, '%s', could not be found", ancestralAlignmentsFile.getAbsolutePath())); } } // initialize CNVs if (knownCNVsFile != null) { knownCNVsByContig = createIntervalTreeByContig(knownCNVsFile); } } final void checkForIncompatibleEvaluatorsAndStratifiers( final List<VariantStratifier> stratificationObjects, Set<Class<? extends VariantEvaluator>> evaluationClasses) { for (final VariantStratifier vs : stratificationObjects) { for (Class<? extends VariantEvaluator> ec : evaluationClasses) if (vs.getIncompatibleEvaluators().contains(ec)) throw new UserException.BadArgumentValue( "ST and ET", "The selected stratification " + vs.getName() + " and evaluator " + ec.getSimpleName() + " are incompatible due to combinatorial memory requirements." + " Please disable one"); } } final void createStratificationStates( final List<VariantStratifier> stratificationObjects, final Set<Class<? extends VariantEvaluator>> evaluationObjects) { final List<VariantStratifier> strats = new ArrayList<VariantStratifier>(stratificationObjects); stratManager = new StratificationManager<VariantStratifier, EvaluationContext>(strats); logger.info("Creating " + stratManager.size() + " combinatorial stratification states"); for (int i = 0; i < stratManager.size(); i++) { EvaluationContext ec = new EvaluationContext(this, evaluationObjects); stratManager.set(i, ec); } } public final Map<String, IntervalTree<GenomeLoc>> createIntervalTreeByContig( final IntervalBinding<Feature> intervals) { final Map<String, IntervalTree<GenomeLoc>> byContig = new HashMap<String, IntervalTree<GenomeLoc>>(); final List<GenomeLoc> locs = intervals.getIntervals(getToolkit()); // set up the map from contig -> interval tree for (final String contig : getContigNames()) byContig.put(contig, new IntervalTree<GenomeLoc>()); for (final GenomeLoc loc : locs) { byContig.get(loc.getContig()).put(loc.getStart(), loc.getStop(), loc); } return byContig; } /** Collect relevant information from each variant in the supplied VCFs */ @Override public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { // we track the processed bp and expose this for modules instead of wasting CPU power on // calculating // the same thing over and over in evals that want the processed bp synchronized (this) { nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); } if (tracker != null) { String aastr = (ancestralAlignments == null) ? null : new String( ancestralAlignments .getSubsequenceAt( ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop()) .getBases()); // // update the dynamic stratifications // for (final VariantContext vc : tracker.getValues(evals, ref.getLocus())) { // // don't worry -- DynamicStratification only work with one eval object // for ( final DynamicStratification ds : dynamicStratifications ) { // ds.update(vc); // } // } // --------- track --------- sample - VariantContexts - HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> evalVCs = variantEvalUtils.bindVariantContexts( tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals); HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> compVCs = variantEvalUtils.bindVariantContexts( tracker, ref, comps, byFilterIsEnabled, false, false, false); // for each eval track for (final RodBinding<VariantContext> evalRod : evals) { final Map<String, Collection<VariantContext>> emptyEvalMap = Collections.emptyMap(); final Map<String, Collection<VariantContext>> evalSet = evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : emptyEvalMap; // for each sample stratifier for (final String sampleName : sampleNamesForStratification) { Collection<VariantContext> evalSetBySample = evalSet.get(sampleName); if (evalSetBySample == null) { evalSetBySample = new HashSet<VariantContext>(1); evalSetBySample.add(null); } // for each eval in the track for (VariantContext eval : evalSetBySample) { // deal with ancestral alleles if requested if (eval != null && aastr != null) { eval = new VariantContextBuilder(eval).attribute("ANCESTRALALLELE", aastr).make(); } // for each comp track for (final RodBinding<VariantContext> compRod : comps) { // no sample stratification for comps final HashMap<String, Collection<VariantContext>> compSetHash = compVCs.get(compRod); final Collection<VariantContext> compSet = (compSetHash == null || compSetHash.size() == 0) ? Collections.<VariantContext>emptyList() : compVCs.get(compRod).values().iterator().next(); // find the comp final VariantContext comp = findMatchingComp(eval, compSet); for (EvaluationContext nec : getEvaluationContexts( tracker, ref, eval, evalRod.getName(), comp, compRod.getName(), sampleName)) { // eval against the comp synchronized (nec) { nec.apply(tracker, ref, context, comp, eval); } // eval=null against all comps of different type that aren't bound to another eval for (VariantContext otherComp : compSet) { if (otherComp != comp && !compHasMatchingEval(otherComp, evalSetBySample)) { synchronized (nec) { nec.apply(tracker, ref, context, otherComp, null); } } } } } } } if (mergeEvals) break; // stop processing the eval tracks } } return null; } /** * Given specific eval and comp VCs and the sample name, return an iterable over all of the * applicable state keys. * * <p>this code isn't structured yet for efficiency. Here we currently are doing the following * inefficient algorithm: * * <p>for each strat: get list of relevant states that eval and comp according to strat add this * list of states to a list of list states * * <p>then * * <p>ask the strat manager to look up all of the keys associated with the combinations of these * states. For example, suppose we have a single variant S. We have active strats EvalRod, * CompRod, and Novelty. We produce a list that looks like: * * <p>L = [[Eval], [Comp], [All, Novel]] * * <p>We then go through the strat manager tree to produce the keys associated with these states: * * <p>K = [0, 1] where EVAL x COMP x ALL = 0 and EVAL x COMP x NOVEL = 1 * * <p>It's clear that a better * * <p>TODO -- create an inline version that doesn't create the intermediate list of list * * @param tracker * @param ref * @param eval * @param evalName * @param comp * @param compName * @param sampleName * @return */ protected Collection<EvaluationContext> getEvaluationContexts( final RefMetaDataTracker tracker, final ReferenceContext ref, final VariantContext eval, final String evalName, final VariantContext comp, final String compName, final String sampleName) { final List<List<Object>> states = new LinkedList<List<Object>>(); for (final VariantStratifier vs : stratManager.getStratifiers()) { states.add(vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName)); } return stratManager.values(states); } @Requires({"comp != null", "evals != null"}) private boolean compHasMatchingEval( final VariantContext comp, final Collection<VariantContext> evals) { // find all of the matching comps for (final VariantContext eval : evals) { if (eval != null && doEvalAndCompMatch(comp, eval, requireStrictAlleleMatch) != EvalCompMatchType.NO_MATCH) return true; } // nothing matched return false; } private enum EvalCompMatchType { NO_MATCH, STRICT, LENIENT } @Requires({"eval != null", "comp != null"}) private EvalCompMatchType doEvalAndCompMatch( final VariantContext eval, final VariantContext comp, boolean requireStrictAlleleMatch) { if (comp.getType() == VariantContext.Type.NO_VARIATION || eval.getType() == VariantContext.Type.NO_VARIATION) // if either of these are NO_VARIATION they are LENIENT matches return EvalCompMatchType.LENIENT; if (comp.getType() != eval.getType()) return EvalCompMatchType.NO_MATCH; // find the comp which matches both the reference allele and alternate allele from eval final Allele altEval = eval.getAlternateAlleles().size() == 0 ? null : eval.getAlternateAllele(0); final Allele altComp = comp.getAlternateAlleles().size() == 0 ? null : comp.getAlternateAllele(0); if ((altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp) && eval.getReference().equals(comp.getReference()))) return EvalCompMatchType.STRICT; else return requireStrictAlleleMatch ? EvalCompMatchType.NO_MATCH : EvalCompMatchType.LENIENT; } private VariantContext findMatchingComp( final VariantContext eval, final Collection<VariantContext> comps) { // if no comps, return null if (comps == null || comps.isEmpty()) return null; // if no eval, return any comp if (eval == null) return comps.iterator().next(); // find all of the matching comps VariantContext lenientMatch = null; for (final VariantContext comp : comps) { switch (doEvalAndCompMatch(comp, eval, requireStrictAlleleMatch)) { case STRICT: return comp; case LENIENT: if (lenientMatch == null) lenientMatch = comp; break; case NO_MATCH: // do nothing } } // nothing matched, just return lenientMatch, which might be null return lenientMatch; } public Integer treeReduce(Integer lhs, Integer rhs) { return null; } @Override public Integer reduceInit() { return null; } @Override public Integer reduce(Integer value, Integer sum) { return null; } /** * Output the finalized report * * @param result an integer that doesn't get used for anything */ public void onTraversalDone(Integer result) { logger.info("Finalizing variant report"); // go through the evaluations and finalize them for (final EvaluationContext nec : stratManager.values()) for (final VariantEvaluator ve : nec.getVariantEvaluators()) ve.finalizeEvaluation(); VariantEvalReportWriter.writeReport( out, stratManager, stratManager.getStratifiers(), stratManager.get(0).getVariantEvaluators()); } // Accessors public Logger getLogger() { return logger; } public double getMinPhaseQuality() { return MIN_PHASE_QUALITY; } public int getSamplePloidy() { return ploidy; } public double getMendelianViolationQualThreshold() { return MENDELIAN_VIOLATION_QUAL_THRESHOLD; } public static String getAllSampleName() { return ALL_SAMPLE_NAME; } public List<RodBinding<VariantContext>> getKnowns() { return knowns; } public List<RodBinding<VariantContext>> getEvals() { return evals; } public boolean isSubsettingToSpecificSamples() { return isSubsettingSamples; } public Set<String> getSampleNamesForEvaluation() { return sampleNamesForEvaluation; } public int getNumberOfSamplesForEvaluation() { if (sampleNamesForEvaluation != null && !sampleNamesForEvaluation.isEmpty()) return sampleNamesForEvaluation.size(); else { return numSamplesFromArgument; } } public Set<String> getSampleNamesForStratification() { return sampleNamesForStratification; } public List<RodBinding<VariantContext>> getComps() { return comps; } public Set<SortableJexlVCMatchExp> getJexlExpressions() { return jexlExpressions; } public long getnProcessedLoci() { return nProcessedLoci; } public Set<String> getContigNames() { final TreeSet<String> contigs = new TreeSet<String>(); for (final SAMSequenceRecord r : getToolkit() .getReferenceDataSource() .getReference() .getSequenceDictionary() .getSequences()) { contigs.add(r.getSequenceName()); } return contigs; } /** * getToolkit is protected, so we have to pseudo-overload it here so eval / strats can get the * toolkit * * @return */ public GenomeAnalysisEngine getToolkit() { return super.getToolkit(); } public boolean ignoreAC0Sites() { return !keepSitesWithAC0; } }