private boolean isConcordant(VariantContext vc, Collection<VariantContext> compVCs) { if (vc == null || compVCs == null || compVCs.isEmpty()) return false; // if we're not looking for specific samples then the fact that we have both VCs is enough to // call it concordant. if (NO_SAMPLES_SPECIFIED) return true; // make a list of all samples contained in this variant VC that are being tracked by the user // command line arguments. Set<String> variantSamples = vc.getSampleNames(); variantSamples.retainAll(samples); // check if we can find all samples from the variant rod in the comp rod. for (String sample : variantSamples) { boolean foundSample = false; for (VariantContext compVC : compVCs) { Genotype varG = vc.getGenotype(sample); Genotype compG = compVC.getGenotype(sample); if (haveSameGenotypes(varG, compG)) { foundSample = true; break; } } // if at least one sample doesn't have the same genotype, we don't have concordance if (!foundSample) { return false; } } return true; }
public int getNumberOfSamplesForEvaluation() { if (sampleNamesForEvaluation != null && !sampleNamesForEvaluation.isEmpty()) return sampleNamesForEvaluation.size(); else { return numSamplesFromArgument; } }
private void initializeVcfWriter() { final List<String> inputNames = Arrays.asList(validation.getName()); // setup the header fields Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>(); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames)); hInfo.add( new VCFFilterHeaderLine( "bootstrap", "This site used for genotype bootstrapping with ProduceBeagleInputWalker")); bootstrapVCFOutput.writeHeader( new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames))); }
/** * Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN, * AC, AF). * * @param vc the VariantContext record to subset * @param samples the samples to extract * @return the subsetted VariantContext */ private VariantContext subsetRecord(VariantContext vc, Set<String> samples) { if (samples == null || samples.isEmpty()) return vc; ArrayList<Genotype> genotypes = new ArrayList<Genotype>(); for (Map.Entry<String, Genotype> genotypePair : vc.getGenotypes().entrySet()) { if (samples.contains(genotypePair.getKey())) genotypes.add(genotypePair.getValue()); } VariantContext sub = vc.subContextFromGenotypes(genotypes, vc.getAlleles()); // if we have fewer alternate alleles in the selected VC than in the original VC, we need to // strip out the GL/PLs (because they are no longer accurate) if (vc.getAlleles().size() != sub.getAlleles().size()) sub = VariantContext.modifyGenotypes(sub, VariantContextUtils.stripPLs(vc.getGenotypes())); HashMap<String, Object> attributes = new HashMap<String, Object>(sub.getAttributes()); int depth = 0; for (String sample : sub.getSampleNames()) { Genotype g = sub.getGenotype(sample); if (g.isNotFiltered() && g.isCalled()) { String dp = (String) g.getAttribute("DP"); if (dp != null && !dp.equals(VCFConstants.MISSING_DEPTH_v3) && !dp.equals(VCFConstants.MISSING_VALUE_v4)) { depth += Integer.valueOf(dp); } } } if (KEEP_ORIGINAL_CHR_COUNTS) { if (attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY)) attributes.put("AC_Orig", attributes.get(VCFConstants.ALLELE_COUNT_KEY)); if (attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY)) attributes.put("AF_Orig", attributes.get(VCFConstants.ALLELE_FREQUENCY_KEY)); if (attributes.containsKey(VCFConstants.ALLELE_NUMBER_KEY)) attributes.put("AN_Orig", attributes.get(VCFConstants.ALLELE_NUMBER_KEY)); } VariantContextUtils.calculateChromosomeCounts(sub, attributes, false); attributes.put("DP", depth); sub = VariantContext.modifyAttributes(sub, attributes); return sub; }
/** * Compares the covariate report lists. * * @param diffs map where to annotate the difference. * @param other the argument collection to compare against. * @param thisRole the name for this argument collection that makes sense to the user. * @param otherRole the name for the other argument collection that makes sense to the end user. * @return <code>true</code> if a difference was found. */ @Requires("diffs != null && other != null && thisRole != null && otherRole != null") private boolean compareRequestedCovariates( final Map<String, String> diffs, final RecalibrationArgumentCollection other, final String thisRole, final String otherRole) { final Set<String> beforeNames = new HashSet<>(this.COVARIATES.length); final Set<String> afterNames = new HashSet<>(other.COVARIATES.length); Utils.addAll(beforeNames, this.COVARIATES); Utils.addAll(afterNames, other.COVARIATES); final Set<String> intersect = new HashSet<>(Math.min(beforeNames.size(), afterNames.size())); intersect.addAll(beforeNames); intersect.retainAll(afterNames); String diffMessage = null; if (intersect.size() == 0) { // In practice this is not possible due to required covariates but... diffMessage = String.format( "There are no common covariates between '%s' and '%s'" + " recalibrator reports. Covariates in '%s': {%s}. Covariates in '%s': {%s}.", thisRole, otherRole, thisRole, Utils.join(", ", this.COVARIATES), otherRole, Utils.join(",", other.COVARIATES)); } else if (intersect.size() != beforeNames.size() || intersect.size() != afterNames.size()) { beforeNames.removeAll(intersect); afterNames.removeAll(intersect); diffMessage = String.format( "There are differences in the set of covariates requested in the" + " '%s' and '%s' recalibrator reports. " + " Exclusive to '%s': {%s}. Exclusive to '%s': {%s}.", thisRole, otherRole, thisRole, Utils.join(", ", beforeNames), otherRole, Utils.join(", ", afterNames)); } if (diffMessage != null) { diffs.put("covariate", diffMessage); return true; } else { return false; } }
/** Initialize the stratifications, evaluations, evaluation contexts, and reporting object */ public void initialize() { // Just list the modules, and exit quickly. if (LIST) { variantEvalUtils.listModulesAndExit(); } // maintain the full list of comps comps.addAll(compsProvided); if (dbsnp.dbsnp.isBound()) { comps.add(dbsnp.dbsnp); knowns.add(dbsnp.dbsnp); } // Add a dummy comp track if none exists if (comps.size() == 0) comps.add( new RodBinding<VariantContext>(VariantContext.class, "none", "UNBOUND", "", new Tags())); // Set up set of additional knowns for (RodBinding<VariantContext> compRod : comps) { if (KNOWN_NAMES.contains(compRod.getName())) knowns.add(compRod); } // Now that we have all the rods categorized, determine the sample list from the eval rods. Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evals); Set<String> vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); // Load the sample list sampleNamesForEvaluation.addAll( SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS)); numSamples = NUM_SAMPLES > 0 ? NUM_SAMPLES : sampleNamesForEvaluation.size(); if (Arrays.asList(STRATIFICATIONS_TO_USE).contains("Sample")) { sampleNamesForStratification.addAll(sampleNamesForEvaluation); } sampleNamesForStratification.add(ALL_SAMPLE_NAME); // Initialize select expressions for (VariantContextUtils.JexlVCMatchExp jexl : VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS)) { SortableJexlVCMatchExp sjexl = new SortableJexlVCMatchExp(jexl.name, jexl.exp); jexlExpressions.add(sjexl); } // Initialize the set of stratifications and evaluations to use stratificationObjects = variantEvalUtils.initializeStratificationObjects( this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE); Set<Class<? extends VariantEvaluator>> evaluationObjects = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE); for (VariantStratifier vs : getStratificationObjects()) { if (vs.getName().equals("Filter")) byFilterIsEnabled = true; else if (vs.getName().equals("Sample")) perSampleIsEnabled = true; } if (intervalsFile != null) { boolean fail = true; for (final VariantStratifier vs : stratificationObjects) { if (vs.getClass().equals(IntervalStratification.class)) fail = false; } if (fail) throw new UserException.BadArgumentValue( "ST", "stratIntervals argument provided but -ST IntervalStratification not provided"); } // Initialize the evaluation contexts evaluationContexts = variantEvalUtils.initializeEvaluationContexts( stratificationObjects, evaluationObjects, null, null); // Initialize report table report = variantEvalUtils.initializeGATKReport(stratificationObjects, evaluationObjects); // Load ancestral alignments if (ancestralAlignmentsFile != null) { try { ancestralAlignments = new IndexedFastaSequenceFile(ancestralAlignmentsFile); } catch (FileNotFoundException e) { throw new ReviewedStingException( String.format( "The ancestral alignments file, '%s', could not be found", ancestralAlignmentsFile.getAbsolutePath())); } } // initialize CNVs if (knownCNVsFile != null) { knownCNVsByContig = createIntervalTreeByContig(knownCNVsFile); } }
/** Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher */ public void initialize() { // Get list of samples to include in the output List<String> rodNames = Arrays.asList(variantCollection.variants.getName()); Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); TreeSet<String> vcfSamples = new TreeSet<String>( SampleUtils.getSampleList( vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); Collection<String> samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); Collection<String> samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions); // first, add any requested samples samples.addAll(samplesFromFile); samples.addAll(samplesFromExpressions); samples.addAll(sampleNames); // if none were requested, we want all of them if (samples.isEmpty()) { samples.addAll(vcfSamples); NO_SAMPLES_SPECIFIED = true; } // now, exclude any requested samples Collection<String> XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles); samples.removeAll(XLsamplesFromFile); samples.removeAll(XLsampleNames); if (samples.size() == 0 && !NO_SAMPLES_SPECIFIED) throw new UserException( "All samples requested to be included were also requested to be excluded."); for (String sample : samples) logger.info("Including sample '" + sample + "'"); // if user specified types to include, add these, otherwise, add all possible variant context // types to list of vc types to include if (TYPES_TO_INCLUDE.isEmpty()) { for (VariantContext.Type t : VariantContext.Type.values()) selectedTypes.add(t); } else { for (VariantContext.Type t : TYPES_TO_INCLUDE) selectedTypes.add(t); } // Initialize VCF header Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); headerLines.add(new VCFHeaderLine("source", "SelectVariants")); if (KEEP_ORIGINAL_CHR_COUNTS) { headerLines.add( new VCFFormatHeaderLine("AC_Orig", 1, VCFHeaderLineType.Integer, "Original AC")); headerLines.add( new VCFFormatHeaderLine("AF_Orig", 1, VCFHeaderLineType.Float, "Original AF")); headerLines.add( new VCFFormatHeaderLine("AN_Orig", 1, VCFHeaderLineType.Integer, "Original AN")); } vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); for (int i = 0; i < SELECT_EXPRESSIONS.size(); i++) { // It's not necessary that the user supply select names for the JEXL expressions, since those // expressions will only be needed for omitting records. Make up the select names here. selectNames.add(String.format("select-%d", i)); } jexls = VariantContextUtils.initializeMatchExps(selectNames, SELECT_EXPRESSIONS); // Look at the parameters to decide which analysis to perform DISCORDANCE_ONLY = discordanceTrack.isBound(); if (DISCORDANCE_ONLY) logger.info( "Selecting only variants discordant with the track: " + discordanceTrack.getName()); CONCORDANCE_ONLY = concordanceTrack.isBound(); if (CONCORDANCE_ONLY) logger.info( "Selecting only variants concordant with the track: " + concordanceTrack.getName()); if (MENDELIAN_VIOLATIONS) { if (FAMILY_STRUCTURE_FILE != null) { try { for (final String line : new XReadLines(FAMILY_STRUCTURE_FILE)) { MendelianViolation mv = new MendelianViolation(line, MENDELIAN_VIOLATION_QUAL_THRESHOLD); if (samples.contains(mv.getSampleChild()) && samples.contains(mv.getSampleDad()) && samples.contains(mv.getSampleMom())) mvSet.add(mv); } } catch (FileNotFoundException e) { throw new UserException.CouldNotReadInputFile(FAMILY_STRUCTURE_FILE, e); } if (outMVFile != null) try { outMVFileStream = new PrintStream(outMVFile); } catch (FileNotFoundException e) { throw new UserException.CouldNotCreateOutputFile( outMVFile, "Can't open output file", e); } } else mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD)); } else if (!FAMILY_STRUCTURE.isEmpty()) { mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD)); MENDELIAN_VIOLATIONS = true; } SELECT_RANDOM_NUMBER = numRandom > 0; if (SELECT_RANDOM_NUMBER) { logger.info("Selecting " + numRandom + " variants at random from the variant track"); variantArray = new RandomVariantStructure[numRandom]; } SELECT_RANDOM_FRACTION = fractionRandom > 0; if (SELECT_RANDOM_FRACTION) logger.info( "Selecting approximately " + 100.0 * fractionRandom + "% of the variants at random from the variant track"); }