/** * Takes the interval, finds it in the stash, prints it to the VCF * * @param stats The statistics of the interval * @param refAllele the reference allele */ private void outputStatsToVCF(final IntervalStratification stats, final Allele refAllele) { GenomeLoc interval = stats.getInterval(); final List<Allele> alleles = new ArrayList<>(); final Map<String, Object> attributes = new HashMap<>(); final ArrayList<Genotype> genotypes = new ArrayList<>(); for (String sample : samples) { final GenotypeBuilder gb = new GenotypeBuilder(sample); SampleStratification sampleStat = stats.getSampleStatistics(sample); gb.attribute( GATKVCFConstants.AVG_INTERVAL_DP_BY_SAMPLE_KEY, sampleStat.averageCoverage(interval.size())); gb.attribute(GATKVCFConstants.LOW_COVERAGE_LOCI, sampleStat.getNLowCoveredLoci()); gb.attribute(GATKVCFConstants.ZERO_COVERAGE_LOCI, sampleStat.getNUncoveredLoci()); gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false)); genotypes.add(gb.make()); } alleles.add(refAllele); alleles.add(SYMBOLIC_ALLELE); VariantContextBuilder vcb = new VariantContextBuilder( "DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles); vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); vcb.filters(new LinkedHashSet<>(statusToStrings(stats.callableStatuses(), true))); attributes.put(VCFConstants.END_KEY, interval.getStop()); attributes.put(GATKVCFConstants.AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size())); attributes.put(GATKVCFConstants.INTERVAL_GC_CONTENT_KEY, stats.gcContent()); vcb = vcb.attributes(attributes); vcb = vcb.genotypes(genotypes); vcfWriter.add(vcb.make()); }
/** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority * order, if provided. If uniqifySamples is true, the priority order is ignored and names are * created by concatenating the VC name with the sample name * * @param genomeLocParser loc parser * @param unsortedVCs collection of unsorted VCs * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs * @param filteredRecordMergeType merge type for filtered records * @param genotypeMergeOptions merge option for genotypes * @param annotateOrigin should we annotate the set it came from? * @param printMessages should we print messages? * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? * @return new VariantContext representing the merge of unsortedVCs */ public static VariantContext simpleMerge( final GenomeLocParser genomeLocParser, final Collection<VariantContext> unsortedVCs, final List<String> priorityListOfVCs, final FilteredRecordMergeType filteredRecordMergeType, final GenotypeMergeType genotypeMergeOptions, final boolean annotateOrigin, final boolean printMessages, final String setKey, final boolean filteredAreUncalled, final boolean mergeInfoWithMaxAC) { if (unsortedVCs == null || unsortedVCs.size() == 0) return null; if (annotateOrigin && priorityListOfVCs == null) throw new IllegalArgumentException( "Cannot merge calls and annotate their origins without a complete priority list of VariantContexts"); if (genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE) verifyUniqueSampleNames(unsortedVCs); List<VariantContext> prepaddedVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); // Make sure all variant contexts are padded with reference base in case of indels if necessary List<VariantContext> VCs = new ArrayList<VariantContext>(); for (VariantContext vc : prepaddedVCs) { // also a reasonable place to remove filtered calls, if needed if (!filteredAreUncalled || vc.isNotFiltered()) VCs.add(createVariantContextWithPaddedAlleles(vc, false)); } if (VCs.size() == 0) // everything is filtered out and we're filteredAreUncalled return null; // establish the baseline info from the first VC final VariantContext first = VCs.get(0); final String name = first.getSource(); final Allele refAllele = determineReferenceAllele(VCs); final Set<Allele> alleles = new LinkedHashSet<Allele>(); final Set<String> filters = new TreeSet<String>(); final Map<String, Object> attributes = new TreeMap<String, Object>(); final Set<String> inconsistentAttributes = new HashSet<String>(); final Set<String> variantSources = new HashSet< String>(); // contains the set of sources we found in our set of VCs that are variant final Set<String> rsIDs = new LinkedHashSet<String>(1); // most of the time there's one id GenomeLoc loc = getLocation(genomeLocParser, first); int depth = 0; int maxAC = -1; final Map<String, Object> attributesWithMaxAC = new TreeMap<String, Object>(); double log10PError = 1; VariantContext vcWithMaxAC = null; GenotypesContext genotypes = GenotypesContext.create(); // counting the number of filtered and variant VCs int nFiltered = 0; boolean remapped = false; // cycle through and add info from the other VCs, making sure the loc/reference matches for (VariantContext vc : VCs) { if (loc.getStart() != vc.getStart()) // || !first.getReference().equals(vc.getReference()) ) throw new ReviewedStingException( "BUG: attempting to merge VariantContexts with different start sites: first=" + first.toString() + " second=" + vc.toString()); if (getLocation(genomeLocParser, vc).size() > loc.size()) loc = getLocation(genomeLocParser, vc); // get the longest location nFiltered += vc.isFiltered() ? 1 : 0; if (vc.isVariant()) variantSources.add(vc.getSource()); AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); remapped = remapped || alleleMapping.needsRemapping(); alleles.addAll(alleleMapping.values()); mergeGenotypes( genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); log10PError = Math.min(log10PError, vc.isVariant() ? vc.getLog10PError() : 1); filters.addAll(vc.getFilters()); // // add attributes // // special case DP (add it up) and ID (just preserve it) // if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); if (vc.hasID()) rsIDs.add(vc.getID()); if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); // lets see if the string contains a , separator if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { List<String> alleleCountArray = Arrays.asList( rawAlleleCounts .substring(1, rawAlleleCounts.length() - 1) .split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); for (String alleleCount : alleleCountArray) { final int ac = Integer.valueOf(alleleCount.trim()); if (ac > maxAC) { maxAC = ac; vcWithMaxAC = vc; } } } else { final int ac = Integer.valueOf(rawAlleleCounts); if (ac > maxAC) { maxAC = ac; vcWithMaxAC = vc; } } } for (Map.Entry<String, Object> p : vc.getAttributes().entrySet()) { String key = p.getKey(); // if we don't like the key already, don't go anywhere if (!inconsistentAttributes.contains(key)) { boolean alreadyFound = attributes.containsKey(key); Object boundValue = attributes.get(key); boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); if (alreadyFound && !boundValue.equals(p.getValue()) && !boundIsMissingValue) { // we found the value but we're inconsistent, put it in the exclude list // System.out.printf("Inconsistent INFO values: %s => %s and %s%n", key, boundValue, // p.getValue()); inconsistentAttributes.add(key); attributes.remove(key); } else if (!alreadyFound || boundIsMissingValue) { // no value // if ( vc != first ) System.out.printf("Adding key %s => %s%n", p.getKey(), // p.getValue()); attributes.put(key, p.getValue()); } } } } // if we have more alternate alleles in the merged VC than in one or more of the // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well // as allele-dependent attributes like AC,AF for (VariantContext vc : VCs) { if (vc.alleles.size() == 1) continue; if (hasPLIncompatibleAlleles(alleles, vc.alleles)) { if (!genotypes.isEmpty()) logger.warn( String.format( "Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); genotypes = stripPLs(genotypes); // this will remove stale AC,AF attributed from vc calculateChromosomeCounts(vc, attributes, true); break; } } // take the VC with the maxAC and pull the attributes into a modifiable map if (mergeInfoWithMaxAC && vcWithMaxAC != null) { attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); } // if at least one record was unfiltered and we want a union, clear all of the filters if ((filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL) filters.clear(); if (annotateOrigin) { // we care about where the call came from String setValue; if (nFiltered == 0 && variantSources.size() == priorityListOfVCs.size()) // nothing was unfiltered setValue = MERGE_INTERSECTION; else if (nFiltered == VCs.size()) // everything was filtered out setValue = MERGE_FILTER_IN_ALL; else if (variantSources.isEmpty()) // everyone was reference setValue = MERGE_REF_IN_ALL; else { LinkedHashSet<String> s = new LinkedHashSet<String>(); for (VariantContext vc : VCs) if (vc.isVariant()) s.add(vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource()); setValue = Utils.join("-", s); } if (setKey != null) { attributes.put(setKey, setValue); if (mergeInfoWithMaxAC && vcWithMaxAC != null) { attributesWithMaxAC.put(setKey, vcWithMaxAC.getSource()); } } } if (depth > 0) attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID); builder.loc(loc.getContig(), loc.getStart(), loc.getStop()); builder.alleles(alleles); builder.genotypes(genotypes); builder.log10PError(log10PError); builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes); // Trim the padded bases of all alleles if necessary VariantContext merged = createVariantContextWithTrimmedAlleles(builder.make()); if (printMessages && remapped) System.out.printf("Remapped => %s%n", merged); return merged; }
/** * Main entry function to calculate genotypes of a given VC with corresponding GL's * * @param tracker Tracker * @param refContext Reference context * @param rawContext Raw context * @param stratifiedContexts Stratified alignment contexts * @param vc Input VC * @param model GL calculation model * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc * @return VC with assigned genotypes */ public VariantCallContext calculateGenotypes( final RefMetaDataTracker tracker, final ReferenceContext refContext, final AlignmentContext rawContext, Map<String, AlignmentContext> stratifiedContexts, final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final boolean inheritAttributesFromInputVC, final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) { boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; // initialize the data for this thread if that hasn't been done yet if (afcm.get() == null) { afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger)); } // estimate our confidence in a reference call and return if (vc.getNSamples() == 0) { if (limitedContext) return null; return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ? estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), false, 1.0) : generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); } AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model)); // is the most likely frequency conformation AC=0 for all alternate alleles? boolean bestGuessIsRef = true; // determine which alternate alleles have AF>0 final List<Allele> myAlleles = new ArrayList<Allele>(vc.getAlleles().size()); final List<Integer> alleleCountsofMLE = new ArrayList<Integer>(vc.getAlleles().size()); myAlleles.add(vc.getReference()); for (int i = 0; i < AFresult.getAllelesUsedInGenotyping().size(); i++) { final Allele alternateAllele = AFresult.getAllelesUsedInGenotyping().get(i); if (alternateAllele.isReference()) continue; // we are non-ref if the probability of being non-ref > the emit confidence. // the emit confidence is phred-scaled, say 30 => 10^-3. // the posterior AF > 0 is log10: -5 => 10^-5 // we are non-ref if 10^-5 < 10^-3 => -5 < -3 final boolean isNonRef = AFresult.isPolymorphic(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING / -10.0); // if the most likely AC is not 0, then this is a good alternate allele to use if (isNonRef) { myAlleles.add(alternateAllele); alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); bestGuessIsRef = false; } // if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele else if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { myAlleles.add(alternateAllele); alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); } } final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0()); // note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice final double phredScaledConfidence = Math.abs( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE .GENOTYPE_GIVEN_ALLELES ? -10 * AFresult.getLog10PosteriorOfAFEq0() : -10 * AFresult.getLog10PosteriorOfAFGT0()); // return a null call if we don't pass the confidence cutoff or the most likely allele frequency // is zero if (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef)) { // technically, at this point our confidence in a reference call isn't accurately estimated // because it didn't take into account samples with no data, so let's get a better estimate return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0); } // start constructing the resulting VC final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc); final VariantContextBuilder builder = new VariantContextBuilder( "UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles); builder.log10PError(phredScaledConfidence / -10.0); if (!passesCallThreshold(phredScaledConfidence)) builder.filters(filter); // create the genotypes final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, myAlleles, true, ploidy); builder.genotypes(genotypes); // print out stats if we have a writer if (verboseWriter != null && !limitedContext) printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model); // *** note that calculating strand bias involves overwriting data structures, so we do that // last final HashMap<String, Object> attributes = new HashMap<String, Object>(); // inherit attributed from input vc if requested if (inheritAttributesFromInputVC) attributes.putAll(vc.getAttributes()); // if the site was downsampled, record that fact if (!limitedContext && rawContext.hasPileupBeenDownsampled()) attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); if (UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED) attributes.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size()); // add the MLE AC and AF annotations if (alleleCountsofMLE.size() > 0) { attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE); final int AN = builder.make().getCalledChrCount(); final ArrayList<Double> MLEfrequencies = new ArrayList<Double>(alleleCountsofMLE.size()); // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT // is ./. but the exact model may arbitrarily choose an AC>1) for (int AC : alleleCountsofMLE) MLEfrequencies.add(Math.min(1.0, (double) AC / (double) AN)); attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies); } if (UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef) { // final boolean DEBUG_SLOD = false; // the overall lod // double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); // if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); List<Allele> allAllelesToUse = builder.make().getAlleles(); // the forward lod VariantContext vcForward = calculateLikelihoods( tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); AFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model)); // double[] normalizedLog10Posteriors = // MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double forwardLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); double forwardLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); // if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", // forwardLog10PofF=" + forwardLog10PofF); // the reverse lod VariantContext vcReverse = calculateLikelihoods( tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); AFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model)); // normalizedLog10Posteriors = // MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double reverseLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); double reverseLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); // if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", // reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF; // if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + // reverseLod); // strand score is max bias between forward and reverse strands double strandScore = Math.max(forwardLod, reverseLod); // rescale by a factor of 10 strandScore *= 10.0; // logger.debug(String.format("SLOD=%f", strandScore)); if (!Double.isNaN(strandScore)) attributes.put("SB", strandScore); } // finish constructing the resulting VC builder.attributes(attributes); VariantContext vcCall = builder.make(); // if we are subsetting alleles (either because there were too many or because some were not // polymorphic) // then we may need to trim the alleles (because the original VariantContext may have had to pad // at the end). if (myAlleles.size() != vc.getAlleles().size() && !limitedContext) // limitedContext callers need to handle allele trimming on their own to // keep their perReadAlleleLikelihoodMap alleles in sync vcCall = VariantContextUtils.reverseTrimAlleles(vcCall); if (annotationEngine != null && !limitedContext) { // limitedContext callers need to handle annotations on their own by // calling their own annotationEngine // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations final ReadBackedPileup pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); vcCall = annotationEngine.annotateContext( tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap); } return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0)); }