/** * Returns a context identical to this with the REF and ALT alleles reverse complemented. * * @param vc variant context * @return new vc */ public static VariantContext reverseComplement(VariantContext vc) { // create a mapping from original allele to reverse complemented allele HashMap<Allele, Allele> alleleMap = new HashMap<Allele, Allele>(vc.getAlleles().size()); for (Allele originalAllele : vc.getAlleles()) { Allele newAllele; if (originalAllele.isNoCall() || originalAllele.isNull()) newAllele = originalAllele; else newAllele = Allele.create( BaseUtils.simpleReverseComplement(originalAllele.getBases()), originalAllele.isReference()); alleleMap.put(originalAllele, newAllele); } // create new Genotype objects GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); for (final Genotype genotype : vc.getGenotypes()) { List<Allele> newAlleles = new ArrayList<Allele>(); for (Allele allele : genotype.getAlleles()) { Allele newAllele = alleleMap.get(allele); if (newAllele == null) newAllele = Allele.NO_CALL; newAlleles.add(newAllele); } newGenotypes.add(Genotype.modifyAlleles(genotype, newAlleles)); } return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make(); }
/** * Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN, * AC, AF). * * @param vc the VariantContext record to subset * @param samples the samples to extract * @return the subsetted VariantContext */ private VariantContext subsetRecord(VariantContext vc, Set<String> samples) { if (samples == null || samples.isEmpty()) return vc; ArrayList<Genotype> genotypes = new ArrayList<Genotype>(); for (Map.Entry<String, Genotype> genotypePair : vc.getGenotypes().entrySet()) { if (samples.contains(genotypePair.getKey())) genotypes.add(genotypePair.getValue()); } VariantContext sub = vc.subContextFromGenotypes(genotypes, vc.getAlleles()); // if we have fewer alternate alleles in the selected VC than in the original VC, we need to // strip out the GL/PLs (because they are no longer accurate) if (vc.getAlleles().size() != sub.getAlleles().size()) sub = VariantContext.modifyGenotypes(sub, VariantContextUtils.stripPLs(vc.getGenotypes())); HashMap<String, Object> attributes = new HashMap<String, Object>(sub.getAttributes()); int depth = 0; for (String sample : sub.getSampleNames()) { Genotype g = sub.getGenotype(sample); if (g.isNotFiltered() && g.isCalled()) { String dp = (String) g.getAttribute("DP"); if (dp != null && !dp.equals(VCFConstants.MISSING_DEPTH_v3) && !dp.equals(VCFConstants.MISSING_VALUE_v4)) { depth += Integer.valueOf(dp); } } } if (KEEP_ORIGINAL_CHR_COUNTS) { if (attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY)) attributes.put("AC_Orig", attributes.get(VCFConstants.ALLELE_COUNT_KEY)); if (attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY)) attributes.put("AF_Orig", attributes.get(VCFConstants.ALLELE_FREQUENCY_KEY)); if (attributes.containsKey(VCFConstants.ALLELE_NUMBER_KEY)) attributes.put("AN_Orig", attributes.get(VCFConstants.ALLELE_NUMBER_KEY)); } VariantContextUtils.calculateChromosomeCounts(sub, attributes, false); attributes.put("DP", depth); sub = VariantContext.modifyAttributes(sub, attributes); return sub; }
@Test(dataProvider = "mergeFiltered") public void testMergeFiltered(MergeFilteredTest cfg) { final List<String> priority = vcs2priority(cfg.inputs); final VariantContext merged = VariantContextUtils.simpleMerge( genomeLocParser, cfg.inputs, priority, cfg.type, VariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); // test alleles are equal Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); // test set field Assert.assertEquals(merged.getAttribute("set"), cfg.setExpected); // test filter field Assert.assertEquals(merged.getFilters(), cfg.expected.getFilters()); }
@Test(dataProvider = "mergeAlleles") public void testMergeAlleles(MergeAllelesTest cfg) { final List<VariantContext> inputs = new ArrayList<VariantContext>(); int i = 0; for (final List<Allele> alleles : cfg.inputs) { final String name = "vcf" + ++i; inputs.add(makeVC(name, alleles)); } final List<String> priority = vcs2priority(inputs); final VariantContext merged = VariantContextUtils.simpleMerge( genomeLocParser, inputs, priority, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false); Assert.assertEquals(merged.getAlleles(), cfg.expected); }
protected void printVerboseData( String pos, VariantContext vc, double PofF, double phredScaledConfidence, final GenotypeLikelihoodsCalculationModel.Model model) { Allele refAllele = null, altAllele = null; for (Allele allele : vc.getAlleles()) { if (allele.isReference()) refAllele = allele; else altAllele = allele; } for (int i = 0; i <= N; i++) { StringBuilder AFline = new StringBuilder("AFINFO\t"); AFline.append(pos); AFline.append("\t"); AFline.append(refAllele); AFline.append("\t"); if (altAllele != null) AFline.append(altAllele); else AFline.append("N/A"); AFline.append("\t"); AFline.append(i + "/" + N + "\t"); AFline.append(String.format("%.2f\t", ((float) i) / N)); AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i])); verboseWriter.println(AFline.toString()); } verboseWriter.println("P(f>0) = " + PofF); verboseWriter.println("Qscore = " + phredScaledConfidence); verboseWriter.println(); }
protected final void printCallInfo( final VariantContext vc, final double[] log10AlleleFrequencyPriors, final long runtimeNano, final AFCalcResult result) { printCallElement(vc, "type", "ignore", vc.getType()); int allelei = 0; for (final Allele a : vc.getAlleles()) printCallElement(vc, "allele", allelei++, a.getDisplayString()); for (final Genotype g : vc.getGenotypes()) printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString()); for (int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++) printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]); printCallElement(vc, "runtime.nano", "ignore", runtimeNano); printCallElement(vc, "log10PosteriorOfAFEq0", "ignore", result.getLog10PosteriorOfAFEq0()); printCallElement(vc, "log10PosteriorOfAFGt0", "ignore", result.getLog10PosteriorOfAFGT0()); for (final Allele allele : result.getAllelesUsedInGenotyping()) { if (allele.isNonReference()) { printCallElement(vc, "MLE", allele, result.getAlleleCountAtMLE(allele)); printCallElement( vc, "pNonRefByAllele", allele, result.getLog10PosteriorOfAFGt0ForAllele(allele)); } } callReport.flush(); }
@Override public String toString() { return String.format( "ExactCall %s:%d alleles=%s nSamples=%s orig.pNonRef=%.2f orig.runtime=%s", vc.getChr(), vc.getStart(), vc.getAlleles(), vc.getNSamples(), originalCall.getLog10PosteriorOfAFGT0(), new AutoFormattingTime(runtime / 1e9).toString()); }
/** * Copy constructor * * @param other the VariantContext to copy */ protected VariantContext(VariantContext other) { this( other.getSource(), other.getID(), other.getChr(), other.getStart(), other.getEnd(), other.getAlleles(), other.getGenotypes(), other.getLog10PError(), other.getFiltersMaybeNull(), other.getAttributes(), other.REFERENCE_BASE_FOR_INDEL, NO_VALIDATION); }
private VariantCallContext generateEmptyContext( RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, AlignmentContext rawContext) { VariantContext vc; if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod( tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles); if (vcInput == null) return null; vc = new VariantContextBuilder( "UG_call", ref.getLocus().getContig(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles()) .make(); } else { // deal with bad/non-standard reference bases if (!Allele.acceptableAlleleBases(new byte[] {ref.getBase()})) return null; Set<Allele> alleles = new HashSet<Allele>(); alleles.add(Allele.create(ref.getBase(), true)); vc = new VariantContextBuilder( "UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles) .make(); } if (annotationEngine != null) { // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations final ReadBackedPileup pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc); } return new VariantCallContext(vc, false); }
private static AlleleMapper resolveIncompatibleAlleles( Allele refAllele, VariantContext vc, Set<Allele> allAlleles) { if (refAllele.equals(vc.getReference())) return new AlleleMapper(vc); else { // we really need to do some work. The refAllele is the longest reference allele seen at this // start site. So imagine it is: // // refAllele: ACGTGA // myRef: ACGT // myAlt: - // // We need to remap all of the alleles in vc to include the extra GA so that // myRef => refAllele and myAlt => GA // Allele myRef = vc.getReference(); if (refAllele.length() <= myRef.length()) throw new ReviewedStingException( "BUG: myRef=" + myRef + " is longer than refAllele=" + refAllele); byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); // System.out.printf("Remapping allele at %s%n", vc); // System.out.printf("ref %s%n", refAllele); // System.out.printf("myref %s%n", myRef ); // System.out.printf("extrabases %s%n", new String(extraBases)); Map<Allele, Allele> map = new HashMap<Allele, Allele>(); for (Allele a : vc.getAlleles()) { if (a.isReference()) map.put(a, refAllele); else { Allele extended = Allele.extend(a, extraBases); for (Allele b : allAlleles) if (extended.equals(b)) extended = b; // System.out.printf(" Extending %s => %s%n", a, extended); map.put(a, extended); } } // debugging // System.out.printf("mapping %s%n", map); return new AlleleMapper(map); } }
@Test(dataProvider = "mergeGenotypes") public void testMergeGenotypes(MergeGenotypesTest cfg) { final VariantContext merged = VariantContextUtils.simpleMerge( genomeLocParser, cfg.inputs, cfg.priority, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); // test alleles are equal Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); // test genotypes assertGenotypesAreMostlyEqual(merged.getGenotypes(), cfg.expected.getGenotypes()); }
public void writeBeagleOutput( VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) { GenomeLoc currentLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), preferredVC); StringBuffer beagleOut = new StringBuffer(); String marker = String.format("%s:%d ", currentLoc.getContig(), currentLoc.getStart()); beagleOut.append(marker); if (markers != null) markers.append(marker).append("\t").append(Integer.toString(markerCounter++)).append("\t"); for (Allele allele : preferredVC.getAlleles()) { String bglPrintString; if (allele.isNoCall() || allele.isNull()) bglPrintString = "-"; else bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele beagleOut.append(String.format("%s ", bglPrintString)); if (markers != null) markers.append(bglPrintString).append("\t"); } if (markers != null) markers.append("\n"); GenotypesContext preferredGenotypes = preferredVC.getGenotypes(); GenotypesContext otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; for (String sample : samples) { boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE; Genotype genotype; boolean isValidation; // use sample as key into genotypes structure if (preferredGenotypes.containsSample(sample)) { genotype = preferredGenotypes.get(sample); isValidation = isValidationSite; } else if (otherGenotypes != null && otherGenotypes.containsSample(sample)) { genotype = otherGenotypes.get(sample); isValidation = !isValidationSite; } else { // there is magically no genotype for this sample. throw new StingException( "Sample " + sample + " arose with no genotype in variant or validation VCF. This should never happen."); } /* * Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key */ double[] log10Likelihoods = null; if ((isValidation && prior < 0.0) || genotype.hasLikelihoods()) { log10Likelihoods = genotype.getLikelihoods().getAsVector(); // see if we need to randomly mask out genotype in this position. if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() <= insertedNoCallRate) { // we are masking out this genotype log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS; } if (isMaleOnChrX) { log10Likelihoods[1] = -255; // todo -- warning this is dangerous for multi-allele case } } /** otherwise, use the prior uniformly */ else if (!isValidation && genotype.isCalled() && !genotype.hasLikelihoods()) { // hack to deal with input VCFs with no genotype likelihoods. Just assume the called // genotype // is confident. This is useful for Hapmap and 1KG release VCFs. double AA = (1.0 - prior) / 2.0; double AB = (1.0 - prior) / 2.0; double BB = (1.0 - prior) / 2.0; if (genotype.isHomRef()) { AA = prior; } else if (genotype.isHet()) { AB = prior; } else if (genotype.isHomVar()) { BB = prior; } log10Likelihoods = MathUtils.toLog10(new double[] {AA, isMaleOnChrX ? 0.0 : AB, BB}); } else { log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS; } writeSampleLikelihoods(beagleOut, preferredVC, log10Likelihoods); } beagleWriter.println(beagleOut.toString()); }
/** * Main entry function to calculate genotypes of a given VC with corresponding GL's * * @param tracker Tracker * @param refContext Reference context * @param rawContext Raw context * @param stratifiedContexts Stratified alignment contexts * @param vc Input VC * @param model GL calculation model * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc * @return VC with assigned genotypes */ public VariantCallContext calculateGenotypes( final RefMetaDataTracker tracker, final ReferenceContext refContext, final AlignmentContext rawContext, Map<String, AlignmentContext> stratifiedContexts, final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final boolean inheritAttributesFromInputVC, final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) { boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; // initialize the data for this thread if that hasn't been done yet if (afcm.get() == null) { afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger)); } // estimate our confidence in a reference call and return if (vc.getNSamples() == 0) { if (limitedContext) return null; return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ? estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), false, 1.0) : generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); } AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model)); // is the most likely frequency conformation AC=0 for all alternate alleles? boolean bestGuessIsRef = true; // determine which alternate alleles have AF>0 final List<Allele> myAlleles = new ArrayList<Allele>(vc.getAlleles().size()); final List<Integer> alleleCountsofMLE = new ArrayList<Integer>(vc.getAlleles().size()); myAlleles.add(vc.getReference()); for (int i = 0; i < AFresult.getAllelesUsedInGenotyping().size(); i++) { final Allele alternateAllele = AFresult.getAllelesUsedInGenotyping().get(i); if (alternateAllele.isReference()) continue; // we are non-ref if the probability of being non-ref > the emit confidence. // the emit confidence is phred-scaled, say 30 => 10^-3. // the posterior AF > 0 is log10: -5 => 10^-5 // we are non-ref if 10^-5 < 10^-3 => -5 < -3 final boolean isNonRef = AFresult.isPolymorphic(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING / -10.0); // if the most likely AC is not 0, then this is a good alternate allele to use if (isNonRef) { myAlleles.add(alternateAllele); alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); bestGuessIsRef = false; } // if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele else if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { myAlleles.add(alternateAllele); alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); } } final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0()); // note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice final double phredScaledConfidence = Math.abs( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE .GENOTYPE_GIVEN_ALLELES ? -10 * AFresult.getLog10PosteriorOfAFEq0() : -10 * AFresult.getLog10PosteriorOfAFGT0()); // return a null call if we don't pass the confidence cutoff or the most likely allele frequency // is zero if (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef)) { // technically, at this point our confidence in a reference call isn't accurately estimated // because it didn't take into account samples with no data, so let's get a better estimate return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0); } // start constructing the resulting VC final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc); final VariantContextBuilder builder = new VariantContextBuilder( "UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles); builder.log10PError(phredScaledConfidence / -10.0); if (!passesCallThreshold(phredScaledConfidence)) builder.filters(filter); // create the genotypes final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, myAlleles, true, ploidy); builder.genotypes(genotypes); // print out stats if we have a writer if (verboseWriter != null && !limitedContext) printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model); // *** note that calculating strand bias involves overwriting data structures, so we do that // last final HashMap<String, Object> attributes = new HashMap<String, Object>(); // inherit attributed from input vc if requested if (inheritAttributesFromInputVC) attributes.putAll(vc.getAttributes()); // if the site was downsampled, record that fact if (!limitedContext && rawContext.hasPileupBeenDownsampled()) attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); if (UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED) attributes.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size()); // add the MLE AC and AF annotations if (alleleCountsofMLE.size() > 0) { attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE); final int AN = builder.make().getCalledChrCount(); final ArrayList<Double> MLEfrequencies = new ArrayList<Double>(alleleCountsofMLE.size()); // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT // is ./. but the exact model may arbitrarily choose an AC>1) for (int AC : alleleCountsofMLE) MLEfrequencies.add(Math.min(1.0, (double) AC / (double) AN)); attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies); } if (UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef) { // final boolean DEBUG_SLOD = false; // the overall lod // double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); // if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); List<Allele> allAllelesToUse = builder.make().getAlleles(); // the forward lod VariantContext vcForward = calculateLikelihoods( tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); AFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model)); // double[] normalizedLog10Posteriors = // MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double forwardLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); double forwardLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); // if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", // forwardLog10PofF=" + forwardLog10PofF); // the reverse lod VariantContext vcReverse = calculateLikelihoods( tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); AFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model)); // normalizedLog10Posteriors = // MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double reverseLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); double reverseLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); // if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", // reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF; // if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + // reverseLod); // strand score is max bias between forward and reverse strands double strandScore = Math.max(forwardLod, reverseLod); // rescale by a factor of 10 strandScore *= 10.0; // logger.debug(String.format("SLOD=%f", strandScore)); if (!Double.isNaN(strandScore)) attributes.put("SB", strandScore); } // finish constructing the resulting VC builder.attributes(attributes); VariantContext vcCall = builder.make(); // if we are subsetting alleles (either because there were too many or because some were not // polymorphic) // then we may need to trim the alleles (because the original VariantContext may have had to pad // at the end). if (myAlleles.size() != vc.getAlleles().size() && !limitedContext) // limitedContext callers need to handle allele trimming on their own to // keep their perReadAlleleLikelihoodMap alleles in sync vcCall = VariantContextUtils.reverseTrimAlleles(vcCall); if (annotationEngine != null && !limitedContext) { // limitedContext callers need to handle annotations on their own by // calling their own annotationEngine // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations final ReadBackedPileup pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); vcCall = annotationEngine.annotateContext( tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap); } return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0)); }
public Collection<Allele> values() { return map != null ? map.values() : vc.getAlleles(); }
public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) { // see if we need to trim common reference base from all alleles boolean trimVC; // We need to trim common reference base from all alleles in all genotypes if a ref base is // common to all alleles Allele refAllele = inputVC.getReference(); if (!inputVC.isVariant()) trimVC = false; else if (refAllele.isNull()) trimVC = false; else { trimVC = (AbstractVCFCodec.computeForwardClipping( new ArrayList<Allele>(inputVC.getAlternateAlleles()), inputVC.getReference().getDisplayString()) > 0); } // nothing to do if we don't need to trim bases if (trimVC) { List<Allele> alleles = new ArrayList<Allele>(); GenotypesContext genotypes = GenotypesContext.create(); // set the reference base for indels in the attributes Map<String, Object> attributes = new TreeMap<String, Object>(inputVC.getAttributes()); Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>(); for (Allele a : inputVC.getAlleles()) { if (a.isSymbolic()) { alleles.add(a); originalToTrimmedAlleleMap.put(a, a); } else { // get bases for current allele and create a new one with trimmed bases byte[] newBases = Arrays.copyOfRange(a.getBases(), 1, a.length()); Allele trimmedAllele = Allele.create(newBases, a.isReference()); alleles.add(trimmedAllele); originalToTrimmedAlleleMap.put(a, trimmedAllele); } } // detect case where we're trimming bases but resulting vc doesn't have any null allele. In // that case, we keep original representation // example: mixed records such as {TA*,TGA,TG} boolean hasNullAlleles = false; for (Allele a : originalToTrimmedAlleleMap.values()) { if (a.isNull()) hasNullAlleles = true; if (a.isReference()) refAllele = a; } if (!hasNullAlleles) return inputVC; // now we can recreate new genotypes with trimmed alleles for (final Genotype genotype : inputVC.getGenotypes()) { List<Allele> originalAlleles = genotype.getAlleles(); List<Allele> trimmedAlleles = new ArrayList<Allele>(); for (Allele a : originalAlleles) { if (a.isCalled()) trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); else trimmedAlleles.add(Allele.NO_CALL); } genotypes.add(Genotype.modifyAlleles(genotype, trimmedAlleles)); } final VariantContextBuilder builder = new VariantContextBuilder(inputVC); return builder .alleles(alleles) .genotypes(genotypes) .attributes(attributes) .referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])) .make(); } return inputVC; }
public static VariantContext createVariantContextWithPaddedAlleles( VariantContext inputVC, boolean refBaseShouldBeAppliedToEndOfAlleles) { // see if we need to pad common reference base from all alleles boolean padVC; // We need to pad a VC with a common base if the length of the reference allele is less than the // length of the VariantContext. // This happens because the position of e.g. an indel is always one before the actual event (as // per VCF convention). long locLength = (inputVC.getEnd() - inputVC.getStart()) + 1; if (inputVC.hasSymbolicAlleles()) padVC = true; else if (inputVC.getReference().length() == locLength) padVC = false; else if (inputVC.getReference().length() == locLength - 1) padVC = true; else throw new IllegalArgumentException( "Badly formed variant context at location " + String.valueOf(inputVC.getStart()) + " in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size"); // nothing to do if we don't need to pad bases if (padVC) { if (!inputVC.hasReferenceBaseForIndel()) throw new ReviewedStingException( "Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available."); Byte refByte = inputVC.getReferenceBaseForIndel(); List<Allele> alleles = new ArrayList<Allele>(); for (Allele a : inputVC.getAlleles()) { // get bases for current allele and create a new one with trimmed bases if (a.isSymbolic()) { alleles.add(a); } else { String newBases; if (refBaseShouldBeAppliedToEndOfAlleles) newBases = a.getBaseString() + new String(new byte[] {refByte}); else newBases = new String(new byte[] {refByte}) + a.getBaseString(); alleles.add(Allele.create(newBases, a.isReference())); } } // now we can recreate new genotypes with trimmed alleles GenotypesContext genotypes = GenotypesContext.create(inputVC.getNSamples()); for (final Genotype g : inputVC.getGenotypes()) { List<Allele> inAlleles = g.getAlleles(); List<Allele> newGenotypeAlleles = new ArrayList<Allele>(g.getAlleles().size()); for (Allele a : inAlleles) { if (a.isCalled()) { if (a.isSymbolic()) { newGenotypeAlleles.add(a); } else { String newBases; if (refBaseShouldBeAppliedToEndOfAlleles) newBases = a.getBaseString() + new String(new byte[] {refByte}); else newBases = new String(new byte[] {refByte}) + a.getBaseString(); newGenotypeAlleles.add(Allele.create(newBases, a.isReference())); } } else { // add no-call allele newGenotypeAlleles.add(Allele.NO_CALL); } } genotypes.add( new Genotype( g.getSampleName(), newGenotypeAlleles, g.getLog10PError(), g.getFilters(), g.getAttributes(), g.isPhased())); } return new VariantContextBuilder(inputVC).alleles(alleles).genotypes(genotypes).make(); } else return inputVC; }
public Allele getLikelihoods( RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> contexts, AlignmentContextUtils.ReadOrientation contextType, GenotypePriors priors, Map<String, MultiallelicGenotypeLikelihoods> GLs, Allele alternateAlleleToUse, boolean useBAQedPileup) { if (tracker == null) return null; GenomeLoc loc = ref.getLocus(); Allele refAllele, altAllele; VariantContext vc = null; if (!ref.getLocus().equals(lastSiteVisited)) { // starting a new site: clear allele list alleleList.clear(); lastSiteVisited = ref.getLocus(); indelLikelihoodMap.set(new HashMap<PileupElement, LinkedHashMap<Allele, Double>>()); haplotypeMap.clear(); if (getAlleleListFromVCF) { for (final VariantContext vc_input : tracker.getValues(UAC.alleles, loc)) { if (vc_input != null && allowableTypes.contains(vc_input.getType()) && ref.getLocus().getStart() == vc_input.getStart()) { vc = vc_input; break; } } // ignore places where we don't have a variant if (vc == null) return null; alleleList.clear(); if (ignoreSNPAllelesWhenGenotypingIndels) { // if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore // it and don't genotype it for (Allele a : vc.getAlleles()) if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length) continue; else alleleList.add(a); } else { for (Allele a : vc.getAlleles()) alleleList.add(a); } } else { alleleList = computeConsensusAlleles(ref, contexts, contextType); if (alleleList.isEmpty()) return null; } } // protect against having an indel too close to the edge of a contig if (loc.getStart() <= HAPLOTYPE_SIZE) return null; // check if there is enough reference window to create haplotypes (can be an issue at end of // contigs) if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE) return null; if (!(priors instanceof DiploidIndelGenotypePriors)) throw new StingException( "Only diploid-based Indel priors are supported in the DINDEL GL model"); if (alleleList.isEmpty()) return null; refAllele = alleleList.get(0); altAllele = alleleList.get(1); // look for alt allele that has biggest length distance to ref allele int maxLenDiff = 0; for (Allele a : alleleList) { if (a.isNonReference()) { int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length()); if (lenDiff > maxLenDiff) { maxLenDiff = lenDiff; altAllele = a; } } } final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length(); final int hsize = (int) ref.getWindow().size() - Math.abs(eventLength) - 1; final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1; haplotypeMap = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(), ref, hsize, numPrefBases); // For each sample, get genotype likelihoods based on pileup // compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with // them. // initialize the GenotypeLikelihoods GLs.clear(); for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) { AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); ReadBackedPileup pileup = null; if (context.hasExtendedEventPileup()) pileup = context.getExtendedEventPileup(); else if (context.hasBasePileup()) pileup = context.getBasePileup(); if (pileup != null) { final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); GLs.put( sample.getKey(), new MultiallelicGenotypeLikelihoods( sample.getKey(), alleleList, genotypeLikelihoods, getFilteredDepth(pileup))); if (DEBUG) { System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString()); for (int k = 0; k < genotypeLikelihoods.length; k++) System.out.format("%1.4f ", genotypeLikelihoods[k]); System.out.println(); } } } return refAllele; }
/** * add a record to the file * * @param vc the Variant Context object * @param refBase the ref base used for indels * @param refBaseShouldBeAppliedToEndOfAlleles *** THIS SHOULD BE FALSE EXCEPT FOR AN INDEL AT THE * EXTREME BEGINNING OF A CONTIG (WHERE THERE IS NO PREVIOUS BASE, SO WE USE THE BASE AFTER * THE EVENT INSTEAD) */ public void add(VariantContext vc, byte refBase, boolean refBaseShouldBeAppliedToEndOfAlleles) { if (mHeader == null) throw new IllegalStateException( "The VCF Header must be written before records can be added: " + locationString()); if (doNotWriteGenotypes) vc = VariantContext.modifyGenotypes(vc, null); try { vc = VariantContext.createVariantContextWithPaddedAlleles( vc, refBase, refBaseShouldBeAppliedToEndOfAlleles); // if we are doing on the fly indexing, add the record ***before*** we write any bytes if (indexer != null) indexer.addFeature(vc, positionalStream.getPosition()); Map<Allele, String> alleleMap = new HashMap<Allele, String>(vc.getAlleles().size()); alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup // CHROM mWriter.write(vc.getChr()); mWriter.write(VCFConstants.FIELD_SEPARATOR); // POS mWriter.write(String.valueOf(vc.getStart())); mWriter.write(VCFConstants.FIELD_SEPARATOR); // ID String ID = vc.hasID() ? vc.getID() : VCFConstants.EMPTY_ID_FIELD; mWriter.write(ID); mWriter.write(VCFConstants.FIELD_SEPARATOR); // REF alleleMap.put(vc.getReference(), "0"); String refString = vc.getReference().getDisplayString(); mWriter.write(refString); mWriter.write(VCFConstants.FIELD_SEPARATOR); // ALT if (vc.isVariant()) { Allele altAllele = vc.getAlternateAllele(0); alleleMap.put(altAllele, "1"); String alt = altAllele.getDisplayString(); mWriter.write(alt); for (int i = 1; i < vc.getAlternateAlleles().size(); i++) { altAllele = vc.getAlternateAllele(i); alleleMap.put(altAllele, String.valueOf(i + 1)); alt = altAllele.getDisplayString(); mWriter.write(","); mWriter.write(alt); } } else { mWriter.write(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD); } mWriter.write(VCFConstants.FIELD_SEPARATOR); // QUAL if (!vc.hasNegLog10PError()) mWriter.write(VCFConstants.MISSING_VALUE_v4); else mWriter.write(getQualValue(vc.getPhredScaledQual())); mWriter.write(VCFConstants.FIELD_SEPARATOR); // FILTER String filters = vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (filtersWereAppliedToContext || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED); mWriter.write(filters); mWriter.write(VCFConstants.FIELD_SEPARATOR); // INFO Map<String, String> infoFields = new TreeMap<String, String>(); for (Map.Entry<String, Object> field : vc.getAttributes().entrySet()) { String key = field.getKey(); if (key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY)) continue; String outputValue = formatVCFField(field.getValue()); if (outputValue != null) infoFields.put(key, outputValue); } writeInfoString(infoFields); // FORMAT if (vc.hasAttribute(VariantContext.UNPARSED_GENOTYPE_MAP_KEY)) { mWriter.write(VCFConstants.FIELD_SEPARATOR); mWriter.write(vc.getAttributeAsString(VariantContext.UNPARSED_GENOTYPE_MAP_KEY, "")); } else { List<String> genotypeAttributeKeys = new ArrayList<String>(); if (vc.hasGenotypes()) { genotypeAttributeKeys.addAll(calcVCFGenotypeKeys(vc)); } else if (mHeader.hasGenotypingData()) { // this needs to be done in case all samples are no-calls genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY); } if (genotypeAttributeKeys.size() > 0) { String genotypeFormatString = ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys); mWriter.write(VCFConstants.FIELD_SEPARATOR); mWriter.write(genotypeFormatString); addGenotypeData(vc, alleleMap, genotypeAttributeKeys); } } mWriter.write("\n"); mWriter.flush(); // necessary so that writing to an output stream will work } catch (IOException e) { throw new RuntimeException("Unable to write the VCF object to " + locationString()); } }