@Test(dataProvider = "RepeatDetectorTest") public void testRepeatDetectorTest(RepeatDetectorTest cfg) { // test alleles are equal Assert.assertEquals( VariantContextUtils.isTandemRepeat(cfg.vc, cfg.ref.getBytes()), cfg.isTrueRepeat); }
@Test(dataProvider = "mergeAlleles") public void testMergeAlleles(MergeAllelesTest cfg) { final List<VariantContext> inputs = new ArrayList<VariantContext>(); int i = 0; for (final List<Allele> alleles : cfg.inputs) { final String name = "vcf" + ++i; inputs.add(makeVC(name, alleles)); } final List<String> priority = vcs2priority(inputs); final VariantContext merged = VariantContextUtils.simpleMerge( genomeLocParser, inputs, priority, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false); Assert.assertEquals(merged.getAlleles(), cfg.expected); }
@Test public void testAnnotationSet() { for (final boolean annotate : Arrays.asList(true, false)) { for (final String set : Arrays.asList("set", "combine", "x")) { final List<String> priority = Arrays.asList("1", "2"); VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS); VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS); final VariantContext merged = VariantContextUtils.simpleMerge( genomeLocParser, Arrays.asList(vc1, vc2), priority, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false); if (annotate) Assert.assertEquals(merged.getAttribute(set), VariantContextUtils.MERGE_INTERSECTION); else Assert.assertFalse(merged.hasAttribute(set)); } } }
@Test(dataProvider = "mergeFiltered") public void testMergeFiltered(MergeFilteredTest cfg) { final List<String> priority = vcs2priority(cfg.inputs); final VariantContext merged = VariantContextUtils.simpleMerge( genomeLocParser, cfg.inputs, priority, cfg.type, VariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); // test alleles are equal Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); // test set field Assert.assertEquals(merged.getAttribute("set"), cfg.setExpected); // test filter field Assert.assertEquals(merged.getFilters(), cfg.expected.getFilters()); }
static boolean mergeIntoMNPvalidationCheck( GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) { GenomeLoc loc1 = VariantContextUtils.getLocation(genomeLocParser, vc1); GenomeLoc loc2 = VariantContextUtils.getLocation(genomeLocParser, vc2); if (!loc1.onSameContig(loc2)) throw new ReviewedStingException("Can only merge vc1, vc2 if on the same chromosome"); if (!loc1.isBefore(loc2)) throw new ReviewedStingException("Can only merge if vc1 is BEFORE vc2"); if (vc1.isFiltered() || vc2.isFiltered()) return false; if (!vc1.getSampleNames() .equals(vc2.getSampleNames())) // vc1, vc2 refer to different sample sets return false; if (!allGenotypesAreUnfilteredAndCalled(vc1) || !allGenotypesAreUnfilteredAndCalled(vc2)) return false; return true; }
@Test(expectedExceptions = UserException.class) public void testMergeGenotypesRequireUnique() { final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); final VariantContext merged = VariantContextUtils.simpleMerge( genomeLocParser, Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false); }
@Test(dataProvider = "mergeGenotypes") public void testMergeGenotypes(MergeGenotypesTest cfg) { final VariantContext merged = VariantContextUtils.simpleMerge( genomeLocParser, cfg.inputs, cfg.priority, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); // test alleles are equal Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); // test genotypes assertGenotypesAreMostlyEqual(merged.getGenotypes(), cfg.expected.getGenotypes()); }
@Test public void testMergeGenotypesUniquify() { final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); final VariantContext merged = VariantContextUtils.simpleMerge( genomeLocParser, Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false); // test genotypes Assert.assertEquals( merged.getSampleNames(), new HashSet<String>(Arrays.asList("s1.1", "s1.2"))); }
/** * Method for creating JexlVCMatchExp from input walker arguments names and exps. These two arrays * contain the name associated with each JEXL expression. initializeMatchExps will parse each * expression and return a list of JexlVCMatchExp, in order, that correspond to the names and * exps. These are suitable input to match() below. * * @param names names * @param exps expressions * @return list of matches */ public static List<JexlVCMatchExp> initializeMatchExps(String[] names, String[] exps) { if (names == null || exps == null) throw new ReviewedStingException( "BUG: neither names nor exps can be null: names " + Arrays.toString(names) + " exps=" + Arrays.toString(exps)); if (names.length != exps.length) throw new UserException( "Inconsistent number of provided filter names and expressions: names=" + Arrays.toString(names) + " exps=" + Arrays.toString(exps)); Map<String, String> map = new HashMap<String, String>(); for (int i = 0; i < names.length; i++) { map.put(names[i], exps[i]); } return VariantContextUtils.initializeMatchExps(map); }
@Test(dataProvider = "simplemergersiddata") public void testRSIDMerge(SimpleMergeRSIDTest cfg) { VariantContext snpVC1 = makeVC("snpvc1", Arrays.asList(Aref, T)); final List<VariantContext> inputs = new ArrayList<VariantContext>(); for (final String id : cfg.inputs) { inputs.add(new VariantContextBuilder(snpVC1).id(id).make()); } final VariantContext merged = VariantContextUtils.simpleMerge( genomeLocParser, inputs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false); Assert.assertEquals(merged.getID(), cfg.expected); }
public void writeBeagleOutput( VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) { GenomeLoc currentLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), preferredVC); StringBuffer beagleOut = new StringBuffer(); String marker = String.format("%s:%d ", currentLoc.getContig(), currentLoc.getStart()); beagleOut.append(marker); if (markers != null) markers.append(marker).append("\t").append(Integer.toString(markerCounter++)).append("\t"); for (Allele allele : preferredVC.getAlleles()) { String bglPrintString; if (allele.isNoCall() || allele.isNull()) bglPrintString = "-"; else bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele beagleOut.append(String.format("%s ", bglPrintString)); if (markers != null) markers.append(bglPrintString).append("\t"); } if (markers != null) markers.append("\n"); GenotypesContext preferredGenotypes = preferredVC.getGenotypes(); GenotypesContext otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; for (String sample : samples) { boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE; Genotype genotype; boolean isValidation; // use sample as key into genotypes structure if (preferredGenotypes.containsSample(sample)) { genotype = preferredGenotypes.get(sample); isValidation = isValidationSite; } else if (otherGenotypes != null && otherGenotypes.containsSample(sample)) { genotype = otherGenotypes.get(sample); isValidation = !isValidationSite; } else { // there is magically no genotype for this sample. throw new StingException( "Sample " + sample + " arose with no genotype in variant or validation VCF. This should never happen."); } /* * Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key */ double[] log10Likelihoods = null; if ((isValidation && prior < 0.0) || genotype.hasLikelihoods()) { log10Likelihoods = genotype.getLikelihoods().getAsVector(); // see if we need to randomly mask out genotype in this position. if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() <= insertedNoCallRate) { // we are masking out this genotype log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS; } if (isMaleOnChrX) { log10Likelihoods[1] = -255; // todo -- warning this is dangerous for multi-allele case } } /** otherwise, use the prior uniformly */ else if (!isValidation && genotype.isCalled() && !genotype.hasLikelihoods()) { // hack to deal with input VCFs with no genotype likelihoods. Just assume the called // genotype // is confident. This is useful for Hapmap and 1KG release VCFs. double AA = (1.0 - prior) / 2.0; double AB = (1.0 - prior) / 2.0; double BB = (1.0 - prior) / 2.0; if (genotype.isHomRef()) { AA = prior; } else if (genotype.isHet()) { AB = prior; } else if (genotype.isHomVar()) { BB = prior; } log10Likelihoods = MathUtils.toLog10(new double[] {AA, isMaleOnChrX ? 0.0 : AB, BB}); } else { log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS; } writeSampleLikelihoods(beagleOut, preferredVC, log10Likelihoods); } beagleWriter.println(beagleOut.toString()); }
@Test(dataProvider = "ReverseClippingPositionTestProvider") public void testReverseClippingPositionTestProvider(ReverseClippingPositionTestProvider cfg) { int result = VariantContextUtils.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false); Assert.assertEquals(result, cfg.expectedClip); }
static VariantContext reallyMergeIntoMNP( VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) { int startInter = vc1.getEnd() + 1; int endInter = vc2.getStart() - 1; byte[] intermediateBases = null; if (startInter <= endInter) { intermediateBases = referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases(); StringUtil.toUpperCase(intermediateBases); } MergedAllelesData mergeData = new MergedAllelesData( intermediateBases, vc1, vc2); // ensures that the reference allele is added GenotypesContext mergedGenotypes = GenotypesContext.create(); for (final Genotype gt1 : vc1.getGenotypes()) { Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); List<Allele> site1Alleles = gt1.getAlleles(); List<Allele> site2Alleles = gt2.getAlleles(); List<Allele> mergedAllelesForSample = new LinkedList<Allele>(); /* NOTE: Since merged alleles are added to mergedAllelesForSample in the SAME order as in the input VC records, we preserve phase information (if any) relative to whatever precedes vc1: */ Iterator<Allele> all2It = site2Alleles.iterator(); for (Allele all1 : site1Alleles) { Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() Allele mergedAllele = mergeData.ensureMergedAllele(all1, all2); mergedAllelesForSample.add(mergedAllele); } double mergedGQ = Math.max(gt1.getLog10PError(), gt2.getLog10PError()); Set<String> mergedGtFilters = new HashSet< String>(); // Since gt1 and gt2 were unfiltered, the Genotype remains unfiltered Map<String, Object> mergedGtAttribs = new HashMap<String, Object>(); PhaseAndQuality phaseQual = calcPhaseForMergedGenotypes(gt1, gt2); if (phaseQual.PQ != null) mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, phaseQual.PQ); Genotype mergedGt = new Genotype( gt1.getSampleName(), mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, phaseQual.isPhased); mergedGenotypes.add(mergedGt); } String mergedName = mergeVariantContextNames(vc1.getSource(), vc2.getSource()); double mergedLog10PError = Math.min(vc1.getLog10PError(), vc2.getLog10PError()); Set<String> mergedFilters = new HashSet< String>(); // Since vc1 and vc2 were unfiltered, the merged record remains unfiltered Map<String, Object> mergedAttribs = mergeVariantContextAttributes(vc1, vc2); // ids List<String> mergedIDs = new ArrayList<String>(); if (vc1.hasID()) mergedIDs.add(vc1.getID()); if (vc2.hasID()) mergedIDs.add(vc2.getID()); String mergedID = mergedIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(VCFConstants.ID_FIELD_SEPARATOR, mergedIDs); VariantContextBuilder mergedBuilder = new VariantContextBuilder( mergedName, vc1.getChr(), vc1.getStart(), vc2.getEnd(), mergeData.getAllMergedAlleles()) .id(mergedID) .genotypes(mergedGenotypes) .log10PError(mergedLog10PError) .filters(mergedFilters) .attributes(mergedAttribs); VariantContextUtils.calculateChromosomeCounts(mergedBuilder, true); return mergedBuilder.make(); }
/** * Main entry function to calculate genotypes of a given VC with corresponding GL's * * @param tracker Tracker * @param refContext Reference context * @param rawContext Raw context * @param stratifiedContexts Stratified alignment contexts * @param vc Input VC * @param model GL calculation model * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc * @return VC with assigned genotypes */ public VariantCallContext calculateGenotypes( final RefMetaDataTracker tracker, final ReferenceContext refContext, final AlignmentContext rawContext, Map<String, AlignmentContext> stratifiedContexts, final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final boolean inheritAttributesFromInputVC, final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) { boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; // initialize the data for this thread if that hasn't been done yet if (afcm.get() == null) { afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger)); } // estimate our confidence in a reference call and return if (vc.getNSamples() == 0) { if (limitedContext) return null; return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ? estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), false, 1.0) : generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); } AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model)); // is the most likely frequency conformation AC=0 for all alternate alleles? boolean bestGuessIsRef = true; // determine which alternate alleles have AF>0 final List<Allele> myAlleles = new ArrayList<Allele>(vc.getAlleles().size()); final List<Integer> alleleCountsofMLE = new ArrayList<Integer>(vc.getAlleles().size()); myAlleles.add(vc.getReference()); for (int i = 0; i < AFresult.getAllelesUsedInGenotyping().size(); i++) { final Allele alternateAllele = AFresult.getAllelesUsedInGenotyping().get(i); if (alternateAllele.isReference()) continue; // we are non-ref if the probability of being non-ref > the emit confidence. // the emit confidence is phred-scaled, say 30 => 10^-3. // the posterior AF > 0 is log10: -5 => 10^-5 // we are non-ref if 10^-5 < 10^-3 => -5 < -3 final boolean isNonRef = AFresult.isPolymorphic(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING / -10.0); // if the most likely AC is not 0, then this is a good alternate allele to use if (isNonRef) { myAlleles.add(alternateAllele); alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); bestGuessIsRef = false; } // if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele else if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { myAlleles.add(alternateAllele); alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); } } final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0()); // note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice final double phredScaledConfidence = Math.abs( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE .GENOTYPE_GIVEN_ALLELES ? -10 * AFresult.getLog10PosteriorOfAFEq0() : -10 * AFresult.getLog10PosteriorOfAFGT0()); // return a null call if we don't pass the confidence cutoff or the most likely allele frequency // is zero if (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef)) { // technically, at this point our confidence in a reference call isn't accurately estimated // because it didn't take into account samples with no data, so let's get a better estimate return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0); } // start constructing the resulting VC final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc); final VariantContextBuilder builder = new VariantContextBuilder( "UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles); builder.log10PError(phredScaledConfidence / -10.0); if (!passesCallThreshold(phredScaledConfidence)) builder.filters(filter); // create the genotypes final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, myAlleles, true, ploidy); builder.genotypes(genotypes); // print out stats if we have a writer if (verboseWriter != null && !limitedContext) printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model); // *** note that calculating strand bias involves overwriting data structures, so we do that // last final HashMap<String, Object> attributes = new HashMap<String, Object>(); // inherit attributed from input vc if requested if (inheritAttributesFromInputVC) attributes.putAll(vc.getAttributes()); // if the site was downsampled, record that fact if (!limitedContext && rawContext.hasPileupBeenDownsampled()) attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); if (UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED) attributes.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size()); // add the MLE AC and AF annotations if (alleleCountsofMLE.size() > 0) { attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE); final int AN = builder.make().getCalledChrCount(); final ArrayList<Double> MLEfrequencies = new ArrayList<Double>(alleleCountsofMLE.size()); // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT // is ./. but the exact model may arbitrarily choose an AC>1) for (int AC : alleleCountsofMLE) MLEfrequencies.add(Math.min(1.0, (double) AC / (double) AN)); attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies); } if (UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef) { // final boolean DEBUG_SLOD = false; // the overall lod // double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); // if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); List<Allele> allAllelesToUse = builder.make().getAlleles(); // the forward lod VariantContext vcForward = calculateLikelihoods( tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); AFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model)); // double[] normalizedLog10Posteriors = // MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double forwardLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); double forwardLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); // if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", // forwardLog10PofF=" + forwardLog10PofF); // the reverse lod VariantContext vcReverse = calculateLikelihoods( tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); AFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model)); // normalizedLog10Posteriors = // MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double reverseLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); double reverseLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); // if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", // reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF; // if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + // reverseLod); // strand score is max bias between forward and reverse strands double strandScore = Math.max(forwardLod, reverseLod); // rescale by a factor of 10 strandScore *= 10.0; // logger.debug(String.format("SLOD=%f", strandScore)); if (!Double.isNaN(strandScore)) attributes.put("SB", strandScore); } // finish constructing the resulting VC builder.attributes(attributes); VariantContext vcCall = builder.make(); // if we are subsetting alleles (either because there were too many or because some were not // polymorphic) // then we may need to trim the alleles (because the original VariantContext may have had to pad // at the end). if (myAlleles.size() != vc.getAlleles().size() && !limitedContext) // limitedContext callers need to handle allele trimming on their own to // keep their perReadAlleleLikelihoodMap alleles in sync vcCall = VariantContextUtils.reverseTrimAlleles(vcCall); if (annotationEngine != null && !limitedContext) { // limitedContext callers need to handle annotations on their own by // calling their own annotationEngine // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations final ReadBackedPileup pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); vcCall = annotationEngine.annotateContext( tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap); } return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0)); }