public Map<String, Object> annotate( RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) { if (mendelianViolation == null) { if (checkAndSetSamples(((Walker) walker).getSampleDB())) { mendelianViolation = new MendelianViolation(((VariantAnnotator) walker).minGenotypeQualityP); } else { throw new UserException( "Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line containing only 1 trio."); } } Map<String, Object> toRet = new HashMap<String, Object>(1); boolean hasAppropriateGenotypes = vc.hasGenotype(motherId) && vc.getGenotype(motherId).hasLikelihoods() && vc.hasGenotype(fatherId) && vc.getGenotype(fatherId).hasLikelihoods() && vc.hasGenotype(childId) && vc.getGenotype(childId).hasLikelihoods(); if (hasAppropriateGenotypes) toRet.put( "MVLR", mendelianViolation.violationLikelihoodRatio(vc, motherId, fatherId, childId)); return toRet; }
private boolean isConcordant(VariantContext vc, Collection<VariantContext> compVCs) { if (vc == null || compVCs == null || compVCs.isEmpty()) return false; // if we're not looking for specific samples then the fact that we have both VCs is enough to // call it concordant. if (NO_SAMPLES_SPECIFIED) return true; // make a list of all samples contained in this variant VC that are being tracked by the user // command line arguments. Set<String> variantSamples = vc.getSampleNames(); variantSamples.retainAll(samples); // check if we can find all samples from the variant rod in the comp rod. for (String sample : variantSamples) { boolean foundSample = false; for (VariantContext compVC : compVCs) { Genotype varG = vc.getGenotype(sample); Genotype compG = compVC.getGenotype(sample); if (haveSameGenotypes(varG, compG)) { foundSample = true; break; } } // if at least one sample doesn't have the same genotype, we don't have concordance if (!foundSample) { return false; } } return true; }
/** * Checks if vc has a variant call for (at least one of) the samples. * * @param vc the variant rod VariantContext. Here, the variant is the dataset you're looking for * discordances to (e.g. HapMap) * @param compVCs the comparison VariantContext (discordance * @return */ private boolean isDiscordant(VariantContext vc, Collection<VariantContext> compVCs) { if (vc == null) return false; // if we're not looking at specific samples then the absence of a compVC means discordance if (NO_SAMPLES_SPECIFIED) return (compVCs == null || compVCs.isEmpty()); // check if we find it in the variant rod Map<String, Genotype> genotypes = vc.getGenotypes(samples); for (Genotype g : genotypes.values()) { if (sampleHasVariant(g)) { // There is a variant called (or filtered with not exclude filtered option set) that is not // HomRef for at least one of the samples. if (compVCs == null) return true; // Look for this sample in the all vcs of the comp ROD track. boolean foundVariant = false; for (VariantContext compVC : compVCs) { if (sampleHasVariant(compVC.getGenotype(g.getSampleName()))) { foundVariant = true; break; } } // if (at least one sample) was not found in all VCs of the comp ROD, we have discordance if (!foundVariant) return true; } } return false; // we only get here if all samples have a variant in the comp rod. }
static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) { // Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1: for (final Genotype gt1 : vc1.getGenotypes()) { Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); if (!alleleSegregationIsKnown(gt1, gt2)) // can merge if: phased, or if either is a hom return false; } return true; }
/** * Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN, * AC, AF). * * @param vc the VariantContext record to subset * @param samples the samples to extract * @return the subsetted VariantContext */ private VariantContext subsetRecord(VariantContext vc, Set<String> samples) { if (samples == null || samples.isEmpty()) return vc; ArrayList<Genotype> genotypes = new ArrayList<Genotype>(); for (Map.Entry<String, Genotype> genotypePair : vc.getGenotypes().entrySet()) { if (samples.contains(genotypePair.getKey())) genotypes.add(genotypePair.getValue()); } VariantContext sub = vc.subContextFromGenotypes(genotypes, vc.getAlleles()); // if we have fewer alternate alleles in the selected VC than in the original VC, we need to // strip out the GL/PLs (because they are no longer accurate) if (vc.getAlleles().size() != sub.getAlleles().size()) sub = VariantContext.modifyGenotypes(sub, VariantContextUtils.stripPLs(vc.getGenotypes())); HashMap<String, Object> attributes = new HashMap<String, Object>(sub.getAttributes()); int depth = 0; for (String sample : sub.getSampleNames()) { Genotype g = sub.getGenotype(sample); if (g.isNotFiltered() && g.isCalled()) { String dp = (String) g.getAttribute("DP"); if (dp != null && !dp.equals(VCFConstants.MISSING_DEPTH_v3) && !dp.equals(VCFConstants.MISSING_VALUE_v4)) { depth += Integer.valueOf(dp); } } } if (KEEP_ORIGINAL_CHR_COUNTS) { if (attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY)) attributes.put("AC_Orig", attributes.get(VCFConstants.ALLELE_COUNT_KEY)); if (attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY)) attributes.put("AF_Orig", attributes.get(VCFConstants.ALLELE_FREQUENCY_KEY)); if (attributes.containsKey(VCFConstants.ALLELE_NUMBER_KEY)) attributes.put("AN_Orig", attributes.get(VCFConstants.ALLELE_NUMBER_KEY)); } VariantContextUtils.calculateChromosomeCounts(sub, attributes, false); attributes.put("DP", depth); sub = VariantContext.modifyAttributes(sub, attributes); return sub; }
static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) { for (final Genotype gt1 : vc1.getGenotypes()) { Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); List<Allele> site1Alleles = gt1.getAlleles(); List<Allele> site2Alleles = gt2.getAlleles(); Iterator<Allele> all2It = site2Alleles.iterator(); for (Allele all1 : site1Alleles) { Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() if (all1.isNonReference() && all2.isNonReference()) // corresponding alleles are alternate return true; } } return false; }
static boolean doubleAllelesSegregatePerfectlyAmongSamples( VariantContext vc1, VariantContext vc2) { // Check that Alleles at vc1 and at vc2 always segregate together in all samples (including // reference): Map<Allele, Allele> allele1ToAllele2 = new HashMap<Allele, Allele>(); Map<Allele, Allele> allele2ToAllele1 = new HashMap<Allele, Allele>(); // Note the segregation of the alleles for the reference genome: allele1ToAllele2.put(vc1.getReference(), vc2.getReference()); allele2ToAllele1.put(vc2.getReference(), vc1.getReference()); // Note the segregation of the alleles for each sample (and check that it is consistent with the // reference and all previous samples). for (final Genotype gt1 : vc1.getGenotypes()) { Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); List<Allele> site1Alleles = gt1.getAlleles(); List<Allele> site2Alleles = gt2.getAlleles(); Iterator<Allele> all2It = site2Alleles.iterator(); for (Allele all1 : site1Alleles) { Allele all2 = all2It.next(); Allele all1To2 = allele1ToAllele2.get(all1); if (all1To2 == null) allele1ToAllele2.put(all1, all2); else if (!all1To2.equals(all2)) // all1 segregates with two different alleles at site 2 return false; Allele all2To1 = allele2ToAllele1.get(all2); if (all2To1 == null) allele2ToAllele1.put(all2, all1); else if (!all2To1.equals(all1)) // all2 segregates with two different alleles at site 1 return false; } } return true; }
/** * add the genotype data * * @param vc the variant context * @param genotypeFormatKeys Genotype formatting string * @param alleleMap alleles for this context * @throws IOException for writer */ private void addGenotypeData( VariantContext vc, Map<Allele, String> alleleMap, List<String> genotypeFormatKeys) throws IOException { for (String sample : mHeader.getGenotypeSamples()) { mWriter.write(VCFConstants.FIELD_SEPARATOR); Genotype g = vc.getGenotype(sample); if (g == null) { // TODO -- The VariantContext needs to know what the general ploidy is of the samples // TODO -- We shouldn't be assuming diploid genotypes here! mWriter.write(VCFConstants.EMPTY_GENOTYPE); continue; } List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size()); for (String key : genotypeFormatKeys) { if (key.equals(VCFConstants.GENOTYPE_KEY)) { if (!g.isAvailable()) { throw new ReviewedStingException( "GTs cannot be missing for some samples if they are available for others in the record"); } writeAllele(g.getAllele(0), alleleMap); for (int i = 1; i < g.getPloidy(); i++) { mWriter.write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED); writeAllele(g.getAllele(i), alleleMap); } continue; } Object val = g.hasAttribute(key) ? g.getAttribute(key) : VCFConstants.MISSING_VALUE_v4; // some exceptions if (key.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) { if (Math.abs(g.getNegLog10PError() - Genotype.NO_NEG_LOG_10PERROR) < 1e-6) val = VCFConstants.MISSING_VALUE_v4; else { val = getQualValue(Math.min(g.getPhredScaledQual(), VCFConstants.MAX_GENOTYPE_QUAL)); } } else if (key.equals(VCFConstants.GENOTYPE_FILTER_KEY)) { val = g.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(g.getFilters())) : (g.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED); } VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(key); if (metaData != null) { int numInFormatField = metaData.getCount(vc.getAlternateAlleles().size()); if (numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4)) { // If we have a missing field but multiple values are expected, we need to construct a // new string with all fields. // For example, if Number=2, the string has to be ".,." StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4); for (int i = 1; i < numInFormatField; i++) { sb.append(","); sb.append(VCFConstants.MISSING_VALUE_v4); } val = sb.toString(); } } // assume that if key is absent, then the given string encoding suffices String outputValue = formatVCFField(val); if (outputValue != null) attrs.add(outputValue); } // strip off trailing missing values for (int i = attrs.size() - 1; i >= 0; i--) { if (isMissingValue(attrs.get(i))) attrs.remove(i); else break; } for (int i = 0; i < attrs.size(); i++) { if (i > 0 || genotypeFormatKeys.contains(VCFConstants.GENOTYPE_KEY)) mWriter.write(VCFConstants.GENOTYPE_FIELD_SEPARATOR); mWriter.write(attrs.get(i)); } } }
/** * Subset VC record if necessary and emit the modified record (provided it satisfies criteria for * printing) * * @param tracker the ROD tracker * @param ref reference information * @param context alignment info * @return 1 if the record was printed to the output file, 0 if otherwise */ @Override public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if (tracker == null) return 0; Collection<VariantContext> vcs = tracker.getValues(variantCollection.variants, context.getLocation()); if (vcs == null || vcs.size() == 0) { return 0; } for (VariantContext vc : vcs) { if (MENDELIAN_VIOLATIONS) { boolean foundMV = false; for (MendelianViolation mv : mvSet) { if (mv.isViolation(vc)) { foundMV = true; // System.out.println(vc.toString()); if (outMVFile != null) outMVFileStream.format( "MV@%s:%d. REF=%s, ALT=%s, AC=%d, momID=%s, dadID=%s, childID=%s, momG=%s, momGL=%s, dadG=%s, dadGL=%s, " + "childG=%s childGL=%s\n", vc.getChr(), vc.getStart(), vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString(), vc.getChromosomeCount(vc.getAlternateAllele(0)), mv.getSampleMom(), mv.getSampleDad(), mv.getSampleChild(), vc.getGenotype(mv.getSampleMom()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(), vc.getGenotype(mv.getSampleDad()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(), vc.getGenotype(mv.getSampleChild()).toBriefString(), vc.getGenotype(mv.getSampleChild()).getLikelihoods().getAsString()); } } if (!foundMV) break; } if (DISCORDANCE_ONLY) { Collection<VariantContext> compVCs = tracker.getValues(discordanceTrack, context.getLocation()); if (!isDiscordant(vc, compVCs)) return 0; } if (CONCORDANCE_ONLY) { Collection<VariantContext> compVCs = tracker.getValues(concordanceTrack, context.getLocation()); if (!isConcordant(vc, compVCs)) return 0; } if (alleleRestriction.equals(NumberAlleleRestriction.BIALLELIC) && !vc.isBiallelic()) continue; if (alleleRestriction.equals(NumberAlleleRestriction.MULTIALLELIC) && vc.isBiallelic()) continue; if (!selectedTypes.contains(vc.getType())) continue; VariantContext sub = subsetRecord(vc, samples); if ((sub.isPolymorphic() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED)) { for (VariantContextUtils.JexlVCMatchExp jexl : jexls) { if (!VariantContextUtils.match(sub, jexl)) { return 0; } } if (SELECT_RANDOM_NUMBER) { randomlyAddVariant(++variantNumber, sub, ref.getBase()); } else if (!SELECT_RANDOM_FRACTION || (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { vcfWriter.add(sub); } } } return 1; }
static VariantContext reallyMergeIntoMNP( VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) { int startInter = vc1.getEnd() + 1; int endInter = vc2.getStart() - 1; byte[] intermediateBases = null; if (startInter <= endInter) { intermediateBases = referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases(); StringUtil.toUpperCase(intermediateBases); } MergedAllelesData mergeData = new MergedAllelesData( intermediateBases, vc1, vc2); // ensures that the reference allele is added GenotypesContext mergedGenotypes = GenotypesContext.create(); for (final Genotype gt1 : vc1.getGenotypes()) { Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); List<Allele> site1Alleles = gt1.getAlleles(); List<Allele> site2Alleles = gt2.getAlleles(); List<Allele> mergedAllelesForSample = new LinkedList<Allele>(); /* NOTE: Since merged alleles are added to mergedAllelesForSample in the SAME order as in the input VC records, we preserve phase information (if any) relative to whatever precedes vc1: */ Iterator<Allele> all2It = site2Alleles.iterator(); for (Allele all1 : site1Alleles) { Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable() Allele mergedAllele = mergeData.ensureMergedAllele(all1, all2); mergedAllelesForSample.add(mergedAllele); } double mergedGQ = Math.max(gt1.getLog10PError(), gt2.getLog10PError()); Set<String> mergedGtFilters = new HashSet< String>(); // Since gt1 and gt2 were unfiltered, the Genotype remains unfiltered Map<String, Object> mergedGtAttribs = new HashMap<String, Object>(); PhaseAndQuality phaseQual = calcPhaseForMergedGenotypes(gt1, gt2); if (phaseQual.PQ != null) mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, phaseQual.PQ); Genotype mergedGt = new Genotype( gt1.getSampleName(), mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, phaseQual.isPhased); mergedGenotypes.add(mergedGt); } String mergedName = mergeVariantContextNames(vc1.getSource(), vc2.getSource()); double mergedLog10PError = Math.min(vc1.getLog10PError(), vc2.getLog10PError()); Set<String> mergedFilters = new HashSet< String>(); // Since vc1 and vc2 were unfiltered, the merged record remains unfiltered Map<String, Object> mergedAttribs = mergeVariantContextAttributes(vc1, vc2); // ids List<String> mergedIDs = new ArrayList<String>(); if (vc1.hasID()) mergedIDs.add(vc1.getID()); if (vc2.hasID()) mergedIDs.add(vc2.getID()); String mergedID = mergedIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(VCFConstants.ID_FIELD_SEPARATOR, mergedIDs); VariantContextBuilder mergedBuilder = new VariantContextBuilder( mergedName, vc1.getChr(), vc1.getStart(), vc2.getEnd(), mergeData.getAllMergedAlleles()) .id(mergedID) .genotypes(mergedGenotypes) .log10PError(mergedLog10PError) .filters(mergedFilters) .attributes(mergedAttribs); VariantContextUtils.calculateChromosomeCounts(mergedBuilder, true); return mergedBuilder.make(); }