@Test public void testAnnotationSet() { for (final boolean annotate : Arrays.asList(true, false)) { for (final String set : Arrays.asList("set", "combine", "x")) { final List<String> priority = Arrays.asList("1", "2"); VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS); VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS); final VariantContext merged = VariantContextUtils.simpleMerge( genomeLocParser, Arrays.asList(vc1, vc2), priority, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false); if (annotate) Assert.assertEquals(merged.getAttribute(set), VariantContextUtils.MERGE_INTERSECTION); else Assert.assertFalse(merged.hasAttribute(set)); } } }
public void update2( VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if (eval == null || (getWalker().ignoreAC0Sites() && eval.isMonomorphicInSamples())) return; final Type type = getType(eval); if (type == null) return; TypeSampleMap titvTable = null; // update DP, if possible if (eval.hasAttribute(VCFConstants.DEPTH_KEY)) depthPerSample.inc(type, ALL); // update counts allVariantCounts.inc(type, ALL); // type specific calculations if (type == Type.SNP && eval.isBiallelic()) { titvTable = VariantContextUtils.isTransition(eval) ? transitionsPerSample : transversionsPerSample; titvTable.inc(type, ALL); } // novelty calculation if (comp != null || (type == Type.CNV && overlapsKnownCNV(eval))) knownVariantCounts.inc(type, ALL); // per sample metrics for (final Genotype g : eval.getGenotypes()) { if (!g.isNoCall() && !g.isHomRef()) { countsPerSample.inc(type, g.getSampleName()); // update transition / transversion ratio if (titvTable != null) titvTable.inc(type, g.getSampleName()); if (g.hasDP()) depthPerSample.inc(type, g.getSampleName()); } } }
// // helper routines // private SiteStatus calcSiteStatus(VariantContext vc) { if (vc == null) return SiteStatus.NO_CALL; if (vc.isFiltered()) return SiteStatus.FILTERED; if (vc.isMonomorphicInSamples()) return SiteStatus.MONO; if (vc.hasGenotypes()) return SiteStatus .POLY; // must be polymorphic if isMonomorphicInSamples was false and there are genotypes if (vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { int ac = 0; if (vc.getNAlleles() > 2) { return SiteStatus.POLY; } else ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0); return ac > 0 ? SiteStatus.POLY : SiteStatus.MONO; } else { return TREAT_ALL_SITES_IN_EVAL_VCF_AS_CALLED ? SiteStatus.POLY : SiteStatus.NO_CALL; // we can't figure out what to do } }
public String update1( VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { nCalledLoci++; // Note from Eric: // This is really not correct. What we really want here is a polymorphic vs. monomorphic count // (i.e. on the Genotypes). // So in order to maintain consistency with the previous implementation (and the intention of // the original author), I've // added in a proxy check for monomorphic status here. // Protect against case when vc only as no-calls too - can happen if we strafity by sample and // sample as a single no-call. if (vc1.isMonomorphicInSamples()) { nRefLoci++; } else { switch (vc1.getType()) { case NO_VARIATION: // shouldn't get here break; case SNP: nVariantLoci++; nSNPs++; if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++; break; case MNP: nVariantLoci++; nMNPs++; if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++; break; case INDEL: nVariantLoci++; if (vc1.isSimpleInsertion()) nInsertions++; else if (vc1.isSimpleDeletion()) nDeletions++; else nComplex++; break; case MIXED: nVariantLoci++; nMixed++; break; case SYMBOLIC: nSymbolic++; break; default: throw new ReviewedStingException("Unexpected VariantContext type " + vc1.getType()); } } String refStr = vc1.getReference().getBaseString().toUpperCase(); String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE", null).toUpperCase() : null; // if (aaStr.equals(".")) { // aaStr = refStr; // } // ref aa alt class // A C A der homozygote // A C C anc homozygote // A A A ref homozygote // A A C // A C A // A C C for (final Genotype g : vc1.getGenotypes()) { final String altStr = vc1.getAlternateAlleles().size() > 0 ? vc1.getAlternateAllele(0).getBaseString().toUpperCase() : null; switch (g.getType()) { case NO_CALL: nNoCalls++; break; case HOM_REF: nHomRef++; if (aaStr != null && altStr != null && !refStr.equalsIgnoreCase(aaStr)) { nHomDerived++; } break; case HET: nHets++; break; case HOM_VAR: nHomVar++; if (aaStr != null && altStr != null && !altStr.equalsIgnoreCase(aaStr)) { nHomDerived++; } break; case MIXED: break; default: throw new ReviewedStingException("BUG: Unexpected genotype type: " + g); } } return null; // we don't capture any interesting sites }
/** * add a record to the file * * @param vc the Variant Context object * @param refBase the ref base used for indels * @param refBaseShouldBeAppliedToEndOfAlleles *** THIS SHOULD BE FALSE EXCEPT FOR AN INDEL AT THE * EXTREME BEGINNING OF A CONTIG (WHERE THERE IS NO PREVIOUS BASE, SO WE USE THE BASE AFTER * THE EVENT INSTEAD) */ public void add(VariantContext vc, byte refBase, boolean refBaseShouldBeAppliedToEndOfAlleles) { if (mHeader == null) throw new IllegalStateException( "The VCF Header must be written before records can be added: " + locationString()); if (doNotWriteGenotypes) vc = VariantContext.modifyGenotypes(vc, null); try { vc = VariantContext.createVariantContextWithPaddedAlleles( vc, refBase, refBaseShouldBeAppliedToEndOfAlleles); // if we are doing on the fly indexing, add the record ***before*** we write any bytes if (indexer != null) indexer.addFeature(vc, positionalStream.getPosition()); Map<Allele, String> alleleMap = new HashMap<Allele, String>(vc.getAlleles().size()); alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup // CHROM mWriter.write(vc.getChr()); mWriter.write(VCFConstants.FIELD_SEPARATOR); // POS mWriter.write(String.valueOf(vc.getStart())); mWriter.write(VCFConstants.FIELD_SEPARATOR); // ID String ID = vc.hasID() ? vc.getID() : VCFConstants.EMPTY_ID_FIELD; mWriter.write(ID); mWriter.write(VCFConstants.FIELD_SEPARATOR); // REF alleleMap.put(vc.getReference(), "0"); String refString = vc.getReference().getDisplayString(); mWriter.write(refString); mWriter.write(VCFConstants.FIELD_SEPARATOR); // ALT if (vc.isVariant()) { Allele altAllele = vc.getAlternateAllele(0); alleleMap.put(altAllele, "1"); String alt = altAllele.getDisplayString(); mWriter.write(alt); for (int i = 1; i < vc.getAlternateAlleles().size(); i++) { altAllele = vc.getAlternateAllele(i); alleleMap.put(altAllele, String.valueOf(i + 1)); alt = altAllele.getDisplayString(); mWriter.write(","); mWriter.write(alt); } } else { mWriter.write(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD); } mWriter.write(VCFConstants.FIELD_SEPARATOR); // QUAL if (!vc.hasNegLog10PError()) mWriter.write(VCFConstants.MISSING_VALUE_v4); else mWriter.write(getQualValue(vc.getPhredScaledQual())); mWriter.write(VCFConstants.FIELD_SEPARATOR); // FILTER String filters = vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (filtersWereAppliedToContext || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED); mWriter.write(filters); mWriter.write(VCFConstants.FIELD_SEPARATOR); // INFO Map<String, String> infoFields = new TreeMap<String, String>(); for (Map.Entry<String, Object> field : vc.getAttributes().entrySet()) { String key = field.getKey(); if (key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY)) continue; String outputValue = formatVCFField(field.getValue()); if (outputValue != null) infoFields.put(key, outputValue); } writeInfoString(infoFields); // FORMAT if (vc.hasAttribute(VariantContext.UNPARSED_GENOTYPE_MAP_KEY)) { mWriter.write(VCFConstants.FIELD_SEPARATOR); mWriter.write(vc.getAttributeAsString(VariantContext.UNPARSED_GENOTYPE_MAP_KEY, "")); } else { List<String> genotypeAttributeKeys = new ArrayList<String>(); if (vc.hasGenotypes()) { genotypeAttributeKeys.addAll(calcVCFGenotypeKeys(vc)); } else if (mHeader.hasGenotypingData()) { // this needs to be done in case all samples are no-calls genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY); } if (genotypeAttributeKeys.size() > 0) { String genotypeFormatString = ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys); mWriter.write(VCFConstants.FIELD_SEPARATOR); mWriter.write(genotypeFormatString); addGenotypeData(vc, alleleMap, genotypeAttributeKeys); } } mWriter.write("\n"); mWriter.flush(); // necessary so that writing to an output stream will work } catch (IOException e) { throw new RuntimeException("Unable to write the VCF object to " + locationString()); } }
/** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority * order, if provided. If uniqifySamples is true, the priority order is ignored and names are * created by concatenating the VC name with the sample name * * @param genomeLocParser loc parser * @param unsortedVCs collection of unsorted VCs * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs * @param filteredRecordMergeType merge type for filtered records * @param genotypeMergeOptions merge option for genotypes * @param annotateOrigin should we annotate the set it came from? * @param printMessages should we print messages? * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? * @return new VariantContext representing the merge of unsortedVCs */ public static VariantContext simpleMerge( final GenomeLocParser genomeLocParser, final Collection<VariantContext> unsortedVCs, final List<String> priorityListOfVCs, final FilteredRecordMergeType filteredRecordMergeType, final GenotypeMergeType genotypeMergeOptions, final boolean annotateOrigin, final boolean printMessages, final String setKey, final boolean filteredAreUncalled, final boolean mergeInfoWithMaxAC) { if (unsortedVCs == null || unsortedVCs.size() == 0) return null; if (annotateOrigin && priorityListOfVCs == null) throw new IllegalArgumentException( "Cannot merge calls and annotate their origins without a complete priority list of VariantContexts"); if (genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE) verifyUniqueSampleNames(unsortedVCs); List<VariantContext> prepaddedVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); // Make sure all variant contexts are padded with reference base in case of indels if necessary List<VariantContext> VCs = new ArrayList<VariantContext>(); for (VariantContext vc : prepaddedVCs) { // also a reasonable place to remove filtered calls, if needed if (!filteredAreUncalled || vc.isNotFiltered()) VCs.add(createVariantContextWithPaddedAlleles(vc, false)); } if (VCs.size() == 0) // everything is filtered out and we're filteredAreUncalled return null; // establish the baseline info from the first VC final VariantContext first = VCs.get(0); final String name = first.getSource(); final Allele refAllele = determineReferenceAllele(VCs); final Set<Allele> alleles = new LinkedHashSet<Allele>(); final Set<String> filters = new TreeSet<String>(); final Map<String, Object> attributes = new TreeMap<String, Object>(); final Set<String> inconsistentAttributes = new HashSet<String>(); final Set<String> variantSources = new HashSet< String>(); // contains the set of sources we found in our set of VCs that are variant final Set<String> rsIDs = new LinkedHashSet<String>(1); // most of the time there's one id GenomeLoc loc = getLocation(genomeLocParser, first); int depth = 0; int maxAC = -1; final Map<String, Object> attributesWithMaxAC = new TreeMap<String, Object>(); double log10PError = 1; VariantContext vcWithMaxAC = null; GenotypesContext genotypes = GenotypesContext.create(); // counting the number of filtered and variant VCs int nFiltered = 0; boolean remapped = false; // cycle through and add info from the other VCs, making sure the loc/reference matches for (VariantContext vc : VCs) { if (loc.getStart() != vc.getStart()) // || !first.getReference().equals(vc.getReference()) ) throw new ReviewedStingException( "BUG: attempting to merge VariantContexts with different start sites: first=" + first.toString() + " second=" + vc.toString()); if (getLocation(genomeLocParser, vc).size() > loc.size()) loc = getLocation(genomeLocParser, vc); // get the longest location nFiltered += vc.isFiltered() ? 1 : 0; if (vc.isVariant()) variantSources.add(vc.getSource()); AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); remapped = remapped || alleleMapping.needsRemapping(); alleles.addAll(alleleMapping.values()); mergeGenotypes( genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); log10PError = Math.min(log10PError, vc.isVariant() ? vc.getLog10PError() : 1); filters.addAll(vc.getFilters()); // // add attributes // // special case DP (add it up) and ID (just preserve it) // if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); if (vc.hasID()) rsIDs.add(vc.getID()); if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); // lets see if the string contains a , separator if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { List<String> alleleCountArray = Arrays.asList( rawAlleleCounts .substring(1, rawAlleleCounts.length() - 1) .split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); for (String alleleCount : alleleCountArray) { final int ac = Integer.valueOf(alleleCount.trim()); if (ac > maxAC) { maxAC = ac; vcWithMaxAC = vc; } } } else { final int ac = Integer.valueOf(rawAlleleCounts); if (ac > maxAC) { maxAC = ac; vcWithMaxAC = vc; } } } for (Map.Entry<String, Object> p : vc.getAttributes().entrySet()) { String key = p.getKey(); // if we don't like the key already, don't go anywhere if (!inconsistentAttributes.contains(key)) { boolean alreadyFound = attributes.containsKey(key); Object boundValue = attributes.get(key); boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); if (alreadyFound && !boundValue.equals(p.getValue()) && !boundIsMissingValue) { // we found the value but we're inconsistent, put it in the exclude list // System.out.printf("Inconsistent INFO values: %s => %s and %s%n", key, boundValue, // p.getValue()); inconsistentAttributes.add(key); attributes.remove(key); } else if (!alreadyFound || boundIsMissingValue) { // no value // if ( vc != first ) System.out.printf("Adding key %s => %s%n", p.getKey(), // p.getValue()); attributes.put(key, p.getValue()); } } } } // if we have more alternate alleles in the merged VC than in one or more of the // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well // as allele-dependent attributes like AC,AF for (VariantContext vc : VCs) { if (vc.alleles.size() == 1) continue; if (hasPLIncompatibleAlleles(alleles, vc.alleles)) { if (!genotypes.isEmpty()) logger.warn( String.format( "Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); genotypes = stripPLs(genotypes); // this will remove stale AC,AF attributed from vc calculateChromosomeCounts(vc, attributes, true); break; } } // take the VC with the maxAC and pull the attributes into a modifiable map if (mergeInfoWithMaxAC && vcWithMaxAC != null) { attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); } // if at least one record was unfiltered and we want a union, clear all of the filters if ((filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL) filters.clear(); if (annotateOrigin) { // we care about where the call came from String setValue; if (nFiltered == 0 && variantSources.size() == priorityListOfVCs.size()) // nothing was unfiltered setValue = MERGE_INTERSECTION; else if (nFiltered == VCs.size()) // everything was filtered out setValue = MERGE_FILTER_IN_ALL; else if (variantSources.isEmpty()) // everyone was reference setValue = MERGE_REF_IN_ALL; else { LinkedHashSet<String> s = new LinkedHashSet<String>(); for (VariantContext vc : VCs) if (vc.isVariant()) s.add(vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource()); setValue = Utils.join("-", s); } if (setKey != null) { attributes.put(setKey, setValue); if (mergeInfoWithMaxAC && vcWithMaxAC != null) { attributesWithMaxAC.put(setKey, vcWithMaxAC.getSource()); } } } if (depth > 0) attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID); builder.loc(loc.getContig(), loc.getStart(), loc.getStop()); builder.alleles(alleles); builder.genotypes(genotypes); builder.log10PError(log10PError); builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes); // Trim the padded bases of all alleles if necessary VariantContext merged = createVariantContextWithTrimmedAlleles(builder.make()); if (printMessages && remapped) System.out.printf("Remapped => %s%n", merged); return merged; }