public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) { // see if we need to trim common reference base from all alleles boolean trimVC; // We need to trim common reference base from all alleles in all genotypes if a ref base is // common to all alleles Allele refAllele = inputVC.getReference(); if (!inputVC.isVariant()) trimVC = false; else if (refAllele.isNull()) trimVC = false; else { trimVC = (AbstractVCFCodec.computeForwardClipping( new ArrayList<Allele>(inputVC.getAlternateAlleles()), inputVC.getReference().getDisplayString()) > 0); } // nothing to do if we don't need to trim bases if (trimVC) { List<Allele> alleles = new ArrayList<Allele>(); GenotypesContext genotypes = GenotypesContext.create(); // set the reference base for indels in the attributes Map<String, Object> attributes = new TreeMap<String, Object>(inputVC.getAttributes()); Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>(); for (Allele a : inputVC.getAlleles()) { if (a.isSymbolic()) { alleles.add(a); originalToTrimmedAlleleMap.put(a, a); } else { // get bases for current allele and create a new one with trimmed bases byte[] newBases = Arrays.copyOfRange(a.getBases(), 1, a.length()); Allele trimmedAllele = Allele.create(newBases, a.isReference()); alleles.add(trimmedAllele); originalToTrimmedAlleleMap.put(a, trimmedAllele); } } // detect case where we're trimming bases but resulting vc doesn't have any null allele. In // that case, we keep original representation // example: mixed records such as {TA*,TGA,TG} boolean hasNullAlleles = false; for (Allele a : originalToTrimmedAlleleMap.values()) { if (a.isNull()) hasNullAlleles = true; if (a.isReference()) refAllele = a; } if (!hasNullAlleles) return inputVC; // now we can recreate new genotypes with trimmed alleles for (final Genotype genotype : inputVC.getGenotypes()) { List<Allele> originalAlleles = genotype.getAlleles(); List<Allele> trimmedAlleles = new ArrayList<Allele>(); for (Allele a : originalAlleles) { if (a.isCalled()) trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); else trimmedAlleles.add(Allele.NO_CALL); } genotypes.add(Genotype.modifyAlleles(genotype, trimmedAlleles)); } final VariantContextBuilder builder = new VariantContextBuilder(inputVC); return builder .alleles(alleles) .genotypes(genotypes) .attributes(attributes) .referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])) .make(); } return inputVC; }
/** * Read in a list of ExactCall objects from reader, keeping only those with starts in startsToKeep * or all sites (if this is empty) * * @param reader a just-opened reader sitting at the start of the file * @param startsToKeep a list of start position of the calls to keep, or empty if all calls should * be kept * @param parser a genome loc parser to create genome locs * @return a list of ExactCall objects in reader * @throws IOException */ public static List<ExactCall> readExactLog( final BufferedReader reader, final List<Integer> startsToKeep, GenomeLocParser parser) throws IOException { if (reader == null) throw new IllegalArgumentException("reader cannot be null"); if (startsToKeep == null) throw new IllegalArgumentException("startsToKeep cannot be null"); if (parser == null) throw new IllegalArgumentException("GenomeLocParser cannot be null"); List<ExactCall> calls = new LinkedList<ExactCall>(); // skip the header line reader.readLine(); // skip the first "type" line reader.readLine(); while (true) { final VariantContextBuilder builder = new VariantContextBuilder(); final List<Allele> alleles = new ArrayList<Allele>(); final List<Genotype> genotypes = new ArrayList<Genotype>(); final double[] posteriors = new double[2]; final double[] priors = MathUtils.normalizeFromLog10(new double[] {0.5, 0.5}, true); final List<Integer> mle = new ArrayList<Integer>(); final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(); long runtimeNano = -1; GenomeLoc currentLoc = null; while (true) { final String line = reader.readLine(); if (line == null) return calls; final String[] parts = line.split("\t"); final GenomeLoc lineLoc = parser.parseGenomeLoc(parts[0]); final String variable = parts[1]; final String key = parts[2]; final String value = parts[3]; if (currentLoc == null) currentLoc = lineLoc; if (variable.equals("type")) { if (startsToKeep.isEmpty() || startsToKeep.contains(currentLoc.getStart())) { builder.alleles(alleles); final int stop = currentLoc.getStart() + alleles.get(0).length() - 1; builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop); builder.genotypes(genotypes); final int[] mleInts = ArrayUtils.toPrimitive(mle.toArray(new Integer[] {})); final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pNonRefByAllele); calls.add(new ExactCall(builder.make(), runtimeNano, result)); } break; } else if (variable.equals("allele")) { final boolean isRef = key.equals("0"); alleles.add(Allele.create(value, isRef)); } else if (variable.equals("PL")) { final GenotypeBuilder gb = new GenotypeBuilder(key); gb.PL(GenotypeLikelihoods.fromPLField(value).getAsPLs()); genotypes.add(gb.make()); } else if (variable.equals("log10PosteriorOfAFEq0")) { posteriors[0] = Double.valueOf(value); } else if (variable.equals("log10PosteriorOfAFGt0")) { posteriors[1] = Double.valueOf(value); } else if (variable.equals("MLE")) { mle.add(Integer.valueOf(value)); } else if (variable.equals("pNonRefByAllele")) { final Allele a = Allele.create(key); log10pNonRefByAllele.put(a, Double.valueOf(value)); } else if (variable.equals("runtime.nano")) { runtimeNano = Long.valueOf(value); } else { // nothing to do } } } }
/** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority * order, if provided. If uniqifySamples is true, the priority order is ignored and names are * created by concatenating the VC name with the sample name * * @param genomeLocParser loc parser * @param unsortedVCs collection of unsorted VCs * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs * @param filteredRecordMergeType merge type for filtered records * @param genotypeMergeOptions merge option for genotypes * @param annotateOrigin should we annotate the set it came from? * @param printMessages should we print messages? * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? * @return new VariantContext representing the merge of unsortedVCs */ public static VariantContext simpleMerge( final GenomeLocParser genomeLocParser, final Collection<VariantContext> unsortedVCs, final List<String> priorityListOfVCs, final FilteredRecordMergeType filteredRecordMergeType, final GenotypeMergeType genotypeMergeOptions, final boolean annotateOrigin, final boolean printMessages, final String setKey, final boolean filteredAreUncalled, final boolean mergeInfoWithMaxAC) { if (unsortedVCs == null || unsortedVCs.size() == 0) return null; if (annotateOrigin && priorityListOfVCs == null) throw new IllegalArgumentException( "Cannot merge calls and annotate their origins without a complete priority list of VariantContexts"); if (genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE) verifyUniqueSampleNames(unsortedVCs); List<VariantContext> prepaddedVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); // Make sure all variant contexts are padded with reference base in case of indels if necessary List<VariantContext> VCs = new ArrayList<VariantContext>(); for (VariantContext vc : prepaddedVCs) { // also a reasonable place to remove filtered calls, if needed if (!filteredAreUncalled || vc.isNotFiltered()) VCs.add(createVariantContextWithPaddedAlleles(vc, false)); } if (VCs.size() == 0) // everything is filtered out and we're filteredAreUncalled return null; // establish the baseline info from the first VC final VariantContext first = VCs.get(0); final String name = first.getSource(); final Allele refAllele = determineReferenceAllele(VCs); final Set<Allele> alleles = new LinkedHashSet<Allele>(); final Set<String> filters = new TreeSet<String>(); final Map<String, Object> attributes = new TreeMap<String, Object>(); final Set<String> inconsistentAttributes = new HashSet<String>(); final Set<String> variantSources = new HashSet< String>(); // contains the set of sources we found in our set of VCs that are variant final Set<String> rsIDs = new LinkedHashSet<String>(1); // most of the time there's one id GenomeLoc loc = getLocation(genomeLocParser, first); int depth = 0; int maxAC = -1; final Map<String, Object> attributesWithMaxAC = new TreeMap<String, Object>(); double log10PError = 1; VariantContext vcWithMaxAC = null; GenotypesContext genotypes = GenotypesContext.create(); // counting the number of filtered and variant VCs int nFiltered = 0; boolean remapped = false; // cycle through and add info from the other VCs, making sure the loc/reference matches for (VariantContext vc : VCs) { if (loc.getStart() != vc.getStart()) // || !first.getReference().equals(vc.getReference()) ) throw new ReviewedStingException( "BUG: attempting to merge VariantContexts with different start sites: first=" + first.toString() + " second=" + vc.toString()); if (getLocation(genomeLocParser, vc).size() > loc.size()) loc = getLocation(genomeLocParser, vc); // get the longest location nFiltered += vc.isFiltered() ? 1 : 0; if (vc.isVariant()) variantSources.add(vc.getSource()); AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); remapped = remapped || alleleMapping.needsRemapping(); alleles.addAll(alleleMapping.values()); mergeGenotypes( genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); log10PError = Math.min(log10PError, vc.isVariant() ? vc.getLog10PError() : 1); filters.addAll(vc.getFilters()); // // add attributes // // special case DP (add it up) and ID (just preserve it) // if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); if (vc.hasID()) rsIDs.add(vc.getID()); if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); // lets see if the string contains a , separator if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { List<String> alleleCountArray = Arrays.asList( rawAlleleCounts .substring(1, rawAlleleCounts.length() - 1) .split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); for (String alleleCount : alleleCountArray) { final int ac = Integer.valueOf(alleleCount.trim()); if (ac > maxAC) { maxAC = ac; vcWithMaxAC = vc; } } } else { final int ac = Integer.valueOf(rawAlleleCounts); if (ac > maxAC) { maxAC = ac; vcWithMaxAC = vc; } } } for (Map.Entry<String, Object> p : vc.getAttributes().entrySet()) { String key = p.getKey(); // if we don't like the key already, don't go anywhere if (!inconsistentAttributes.contains(key)) { boolean alreadyFound = attributes.containsKey(key); Object boundValue = attributes.get(key); boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); if (alreadyFound && !boundValue.equals(p.getValue()) && !boundIsMissingValue) { // we found the value but we're inconsistent, put it in the exclude list // System.out.printf("Inconsistent INFO values: %s => %s and %s%n", key, boundValue, // p.getValue()); inconsistentAttributes.add(key); attributes.remove(key); } else if (!alreadyFound || boundIsMissingValue) { // no value // if ( vc != first ) System.out.printf("Adding key %s => %s%n", p.getKey(), // p.getValue()); attributes.put(key, p.getValue()); } } } } // if we have more alternate alleles in the merged VC than in one or more of the // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well // as allele-dependent attributes like AC,AF for (VariantContext vc : VCs) { if (vc.alleles.size() == 1) continue; if (hasPLIncompatibleAlleles(alleles, vc.alleles)) { if (!genotypes.isEmpty()) logger.warn( String.format( "Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); genotypes = stripPLs(genotypes); // this will remove stale AC,AF attributed from vc calculateChromosomeCounts(vc, attributes, true); break; } } // take the VC with the maxAC and pull the attributes into a modifiable map if (mergeInfoWithMaxAC && vcWithMaxAC != null) { attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); } // if at least one record was unfiltered and we want a union, clear all of the filters if ((filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL) filters.clear(); if (annotateOrigin) { // we care about where the call came from String setValue; if (nFiltered == 0 && variantSources.size() == priorityListOfVCs.size()) // nothing was unfiltered setValue = MERGE_INTERSECTION; else if (nFiltered == VCs.size()) // everything was filtered out setValue = MERGE_FILTER_IN_ALL; else if (variantSources.isEmpty()) // everyone was reference setValue = MERGE_REF_IN_ALL; else { LinkedHashSet<String> s = new LinkedHashSet<String>(); for (VariantContext vc : VCs) if (vc.isVariant()) s.add(vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource()); setValue = Utils.join("-", s); } if (setKey != null) { attributes.put(setKey, setValue); if (mergeInfoWithMaxAC && vcWithMaxAC != null) { attributesWithMaxAC.put(setKey, vcWithMaxAC.getSource()); } } } if (depth > 0) attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID); builder.loc(loc.getContig(), loc.getStart(), loc.getStop()); builder.alleles(alleles); builder.genotypes(genotypes); builder.log10PError(log10PError); builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes); // Trim the padded bases of all alleles if necessary VariantContext merged = createVariantContextWithTrimmedAlleles(builder.make()); if (printMessages && remapped) System.out.printf("Remapped => %s%n", merged); return merged; }