/** Create a list of variants from this VcfEntry */ public List<Variant> variants() { if (variants != null) return variants; // Create list of variants variants = new LinkedList<>(); // Create one Variant for each ALT Chromosome chr = (Chromosome) parent; if (!isVariant()) { // Not a variant? List<Variant> vars = variants(chr, start, ref, null, id); String alt = "."; // Add original 'ALT' field as genotype for (Variant variant : vars) variant.setGenotype(alt); variants.addAll(vars); } else { // At least one variant for (String alt : alts) { if (!isVariant(alt)) alt = null; List<Variant> vars = variants(chr, start, ref, alt, id); variants.addAll(vars); } } return variants; }
/** Annotate a VCF entry */ public boolean annotate(VcfEntry vcfEntry) throws IOException { boolean annotated = false; Set<String> idSet = new HashSet<>(); Map<String, String> infos = new HashMap<>(); boolean exists = false; // --- // Find all matching database entries // Note that QueryResult.variantVcfEntry can be 'null' // --- List<QueryResult> queryResults = new LinkedList<>(); Set<VcfEntry> uniqueVcfEntries = new HashSet<>(); for (Variant var : vcfEntry.variants()) { // Skip huge structural variants if (var.isStructuralHuge()) continue; // Query database Collection<VariantVcfEntry> results = query(var); // Make sure we add all found VcfEntries for (VariantVcfEntry dbEntry : results) uniqueVcfEntries.add(dbEntry.getVcfEntry()); // Add query and result QueryResult qr = new QueryResult(var, results); queryResults.add(qr); if (debug) Gpr.debug("Adding QueryResult: " + qr); } // Try to find INFO fields that we might have not seen before if (useAllInfoFields) { for (VcfEntry ve : uniqueVcfEntries) discoverInfoFields(ve); } // Add INFO fields using 'REF' data findDbInfoRef(infos, uniqueVcfEntries); // --- // Annotate all fields // --- for (QueryResult qr : queryResults) { if (debug) Gpr.debug("Processing QueryResult: " + qr); if (useId) findDbId(idSet, qr); if (existsInfoField != null) exists |= findDbExists(qr); if (useInfoFields) findDbInfo(infos, qr); } // Annotate input vcfEntry annotated |= annotateIds(vcfEntry, idSet); annotated |= annotateInfo(vcfEntry, infos); if (exists) annotateExists(vcfEntry); return annotated; }
/** Does database entry 'dbVcfEntry' match 'variant'? */ protected boolean match(Variant var, VariantVcfEntry dbEntry) { // Do coordinates match? if (var.getChromosomeName().equals(dbEntry.getChromosomeName()) // && var.getStart() == dbEntry.getStart() // && var.getEnd() == dbEntry.getEnd() // ) { if (useRefAlt) { // Compare Ref & Alt if (var.getReference().equalsIgnoreCase(dbEntry.getReference()) // && var.getAlt().equalsIgnoreCase(dbEntry.getAlt()) // ) return true; } else { // No need to use Ref & Alt, it's a match return true; } } return false; }
/** Query VCF entries intersecting 'marker' at node 'idx' */ protected void queryIntersects(Interval queryMarker, int idx, Markers results) { if (intersectFilePosStart[idx] == null) return; if (debug) Gpr.debug("queryIntersects\tidx: " + idx); // Read entries from disk List<VcfEntry> vcfEntries = readEntries(idx); // Find matching entries for (VcfEntry ve : vcfEntries) { // If any variant within the vcfEntry intersects the query // marker, we store this VCF entry as a result for (Variant var : ve.variants()) { if (var.intersects(queryMarker)) { if (debug) Gpr.debug("\tAdding matching result: " + ve); results.add(ve); break; // Store this entry only once } } // Past query's end coordinate? We don't need to look any further if (queryMarker.getEnd() < ve.getStart()) return; } }
/** Create a variant */ List<Variant> variants(Chromosome chromo, int start, String reference, String alt, String id) { List<Variant> list = null; if (alt != null) alt = alt.toUpperCase(); if (alt == null || alt.isEmpty() || alt.equals(reference)) { // Non-variant list = Variant.factory(chromo, start, reference, null, id, false); } else if (alt.charAt(0) == '<') { // Structural variants if (alt.startsWith("<DEL")) { // Case: Deletion // 2 321682 . T <DEL> 6 PASS // IMPRECISE;SVTYPE=DEL;END=321887;SVLEN=-105;CIPOS=-56,20;CIEND=-10,62 String ch = ref; int startNew = start; if (end > start) { startNew = start + reference.length(); int size = end - startNew + 1; char change[] = new char[size]; for (int i = 0; i < change.length; i++) change[i] = reference.length() > i ? reference.charAt(i) : 'N'; ch = new String(change); } list = Variant.factory(chromo, startNew, ch, "", id, false); } else if (alt.startsWith("<INV")) { // Inversion int startNew = start + reference.length(); Variant var = new Variant(chromo, startNew, end, id); var.setVariantType(VariantType.INV); list = new LinkedList<>(); list.add(var); } else if (alt.startsWith("<DUP")) { // Duplication int startNew = start + reference.length(); Variant var = new Variant(chromo, startNew, end, id); var.setVariantType(VariantType.DUP); list = new LinkedList<>(); list.add(var); } } else if ((alt.indexOf('[') >= 0) || (alt.indexOf(']') >= 0)) { // Translocations // Parse ALT string boolean left = alt.indexOf(']') >= 0; String sep = (left ? "\\]" : "\\["); String tpos[] = alt.split(sep); String pos = tpos[1]; boolean before = (alt.indexOf(']') == 0) || (alt.indexOf('[') == 0); String altBases = (before ? tpos[2] : tpos[0]); // Parse 'chr:start' String posSplit[] = pos.split(":"); String trChrName = posSplit[0]; Chromosome trChr = chromo.getGenome().getOrCreateChromosome(trChrName); int trStart = Gpr.parseIntSafe(posSplit[1]) - 1; VariantBnd var = new VariantBnd(chromo, start, ref, altBases, trChr, trStart, left, before); list = new LinkedList<>(); list.add(var); } else if (reference.length() == alt.length()) { // Case: SNP, MNP if (reference.length() == 1) { // SNPs // 20 3 . C G . PASS DP=100 list = Variant.factory(chromo, start, reference, alt, id, true); } else { // MNPs // 20 3 . TC AT . PASS DP=100 // Sometimes the first bases are the same and we can trim them int startDiff = Integer.MAX_VALUE; for (int i = 0; i < reference.length(); i++) if (reference.charAt(i) != alt.charAt(i)) startDiff = Math.min(startDiff, i); // MNPs // Sometimes the last bases are the same and we can trim them int endDiff = 0; for (int i = reference.length() - 1; i >= 0; i--) if (reference.charAt(i) != alt.charAt(i)) endDiff = Math.max(endDiff, i); String newRef = reference.substring(startDiff, endDiff + 1); String newAlt = alt.substring(startDiff, endDiff + 1); list = Variant.factory(chromo, start + startDiff, newRef, newAlt, id, true); } } else { // Short Insertions, Deletions or Mixed Variants (substitutions) VcfRefAltAlign align = new VcfRefAltAlign(alt, reference); align.align(); int startDiff = align.getOffset(); switch (align.getVariantType()) { case DEL: // Case: Deletion // 20 2 . TC T . PASS DP=100 // 20 2 . AGAC AAC . PASS DP=100 String ref = ""; String ch = align.getAlignment(); if (!ch.startsWith("-")) throw new RuntimeException( "Deletion '" + ch + "' does not start with '-'. This should never happen!"); list = Variant.factory(chromo, start + startDiff, ref, ch, id, true); break; case INS: // Case: Insertion of A { tC ; tCA } tC is the reference allele // 20 2 . TC TCA . PASS DP=100 ch = align.getAlignment(); ref = ""; if (!ch.startsWith("+")) throw new RuntimeException( "Insertion '" + ch + "' does not start with '+'. This should never happen!"); list = Variant.factory(chromo, start + startDiff, ref, ch, id, true); break; case MIXED: // Case: Mixed variant (substitution) reference = reference.substring(startDiff); alt = alt.substring(startDiff); list = Variant.factory(chromo, start + startDiff, reference, alt, id, true); break; default: // Other change type? throw new RuntimeException( "Unsupported VCF change type '" + align.getVariantType() + "'\n\tRef: " + reference + "'\n\tAlt: '" + alt + "'\n\tVcfEntry: " + this); } } // --- // Add original 'ALT' field as genotype // --- if (list == null) list = new LinkedList<>(); for (Variant variant : list) variant.setGenotype(alt); return list; }