void checkVarTypeField(String file, String varTypeExpected) { SnpSiftCmdVarType varType = new SnpSiftCmdVarType(null); VcfFileIterator vcf = new VcfFileIterator(file); for (VcfEntry ve : vcf) { // Annotate varType.annotate(ve); // Check that all variants are the ones expected String varTypeAnnotated = ve.getInfo(SnpSiftCmdVarType.VARTYPE); Assert.assertEquals(varTypeExpected, varTypeAnnotated); } }
/** Add ID information. Make sure we are no repeating IDs */ protected boolean annotateIds(VcfEntry vcfEntry, Set<String> idSet) { if (idSet.isEmpty()) return false; // Add IDs, make sure we are no repeating them // Get unique IDs (the ones not already present in vcf.id) boolean annotated = false; String id = uniqueIds(idSet, vcfEntry.getId()); if (!id.isEmpty()) { // Skip if no new ids found annotated = true; // Add ID if (!vcfEntry.getId().isEmpty()) id = vcfEntry.getId() + ";" + id; vcfEntry.setId(id); } return annotated; }
/** Find matching entries in the database */ public List<VcfEntry> find(VcfEntry vcfEntry) { List<Variant> vars = vcfEntry.variants(); List<VcfEntry> ves = new LinkedList<>(); for (Variant var : vars) ves.addAll(find(var)); return ves; }
/** Check info field Note: We report the first error we find */ String checkInfo(String infoName) { if (infoName.isEmpty()) return ""; VcfHeaderInfo vcfInfo = getVcfInfo(infoName); if (vcfInfo == null) return "Cannot find header for INFO field '" + infoName + "'"; // Split INFO value and match it to allele String valsStr = getInfo(infoName); if (valsStr == null) return ""; // INFO field not present, nothing to do // Check values String values[] = valsStr.split(","); for (String val : values) if (!VcfEntry.isValidInfoValue(val)) return "INFO filed '" + infoName + "' has an invalid value '" + val + "' (no spaces, tabs, '=' or ';' are allowed)"; // Check number of INFO elements if (vcfInfo.isNumberNumber() && vcfInfo.getNumber() != values.length) { VcfInfoType type = vcfInfo.getVcfInfoType(); if (type == VcfInfoType.Flag && values.length == 1) ; // OK, flags must have one or zero values else return "INFO filed '" + infoName + "' has 'Number=" + vcfInfo.getNumber() + "' in header, but it contains '" + values.length + "' elements."; } if (vcfInfo.isNumberAllAlleles() && values.length != (alts.length + 1)) return "INFO filed '" + infoName + "' has 'Number=R' in header, but it contains '" + values.length + "' elements when there are '" + alts.length + "' alleles (it should have '" + (alts.length + 1) + "' elements)."; if (vcfInfo.isNumberAllAlleles() && values.length != alts.length) return "INFO filed '" + infoName + "' has 'Number=A' in header, but it contains '" + values.length + "' elements when there are '" + alts.length + "' alleles."; return ""; }
/** Annotate a VCF entry */ public boolean annotate(VcfEntry vcfEntry) throws IOException { boolean annotated = false; Set<String> idSet = new HashSet<>(); Map<String, String> infos = new HashMap<>(); boolean exists = false; // --- // Find all matching database entries // Note that QueryResult.variantVcfEntry can be 'null' // --- List<QueryResult> queryResults = new LinkedList<>(); Set<VcfEntry> uniqueVcfEntries = new HashSet<>(); for (Variant var : vcfEntry.variants()) { // Skip huge structural variants if (var.isStructuralHuge()) continue; // Query database Collection<VariantVcfEntry> results = query(var); // Make sure we add all found VcfEntries for (VariantVcfEntry dbEntry : results) uniqueVcfEntries.add(dbEntry.getVcfEntry()); // Add query and result QueryResult qr = new QueryResult(var, results); queryResults.add(qr); if (debug) Gpr.debug("Adding QueryResult: " + qr); } // Try to find INFO fields that we might have not seen before if (useAllInfoFields) { for (VcfEntry ve : uniqueVcfEntries) discoverInfoFields(ve); } // Add INFO fields using 'REF' data findDbInfoRef(infos, uniqueVcfEntries); // --- // Annotate all fields // --- for (QueryResult qr : queryResults) { if (debug) Gpr.debug("Processing QueryResult: " + qr); if (useId) findDbId(idSet, qr); if (existsInfoField != null) exists |= findDbExists(qr); if (useInfoFields) findDbInfo(infos, qr); } // Annotate input vcfEntry annotated |= annotateIds(vcfEntry, idSet); annotated |= annotateInfo(vcfEntry, infos); if (exists) annotateExists(vcfEntry); return annotated; }
/** * Check that all variants in a file belong to a given type * * @param file * @param varTypeExpected */ void checkAllVarType(String file, String varTypeExpected) { SnpSiftCmdVarType varType = new SnpSiftCmdVarType(null); VcfFileIterator vcf = new VcfFileIterator(file); for (VcfEntry ve : vcf) { // Annotate varType.annotate(ve); // Check that all variants are the ones expected if (verbose) System.out.println( ve // + "\n\tvarTypeExpected: " + varTypeExpected // + "\n\tINFO flag : " + ve.getInfoFlag(varTypeExpected) // ); if (!ve.getInfoFlag(varTypeExpected)) System.err.println("Eror in file '" + file + "':\n" + ve); Assert.assertEquals(true, ve.getInfoFlag(varTypeExpected)); } }
/** Add INFO fields. */ protected boolean annotateInfo(VcfEntry vcfEntry, Map<String, String> info) { if (info == null || info.isEmpty()) return false; // Sort keys alphabetically ArrayList<String> keys = new ArrayList<String>(); keys.addAll(info.keySet()); Collections.sort(keys); // Add keys sorted alphabetically for (String key : keys) { String value = info.get(key); // Skip empty fields? if (!annotateEmpty && VcfEntry.isEmpty(value)) continue; // Add INFO entry if (prependInfoFieldName != null) key = prependInfoName(key); vcfEntry.addInfo(key, value); } return true; }
/** Fill values for INFO fields requiring 'REF' value */ protected void findDbInfoRef(Map<String, String> info, Set<VcfEntry> uniqueVcfEntries) { if (!useInfoFields || !hasVcfInfoPerAlleleRef) return; // Nothing to do for (String infoFieldName : infoFields) { // Does this field require 'REF' annotation? if (!isVcfInfoPerAlleleRef(infoFieldName)) continue; // Try to find 'REF' information in any entry String val = null; for (VcfEntry dbVcfEntry : uniqueVcfEntries) { val = dbVcfEntry.getInfo(infoFieldName, dbVcfEntry.getRef()); if (VcfEntry.isEmpty(val)) val = null; // Only add non-empty else break; // We need only one value } // Nothing found? Use 'MISSING' value if (val == null) val = VcfFileIterator.MISSING; // Store value info.put(infoFieldName, val); } }
/** Query VCF entries intersecting 'marker' at node 'idx' */ protected void queryIntersects(Interval queryMarker, int idx, Markers results) { if (intersectFilePosStart[idx] == null) return; if (debug) Gpr.debug("queryIntersects\tidx: " + idx); // Read entries from disk List<VcfEntry> vcfEntries = readEntries(idx); // Find matching entries for (VcfEntry ve : vcfEntries) { // If any variant within the vcfEntry intersects the query // marker, we store this VCF entry as a result for (Variant var : ve.variants()) { if (var.intersects(queryMarker)) { if (debug) Gpr.debug("\tAdding matching result: " + ve); results.add(ve); break; // Store this entry only once } } // Past query's end coordinate? We don't need to look any further if (queryMarker.getEnd() < ve.getStart()) return; } }
/** Find all non-empty INFO fields 'infoFieldName' in results */ protected String findDbInfo(String infoFieldName, QueryResult qr) { if (debug) Gpr.debug("Finding DB data for INFO field: " + infoFieldName); StringBuilder sb = new StringBuilder(); for (VariantVcfEntry varVe : qr.results) { if (varVe != null) { String val = varVe.getVcfEntry().getInfo(infoFieldName); if (!VcfEntry.isEmpty(val)) { if (debug) Gpr.debug("\tFound: " + val); if (sb.length() > 0) sb.append(','); sb.append(val); } } } return sb.length() <= 0 ? null : sb.toString(); }
/** Find the first non-empty INFO field 'infoFieldName' in results Note: ALT must match */ protected String findDbInfoAlt(String infoFieldName, QueryResult qr) { for (VariantVcfEntry varVe : qr.results) { if (varVe != null) { // IMPORTANT: When a variant is parse, the original 'ALT' entry is stored in // the 'Variant.genotype' whereas 'variant.alt' contains // a 'minimal ALT'. E.g. if we have // vcfEntry.ref = 'AC' // vcfEntry.alt = 'A' // Then // variant.ref = 'C' // variant.alt = '' // variant.genotype = 'A' <-- This is the 'original' ALT field from // vcfEntry // That's why we use 'var.getGenotype()' in the following 'getInfo()' method. String vcfAlt = qr.variant.getGenotype(); String val = varVe.getVcfEntry().getInfo(infoFieldName, vcfAlt); if (!VcfEntry.isEmpty(val)) return val; } } return VcfFileIterator.MISSING; }
/** * If 'ALL' info fields are being used, we can try to discover new fields that have not already * been added to the annotation list (e.g. implicit fields not mentioned in the VCF header) */ protected void discoverInfoFields(VcfEntry dbVcfEntry) { // Make sure all fields are added for (String info : dbVcfEntry.getInfoKeys()) if (!info.isEmpty()) infoFields.add(info); }
/** Add 'exists' flag to INFO fields */ protected void annotateExists(VcfEntry vcfEntry) { vcfEntry.addInfo(existsInfoField, null); }