List<VcfEntry> readEntries(int idx) { // Cached? if (cachedLeafNodeIdx == idx) return cachedLeafNode; List<VcfEntry> vcfEntries = intersect[idx]; if (vcfEntries != null) return vcfEntries; try { // There might be several non-contiguous file regions int len = intersectFilePosStart[idx].length; // Read each file region vcfEntries = new ArrayList<VcfEntry>(); Set<VcfEntry> added = new HashSet<>(); for (int i = 0; i < len; i++) { if (debug) Gpr.debug( "\tintersect[" + idx + "][" + i + "]:\t[" + intersectFilePosStart[idx][i] + " , " + intersectFilePosEnd[idx][i] + " ]"); long startPos = intersectFilePosStart[idx][i]; long endPos = intersectFilePosEnd[idx][i]; // No cache? Read from file vcf.seek(startPos); // Read entries from file for (VcfEntry ve : vcf) { if (added.add(ve)) { // Make sure we add entries only once vcfEntries.add(ve); if (debug) Gpr.debug("\tParsing VcfEntry [" + vcf.getFilePointer() + "]: " + ve); } // Finished reading? if (vcf.getFilePointer() >= endPos) break; } } // Cache data if (isLeaf(idx)) { cachedLeafNodeIdx = idx; cachedLeafNode = vcfEntries; } else if (intersect[idx] == null) { // Cache non-leaf nodes, which have very few intersect entries intersect[idx] = vcfEntries; } return vcfEntries; } catch (IOException e) { throw new RuntimeException(e); } }
/** * Is this variant a duplication * * <p>Reference: http://www.hgvs.org/mutnomen/disc.html#dupins ...the description "dup" (see * Standards) may by definition only be used when the additional copy is directly 3'-flanking of * the original copy (tandem duplication) */ protected boolean isDuplication() { // --- // Simple duplications can be obtained by looking into AA.Ref / AA.Alt // --- String aaRef = variantEffect.getAaRef().toUpperCase(); String aaAlt = variantEffect.getAaAlt().toUpperCase(); // Compare to ALT sequence String dupAaRef = aaRef + aaRef; if (debug) Gpr.debug("AA.Ref: '" + aaRef + "'\tAA.Alt: '" + aaAlt); if (aaAlt.equals(dupAaRef)) return true; // --- // Duplications need to look into the protein sequence // --- // Extract sequence from genomic coordinates before variant String protein = tr.protein(); if (protein == null) return false; // Cannot calculate duplication // Calculate net amino acid change aaAlt = variantEffect.getAaNetChange(); // Get previous AA sequence int aaEnd = variantEffect.getCodonNum(); int aaStart = aaEnd - aaAlt.length(); if (aaStart < 0 || aaEnd > protein.length()) return false; aaRef = protein.substring(aaStart, aaEnd); // Compare to ALT sequence boolean dup = aaRef.equalsIgnoreCase(aaAlt); if (debug) Gpr.debug( "SEQUENCE [ " + aaStart + " , " + aaEnd + " ]: '" + aaRef + "'" // + "\n\tAA Ref : '" + variantEffect.getAaRef() + "'" // + "\n\tAA Alt : '" + variantEffect.getAaAlt() + "'" // + "\n\tAA Alt (net) : '" + aaAlt + "'" // + "\n\tDup? : " + dup); return dup; }
/** Annotate a VCF entry */ public boolean annotate(VcfEntry vcfEntry) throws IOException { boolean annotated = false; Set<String> idSet = new HashSet<>(); Map<String, String> infos = new HashMap<>(); boolean exists = false; // --- // Find all matching database entries // Note that QueryResult.variantVcfEntry can be 'null' // --- List<QueryResult> queryResults = new LinkedList<>(); Set<VcfEntry> uniqueVcfEntries = new HashSet<>(); for (Variant var : vcfEntry.variants()) { // Skip huge structural variants if (var.isStructuralHuge()) continue; // Query database Collection<VariantVcfEntry> results = query(var); // Make sure we add all found VcfEntries for (VariantVcfEntry dbEntry : results) uniqueVcfEntries.add(dbEntry.getVcfEntry()); // Add query and result QueryResult qr = new QueryResult(var, results); queryResults.add(qr); if (debug) Gpr.debug("Adding QueryResult: " + qr); } // Try to find INFO fields that we might have not seen before if (useAllInfoFields) { for (VcfEntry ve : uniqueVcfEntries) discoverInfoFields(ve); } // Add INFO fields using 'REF' data findDbInfoRef(infos, uniqueVcfEntries); // --- // Annotate all fields // --- for (QueryResult qr : queryResults) { if (debug) Gpr.debug("Processing QueryResult: " + qr); if (useId) findDbId(idSet, qr); if (existsInfoField != null) exists |= findDbExists(qr); if (useInfoFields) findDbInfo(infos, qr); } // Annotate input vcfEntry annotated |= annotateIds(vcfEntry, idSet); annotated |= annotateInfo(vcfEntry, infos); if (exists) annotateExists(vcfEntry); return annotated; }
/** Add a value to INFO hash for field 'infoFieldName' */ void findDbInfoAddValue(Map<String, String> info, String infoFieldName, String newValue) { if (newValue == null && !annotateEmpty) return; if (debug) Gpr.debug("\tINFO:" + infoFieldName + "\tnewValue: " + newValue); String oldValue = info.get(infoFieldName); String val = (oldValue == null ? "" : oldValue + ",") + (newValue != null ? newValue : "."); info.put(infoFieldName, val); }
/** Variant after exon end */ @Test public void test_apply_variant_09() { Gpr.debug("Test"); Variant variant = new Variant(transcript.getParent(), 410, "A", "T"); checkApplySnp(variant, transcript.cds(), transcript.protein(), 1, 300, 399); }
/** Rare Amino acid */ @Test public void test_30_RareAa() { Gpr.debug("Test"); String genomeName = "testHg3765Chr22"; CompareEffects comp = new CompareEffects(genomeName, randSeed, verbose); comp.snpEffect("tests/rareAa.txt", null, true); }
/** Find all non-empty INFO fields 'infoFieldName' in results */ protected String findDbInfo(String infoFieldName, QueryResult qr) { if (debug) Gpr.debug("Finding DB data for INFO field: " + infoFieldName); StringBuilder sb = new StringBuilder(); for (VariantVcfEntry varVe : qr.results) { if (varVe != null) { String val = varVe.getVcfEntry().getInfo(infoFieldName); if (!VcfEntry.isEmpty(val)) { if (debug) Gpr.debug("\tFound: " + val); if (sb.length() > 0) sb.append(','); sb.append(val); } } } return sb.length() <= 0 ? null : sb.toString(); }
/** Parse a 'line' from a 'vcfFileIterator' */ public void parse() { // Parse line String fields[] = line.split("\t", 10); // Only pare the fist 9 fields (i.e. do not parse genotypes) // Is line OK? if (fields.length >= 4) { // Chromosome and position. VCF files are one-base, so inOffset should be 1. chromosomeName = fields[0].trim(); // Chromosome Chromosome chromo = vcfFileIterator.getChromosome(chromosomeName); parent = chromo; vcfFileIterator.sanityCheckChromo(chromosomeName, chromo); // Sanity check // Start start = vcfFileIterator.parsePosition(vcfFileIterator.readField(fields, 1)); // ID (e.g. might indicate dbSnp) id = vcfFileIterator.readField(fields, 2); // REF ref = vcfFileIterator.readField(fields, 3).toUpperCase(); // Reference and change strandMinus = false; // Strand is always positive (defined in VCF spec.) // ALT altStr = vcfFileIterator.readField(fields, 4).toUpperCase(); parseAlts(altStr); // Quality String qStr = vcfFileIterator.readField(fields, 5); if (!qStr.isEmpty()) quality = Gpr.parseDoubleSafe(qStr); else quality = null; // Filter filter = vcfFileIterator.readField(fields, 6); // Filter parameters // INFO fields infoStr = vcfFileIterator.readField(fields, 7); info = null; // Start & End coordinates are anchored to the reference genome, thus based on REF field (ALT // is not taken into account) parseEnd(altStr); // Genotype format format = null; if (fields.length > 8) format = vcfFileIterator.readField( fields, 8); // This field is optional, So it can be null or EMPTY ('.') // Add genotype fields (lazy parse) if (fields.length > 9) genotypeFieldsStr = fields[9]; } else throw new RuntimeException( "Impropper VCF entry: Not enough fields (missing tab separators?).\n" + line); }
/** Query database and find results matching 'variant' */ protected Collection<VariantVcfEntry> query(Variant variant) { // Query database Collection<VariantVcfEntry> results = dbVcf.query(variant); // Filter results to match 'variant' List<VariantVcfEntry> list = new LinkedList<>(); for (VariantVcfEntry dbEntry : results) { if (match(variant, dbEntry)) { if (debug) Gpr.debug("dbEntry matches query\tvariant: " + variant + "\tdbEntry: " + dbEntry); list.add(dbEntry); } else { if (debug) Gpr.debug("dbEntry does NOT match query\tvariant: " + variant + "\tdbEntry: " + dbEntry); } } if (debug) Gpr.debug("Match query results: " + list.size()); return list; }
/** Parse genotype string (sparse matrix) and set all entries using 'value' */ void parseSparseGt(String str, byte gt[], int valueInt) { if ((str == null) || (str.isEmpty()) || (str.equals("true"))) return; // Split comma separated indeces String idxs[] = str.split(","); byte value = (byte) valueInt; // Set all entries for (String idx : idxs) { int i = Gpr.parseIntSafe(idx); gt[i] = value; } }
/** Query VCF entries intersecting 'marker' at node 'idx' */ protected void queryIntersects(Interval queryMarker, int idx, Markers results) { if (intersectFilePosStart[idx] == null) return; if (debug) Gpr.debug("queryIntersects\tidx: " + idx); // Read entries from disk List<VcfEntry> vcfEntries = readEntries(idx); // Find matching entries for (VcfEntry ve : vcfEntries) { // If any variant within the vcfEntry intersects the query // marker, we store this VCF entry as a result for (Variant var : ve.variants()) { if (var.intersects(queryMarker)) { if (debug) Gpr.debug("\tAdding matching result: " + ve); results.add(ve); break; // Store this entry only once } } // Past query's end coordinate? We don't need to look any further if (queryMarker.getEnd() < ve.getStart()) return; } }
@Override public String toString() { return "Annotate VCF db:\n" // + "\n\tannotateEmpty :" + annotateEmpty // + "\n\texistsInfoField :" + existsInfoField // + "\n\tprependInfoFieldName :" + prependInfoFieldName // + "\n\tuseRefAlt :" + useRefAlt // + "\n\tdbVcf:\n" + Gpr.prependEachLine("\t\t", dbVcf) // ; }
/** Variant right before exon end */ @Test public void test_apply_variant_06() { Gpr.debug("Test"); Variant variant = new Variant(transcript.getParent(), 399, "G", "A"); String expectedCds = "atgtccgcaggtgaaggcatacacgctgcgcgtatactgatgttacctcgatggattttgtcagaaatatggtgcccaggacgcgaagggcatattatgg" // Exon[0] + "tgtttgggaattcacgggcacggttctgcagcaagctgaattggcagctcggcataaatcccgaccccatcgtcacgcacggatcaattcatcctcaacA" .toLowerCase() // Exon[1] + "ggtagaggaaaagcacctaacccccattgagcaggatctctttcgtaatactctgtatcgattaccgatttatttgattccccacatttatttcatcggg" // Exon[2] ; checkApplySnp(variant, expectedCds, null, 1, 300, 399); }
/** Query index to find all VCF entries intersecting 'marker' Store VCF entries in 'results' */ @Override public Markers query(Interval queryMarker) { Markers results = new Markers(); if (debug) Gpr.debug( "Query: " + queryMarker.getChromosomeName() + ":" + queryMarker.getStart() + "-" + queryMarker.getEnd() + "\t" + queryMarker); query(queryMarker, 0, results); return results; }
void add(String trId, String seq, int lineNum, boolean check) { // Repeated transcript Id? => Check that Protein is the same if (check && (proteinByTrId.get(trId) != null) && (!proteinByTrId.get(trId).equals(seq))) // System.err.println( "ERROR: Different protein for the same transcript ID. This should never happen!!!" // + "\n\tLine number : " + lineNum // + "\n\tTranscript ID : '" + trId + "'" // + "\n\tProtein : " + proteinByTrId.get(trId) // + "\n\tProtein (new) : " + seq // ); // Use whole trId proteinByTrId.put(trId, seq); // Add it to the hash if (debug) Gpr.debug("Adding proteinByTrId{'" + trId + "'} :\t" + seq); }
/** * Query index to find all VCF entries intersecting 'marker', starting from node 'idx' Store VCF * entries in 'results' */ protected void query(Interval queryMarker, int idx, Markers results) { // Negative index? Nothing to do if (idx < 0) return; if (debug) Gpr.debug("Node: " + toString(idx) + (results.isEmpty() ? "" : "\n\tResults: " + results)); // Check all intervals intersecting queryIntersects(queryMarker, idx, results); // Recurse left or right int midPos = mid[idx]; if ((queryMarker.getEnd() < midPos) && (left[idx] >= 0)) { query(queryMarker, left[idx], results); } if ((midPos < queryMarker.getStart()) && (right[idx] >= 0)) { query(queryMarker, right[idx], results); } }
/** * Read Proteins from a file Format: Tab-separated format, containing "sequence \t transcriptId" */ void readProteinFileTxt() { // Load file String proteinData = Gpr.readFile(proteinFile); String proteinLines[] = proteinData.split("\n"); // Parse each line int lineNum = 1; for (String proteinLine : proteinLines) { // Split tab separated fields String field[] = proteinLine.split("\\s+"); // Parse fields if (field.length >= 2) { // OK Parse fields String seq = field[1].trim(); String trId = field[0].trim(); add(trId, seq, lineNum, true); } lineNum++; } }
/** Read sequences from features file */ void readProteinFileFeatures(FeaturesFile featuresFile) { for (Features features : featuresFile) { String trIdPrev = null; for (Feature f : features.getFeatures()) { // Find all CDS if (f.getType() == Type.GENE) { // Clean up trId trIdPrev = null; } else if (f.getType() == Type.MRNA) { // Save trId, so that next CDS record can find it trIdPrev = f.getTranscriptId(); } else if (f.getType() == Type.CDS) { // Add CDS 'translation' record // Try using the transcript ID found in the previous record String trId = trIdPrev; if (trId == null) trId = f.getTranscriptId(); String seq = f.getAasequence(); if (debug) Gpr.debug(trId + "\t" + seq); if ((trId != null) && (seq != null)) add(trId, seq, -1, true); } } } }
public String toString( String tabs, double thresholdEntropy, double thresholdP, int thresholdCount) { if (getTotalCount() == 0) return ""; StringBuilder sb = new StringBuilder(); double p[] = p(); for (int idx = 0; idx < 4; idx++) { char base = BASES[idx]; AcgtTree n = nodes[idx]; if (n != null) { sb.append( String.format( "%s%s%s: %d\te:%4.3f\tp:%4.2f\n", tabs, name, base, counts[idx], n.entropy(), p[idx])); if (((n.entropy() <= thresholdEntropy) || (p[idx] >= thresholdP)) // && (counts[idx] >= thresholdCount) // ) { Gpr.debug( "Name:" + n.name + "\tIdx:" + +idx + "\tEntropy: " + n.entropy() + "\tP:" + p[idx] + "\tCount:" + counts[idx]); sb.append(n.toString(tabs + "\t", thresholdEntropy, thresholdP, thresholdCount)); } } } return sb.toString(); }
/** Read and parse genes file */ protected void readRefSeqFile() { try { int count = 0; BufferedReader reader = Gpr.reader(fileName); if (reader == null) return; // Error for (lineNum = 1; reader.ready(); lineNum++) { line = reader.readLine(); // Skip headers if (!line.startsWith("#")) { String fields[] = line.split("\t"); if (fields.length >= 9) { // Parse fields int fieldNum = 0; String id = fields[fieldNum++]; String chromoName = fields[fieldNum++]; boolean strandMinus = fields[fieldNum++].equals("-"); int txstart = parsePosition(fields[fieldNum++]); int txend = parsePosition(fields[fieldNum++]) - 1; // Our internal database representations of coordinates always have a // zero-based start and a one-based end (Reference: // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 ) int cdsStart = parsePosition(fields[fieldNum++]); int cdsEnd = parsePosition(fields[fieldNum++]) - 1; // Our internal database representations of coordinates always have a // zero-based start and a one-based end (Reference: // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 ) int exonCount = Gpr.parseIntSafe(fields[fieldNum++]); String exonStarts = fields[fieldNum++]; String exonEnds = fields[ fieldNum++]; // Our internal database representations of coordinates always have // a zero-based start and a one-based end (Reference: // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 ) String proteinId = fields[fieldNum++]; // String alignId = fields[fieldNum++]; // Not used // --- // Create // ---- Chromosome chromo = getOrCreateChromosome(chromoName); // Is it protein coding? boolean isCoding = !proteinId.isEmpty(); // Protein ID assigned? // Create IDs String trId = uniqueTrId(id); // Get or create gene Gene gene = findOrCreateGene(proteinId, trId, chromo, txstart, txend, strandMinus, isCoding); // Create transcript Transcript tr = new Transcript(gene, txstart, txend, strandMinus, trId); tr.setProteinCoding(isCoding); add(tr); // Add Exons and CDS String exStartStr[] = exonStarts.split(","); String exEndStr[] = exonEnds.split(","); for (int i = 0; i < exonCount; i++) { // Exons int exStart = parsePosition(exStartStr[i]); int exEnd = parsePosition(exEndStr[i]) - 1; // Our internal database representations of coordinates always have a // zero-based start and a one-based end (Reference: // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 ) String exId = trId + ".ex." + (i + 1); Exon ex = new Exon(tr, exStart, exEnd, strandMinus, exId, i); add(ex); // CDS (ony if intersects) if ((exStart <= cdsEnd) && (exEnd >= cdsStart)) { Cds cds = new Cds( tr, Math.max(cdsStart, exStart), Math.min(cdsEnd, exEnd), strandMinus, exId); add(cds); } } count++; if (count % MARK == 0) System.out.print('.'); if (count % (100 * MARK) == 0) System.out.print("\n\t"); } } } reader.close(); } catch (Exception e) { Gpr.debug("Offending line (lineNum: " + lineNum + "): '" + line + "'"); throw new RuntimeException(e); } }
/** * Get info field as a 'double' number The norm specifies data type as 'FLOAT', that is why the * name of this method might be not intuitive */ public double getInfoFloat(String key) { if (info == null) parseInfo(); String f = info.get(key); if (f == null) return Double.NaN; return Gpr.parseDoubleSafe(f); }
/** Create a variant */ List<Variant> variants(Chromosome chromo, int start, String reference, String alt, String id) { List<Variant> list = null; if (alt != null) alt = alt.toUpperCase(); if (alt == null || alt.isEmpty() || alt.equals(reference)) { // Non-variant list = Variant.factory(chromo, start, reference, null, id, false); } else if (alt.charAt(0) == '<') { // Structural variants if (alt.startsWith("<DEL")) { // Case: Deletion // 2 321682 . T <DEL> 6 PASS // IMPRECISE;SVTYPE=DEL;END=321887;SVLEN=-105;CIPOS=-56,20;CIEND=-10,62 String ch = ref; int startNew = start; if (end > start) { startNew = start + reference.length(); int size = end - startNew + 1; char change[] = new char[size]; for (int i = 0; i < change.length; i++) change[i] = reference.length() > i ? reference.charAt(i) : 'N'; ch = new String(change); } list = Variant.factory(chromo, startNew, ch, "", id, false); } else if (alt.startsWith("<INV")) { // Inversion int startNew = start + reference.length(); Variant var = new Variant(chromo, startNew, end, id); var.setVariantType(VariantType.INV); list = new LinkedList<>(); list.add(var); } else if (alt.startsWith("<DUP")) { // Duplication int startNew = start + reference.length(); Variant var = new Variant(chromo, startNew, end, id); var.setVariantType(VariantType.DUP); list = new LinkedList<>(); list.add(var); } } else if ((alt.indexOf('[') >= 0) || (alt.indexOf(']') >= 0)) { // Translocations // Parse ALT string boolean left = alt.indexOf(']') >= 0; String sep = (left ? "\\]" : "\\["); String tpos[] = alt.split(sep); String pos = tpos[1]; boolean before = (alt.indexOf(']') == 0) || (alt.indexOf('[') == 0); String altBases = (before ? tpos[2] : tpos[0]); // Parse 'chr:start' String posSplit[] = pos.split(":"); String trChrName = posSplit[0]; Chromosome trChr = chromo.getGenome().getOrCreateChromosome(trChrName); int trStart = Gpr.parseIntSafe(posSplit[1]) - 1; VariantBnd var = new VariantBnd(chromo, start, ref, altBases, trChr, trStart, left, before); list = new LinkedList<>(); list.add(var); } else if (reference.length() == alt.length()) { // Case: SNP, MNP if (reference.length() == 1) { // SNPs // 20 3 . C G . PASS DP=100 list = Variant.factory(chromo, start, reference, alt, id, true); } else { // MNPs // 20 3 . TC AT . PASS DP=100 // Sometimes the first bases are the same and we can trim them int startDiff = Integer.MAX_VALUE; for (int i = 0; i < reference.length(); i++) if (reference.charAt(i) != alt.charAt(i)) startDiff = Math.min(startDiff, i); // MNPs // Sometimes the last bases are the same and we can trim them int endDiff = 0; for (int i = reference.length() - 1; i >= 0; i--) if (reference.charAt(i) != alt.charAt(i)) endDiff = Math.max(endDiff, i); String newRef = reference.substring(startDiff, endDiff + 1); String newAlt = alt.substring(startDiff, endDiff + 1); list = Variant.factory(chromo, start + startDiff, newRef, newAlt, id, true); } } else { // Short Insertions, Deletions or Mixed Variants (substitutions) VcfRefAltAlign align = new VcfRefAltAlign(alt, reference); align.align(); int startDiff = align.getOffset(); switch (align.getVariantType()) { case DEL: // Case: Deletion // 20 2 . TC T . PASS DP=100 // 20 2 . AGAC AAC . PASS DP=100 String ref = ""; String ch = align.getAlignment(); if (!ch.startsWith("-")) throw new RuntimeException( "Deletion '" + ch + "' does not start with '-'. This should never happen!"); list = Variant.factory(chromo, start + startDiff, ref, ch, id, true); break; case INS: // Case: Insertion of A { tC ; tCA } tC is the reference allele // 20 2 . TC TCA . PASS DP=100 ch = align.getAlignment(); ref = ""; if (!ch.startsWith("+")) throw new RuntimeException( "Insertion '" + ch + "' does not start with '+'. This should never happen!"); list = Variant.factory(chromo, start + startDiff, ref, ch, id, true); break; case MIXED: // Case: Mixed variant (substitution) reference = reference.substring(startDiff); alt = alt.substring(startDiff); list = Variant.factory(chromo, start + startDiff, reference, alt, id, true); break; default: // Other change type? throw new RuntimeException( "Unsupported VCF change type '" + align.getVariantType() + "'\n\tRef: " + reference + "'\n\tAlt: '" + alt + "'\n\tVcfEntry: " + this); } } // --- // Add original 'ALT' field as genotype // --- if (list == null) list = new LinkedList<>(); for (Variant variant : list) variant.setGenotype(alt); return list; }
/** * Get info field as an long number The norm specifies data type as 'INT', that is why the name of * this method might be not intuitive */ public long getInfoInt(String key) { if (info == null) parseInfo(); String i = info.get(key); if (i == null) return 0; return Gpr.parseLongSafe(i); }