List<VcfEntry> readEntries(int idx) { // Cached? if (cachedLeafNodeIdx == idx) return cachedLeafNode; List<VcfEntry> vcfEntries = intersect[idx]; if (vcfEntries != null) return vcfEntries; try { // There might be several non-contiguous file regions int len = intersectFilePosStart[idx].length; // Read each file region vcfEntries = new ArrayList<VcfEntry>(); Set<VcfEntry> added = new HashSet<>(); for (int i = 0; i < len; i++) { if (debug) Gpr.debug( "\tintersect[" + idx + "][" + i + "]:\t[" + intersectFilePosStart[idx][i] + " , " + intersectFilePosEnd[idx][i] + " ]"); long startPos = intersectFilePosStart[idx][i]; long endPos = intersectFilePosEnd[idx][i]; // No cache? Read from file vcf.seek(startPos); // Read entries from file for (VcfEntry ve : vcf) { if (added.add(ve)) { // Make sure we add entries only once vcfEntries.add(ve); if (debug) Gpr.debug("\tParsing VcfEntry [" + vcf.getFilePointer() + "]: " + ve); } // Finished reading? if (vcf.getFilePointer() >= endPos) break; } } // Cache data if (isLeaf(idx)) { cachedLeafNodeIdx = idx; cachedLeafNode = vcfEntries; } else if (intersect[idx] == null) { // Cache non-leaf nodes, which have very few intersect entries intersect[idx] = vcfEntries; } return vcfEntries; } catch (IOException e) { throw new RuntimeException(e); } }
/** * Is this variant a duplication * * <p>Reference: http://www.hgvs.org/mutnomen/disc.html#dupins ...the description "dup" (see * Standards) may by definition only be used when the additional copy is directly 3'-flanking of * the original copy (tandem duplication) */ protected boolean isDuplication() { // --- // Simple duplications can be obtained by looking into AA.Ref / AA.Alt // --- String aaRef = variantEffect.getAaRef().toUpperCase(); String aaAlt = variantEffect.getAaAlt().toUpperCase(); // Compare to ALT sequence String dupAaRef = aaRef + aaRef; if (debug) Gpr.debug("AA.Ref: '" + aaRef + "'\tAA.Alt: '" + aaAlt); if (aaAlt.equals(dupAaRef)) return true; // --- // Duplications need to look into the protein sequence // --- // Extract sequence from genomic coordinates before variant String protein = tr.protein(); if (protein == null) return false; // Cannot calculate duplication // Calculate net amino acid change aaAlt = variantEffect.getAaNetChange(); // Get previous AA sequence int aaEnd = variantEffect.getCodonNum(); int aaStart = aaEnd - aaAlt.length(); if (aaStart < 0 || aaEnd > protein.length()) return false; aaRef = protein.substring(aaStart, aaEnd); // Compare to ALT sequence boolean dup = aaRef.equalsIgnoreCase(aaAlt); if (debug) Gpr.debug( "SEQUENCE [ " + aaStart + " , " + aaEnd + " ]: '" + aaRef + "'" // + "\n\tAA Ref : '" + variantEffect.getAaRef() + "'" // + "\n\tAA Alt : '" + variantEffect.getAaAlt() + "'" // + "\n\tAA Alt (net) : '" + aaAlt + "'" // + "\n\tDup? : " + dup); return dup; }
/** Annotate a VCF entry */ public boolean annotate(VcfEntry vcfEntry) throws IOException { boolean annotated = false; Set<String> idSet = new HashSet<>(); Map<String, String> infos = new HashMap<>(); boolean exists = false; // --- // Find all matching database entries // Note that QueryResult.variantVcfEntry can be 'null' // --- List<QueryResult> queryResults = new LinkedList<>(); Set<VcfEntry> uniqueVcfEntries = new HashSet<>(); for (Variant var : vcfEntry.variants()) { // Skip huge structural variants if (var.isStructuralHuge()) continue; // Query database Collection<VariantVcfEntry> results = query(var); // Make sure we add all found VcfEntries for (VariantVcfEntry dbEntry : results) uniqueVcfEntries.add(dbEntry.getVcfEntry()); // Add query and result QueryResult qr = new QueryResult(var, results); queryResults.add(qr); if (debug) Gpr.debug("Adding QueryResult: " + qr); } // Try to find INFO fields that we might have not seen before if (useAllInfoFields) { for (VcfEntry ve : uniqueVcfEntries) discoverInfoFields(ve); } // Add INFO fields using 'REF' data findDbInfoRef(infos, uniqueVcfEntries); // --- // Annotate all fields // --- for (QueryResult qr : queryResults) { if (debug) Gpr.debug("Processing QueryResult: " + qr); if (useId) findDbId(idSet, qr); if (existsInfoField != null) exists |= findDbExists(qr); if (useInfoFields) findDbInfo(infos, qr); } // Annotate input vcfEntry annotated |= annotateIds(vcfEntry, idSet); annotated |= annotateInfo(vcfEntry, infos); if (exists) annotateExists(vcfEntry); return annotated; }
/** Add a value to INFO hash for field 'infoFieldName' */ void findDbInfoAddValue(Map<String, String> info, String infoFieldName, String newValue) { if (newValue == null && !annotateEmpty) return; if (debug) Gpr.debug("\tINFO:" + infoFieldName + "\tnewValue: " + newValue); String oldValue = info.get(infoFieldName); String val = (oldValue == null ? "" : oldValue + ",") + (newValue != null ? newValue : "."); info.put(infoFieldName, val); }
/** Rare Amino acid */ @Test public void test_30_RareAa() { Gpr.debug("Test"); String genomeName = "testHg3765Chr22"; CompareEffects comp = new CompareEffects(genomeName, randSeed, verbose); comp.snpEffect("tests/rareAa.txt", null, true); }
/** Variant after exon end */ @Test public void test_apply_variant_09() { Gpr.debug("Test"); Variant variant = new Variant(transcript.getParent(), 410, "A", "T"); checkApplySnp(variant, transcript.cds(), transcript.protein(), 1, 300, 399); }
/** Find all non-empty INFO fields 'infoFieldName' in results */ protected String findDbInfo(String infoFieldName, QueryResult qr) { if (debug) Gpr.debug("Finding DB data for INFO field: " + infoFieldName); StringBuilder sb = new StringBuilder(); for (VariantVcfEntry varVe : qr.results) { if (varVe != null) { String val = varVe.getVcfEntry().getInfo(infoFieldName); if (!VcfEntry.isEmpty(val)) { if (debug) Gpr.debug("\tFound: " + val); if (sb.length() > 0) sb.append(','); sb.append(val); } } } return sb.length() <= 0 ? null : sb.toString(); }
/** Query database and find results matching 'variant' */ protected Collection<VariantVcfEntry> query(Variant variant) { // Query database Collection<VariantVcfEntry> results = dbVcf.query(variant); // Filter results to match 'variant' List<VariantVcfEntry> list = new LinkedList<>(); for (VariantVcfEntry dbEntry : results) { if (match(variant, dbEntry)) { if (debug) Gpr.debug("dbEntry matches query\tvariant: " + variant + "\tdbEntry: " + dbEntry); list.add(dbEntry); } else { if (debug) Gpr.debug("dbEntry does NOT match query\tvariant: " + variant + "\tdbEntry: " + dbEntry); } } if (debug) Gpr.debug("Match query results: " + list.size()); return list; }
/** Query VCF entries intersecting 'marker' at node 'idx' */ protected void queryIntersects(Interval queryMarker, int idx, Markers results) { if (intersectFilePosStart[idx] == null) return; if (debug) Gpr.debug("queryIntersects\tidx: " + idx); // Read entries from disk List<VcfEntry> vcfEntries = readEntries(idx); // Find matching entries for (VcfEntry ve : vcfEntries) { // If any variant within the vcfEntry intersects the query // marker, we store this VCF entry as a result for (Variant var : ve.variants()) { if (var.intersects(queryMarker)) { if (debug) Gpr.debug("\tAdding matching result: " + ve); results.add(ve); break; // Store this entry only once } } // Past query's end coordinate? We don't need to look any further if (queryMarker.getEnd() < ve.getStart()) return; } }
/** Variant right before exon end */ @Test public void test_apply_variant_06() { Gpr.debug("Test"); Variant variant = new Variant(transcript.getParent(), 399, "G", "A"); String expectedCds = "atgtccgcaggtgaaggcatacacgctgcgcgtatactgatgttacctcgatggattttgtcagaaatatggtgcccaggacgcgaagggcatattatgg" // Exon[0] + "tgtttgggaattcacgggcacggttctgcagcaagctgaattggcagctcggcataaatcccgaccccatcgtcacgcacggatcaattcatcctcaacA" .toLowerCase() // Exon[1] + "ggtagaggaaaagcacctaacccccattgagcaggatctctttcgtaatactctgtatcgattaccgatttatttgattccccacatttatttcatcggg" // Exon[2] ; checkApplySnp(variant, expectedCds, null, 1, 300, 399); }
/** Query index to find all VCF entries intersecting 'marker' Store VCF entries in 'results' */ @Override public Markers query(Interval queryMarker) { Markers results = new Markers(); if (debug) Gpr.debug( "Query: " + queryMarker.getChromosomeName() + ":" + queryMarker.getStart() + "-" + queryMarker.getEnd() + "\t" + queryMarker); query(queryMarker, 0, results); return results; }
/** * Query index to find all VCF entries intersecting 'marker', starting from node 'idx' Store VCF * entries in 'results' */ protected void query(Interval queryMarker, int idx, Markers results) { // Negative index? Nothing to do if (idx < 0) return; if (debug) Gpr.debug("Node: " + toString(idx) + (results.isEmpty() ? "" : "\n\tResults: " + results)); // Check all intervals intersecting queryIntersects(queryMarker, idx, results); // Recurse left or right int midPos = mid[idx]; if ((queryMarker.getEnd() < midPos) && (left[idx] >= 0)) { query(queryMarker, left[idx], results); } if ((midPos < queryMarker.getStart()) && (right[idx] >= 0)) { query(queryMarker, right[idx], results); } }
void add(String trId, String seq, int lineNum, boolean check) { // Repeated transcript Id? => Check that Protein is the same if (check && (proteinByTrId.get(trId) != null) && (!proteinByTrId.get(trId).equals(seq))) // System.err.println( "ERROR: Different protein for the same transcript ID. This should never happen!!!" // + "\n\tLine number : " + lineNum // + "\n\tTranscript ID : '" + trId + "'" // + "\n\tProtein : " + proteinByTrId.get(trId) // + "\n\tProtein (new) : " + seq // ); // Use whole trId proteinByTrId.put(trId, seq); // Add it to the hash if (debug) Gpr.debug("Adding proteinByTrId{'" + trId + "'} :\t" + seq); }
/** Read sequences from features file */ void readProteinFileFeatures(FeaturesFile featuresFile) { for (Features features : featuresFile) { String trIdPrev = null; for (Feature f : features.getFeatures()) { // Find all CDS if (f.getType() == Type.GENE) { // Clean up trId trIdPrev = null; } else if (f.getType() == Type.MRNA) { // Save trId, so that next CDS record can find it trIdPrev = f.getTranscriptId(); } else if (f.getType() == Type.CDS) { // Add CDS 'translation' record // Try using the transcript ID found in the previous record String trId = trIdPrev; if (trId == null) trId = f.getTranscriptId(); String seq = f.getAasequence(); if (debug) Gpr.debug(trId + "\t" + seq); if ((trId != null) && (seq != null)) add(trId, seq, -1, true); } } } }
public String toString( String tabs, double thresholdEntropy, double thresholdP, int thresholdCount) { if (getTotalCount() == 0) return ""; StringBuilder sb = new StringBuilder(); double p[] = p(); for (int idx = 0; idx < 4; idx++) { char base = BASES[idx]; AcgtTree n = nodes[idx]; if (n != null) { sb.append( String.format( "%s%s%s: %d\te:%4.3f\tp:%4.2f\n", tabs, name, base, counts[idx], n.entropy(), p[idx])); if (((n.entropy() <= thresholdEntropy) || (p[idx] >= thresholdP)) // && (counts[idx] >= thresholdCount) // ) { Gpr.debug( "Name:" + n.name + "\tIdx:" + +idx + "\tEntropy: " + n.entropy() + "\tP:" + p[idx] + "\tCount:" + counts[idx]); sb.append(n.toString(tabs + "\t", thresholdEntropy, thresholdP, thresholdCount)); } } } return sb.toString(); }
/** Read and parse genes file */ protected void readRefSeqFile() { try { int count = 0; BufferedReader reader = Gpr.reader(fileName); if (reader == null) return; // Error for (lineNum = 1; reader.ready(); lineNum++) { line = reader.readLine(); // Skip headers if (!line.startsWith("#")) { String fields[] = line.split("\t"); if (fields.length >= 9) { // Parse fields int fieldNum = 0; String id = fields[fieldNum++]; String chromoName = fields[fieldNum++]; boolean strandMinus = fields[fieldNum++].equals("-"); int txstart = parsePosition(fields[fieldNum++]); int txend = parsePosition(fields[fieldNum++]) - 1; // Our internal database representations of coordinates always have a // zero-based start and a one-based end (Reference: // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 ) int cdsStart = parsePosition(fields[fieldNum++]); int cdsEnd = parsePosition(fields[fieldNum++]) - 1; // Our internal database representations of coordinates always have a // zero-based start and a one-based end (Reference: // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 ) int exonCount = Gpr.parseIntSafe(fields[fieldNum++]); String exonStarts = fields[fieldNum++]; String exonEnds = fields[ fieldNum++]; // Our internal database representations of coordinates always have // a zero-based start and a one-based end (Reference: // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 ) String proteinId = fields[fieldNum++]; // String alignId = fields[fieldNum++]; // Not used // --- // Create // ---- Chromosome chromo = getOrCreateChromosome(chromoName); // Is it protein coding? boolean isCoding = !proteinId.isEmpty(); // Protein ID assigned? // Create IDs String trId = uniqueTrId(id); // Get or create gene Gene gene = findOrCreateGene(proteinId, trId, chromo, txstart, txend, strandMinus, isCoding); // Create transcript Transcript tr = new Transcript(gene, txstart, txend, strandMinus, trId); tr.setProteinCoding(isCoding); add(tr); // Add Exons and CDS String exStartStr[] = exonStarts.split(","); String exEndStr[] = exonEnds.split(","); for (int i = 0; i < exonCount; i++) { // Exons int exStart = parsePosition(exStartStr[i]); int exEnd = parsePosition(exEndStr[i]) - 1; // Our internal database representations of coordinates always have a // zero-based start and a one-based end (Reference: // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 ) String exId = trId + ".ex." + (i + 1); Exon ex = new Exon(tr, exStart, exEnd, strandMinus, exId, i); add(ex); // CDS (ony if intersects) if ((exStart <= cdsEnd) && (exEnd >= cdsStart)) { Cds cds = new Cds( tr, Math.max(cdsStart, exStart), Math.min(cdsEnd, exEnd), strandMinus, exId); add(cds); } } count++; if (count % MARK == 0) System.out.print('.'); if (count % (100 * MARK) == 0) System.out.print("\n\t"); } } } reader.close(); } catch (Exception e) { Gpr.debug("Offending line (lineNum: " + lineNum + "): '" + line + "'"); throw new RuntimeException(e); } }