/** * Add annotations * * @param category * @param contrVoc * @param sequence */ void countAaSequence(String category, String contrVoc, String description, String sequence) { String key = key(category, contrVoc, description); CountByType cbt = countAaSequenceByType.getOrCreate(key); cbt.inc(sequence); }
/** Show annotations counters in a table */ void analyzeSequenceConservation() { if (verbose) Timer.showStdErr( "Sequence conservation analysis." // + "\n\tAA sequence length : " + 1 // + "\n\tMin AA count : " + HIGHLY_CONSERVED_AA_COUNT // + "\n\tMin AA conservation : " + HIGHLY_CONSERVED_AA_PERCENT // ); ArrayList<String> keys = new ArrayList<String>(); keys.addAll(countAaSequenceByType.keySet()); Collections.sort(keys); // Show title StringBuilder title = new StringBuilder(); for (char aa : GprSeq.AMINO_ACIDS) title.append(aa + "\t"); title.append("\t" + title); if (verbose) System.out.println( "Amino acid regions:\n\tTotal\tMax count\tAvg len\tConservation\tCatergory\tControlled Vocabulary\t" + title + "\tOther AA sequences:"); // Show AA counts for each 'key' for (String key : keys) { long seqLen = 0, totalSeqs = 0, maxCount = 0; CountByType cbt = countAaSequenceByType.get(key); long total = cbt.sum(); boolean highlyConservedAaSequence = false; StringBuilder sb = new StringBuilder(); // For each single amino acid "sequence" for (char aa : GprSeq.AMINO_ACIDS) { long count = cbt.get("" + aa); if (count > 0) { seqLen += 1 * count; totalSeqs += count; maxCount = Math.max(maxCount, count); sb.append(count); double perc = ((double) count) / total; // We estimate that if most AA are the same, then changing this AA can cause a high impact // in protein coding if ((perc > HIGHLY_CONSERVED_AA_PERCENT) && (total >= HIGHLY_CONSERVED_AA_COUNT)) highlyConservedAaSequence = true; } sb.append("\t"); } // Sequences of more than one AA for (String aas : cbt.keySet()) { long count = cbt.get(aas); double perc = ((double) count) / total; if (aas.length() > 1) { seqLen += aas.length() * count; totalSeqs += count; maxCount = Math.max(maxCount, count); sb.append(String.format("\t" + aas + ":" + count)); if ((perc > HIGHLY_CONSERVED_AA_PERCENT) && (total >= HIGHLY_CONSERVED_AA_COUNT)) highlyConservedAaSequence = true; } } long avgLen = seqLen / totalSeqs; // Show line if (verbose) System.out.println( // "\t" + total // + "\t" + maxCount // + "\t" + avgLen // + "\t" + (highlyConservedAaSequence ? "High" : "") // + "\t" + key // + "\t" + sb // ); // Mark highly conserved if (highlyConservedAaSequence) { int count = 0; for (Marker m : markers) { NextProt nextProt = (NextProt) m; if (m.getId().equals(key)) { nextProt.setHighlyConservedAaSequence(true); count++; } } if (verbose) Timer.showStdErr( "NextProt " + count + " markers type '" + key + "' marked as highly conserved AA sequence"); } } }