/** * Gather data from transcript * * @param isoformRef * @param aaStart * @param aaEnd * @param sequence * @param subSeq * @return */ TranscriptData transcriptData( String isoformRef, int aaStart, int aaEnd, String sequence, String subSeq) { String trId = trIdByUniqueName.get(isoformRef); TranscriptData trData = new TranscriptData(); if (trId != null) { Transcript tr = trById.get(trId); if (tr != null) { trData.tr = tr; String protein = tr.protein(); // Remove trailing stop codon ('*') if (!protein.isEmpty() && (protein.charAt(protein.length() - 1) == '*')) protein = protein.substring(0, protein.length() - 1); // Sanity check: Do protein sequences match? if (protein.equals(sequence)) { proteinOk.add(trId); if ((aaStart >= 0) && (aaEnd >= aaStart)) { // Try to map to chromosome position int cdsBase2Pos[] = tr.cdsBaseNumber2ChrPos(); int codonStart = aaStart * 3; int codonEnd = (aaEnd + 1) * 3 - 1; if (tr.isStrandPlus()) { trData.chrPosStart = cdsBase2Pos[codonStart]; trData.chrPosEnd = cdsBase2Pos[codonEnd]; } else { trData.chrPosStart = cdsBase2Pos[codonEnd]; trData.chrPosEnd = cdsBase2Pos[codonStart]; } trData.chrName = tr.getChromosomeName(); // More sanity checks trData.codon = tr.cds().substring(codonStart, codonEnd + 1); trData.aa = CodonTables.getInstance().aa(trData.codon, genome, trData.chrName); if (!subSeq.equals(trData.aa)) Timer.showStdErr( "WARNING: AA differ: " // + "\tUniqueName" + isoformRef // + "\tEnsembl ID: " + trId // + "\tEnsembl AA: " + trData.aa // + "\tNextProt AA:" + subSeq // + "\n"); else trData.ok = true; // All sanity checks passed } } else { if (!proteinDifferences.contains(trId)) Timer.showStdErr( "WARNING: Protein sequences differ: " // + "\tUniqueName" + isoformRef // + "\tEnsembl ID: " + trId // + "\n\tEnsembl (" + protein.length() + "): " + protein // + "\n\tNextProt (" + sequence.length() + "): " + sequence // + "\n"); proteinDifferences.add(trId); } } } return trData; }
/** Compare all CDS */ double cdsCompare() { int i = 1; if (verbose) { // Show labels System.err.println("\tLabels:"); System.err.println("\t\t'+' : OK"); System.err.println("\t\t'.' : Missing"); System.err.println("\t\t'*' : Error"); System.err.print("\t"); } // Compare all genes for (Gene gint : config.getGenome().getGenes()) for (Transcript tint : gint) { String cds = tint.cds().toUpperCase(); String mRna = tint.mRna().toUpperCase(); String cdsReference = cdsByTrId.get(tint.getId()); if (cdsReference != null) cdsReference = cdsReference.toUpperCase(); if (cdsReference == null) { if (debug) System.err.println( "\nWARNING:Cannot find reference CDS for transcript '" + tint.getId() + "'"); else if (verbose) System.out.print('.'); totalNotFound++; } else if (cds.isEmpty()) { if (debug) System.err.println("\nWARNING:Empty CDS for transcript '" + tint.getId() + "'"); else if (verbose) System.out.print('.'); totalNotFound++; } else if (cds.equals(cdsReference)) { totalOk++; if (verbose) System.out.print('+'); // Sanity check: Start and stop codons if ((cds != null) && (cds.length() >= 3)) { CodonTable ctable = CodonTables.getInstance().getTable(config.getGenome(), tint.getChromosomeName()); // Check start codon String startCodon = cds.substring(0, 3); if (!ctable.isStart(startCodon)) { if (debug) System.err.println( "\nWARNING: CDS for transcript '" + tint.getId() + "' does not start with a start codon:\t" + startCodon + "\t" + cds); totalWarnings++; } // Check stop codon String stopCodon = cds.substring(cds.length() - 3, cds.length()); if (!ctable.isStop(stopCodon)) { if (debug) System.err.println( "\nWARNING: CDS for transcript '" + tint.getId() + "' does not end with a stop codon:\t" + stopCodon + "\t" + cds); totalWarnings++; } } } else if (mRna.equals(cdsReference)) { // May be the file has mRNA instead of CDS? totalOk++; if (verbose) System.out.print('+'); } else if ((mRna.length() < cdsReference .length()) // CDS longer than mRNA? May be it is actually an mRNA + poly-A tail // (instead of a CDS) && cdsReference .substring(mRna.length()) .replace('A', ' ') .trim() .isEmpty() // May be it is an mRNA and it has a ploy-A tail added && cdsReference.substring(0, mRna.length()).equals(mRna) // Compare cutting poly-A tail ) { // OK, it was a mRNA + polyA totalOk++; if (verbose) System.out.print('+'); } else if ((mRna.length() > cdsReference.length()) // PolyA in the reference? && mRna.substring(cdsReference.length()).replace('A', ' ').trim().isEmpty() // && mRna.substring(0, cdsReference.length()).equals(mRna) // ) { // OK, it was a mRNA + polyA totalOk++; if (verbose) System.out.print('+'); } else { if (debug || onlyOneError) { // Create a string indicating differences String diffMrna = SnpEffCmdProtein.diffStr(mRna, cdsReference); int diffMrnaCount = SnpEffCmdProtein.diffCount(mRna, cdsReference); String diffCds = SnpEffCmdProtein.diffStr(cds, cdsReference); int diffCdsCount = SnpEffCmdProtein.diffCount(cds, cdsReference); System.err.println( "\nERROR:CDS do not match for transcript " + tint.getId() + "\tStrand:" + tint.getStrand() + "\tExons: " + tint.numChilds()); if (diffMrnaCount < diffCdsCount) { System.err.println( String.format("\tsnpEff mRNA (%6d) : '%s'", mRna.length(), mRna.toLowerCase())); System.err.println( String.format("\tdiff (%6d) : '%s'", diffMrnaCount, diffMrna)); } else { System.err.println( String.format("\tsnpEff CDS (%6d) : '%s'", cds.length(), cds.toLowerCase())); System.err.println( String.format("\tdiff (%6d) : '%s'", diffCdsCount, diffCds)); } System.err.println( String.format( "\tReference (%6d) : '%s'", cdsReference.length(), cdsReference.toLowerCase())); System.err.println("Transcript details:\n" + tint); if (onlyOneError) { System.err.println("Transcript details:\n" + tint); throw new RuntimeException("DIE"); } } else if (verbose) System.out.print('*'); totalErrors++; } // Show a mark if (verbose && (i % 100 == 0)) System.out.print("\n\t"); i++; } double perc = ((double) totalErrors) / ((double) (totalErrors + totalOk)); System.out.println( "\n\tCDS check:\t" + config.getGenome().getVersion() + "\tOK: " + totalOk + "\tWarnings: " + totalWarnings + "\tNot found: " + totalNotFound + "\tErrors: " + totalErrors + "\tError percentage: " + (100 * perc) + "%"); return perc; }