/** calculates statstics and output for results on a given test sequence */ public void calcResultIncrement(TrainingSequence training, int[] predictedHiddenSequence) { labeled.add( new TrainingSequence( training.getInputSequence(), predictedHiddenSequence)); // This is only place that labelled gets added to??? // So I guess the results just get built up incrementally, both the actuall hidden sequences and // the stats? Assert.a(training.length() == predictedHiddenSequence.length); int[] actualHiddenSequence = new int[training.length()]; for (int i = 0; i < training.length(); i++) { actualHiddenSequence[i] = training.getY(i); } boolean thisperfect = true; for (int i = 0; i < predictedHiddenSequence.length; ++i) { int predY = predictedHiddenSequence[i]; int realY = actualHiddenSequence[i]; if (realY == predY) { correct += 1; } else { incorrect += 1; thisperfect = false; } ctCodingNucleotide.increment(isCodingPlus(predY), isCodingPlus(realY)); ctCodingNucleotide.increment(isCodingMinus(predY), isCodingMinus(realY)); for (int s = 0; s < nStates; s++) { ctStates.get(s).increment((predY == s), (realY == s)); } } if (thisperfect) { perfect++; } else { imperfect++; } for (int i = 1; i < predictedHiddenSequence.length; ++i) { int predY = predictedHiddenSequence[i]; int realY = actualHiddenSequence[i]; int predYp = predictedHiddenSequence[i - 1]; int realYp = actualHiddenSequence[i - 1]; for (int t = 0; t < nTransitions; t++) { boolean bPred = ((predYp == fromInd.get(t)) && (predY == toInd.get(t))); boolean bReal = ((realYp == fromInd.get(t)) && (realY == toInd.get(t))); ctTransitions.get(t).increment(bPred, bReal); } } // Now let's increment the contingency table for exons; note that here not counting TN's RangeMap predExonsPlus = new RangeMap(); RangeMap predExonsMinus = new RangeMap(); RangeMap realExonsPlus = new RangeMap(); RangeMap realExonsMinus = new RangeMap(); makeExonRangeMapFrom13SV(predictedHiddenSequence, predExonsPlus, predExonsMinus); makeExonRangeMapFrom13SV(actualHiddenSequence, realExonsPlus, realExonsMinus); incrementCTFromRangeMaps(ctExons, predExonsPlus, realExonsPlus); incrementCTFromRangeMaps(ctExons, predExonsMinus, realExonsMinus); }
private void parseSeqName(TrainingSequence seq, int seqNum) { NameInputSequence nameInput = null; InputSequence<?> inputSeq = seq.getInputSequence(); if (inputSeq instanceof InputSequenceComposite) { nameInput = (NameInputSequence) inputSeq.getComponent("name"); } if (nameInput == null) { log.debug( "Sequence name not specified. Setting sequence name to 'SEQ_" + String.valueOf(seqNum) + "'"); seqName = "SEQ_" + String.valueOf(seqNum); // Create a name and return. genePrefix = "SEQ_" + String.valueOf(seqNum); offset = 0; return; } String name = nameInput.getName().trim(); int colon1, colon2, numColons; if (name.startsWith("group:") || name.startsWith("seq:")) { numColons = numOccurrences(name, ':'); if (numColons == 1) { colon1 = name.indexOf(":"); seqName = name; genePrefix = name.substring(colon1 + 1, name.length()); offset = 0; return; } else if (numColons == 2) { colon1 = name.indexOf(":"); colon2 = name.lastIndexOf(":"); seqName = name.substring(0, colon2); genePrefix = name.substring(colon1 + 1, colon2); int pound = genePrefix.indexOf("#"); if (pound > 0) { genePrefix = genePrefix.substring(0, pound); } setOffset(name.substring(colon2 + 1, name.length())); return; } } log.debug( "Sequence name is in unexpected format. Setting offset=0 and sequence name='" + name + "'."); seqName = name; genePrefix = name; offset = 0; }
// This function converts a 13 state model hidden sequence to a GTF file. public void writeGTF(List<? extends TrainingSequence<?>> refStates, String filename) throws IOException { int ref, geneNum, seqCount, frame = -1; long i, exonStart, exonEnd, end; boolean inPlusExon, inMinusExon, firstExon, startCodonSplit; String strand; Writer fout = new BufferedWriter(new FileWriter(filename)); exonStart = exonEnd = 0; geneNum = 1; seqCount = 0; // Determine if model is tricycle13 or interval13. boolean interval13 = false; int prevState, state; for (TrainingSequence seq : refStates) { if (seq.length() == 0) continue; prevState = seq.getY(0); for (i = 1; i < seq.length(); i++) { state = seq.getY((int) i); if (prevState == 0 && (state == 2 || state == 3 || state == 7 || state == 8)) { interval13 = true; break; } prevState = state; } if (interval13) break; } for (TrainingSequence seq : refStates) { if (interval13) { SequenceConverter.convertSeqFromInterval13ToTricycle13(seq); } inPlusExon = false; inMinusExon = false; firstExon = true; startCodonSplit = false; parseSeqName(seq, seqCount); for (i = 0; i < seq.length(); i++) { ref = seq.getY((int) i); if (ref == 1 || ref == 2 || ref == 3) // in a plus exon { if (!inPlusExon) { exonStart = i + 1; inPlusExon = true; frame = setFrame(ref); } } else if (ref == 7 || ref == 8 || ref == 9) // in a minus exon { if (!inMinusExon) { exonStart = i + 1; inMinusExon = true; frame = setFrame(ref); if (firstExon) { if (i < 3) System.err.println( "Minus strand gene start is within 3 nucleotides of sequence start. No stop codon writen to GTF for gene starting at position " + (exonStart + offset)); else writeGFTLine( fout, seqName, "stop_codon", exonStart + offset - 3, exonStart + offset - 1, "-", frame, genePrefix, geneNum); } } } else if (inPlusExon && (ref == 4 || ref == 5 || ref == 6)) { // just ended an exon on plus strand, now in a plus intron strand = "+"; inPlusExon = false; exonEnd = i; if (firstExon) { if (exonEnd - exonStart + 1 < 3) { end = exonEnd + offset; startCodonSplit = true; } else { end = exonStart + offset + 2; } writeGFTLine( fout, seqName, "start_codon", exonStart + offset, end, strand, frame, genePrefix, geneNum); firstExon = false; } else if (startCodonSplit) { // at second exon that contains part of start codon Assert.a(frame == 1 || frame == 2); writeGFTLine( fout, seqName, "start_codon", exonStart + offset, exonStart + offset + frame - 1, strand, frame, genePrefix, geneNum); startCodonSplit = false; } writeGFTLine( fout, seqName, "CDS", exonStart + offset, exonEnd + offset, strand, frame, genePrefix, geneNum); } else if (inMinusExon && (ref == 10 || ref == 11 || ref == 12)) { // just ended an exon on minus strand, now in a minus intron strand = "-"; inMinusExon = false; firstExon = false; exonEnd = i; writeGFTLine( fout, seqName, "CDS", exonStart + offset, exonEnd + offset, strand, frame, genePrefix, geneNum); } else // now in intergenic region { boolean write = true; if (inPlusExon) // was in gene at previous nucleotide { strand = "+"; exonEnd = i; if (firstExon) { if (exonEnd - exonStart + 1 < 3) { System.err.println( "Single '" + strand + "' strand exon is < 3 bases for sequence '" + seqName + "'. exonStart=" + exonStart + " exonEnd=" + exonEnd); write = false; } else { writeGFTLine( fout, seqName, "start_codon", exonStart + offset, exonStart + offset + 2, strand, frame, genePrefix, geneNum); } } else if (startCodonSplit) { // at second exon that contains part of start codon Assert.a(frame == 1 || frame == 2); writeGFTLine( fout, seqName, "start_codon", exonStart + offset, exonStart + offset + frame - 1, strand, frame, genePrefix, geneNum); } if (write) { writeGFTLine( fout, seqName, "CDS", exonStart + offset, exonEnd + offset, strand, frame, genePrefix, geneNum); writeGFTLine( fout, seqName, "stop_codon", exonEnd + offset + 1, exonEnd + offset + 3, strand, 0, genePrefix, geneNum); } inPlusExon = false; firstExon = true; startCodonSplit = false; geneNum++; } else if (inMinusExon) { strand = "-"; long prevExonEnd = exonEnd; exonEnd = i; if (firstExon && exonEnd - exonStart + 1 < 3) { System.err.println( "Single '" + strand + "' strand exon is < 3 bases for sequence '" + seqName + "'. exonStart=" + exonStart + " exonEnd=" + exonEnd); } else if (exonEnd - exonStart + 1 < 3) { // this exon is < 3 bases, need to split start codon if (exonEnd - exonStart + 1 == 2) { // this exon is 2 bases writeGFTLine( fout, seqName, "start_codon", prevExonEnd + offset, prevExonEnd + offset, strand, 0, genePrefix, geneNum); writeGFTLine( fout, seqName, "CDS", exonStart + offset, exonEnd + offset, strand, frame, genePrefix, geneNum); writeGFTLine( fout, seqName, "start_codon", exonStart + offset, exonEnd + offset, strand, 2, genePrefix, geneNum); } else if (exonEnd - exonStart + 1 == 1) { // this exon is 1 base writeGFTLine( fout, seqName, "start_codon", prevExonEnd + offset - 1, prevExonEnd + offset, strand, 0, genePrefix, geneNum); writeGFTLine( fout, seqName, "CDS", exonStart + offset, exonEnd + offset, strand, frame, genePrefix, geneNum); writeGFTLine( fout, seqName, "start_codon", exonStart + offset, exonEnd + offset, strand, 1, genePrefix, geneNum); } } else { writeGFTLine( fout, seqName, "CDS", exonStart + offset, exonEnd + offset, strand, frame, genePrefix, geneNum); writeGFTLine( fout, seqName, "start_codon", exonEnd + offset - 2, exonEnd + offset, strand, 0, genePrefix, geneNum); } inMinusExon = false; firstExon = true; startCodonSplit = false; geneNum++; } } } seqCount++; } fout.close(); }