public InputSequence<T> subSequence(int start, int end) { Assert.a(start >= 1); Assert.a(end <= this.length()); Assert.a(start <= end); int length = end - start + 1; T[] ret = (T[]) new Object[length]; for (int i = 0; i < length; ++i) { ret[i] = t[i + start - 1]; } return new InputSequenceObject<T>(ret); }
// Frame is the nmber of bases in this region befor you get in frame. // That is, if frame is 0, the first three bases in this element are a codon. // If frame is 1, the first base is the end of a codon hanging over from the // end of the previous codon and the next three are the first codon in this feature. // If frame is 2, the first two bases are the end of the previous codon and the // next three are the first codon in this feature. private static int setFrame(int ref) { int frame = -1; switch (ref) { case 1: frame = 0; break; case 2: frame = 2; break; case 3: frame = 1; break; case 7: frame = 1; break; case 8: frame = 2; break; case 9: frame = 0; break; default: Assert.a(false, "Error setting frame, ref = ", ref); } return frame; }
private boolean isCodingMinus(int y) { Assert.a((y >= 0) && (y < 13)); if ((y == 7) || (y == 8) || (y == 9)) { return true; } return false; }
private boolean isCodingPlus(int y) { Assert.a((y >= 0) && (y < 13)); if ((y == 1) || (y == 2) || (y == 3)) { return true; } return false; }
/** calculates statstics and output for results on a given test sequence */ public void calcResultIncrement(TrainingSequence training, int[] predictedHiddenSequence) { labeled.add( new TrainingSequence( training.getInputSequence(), predictedHiddenSequence)); // This is only place that labelled gets added to??? // So I guess the results just get built up incrementally, both the actuall hidden sequences and // the stats? Assert.a(training.length() == predictedHiddenSequence.length); int[] actualHiddenSequence = new int[training.length()]; for (int i = 0; i < training.length(); i++) { actualHiddenSequence[i] = training.getY(i); } boolean thisperfect = true; for (int i = 0; i < predictedHiddenSequence.length; ++i) { int predY = predictedHiddenSequence[i]; int realY = actualHiddenSequence[i]; if (realY == predY) { correct += 1; } else { incorrect += 1; thisperfect = false; } ctCodingNucleotide.increment(isCodingPlus(predY), isCodingPlus(realY)); ctCodingNucleotide.increment(isCodingMinus(predY), isCodingMinus(realY)); for (int s = 0; s < nStates; s++) { ctStates.get(s).increment((predY == s), (realY == s)); } } if (thisperfect) { perfect++; } else { imperfect++; } for (int i = 1; i < predictedHiddenSequence.length; ++i) { int predY = predictedHiddenSequence[i]; int realY = actualHiddenSequence[i]; int predYp = predictedHiddenSequence[i - 1]; int realYp = actualHiddenSequence[i - 1]; for (int t = 0; t < nTransitions; t++) { boolean bPred = ((predYp == fromInd.get(t)) && (predY == toInd.get(t))); boolean bReal = ((realYp == fromInd.get(t)) && (realY == toInd.get(t))); ctTransitions.get(t).increment(bPred, bReal); } } // Now let's increment the contingency table for exons; note that here not counting TN's RangeMap predExonsPlus = new RangeMap(); RangeMap predExonsMinus = new RangeMap(); RangeMap realExonsPlus = new RangeMap(); RangeMap realExonsMinus = new RangeMap(); makeExonRangeMapFrom13SV(predictedHiddenSequence, predExonsPlus, predExonsMinus); makeExonRangeMapFrom13SV(actualHiddenSequence, realExonsPlus, realExonsMinus); incrementCTFromRangeMaps(ctExons, predExonsPlus, realExonsPlus); incrementCTFromRangeMaps(ctExons, predExonsMinus, realExonsMinus); }
public void train(int startingIndex, ModelManager modelInfo, List data) { Assert.a(allFeatureTypes.size() > 0, "No features types have been assigned."); Assert.a(startIndexes == null, "FeatureManager has already been trained."); startIx = startingIndex; // Train each of the individual FeatureManagers and calculate offsets startIndexes = new int[allFeatureTypes.size()]; totalFeatures = 0; for (int i = 0; i < startIndexes.length; ++i) { startIndexes[i] = totalFeatures + startIx; FeatureManager fm = allFeatureTypes.get(i); List compData = fm.getInputComponent() == null ? data : new ComponentList(data, fm.getInputComponent()); fm.train(totalFeatures, modelInfo, compData); totalFeatures += fm.getNumFeatures(); } }
public void evaluateEdgeLength( InputSequence seq, int pos, int length, int prevState, int state, FeatureList result) { Assert.a(length > 0); for (FeatureManagerEdgeExplicitLength fm : explicitLengthEdgeFeatureTypes) { InputSequence componentSeq = fm.getInputComponent() == null ? seq : seq.getComponent(fm.getInputComponent()); fm.evaluateEdgeLength(componentSeq, pos, length, prevState, state, result); if (!result.isValid()) break; } }
public void addFeatureManager(String name, String inputParams, FeatureManager fm) { Assert.a(startIndexes == null, "Attempted to add a new FeatureManager after training."); if (name != null) fm.setInputComponent(name); allFeatureTypes.add(fm); // Add each feature type into the right list for evalution if (fm instanceof FeatureManagerNode) { nodeFeatureTypes.add((FeatureManagerNode) fm); } if (fm instanceof FeatureManagerEdge) { edgeFeatureTypes.add((FeatureManagerEdge) fm); } if (fm instanceof FeatureManagerNodeExplicitLength) { explicitLengthNodeFeatureTypes.add((FeatureManagerNodeExplicitLength) fm); } if (fm instanceof FeatureManagerEdgeExplicitLength) { explicitLengthEdgeFeatureTypes.add((FeatureManagerEdgeExplicitLength) fm); } }
// Outputs one line to the GTF file. // NOTE: source is assumed to be 'CONRAD', and score is assumed to be unknown and set to '.'. private static void writeGFTLine( Writer out, String seqName, String feature, long exonStart, long exonEnd, String strand, int frame, String genePrefix, int geneNum) throws IOException { Assert.a(frame == 0 || frame == 1 || frame == 2, "Frame value invalid, frame = ", frame); String geneId = genePrefix + "G_" + String.valueOf(geneNum); String transId = genePrefix + "T_" + String.valueOf(geneNum) + ".1"; out.write( seqName + "\t" + "CONRAD" + "\t" + feature + "\t" + exonStart + "\t" + exonEnd + "\t" + "." + "\t" + strand + "\t" + frame + "\t" + "gene_id \"" + geneId + "\"; transcript_id \"" + transId + "\";\n"); }
// This function converts a 13 state model hidden sequence to a GTF file. public void writeGTF(List<? extends TrainingSequence<?>> refStates, String filename) throws IOException { int ref, geneNum, seqCount, frame = -1; long i, exonStart, exonEnd, end; boolean inPlusExon, inMinusExon, firstExon, startCodonSplit; String strand; Writer fout = new BufferedWriter(new FileWriter(filename)); exonStart = exonEnd = 0; geneNum = 1; seqCount = 0; // Determine if model is tricycle13 or interval13. boolean interval13 = false; int prevState, state; for (TrainingSequence seq : refStates) { if (seq.length() == 0) continue; prevState = seq.getY(0); for (i = 1; i < seq.length(); i++) { state = seq.getY((int) i); if (prevState == 0 && (state == 2 || state == 3 || state == 7 || state == 8)) { interval13 = true; break; } prevState = state; } if (interval13) break; } for (TrainingSequence seq : refStates) { if (interval13) { SequenceConverter.convertSeqFromInterval13ToTricycle13(seq); } inPlusExon = false; inMinusExon = false; firstExon = true; startCodonSplit = false; parseSeqName(seq, seqCount); for (i = 0; i < seq.length(); i++) { ref = seq.getY((int) i); if (ref == 1 || ref == 2 || ref == 3) // in a plus exon { if (!inPlusExon) { exonStart = i + 1; inPlusExon = true; frame = setFrame(ref); } } else if (ref == 7 || ref == 8 || ref == 9) // in a minus exon { if (!inMinusExon) { exonStart = i + 1; inMinusExon = true; frame = setFrame(ref); if (firstExon) { if (i < 3) System.err.println( "Minus strand gene start is within 3 nucleotides of sequence start. No stop codon writen to GTF for gene starting at position " + (exonStart + offset)); else writeGFTLine( fout, seqName, "stop_codon", exonStart + offset - 3, exonStart + offset - 1, "-", frame, genePrefix, geneNum); } } } else if (inPlusExon && (ref == 4 || ref == 5 || ref == 6)) { // just ended an exon on plus strand, now in a plus intron strand = "+"; inPlusExon = false; exonEnd = i; if (firstExon) { if (exonEnd - exonStart + 1 < 3) { end = exonEnd + offset; startCodonSplit = true; } else { end = exonStart + offset + 2; } writeGFTLine( fout, seqName, "start_codon", exonStart + offset, end, strand, frame, genePrefix, geneNum); firstExon = false; } else if (startCodonSplit) { // at second exon that contains part of start codon Assert.a(frame == 1 || frame == 2); writeGFTLine( fout, seqName, "start_codon", exonStart + offset, exonStart + offset + frame - 1, strand, frame, genePrefix, geneNum); startCodonSplit = false; } writeGFTLine( fout, seqName, "CDS", exonStart + offset, exonEnd + offset, strand, frame, genePrefix, geneNum); } else if (inMinusExon && (ref == 10 || ref == 11 || ref == 12)) { // just ended an exon on minus strand, now in a minus intron strand = "-"; inMinusExon = false; firstExon = false; exonEnd = i; writeGFTLine( fout, seqName, "CDS", exonStart + offset, exonEnd + offset, strand, frame, genePrefix, geneNum); } else // now in intergenic region { boolean write = true; if (inPlusExon) // was in gene at previous nucleotide { strand = "+"; exonEnd = i; if (firstExon) { if (exonEnd - exonStart + 1 < 3) { System.err.println( "Single '" + strand + "' strand exon is < 3 bases for sequence '" + seqName + "'. exonStart=" + exonStart + " exonEnd=" + exonEnd); write = false; } else { writeGFTLine( fout, seqName, "start_codon", exonStart + offset, exonStart + offset + 2, strand, frame, genePrefix, geneNum); } } else if (startCodonSplit) { // at second exon that contains part of start codon Assert.a(frame == 1 || frame == 2); writeGFTLine( fout, seqName, "start_codon", exonStart + offset, exonStart + offset + frame - 1, strand, frame, genePrefix, geneNum); } if (write) { writeGFTLine( fout, seqName, "CDS", exonStart + offset, exonEnd + offset, strand, frame, genePrefix, geneNum); writeGFTLine( fout, seqName, "stop_codon", exonEnd + offset + 1, exonEnd + offset + 3, strand, 0, genePrefix, geneNum); } inPlusExon = false; firstExon = true; startCodonSplit = false; geneNum++; } else if (inMinusExon) { strand = "-"; long prevExonEnd = exonEnd; exonEnd = i; if (firstExon && exonEnd - exonStart + 1 < 3) { System.err.println( "Single '" + strand + "' strand exon is < 3 bases for sequence '" + seqName + "'. exonStart=" + exonStart + " exonEnd=" + exonEnd); } else if (exonEnd - exonStart + 1 < 3) { // this exon is < 3 bases, need to split start codon if (exonEnd - exonStart + 1 == 2) { // this exon is 2 bases writeGFTLine( fout, seqName, "start_codon", prevExonEnd + offset, prevExonEnd + offset, strand, 0, genePrefix, geneNum); writeGFTLine( fout, seqName, "CDS", exonStart + offset, exonEnd + offset, strand, frame, genePrefix, geneNum); writeGFTLine( fout, seqName, "start_codon", exonStart + offset, exonEnd + offset, strand, 2, genePrefix, geneNum); } else if (exonEnd - exonStart + 1 == 1) { // this exon is 1 base writeGFTLine( fout, seqName, "start_codon", prevExonEnd + offset - 1, prevExonEnd + offset, strand, 0, genePrefix, geneNum); writeGFTLine( fout, seqName, "CDS", exonStart + offset, exonEnd + offset, strand, frame, genePrefix, geneNum); writeGFTLine( fout, seqName, "start_codon", exonStart + offset, exonEnd + offset, strand, 1, genePrefix, geneNum); } } else { writeGFTLine( fout, seqName, "CDS", exonStart + offset, exonEnd + offset, strand, frame, genePrefix, geneNum); writeGFTLine( fout, seqName, "start_codon", exonEnd + offset - 2, exonEnd + offset, strand, 0, genePrefix, geneNum); } inMinusExon = false; firstExon = true; startCodonSplit = false; geneNum++; } } } seqCount++; } fout.close(); }
public int getNumFeatures() { Assert.a(startIndexes != null, "Attempted to get number of features before training."); return totalFeatures; }