コード例 #1
0
  public InputSequence<T> subSequence(int start, int end) {
    Assert.a(start >= 1);
    Assert.a(end <= this.length());
    Assert.a(start <= end);

    int length = end - start + 1;
    T[] ret = (T[]) new Object[length];
    for (int i = 0; i < length; ++i) {
      ret[i] = t[i + start - 1];
    }
    return new InputSequenceObject<T>(ret);
  }
コード例 #2
0
  // Frame is the nmber of bases in this region befor you get in frame.
  // That is, if frame is 0, the first three bases in this element are a codon.
  // If frame is 1, the first base is the end of a codon hanging over from the
  //     end of the previous codon and the next three are the first codon in this feature.
  // If frame is 2, the first two bases are the end of the previous codon and the
  //     next three are the first codon in this feature.
  private static int setFrame(int ref) {
    int frame = -1;

    switch (ref) {
      case 1:
        frame = 0;
        break;
      case 2:
        frame = 2;
        break;
      case 3:
        frame = 1;
        break;
      case 7:
        frame = 1;
        break;
      case 8:
        frame = 2;
        break;
      case 9:
        frame = 0;
        break;
      default:
        Assert.a(false, "Error setting frame, ref = ", ref);
    }
    return frame;
  }
コード例 #3
0
 private boolean isCodingMinus(int y) {
   Assert.a((y >= 0) && (y < 13));
   if ((y == 7) || (y == 8) || (y == 9)) {
     return true;
   }
   return false;
 }
コード例 #4
0
 private boolean isCodingPlus(int y) {
   Assert.a((y >= 0) && (y < 13));
   if ((y == 1) || (y == 2) || (y == 3)) {
     return true;
   }
   return false;
 }
コード例 #5
0
  /** calculates statstics and output for results on a given test sequence */
  public void calcResultIncrement(TrainingSequence training, int[] predictedHiddenSequence) {
    labeled.add(
        new TrainingSequence(
            training.getInputSequence(),
            predictedHiddenSequence)); // This is only place that labelled gets added to???
    // So I guess the results just get built up incrementally, both the actuall hidden sequences and
    // the stats?
    Assert.a(training.length() == predictedHiddenSequence.length);
    int[] actualHiddenSequence = new int[training.length()];
    for (int i = 0; i < training.length(); i++) {
      actualHiddenSequence[i] = training.getY(i);
    }
    boolean thisperfect = true;
    for (int i = 0; i < predictedHiddenSequence.length; ++i) {
      int predY = predictedHiddenSequence[i];
      int realY = actualHiddenSequence[i];

      if (realY == predY) {
        correct += 1;
      } else {
        incorrect += 1;
        thisperfect = false;
      }

      ctCodingNucleotide.increment(isCodingPlus(predY), isCodingPlus(realY));
      ctCodingNucleotide.increment(isCodingMinus(predY), isCodingMinus(realY));

      for (int s = 0; s < nStates; s++) {
        ctStates.get(s).increment((predY == s), (realY == s));
      }
    }
    if (thisperfect) {
      perfect++;
    } else {
      imperfect++;
    }
    for (int i = 1; i < predictedHiddenSequence.length; ++i) {
      int predY = predictedHiddenSequence[i];
      int realY = actualHiddenSequence[i];
      int predYp = predictedHiddenSequence[i - 1];
      int realYp = actualHiddenSequence[i - 1];

      for (int t = 0; t < nTransitions; t++) {
        boolean bPred = ((predYp == fromInd.get(t)) && (predY == toInd.get(t)));
        boolean bReal = ((realYp == fromInd.get(t)) && (realY == toInd.get(t)));
        ctTransitions.get(t).increment(bPred, bReal);
      }
    }

    // Now let's increment the contingency table for exons; note that here not counting TN's
    RangeMap predExonsPlus = new RangeMap();
    RangeMap predExonsMinus = new RangeMap();
    RangeMap realExonsPlus = new RangeMap();
    RangeMap realExonsMinus = new RangeMap();
    makeExonRangeMapFrom13SV(predictedHiddenSequence, predExonsPlus, predExonsMinus);
    makeExonRangeMapFrom13SV(actualHiddenSequence, realExonsPlus, realExonsMinus);
    incrementCTFromRangeMaps(ctExons, predExonsPlus, realExonsPlus);
    incrementCTFromRangeMaps(ctExons, predExonsMinus, realExonsMinus);
  }
コード例 #6
0
  public void train(int startingIndex, ModelManager modelInfo, List data) {
    Assert.a(allFeatureTypes.size() > 0, "No features types have been assigned.");
    Assert.a(startIndexes == null, "FeatureManager has already been trained.");
    startIx = startingIndex;

    // Train each of the individual FeatureManagers and calculate offsets
    startIndexes = new int[allFeatureTypes.size()];
    totalFeatures = 0;
    for (int i = 0; i < startIndexes.length; ++i) {
      startIndexes[i] = totalFeatures + startIx;
      FeatureManager fm = allFeatureTypes.get(i);
      List compData =
          fm.getInputComponent() == null ? data : new ComponentList(data, fm.getInputComponent());
      fm.train(totalFeatures, modelInfo, compData);
      totalFeatures += fm.getNumFeatures();
    }
  }
コード例 #7
0
 public void evaluateEdgeLength(
     InputSequence seq, int pos, int length, int prevState, int state, FeatureList result) {
   Assert.a(length > 0);
   for (FeatureManagerEdgeExplicitLength fm : explicitLengthEdgeFeatureTypes) {
     InputSequence componentSeq =
         fm.getInputComponent() == null ? seq : seq.getComponent(fm.getInputComponent());
     fm.evaluateEdgeLength(componentSeq, pos, length, prevState, state, result);
     if (!result.isValid()) break;
   }
 }
コード例 #8
0
  public void addFeatureManager(String name, String inputParams, FeatureManager fm) {
    Assert.a(startIndexes == null, "Attempted to add a new FeatureManager after training.");

    if (name != null) fm.setInputComponent(name);
    allFeatureTypes.add(fm);

    // Add each feature type into the right list for evalution
    if (fm instanceof FeatureManagerNode) {
      nodeFeatureTypes.add((FeatureManagerNode) fm);
    }
    if (fm instanceof FeatureManagerEdge) {
      edgeFeatureTypes.add((FeatureManagerEdge) fm);
    }
    if (fm instanceof FeatureManagerNodeExplicitLength) {
      explicitLengthNodeFeatureTypes.add((FeatureManagerNodeExplicitLength) fm);
    }
    if (fm instanceof FeatureManagerEdgeExplicitLength) {
      explicitLengthEdgeFeatureTypes.add((FeatureManagerEdgeExplicitLength) fm);
    }
  }
コード例 #9
0
  // Outputs one line to the GTF file.
  // NOTE:  source is assumed to be 'CONRAD', and score is assumed to be unknown and set to '.'.
  private static void writeGFTLine(
      Writer out,
      String seqName,
      String feature,
      long exonStart,
      long exonEnd,
      String strand,
      int frame,
      String genePrefix,
      int geneNum)
      throws IOException {

    Assert.a(frame == 0 || frame == 1 || frame == 2, "Frame value invalid, frame = ", frame);

    String geneId = genePrefix + "G_" + String.valueOf(geneNum);
    String transId = genePrefix + "T_" + String.valueOf(geneNum) + ".1";

    out.write(
        seqName
            + "\t"
            + "CONRAD"
            + "\t"
            + feature
            + "\t"
            + exonStart
            + "\t"
            + exonEnd
            + "\t"
            + "."
            + "\t"
            + strand
            + "\t"
            + frame
            + "\t"
            + "gene_id \""
            + geneId
            + "\"; transcript_id \""
            + transId
            + "\";\n");
  }
コード例 #10
0
  // This function converts a 13 state model hidden sequence to a GTF file.
  public void writeGTF(List<? extends TrainingSequence<?>> refStates, String filename)
      throws IOException {
    int ref, geneNum, seqCount, frame = -1;
    long i, exonStart, exonEnd, end;
    boolean inPlusExon, inMinusExon, firstExon, startCodonSplit;
    String strand;
    Writer fout = new BufferedWriter(new FileWriter(filename));
    exonStart = exonEnd = 0;
    geneNum = 1;
    seqCount = 0;

    // Determine if model is tricycle13 or interval13.
    boolean interval13 = false;
    int prevState, state;
    for (TrainingSequence seq : refStates) {
      if (seq.length() == 0) continue;

      prevState = seq.getY(0);
      for (i = 1; i < seq.length(); i++) {
        state = seq.getY((int) i);
        if (prevState == 0 && (state == 2 || state == 3 || state == 7 || state == 8)) {
          interval13 = true;
          break;
        }
        prevState = state;
      }
      if (interval13) break;
    }

    for (TrainingSequence seq : refStates) {

      if (interval13) {
        SequenceConverter.convertSeqFromInterval13ToTricycle13(seq);
      }

      inPlusExon = false;
      inMinusExon = false;
      firstExon = true;
      startCodonSplit = false;

      parseSeqName(seq, seqCount);

      for (i = 0; i < seq.length(); i++) {
        ref = seq.getY((int) i);

        if (ref == 1 || ref == 2 || ref == 3) // in a plus exon
        {
          if (!inPlusExon) {
            exonStart = i + 1;
            inPlusExon = true;
            frame = setFrame(ref);
          }
        } else if (ref == 7 || ref == 8 || ref == 9) // in a minus exon
        {
          if (!inMinusExon) {
            exonStart = i + 1;
            inMinusExon = true;
            frame = setFrame(ref);
            if (firstExon) {
              if (i < 3)
                System.err.println(
                    "Minus strand gene start is within 3 nucleotides of sequence start.  No stop codon writen to GTF for gene starting at position "
                        + (exonStart + offset));
              else
                writeGFTLine(
                    fout,
                    seqName,
                    "stop_codon",
                    exonStart + offset - 3,
                    exonStart + offset - 1,
                    "-",
                    frame,
                    genePrefix,
                    geneNum);
            }
          }
        } else if (inPlusExon
            && (ref == 4 || ref == 5
                || ref == 6)) { // just ended an exon on plus strand, now in a plus intron
          strand = "+";
          inPlusExon = false;
          exonEnd = i;
          if (firstExon) {
            if (exonEnd - exonStart + 1 < 3) {
              end = exonEnd + offset;
              startCodonSplit = true;
            } else {
              end = exonStart + offset + 2;
            }
            writeGFTLine(
                fout,
                seqName,
                "start_codon",
                exonStart + offset,
                end,
                strand,
                frame,
                genePrefix,
                geneNum);
            firstExon = false;
          } else if (startCodonSplit) { // at second exon that contains part of start codon
            Assert.a(frame == 1 || frame == 2);
            writeGFTLine(
                fout,
                seqName,
                "start_codon",
                exonStart + offset,
                exonStart + offset + frame - 1,
                strand,
                frame,
                genePrefix,
                geneNum);
            startCodonSplit = false;
          }
          writeGFTLine(
              fout,
              seqName,
              "CDS",
              exonStart + offset,
              exonEnd + offset,
              strand,
              frame,
              genePrefix,
              geneNum);
        } else if (inMinusExon
            && (ref == 10 || ref == 11
                || ref == 12)) { // just ended an exon on minus strand, now in a minus intron
          strand = "-";
          inMinusExon = false;
          firstExon = false;
          exonEnd = i;
          writeGFTLine(
              fout,
              seqName,
              "CDS",
              exonStart + offset,
              exonEnd + offset,
              strand,
              frame,
              genePrefix,
              geneNum);
        } else // now in intergenic region
        {
          boolean write = true;
          if (inPlusExon) // was in gene at previous nucleotide
          {
            strand = "+";
            exonEnd = i;
            if (firstExon) {
              if (exonEnd - exonStart + 1 < 3) {
                System.err.println(
                    "Single '"
                        + strand
                        + "' strand exon is < 3 bases for sequence '"
                        + seqName
                        + "'.  exonStart="
                        + exonStart
                        + "  exonEnd="
                        + exonEnd);
                write = false;
              } else {
                writeGFTLine(
                    fout,
                    seqName,
                    "start_codon",
                    exonStart + offset,
                    exonStart + offset + 2,
                    strand,
                    frame,
                    genePrefix,
                    geneNum);
              }
            } else if (startCodonSplit) { // at second exon that contains part of start codon
              Assert.a(frame == 1 || frame == 2);
              writeGFTLine(
                  fout,
                  seqName,
                  "start_codon",
                  exonStart + offset,
                  exonStart + offset + frame - 1,
                  strand,
                  frame,
                  genePrefix,
                  geneNum);
            }
            if (write) {
              writeGFTLine(
                  fout,
                  seqName,
                  "CDS",
                  exonStart + offset,
                  exonEnd + offset,
                  strand,
                  frame,
                  genePrefix,
                  geneNum);
              writeGFTLine(
                  fout,
                  seqName,
                  "stop_codon",
                  exonEnd + offset + 1,
                  exonEnd + offset + 3,
                  strand,
                  0,
                  genePrefix,
                  geneNum);
            }
            inPlusExon = false;
            firstExon = true;
            startCodonSplit = false;
            geneNum++;
          } else if (inMinusExon) {
            strand = "-";
            long prevExonEnd = exonEnd;
            exonEnd = i;
            if (firstExon && exonEnd - exonStart + 1 < 3) {
              System.err.println(
                  "Single '"
                      + strand
                      + "' strand exon is < 3 bases for sequence '"
                      + seqName
                      + "'.  exonStart="
                      + exonStart
                      + "  exonEnd="
                      + exonEnd);
            } else if (exonEnd - exonStart + 1
                < 3) { // this exon is < 3 bases, need to split start codon
              if (exonEnd - exonStart + 1 == 2) { // this exon is 2 bases
                writeGFTLine(
                    fout,
                    seqName,
                    "start_codon",
                    prevExonEnd + offset,
                    prevExonEnd + offset,
                    strand,
                    0,
                    genePrefix,
                    geneNum);
                writeGFTLine(
                    fout,
                    seqName,
                    "CDS",
                    exonStart + offset,
                    exonEnd + offset,
                    strand,
                    frame,
                    genePrefix,
                    geneNum);
                writeGFTLine(
                    fout,
                    seqName,
                    "start_codon",
                    exonStart + offset,
                    exonEnd + offset,
                    strand,
                    2,
                    genePrefix,
                    geneNum);
              } else if (exonEnd - exonStart + 1 == 1) { // this exon is 1 base
                writeGFTLine(
                    fout,
                    seqName,
                    "start_codon",
                    prevExonEnd + offset - 1,
                    prevExonEnd + offset,
                    strand,
                    0,
                    genePrefix,
                    geneNum);
                writeGFTLine(
                    fout,
                    seqName,
                    "CDS",
                    exonStart + offset,
                    exonEnd + offset,
                    strand,
                    frame,
                    genePrefix,
                    geneNum);
                writeGFTLine(
                    fout,
                    seqName,
                    "start_codon",
                    exonStart + offset,
                    exonEnd + offset,
                    strand,
                    1,
                    genePrefix,
                    geneNum);
              }
            } else {
              writeGFTLine(
                  fout,
                  seqName,
                  "CDS",
                  exonStart + offset,
                  exonEnd + offset,
                  strand,
                  frame,
                  genePrefix,
                  geneNum);
              writeGFTLine(
                  fout,
                  seqName,
                  "start_codon",
                  exonEnd + offset - 2,
                  exonEnd + offset,
                  strand,
                  0,
                  genePrefix,
                  geneNum);
            }
            inMinusExon = false;
            firstExon = true;
            startCodonSplit = false;
            geneNum++;
          }
        }
      }
      seqCount++;
    }
    fout.close();
  }
コード例 #11
0
 public int getNumFeatures() {
   Assert.a(startIndexes != null, "Attempted to get number of features before training.");
   return totalFeatures;
 }