Example #1
0
  public static HMM parseHMM(Element hmmelt, TreeMap idMap, TreeMap objects) {

    HMM hmm = new HMM(hmmelt, idMap, objects);
    hmm.analyze(objects);
    objects.put(hmmelt.getAttributeValue("id"), hmm);
    return hmm;
  }
  public static void main(String argv[]) {
    boolean reverse = false;
    modshogun.init_shogun_with_defaults();
    int N = 1;
    int M = 512;
    double pseudo = 1e-5;
    int order = 3;
    int gap = 0;

    String[] fm_train_dna = Load.load_cubes("../data/fm_train_cube.dat");

    StringCharFeatures charfeat = new StringCharFeatures(fm_train_dna, CUBE);
    StringWordFeatures feats = new StringWordFeatures(charfeat.get_alphabet());
    feats.obtain_from_char(charfeat, order - 1, order, gap, reverse);

    HMM hmm = new HMM(feats, N, M, pseudo);
    hmm.train();
    hmm.baum_welch_viterbi_train(BW_NORMAL);

    int num_examples = feats.get_num_vectors();
    int num_param = hmm.get_num_model_parameters();
    for (int i = 0; i < num_examples; i++)
      for (int j = 0; j < num_param; j++) {
        hmm.get_log_derivative(j, i);
      }

    int best_path = 0;
    int best_path_state = 0;
    for (int i = 0; i < num_examples; i++) {
      best_path += hmm.best_path(i);
      for (int j = 0; j < N; j++) best_path_state += hmm.get_best_path_state(i, j);
    }

    DoubleMatrix lik_example = hmm.get_log_likelihood();
    double lik_sample = hmm.get_log_likelihood_sample();

    modshogun.exit_shogun();
  }
  /**
   * train the tagger using the DocumentCollection in file 'trainingCollection'.
   * 'trainingCollection' should consist of documents which have been explicitly tagged with
   * part-of-speech information.
   */
  void train(String trainingCollection) {

    for (int i = 0; i < posTable.length; i++)
      tagTable[i] = new String[] {"constit", "cat", posTable[i], posTable[i]};

    // build ergodic HMM with one state for each POS (plus start and end states)

    HMMstate startState = new HMMstate("start", "", WordFeatureHMMemitter.class);
    posh.addState(startState);
    for (int j = 0; j < posTable.length; j++) startState.addArc(new HMMarc(posTable[j], 0));
    HMMstate endState = new HMMstate("end", "", WordFeatureHMMemitter.class);
    posh.addState(endState);
    for (int i = 0; i < posTable.length; i++) {
      String pos = posTable[i];
      HMMstate state = new HMMstate(pos, pos, WordFeatureHMMemitter.class);
      posh.addState(state);
      for (int j = 0; j < posTable.length; j++) state.addArc(new HMMarc(posTable[j], 0));
      state.addArc(new HMMarc("end", 0));
    }
    posh.resolveNames();

    posh.resetForTraining();
    annotator = new HMMannotator(posh);
    annotator.setTagTable(tagTable);
    annotator.setBItag(false);

    DocumentCollection col = new DocumentCollection(trainingCollection);
    col.open();
    for (int i = 0; i < col.size(); i++) {
      ExternalDocument doc = col.get(i);
      doc.open();
      System.out.println("Training from " + doc.fileName());

      // divide at endmarks (constit cat="."), adding "S" marks

      int posn = 0;
      int start = posn;
      Vector anns;
      while ((anns = doc.annotationsAt(posn, "constit")) != null) {
        Annotation ann = (Annotation) anns.get(0);
        posn = ann.span().end();
        String pos = (String) ann.get("cat");
        if (pos.equals(".")) {
          doc.annotate("S", new Span(start, posn), new FeatureSet());
          start = posn;
        }
      }
      annotator.train(doc);
      //  free up space taken by annotations on document
      doc.clearAnnotations();
    }
    posh.computeProbabilities();
  }
 /** load the HMM associated with this tagger from file 'fileName'. */
 public void load(String fileName) throws IOException {
   posh.load(new BufferedReader(new FileReader(fileName)));
 }
 /** store the HMM associated with this tagger to file 'fileName'. */
 public void store(String fileName) throws IOException {
   posh.store(new PrintWriter(new FileOutputStream(fileName)));
 }