示例#1
0
  /**
   * train the tagger using the DocumentCollection in file 'trainingCollection'.
   * 'trainingCollection' should consist of documents which have been explicitly tagged with
   * part-of-speech information.
   */
  void train(String trainingCollection) {

    for (int i = 0; i < posTable.length; i++)
      tagTable[i] = new String[] {"constit", "cat", posTable[i], posTable[i]};

    // build ergodic HMM with one state for each POS (plus start and end states)

    HMMstate startState = new HMMstate("start", "", WordFeatureHMMemitter.class);
    posh.addState(startState);
    for (int j = 0; j < posTable.length; j++) startState.addArc(new HMMarc(posTable[j], 0));
    HMMstate endState = new HMMstate("end", "", WordFeatureHMMemitter.class);
    posh.addState(endState);
    for (int i = 0; i < posTable.length; i++) {
      String pos = posTable[i];
      HMMstate state = new HMMstate(pos, pos, WordFeatureHMMemitter.class);
      posh.addState(state);
      for (int j = 0; j < posTable.length; j++) state.addArc(new HMMarc(posTable[j], 0));
      state.addArc(new HMMarc("end", 0));
    }
    posh.resolveNames();

    posh.resetForTraining();
    annotator = new HMMannotator(posh);
    annotator.setTagTable(tagTable);
    annotator.setBItag(false);

    DocumentCollection col = new DocumentCollection(trainingCollection);
    col.open();
    for (int i = 0; i < col.size(); i++) {
      ExternalDocument doc = col.get(i);
      doc.open();
      System.out.println("Training from " + doc.fileName());

      // divide at endmarks (constit cat="."), adding "S" marks

      int posn = 0;
      int start = posn;
      Vector anns;
      while ((anns = doc.annotationsAt(posn, "constit")) != null) {
        Annotation ann = (Annotation) anns.get(0);
        posn = ann.span().end();
        String pos = (String) ann.get("cat");
        if (pos.equals(".")) {
          doc.annotate("S", new Span(start, posn), new FeatureSet());
          start = posn;
        }
      }
      annotator.train(doc);
      //  free up space taken by annotations on document
      doc.clearAnnotations();
    }
    posh.computeProbabilities();
  }
示例#2
0
  public void run() {
    List<TestFailure> failures = dto.loadFailures();
    DocumentCollection documents = DocumentCollection.getInstance();
    TermCollection terms = TermCollection.getInstance();

    TestFailureUtil.buildDocumentCollection(failures, documents);
    TestFailureUtil.buildTermCollection(failures, terms);

    for (int i = 0; i < terms.size(); i++) {
      String term = terms.get(i);
      Integer count = documents.findTermFrequency(term);
      // Calculate idf weight against a term
      Float weight = (float) Math.log((double) documents.size() / (double) count);
      terms.setWeithg(term, weight);
      // System.out.println(String.valueOf(i) + "." + term + " : " + String.valueOf(weight)) ;
    }

    for (int i = 0; i < documents.size(); i++) {
      String document = documents.get(i);
      String[] words = document.split("\\s+");

      System.out.println(document);

      for (String word : words) {
        TermItem item = new TermItem();
        float tfidf = VectorSpaceModel.findTFIDF(documents.get(i), word);

        item.setTerm(word);
        item.setTfidf(tfidf);

        Integer pos = -1;
        while ((pos = document.indexOf(word, pos + 1)) >= 0) {
          item.addPos(pos);
        }

        if (tfidf > 0.0) System.out.println(item.toString());
      }
    }
  }