示例#1
0
 /**
  * Adds <B>constit</B> annotations to an existing Document <CODE>doc</CODE> to represent the parse
  * tree structure of a set of trees <CODE>trees</CODE>. This version is provided for parse tree
  * files which include sentence offsets.
  *
  * @param trees list of parse trees
  * @param offsets list of the starting position (in doc) of the text corresponding to each parse
  *     tree
  * @param doc document to which annotations should be added
  * @param targetAnnotation name of annotation to get 'parse' feature pointing to parse tree
  * @param span target span.
  * @param jetCategories if false, use lexical categories from Penn Tree Bank; if true, use
  *     categories from Jet
  */
 public void addAnnotations(
     List<ParseTreeNode> trees,
     List<Integer> offsets,
     Document doc,
     String targetAnnotation,
     Span span,
     boolean jetCategories) {
   if (trees.size() != offsets.size()) {
     System.err.println(
         "PTBReader.addAnnotations:  mismatch between number of "
             + "trees ("
             + trees.size()
             + ") and number of offsets ("
             + offsets.size()
             + ")");
     return;
   }
   for (int i = 0; i < trees.size(); i++) {
     ParseTreeNode tree = trees.get(i);
     int start = offsets.get(i);
     if (start < 0) {
       System.err.println("PTBReader.addAnnotations:  offset missing for " + " parse tree " + i);
       continue;
     }
     int end = (i + 1 == offsets.size()) ? span.end() : offsets.get(i + 1);
     Span sentenceSpan = new Span(start, end);
     addAnnotations(tree, doc, sentenceSpan, jetCategories);
     Vector<Annotation> anns = doc.annotationsAt(start, targetAnnotation);
     if (anns != null && anns.size() > 0) {
       Annotation ann = anns.get(0);
       ann.put("parse", tree.ann);
     }
   }
 }
示例#2
0
  /**
   * train the tagger using the DocumentCollection in file 'trainingCollection'.
   * 'trainingCollection' should consist of documents which have been explicitly tagged with
   * part-of-speech information.
   */
  void train(String trainingCollection) {

    for (int i = 0; i < posTable.length; i++)
      tagTable[i] = new String[] {"constit", "cat", posTable[i], posTable[i]};

    // build ergodic HMM with one state for each POS (plus start and end states)

    HMMstate startState = new HMMstate("start", "", WordFeatureHMMemitter.class);
    posh.addState(startState);
    for (int j = 0; j < posTable.length; j++) startState.addArc(new HMMarc(posTable[j], 0));
    HMMstate endState = new HMMstate("end", "", WordFeatureHMMemitter.class);
    posh.addState(endState);
    for (int i = 0; i < posTable.length; i++) {
      String pos = posTable[i];
      HMMstate state = new HMMstate(pos, pos, WordFeatureHMMemitter.class);
      posh.addState(state);
      for (int j = 0; j < posTable.length; j++) state.addArc(new HMMarc(posTable[j], 0));
      state.addArc(new HMMarc("end", 0));
    }
    posh.resolveNames();

    posh.resetForTraining();
    annotator = new HMMannotator(posh);
    annotator.setTagTable(tagTable);
    annotator.setBItag(false);

    DocumentCollection col = new DocumentCollection(trainingCollection);
    col.open();
    for (int i = 0; i < col.size(); i++) {
      ExternalDocument doc = col.get(i);
      doc.open();
      System.out.println("Training from " + doc.fileName());

      // divide at endmarks (constit cat="."), adding "S" marks

      int posn = 0;
      int start = posn;
      Vector anns;
      while ((anns = doc.annotationsAt(posn, "constit")) != null) {
        Annotation ann = (Annotation) anns.get(0);
        posn = ann.span().end();
        String pos = (String) ann.get("cat");
        if (pos.equals(".")) {
          doc.annotate("S", new Span(start, posn), new FeatureSet());
          start = posn;
        }
      }
      annotator.train(doc);
      //  free up space taken by annotations on document
      doc.clearAnnotations();
    }
    posh.computeProbabilities();
  }
示例#3
0
 /**
  * Creates annotations for each node in parse tree <CODE>node</NODE>.
  * These annotations are added to the parse tree and to the document
  * <CODE>doc</CODE>.  In constrast to <CODE>setAnnotations</CODE>,
  * the categories used for terminal nodes are Jet categories obtained by
  * Jet tokenization and lexical look-up.  This means that hyphenated
  * items are split, and multi-word names are reduced to a single node.
  *
  * @param node      the root of the parse tree
  * @param treeSpan  the span of the document matching the parse tree
  * @param doc       the document to which annotations will be added
  */
 private void setJetAnnotations(ParseTreeNode node, Span treeSpan, Document doc) {
   StatParser.buildParserInput(doc, treeSpan.start(), treeSpan.end(), false);
   StatParser.fixHyphenatedItems(doc);
   int nameConstitEnd = -1;
   List<ParseTreeNode> terminals = getTerminalNodes(node);
   for (ParseTreeNode terminal : terminals) {
     int terminalEnd = terminal.end;
     // is there a 'name' constituent or 'hyphword' constituent here?
     Vector<Annotation> constits = doc.annotationsAt(terminal.start, "constit");
     Annotation constit = null;
     Annotation nameConstit = null;
     Annotation hyphword = null;
     if (constits != null) {
       for (Annotation c : constits) {
         if (c.get("cat") == "name") {
           nameConstit = c;
         } else if (c.get("cat") == "hyphword") {
           hyphword = c;
         }
         if (constit == null) constit = c;
       }
     }
     if (hyphword != null) {
       nameConstit = null;
       constit = hyphword;
     }
     // if there is a name which is not part of a hyphword, associate the
     // name with this (first) terminal node, and mark any remaining terminal
     // nodes which match tokens in the name as empty
     if (nameConstit != null) {
       terminal.end = nameConstit.end();
       terminal.ann = nameConstit;
       nameConstitEnd = nameConstit.end();
     } else if (nameConstitEnd >= 0) {
       terminal.word = null;
     } else {
       Span span = new Span(terminal.start, terminal.end);
       String pennPOS = ((String) terminal.category).toUpperCase().intern();
       String word = terminal.word;
       terminal.ann = StatParser.buildWordDefn(doc, word, span, constit, pennPOS);
     }
     if (nameConstitEnd == terminalEnd) nameConstitEnd = -1;
   }
   // prune parse tree:  remove a node if it has no word or children
   pruneTree(node);
   determineNonTerminalSpans(node, treeSpan.start());
   // add head links
   if (hr == null) hr = HeadRule.createDefaultRule();
   hr.apply(node);
   // add annotations for non-terminals:
   Jet.Parser.ParseTreeNode.makeParseAnnotations(doc, node);
 }
示例#4
0
  /**
   * Remove last whitespace character and modify annotation span.
   *
   * @param annotations
   * @param buffer
   */
  private void modifyAnnotationEnd(List<Annotation> annotations, StringBuilder buffer) {
    ListIterator<Annotation> it = annotations.listIterator(annotations.size());

    if (buffer.length() == 0) {
      return;
    }

    if (!Character.isWhitespace(buffer.charAt(buffer.length() - 1))) {
      return;
    }

    while (it.hasPrevious()) {
      Annotation a = it.previous();
      if (a.end() != buffer.length()) {
        break;
      }

      Span span = new Span(a.start(), a.end() - 1);
      Annotation replacement = new Annotation(a.type(), span, a.attributes());
      it.set(replacement);
    }

    buffer.deleteCharAt(buffer.length() - 1);
  }