Ejemplo n.º 1
0
 /**
  * Creates annotations for each node in parse tree <CODE>node</NODE>.
  * These annotations are added to the parse tree and to the document
  * <CODE>doc</CODE>.  In constrast to <CODE>setAnnotations</CODE>,
  * the categories used for terminal nodes are Jet categories obtained by
  * Jet tokenization and lexical look-up.  This means that hyphenated
  * items are split, and multi-word names are reduced to a single node.
  *
  * @param node      the root of the parse tree
  * @param treeSpan  the span of the document matching the parse tree
  * @param doc       the document to which annotations will be added
  */
 private void setJetAnnotations(ParseTreeNode node, Span treeSpan, Document doc) {
   StatParser.buildParserInput(doc, treeSpan.start(), treeSpan.end(), false);
   StatParser.fixHyphenatedItems(doc);
   int nameConstitEnd = -1;
   List<ParseTreeNode> terminals = getTerminalNodes(node);
   for (ParseTreeNode terminal : terminals) {
     int terminalEnd = terminal.end;
     // is there a 'name' constituent or 'hyphword' constituent here?
     Vector<Annotation> constits = doc.annotationsAt(terminal.start, "constit");
     Annotation constit = null;
     Annotation nameConstit = null;
     Annotation hyphword = null;
     if (constits != null) {
       for (Annotation c : constits) {
         if (c.get("cat") == "name") {
           nameConstit = c;
         } else if (c.get("cat") == "hyphword") {
           hyphword = c;
         }
         if (constit == null) constit = c;
       }
     }
     if (hyphword != null) {
       nameConstit = null;
       constit = hyphword;
     }
     // if there is a name which is not part of a hyphword, associate the
     // name with this (first) terminal node, and mark any remaining terminal
     // nodes which match tokens in the name as empty
     if (nameConstit != null) {
       terminal.end = nameConstit.end();
       terminal.ann = nameConstit;
       nameConstitEnd = nameConstit.end();
     } else if (nameConstitEnd >= 0) {
       terminal.word = null;
     } else {
       Span span = new Span(terminal.start, terminal.end);
       String pennPOS = ((String) terminal.category).toUpperCase().intern();
       String word = terminal.word;
       terminal.ann = StatParser.buildWordDefn(doc, word, span, constit, pennPOS);
     }
     if (nameConstitEnd == terminalEnd) nameConstitEnd = -1;
   }
   // prune parse tree:  remove a node if it has no word or children
   pruneTree(node);
   determineNonTerminalSpans(node, treeSpan.start());
   // add head links
   if (hr == null) hr = HeadRule.createDefaultRule();
   hr.apply(node);
   // add annotations for non-terminals:
   Jet.Parser.ParseTreeNode.makeParseAnnotations(doc, node);
 }
Ejemplo n.º 2
0
  /**
   * Adds <B>constit</B> annotations to an existing Document <CODE>doc</CODE> to represent the parse
   * tree structure <CODE>tree</CODE>.
   *
   * @param tree the parse tree (for a portion of Document doc)
   * @param doc the document
   * @param span the portion of doc covered by the parse tree
   * @param jetCategories if true, use Jet categories as terminal categories (if false, use
   *     categories read from parse trees)
   */
  public void addAnnotations(ParseTreeNode tree, Document doc, Span span, boolean jetCategories) {
    List<ParseTreeNode> terminalNodes = getTerminalNodes(tree);
    String text = doc.text();
    int offset = span.start();

    for (ParseTreeNode terminal : terminalNodes) {
      while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) {
        offset++;
      }
      for (String skipString : skip) {
        if (text.startsWith(skipString, offset)) {
          offset += skipString.length();
          while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) {
            offset++;
          }
          break;
        }
      }
      // match next terminal node against next word in text
      int matchLength = matchTextToTree(text, offset, terminal.word);
      if (matchLength > 0) {
        int endOffset = offset + matchLength;
        while (endOffset < span.end() && Character.isWhitespace(text.charAt(endOffset))) {
          endOffset++;
        }
        terminal.start = offset;
        terminal.end = endOffset;
        offset = endOffset;
      } else {
        System.err.println(
            "PTBReader.addAnnotations:  "
                + "Cannot determine parse tree offset for word "
                + terminal.word);
        System.err.println("  at document offset " + offset + " in sentence");
        System.err.println("  " + doc.text(span));
        return;
      }
    }

    if (jetCategories) {
      setJetAnnotations(tree, span, doc);
      StatParser.deleteUnusedConstits(doc, span, tree.ann); // <<<
    } else {
      determineNonTerminalSpans(tree, span.start());
      setAnnotations(tree, doc);
    }
  }