예제 #1
0
 /**
  * Adds <B>constit</B> annotations to an existing Document <CODE>doc</CODE> to represent the parse
  * tree structure of a set of trees <CODE>trees</CODE>. This version is provided for parse tree
  * files which include sentence offsets.
  *
  * @param trees list of parse trees
  * @param offsets list of the starting position (in doc) of the text corresponding to each parse
  *     tree
  * @param doc document to which annotations should be added
  * @param targetAnnotation name of annotation to get 'parse' feature pointing to parse tree
  * @param span target span.
  * @param jetCategories if false, use lexical categories from Penn Tree Bank; if true, use
  *     categories from Jet
  */
 public void addAnnotations(
     List<ParseTreeNode> trees,
     List<Integer> offsets,
     Document doc,
     String targetAnnotation,
     Span span,
     boolean jetCategories) {
   if (trees.size() != offsets.size()) {
     System.err.println(
         "PTBReader.addAnnotations:  mismatch between number of "
             + "trees ("
             + trees.size()
             + ") and number of offsets ("
             + offsets.size()
             + ")");
     return;
   }
   for (int i = 0; i < trees.size(); i++) {
     ParseTreeNode tree = trees.get(i);
     int start = offsets.get(i);
     if (start < 0) {
       System.err.println("PTBReader.addAnnotations:  offset missing for " + " parse tree " + i);
       continue;
     }
     int end = (i + 1 == offsets.size()) ? span.end() : offsets.get(i + 1);
     Span sentenceSpan = new Span(start, end);
     addAnnotations(tree, doc, sentenceSpan, jetCategories);
     Vector<Annotation> anns = doc.annotationsAt(start, targetAnnotation);
     if (anns != null && anns.size() > 0) {
       Annotation ann = anns.get(0);
       ann.put("parse", tree.ann);
     }
   }
 }
예제 #2
0
 /**
  * hides (adds the 'hidden' feature) to all annotations of type <I>type</I> beginning at the
  * starting position of span <I>span</I>.
  */
 public static void hideAnnotations(Document doc, String type, Span span) {
   for (int posn = span.start(); posn < span.end(); posn++) {
     Vector annotations = doc.annotationsAt(posn, type);
     if (annotations != null) {
       for (int i = 0; i < annotations.size(); i++) {
         Annotation ann = (Annotation) annotations.elementAt(i);
         ann.put("hidden", "true");
         // Console.println ("Hiding " + ann);
       }
     }
   }
 }
예제 #3
0
 /**
  * Creates annotations for each node in parse tree <CODE>node</NODE>.
  * These annotations are added to the parse tree and to the document
  * <CODE>doc</CODE>.  In constrast to <CODE>setAnnotations</CODE>,
  * the categories used for terminal nodes are Jet categories obtained by
  * Jet tokenization and lexical look-up.  This means that hyphenated
  * items are split, and multi-word names are reduced to a single node.
  *
  * @param node      the root of the parse tree
  * @param treeSpan  the span of the document matching the parse tree
  * @param doc       the document to which annotations will be added
  */
 private void setJetAnnotations(ParseTreeNode node, Span treeSpan, Document doc) {
   StatParser.buildParserInput(doc, treeSpan.start(), treeSpan.end(), false);
   StatParser.fixHyphenatedItems(doc);
   int nameConstitEnd = -1;
   List<ParseTreeNode> terminals = getTerminalNodes(node);
   for (ParseTreeNode terminal : terminals) {
     int terminalEnd = terminal.end;
     // is there a 'name' constituent or 'hyphword' constituent here?
     Vector<Annotation> constits = doc.annotationsAt(terminal.start, "constit");
     Annotation constit = null;
     Annotation nameConstit = null;
     Annotation hyphword = null;
     if (constits != null) {
       for (Annotation c : constits) {
         if (c.get("cat") == "name") {
           nameConstit = c;
         } else if (c.get("cat") == "hyphword") {
           hyphword = c;
         }
         if (constit == null) constit = c;
       }
     }
     if (hyphword != null) {
       nameConstit = null;
       constit = hyphword;
     }
     // if there is a name which is not part of a hyphword, associate the
     // name with this (first) terminal node, and mark any remaining terminal
     // nodes which match tokens in the name as empty
     if (nameConstit != null) {
       terminal.end = nameConstit.end();
       terminal.ann = nameConstit;
       nameConstitEnd = nameConstit.end();
     } else if (nameConstitEnd >= 0) {
       terminal.word = null;
     } else {
       Span span = new Span(terminal.start, terminal.end);
       String pennPOS = ((String) terminal.category).toUpperCase().intern();
       String word = terminal.word;
       terminal.ann = StatParser.buildWordDefn(doc, word, span, constit, pennPOS);
     }
     if (nameConstitEnd == terminalEnd) nameConstitEnd = -1;
   }
   // prune parse tree:  remove a node if it has no word or children
   pruneTree(node);
   determineNonTerminalSpans(node, treeSpan.start());
   // add head links
   if (hr == null) hr = HeadRule.createDefaultRule();
   hr.apply(node);
   // add annotations for non-terminals:
   Jet.Parser.ParseTreeNode.makeParseAnnotations(doc, node);
 }
예제 #4
0
  /** generate the dependency parse for a sentence, adding its arcs to 'relations'. */
  public static void parseSentence(Document doc, Span span, SyntacticRelationSet relations) {
    if (fsw == null) {
      System.out.println("DepParser:  no model loaded");
      return;
    }
    // System.out.println ("parseSentence:  " + doc.text(span));
    // run Penn part-of-speech tagger
    // JetTest.tagger.annotate(doc, span, "tagger");
    // build sentence
    List<Token> tokens = new ArrayList<Token>();
    List<Integer> offset = new ArrayList<Integer>();
    offset.add(0); // don't use 0th entry
    int tokenNum = 0;
    int posn = span.start();
    while (posn < span.end()) {
      tokenNum++;
      Annotation tokenAnnotation = doc.tokenAt(posn);
      for (String s : SPECIAL_TOKEN) {
        Vector<Annotation> va = doc.annotationsAt(posn, s);
        if (va != null && va.size() > 0) {
          tokenAnnotation = va.get(0);
          break;
        }
      }
      if (tokenAnnotation == null) return;
      String tokenText = doc.normalizedText(tokenAnnotation).replaceAll(" ", "_");
      Vector v = doc.annotationsAt(posn, "tagger");
      Annotation a = (Annotation) v.get(0);
      String pos = (String) a.get("cat");
      tokens.add(new Token(tokenText, pos, tokenNum));
      offset.add(posn);
      if (posn >= tokenAnnotation.end()) {
        break;
      }
      posn = tokenAnnotation.end();
    }
    Sentence sent = new Sentence(tokens);
    // parse sentence
    Arc[] arcs =
        fsw.process(
                sent,
                tokens.size() > 0 && tokens.get(0).getPos() == null,
                true,
                true,
                true,
                true,
                true)
            .getParse()
            .getHeadArcs();

    // get dependencies
    for (Arc arc : arcs) {
      if (arc == null) continue;
      if (arc.getDependency().equalsIgnoreCase("ROOT")) continue;
      Token head = arc.getHead();
      String headText = head.getText();
      String headPos = head.getPos();
      Integer headOffset = offset.get(head.getIndex());
      Token dep = arc.getChild();
      String depText = dep.getText();
      String depPos = dep.getPos();
      Integer depOffset = offset.get(dep.getIndex());
      String type = arc.getDependency();
      SyntacticRelation r =
          new SyntacticRelation(headOffset, headText, headPos, type, depOffset, depText, depPos);
      relations.add(r);
      // System.out.println ("parseSentence:  adding relation " + r);
    }
  }