/** * Adds <B>constit</B> annotations to an existing Document <CODE>doc</CODE> to represent the parse * tree structure of a set of trees <CODE>trees</CODE>. This version is provided for parse tree * files which include sentence offsets. * * @param trees list of parse trees * @param offsets list of the starting position (in doc) of the text corresponding to each parse * tree * @param doc document to which annotations should be added * @param targetAnnotation name of annotation to get 'parse' feature pointing to parse tree * @param span target span. * @param jetCategories if false, use lexical categories from Penn Tree Bank; if true, use * categories from Jet */ public void addAnnotations( List<ParseTreeNode> trees, List<Integer> offsets, Document doc, String targetAnnotation, Span span, boolean jetCategories) { if (trees.size() != offsets.size()) { System.err.println( "PTBReader.addAnnotations: mismatch between number of " + "trees (" + trees.size() + ") and number of offsets (" + offsets.size() + ")"); return; } for (int i = 0; i < trees.size(); i++) { ParseTreeNode tree = trees.get(i); int start = offsets.get(i); if (start < 0) { System.err.println("PTBReader.addAnnotations: offset missing for " + " parse tree " + i); continue; } int end = (i + 1 == offsets.size()) ? span.end() : offsets.get(i + 1); Span sentenceSpan = new Span(start, end); addAnnotations(tree, doc, sentenceSpan, jetCategories); Vector<Annotation> anns = doc.annotationsAt(start, targetAnnotation); if (anns != null && anns.size() > 0) { Annotation ann = anns.get(0); ann.put("parse", tree.ann); } } }
/** * hides (adds the 'hidden' feature) to all annotations of type <I>type</I> beginning at the * starting position of span <I>span</I>. */ public static void hideAnnotations(Document doc, String type, Span span) { for (int posn = span.start(); posn < span.end(); posn++) { Vector annotations = doc.annotationsAt(posn, type); if (annotations != null) { for (int i = 0; i < annotations.size(); i++) { Annotation ann = (Annotation) annotations.elementAt(i); ann.put("hidden", "true"); // Console.println ("Hiding " + ann); } } } }
/** * Creates annotations for each node in parse tree <CODE>node</NODE>. * These annotations are added to the parse tree and to the document * <CODE>doc</CODE>. In constrast to <CODE>setAnnotations</CODE>, * the categories used for terminal nodes are Jet categories obtained by * Jet tokenization and lexical look-up. This means that hyphenated * items are split, and multi-word names are reduced to a single node. * * @param node the root of the parse tree * @param treeSpan the span of the document matching the parse tree * @param doc the document to which annotations will be added */ private void setJetAnnotations(ParseTreeNode node, Span treeSpan, Document doc) { StatParser.buildParserInput(doc, treeSpan.start(), treeSpan.end(), false); StatParser.fixHyphenatedItems(doc); int nameConstitEnd = -1; List<ParseTreeNode> terminals = getTerminalNodes(node); for (ParseTreeNode terminal : terminals) { int terminalEnd = terminal.end; // is there a 'name' constituent or 'hyphword' constituent here? Vector<Annotation> constits = doc.annotationsAt(terminal.start, "constit"); Annotation constit = null; Annotation nameConstit = null; Annotation hyphword = null; if (constits != null) { for (Annotation c : constits) { if (c.get("cat") == "name") { nameConstit = c; } else if (c.get("cat") == "hyphword") { hyphword = c; } if (constit == null) constit = c; } } if (hyphword != null) { nameConstit = null; constit = hyphword; } // if there is a name which is not part of a hyphword, associate the // name with this (first) terminal node, and mark any remaining terminal // nodes which match tokens in the name as empty if (nameConstit != null) { terminal.end = nameConstit.end(); terminal.ann = nameConstit; nameConstitEnd = nameConstit.end(); } else if (nameConstitEnd >= 0) { terminal.word = null; } else { Span span = new Span(terminal.start, terminal.end); String pennPOS = ((String) terminal.category).toUpperCase().intern(); String word = terminal.word; terminal.ann = StatParser.buildWordDefn(doc, word, span, constit, pennPOS); } if (nameConstitEnd == terminalEnd) nameConstitEnd = -1; } // prune parse tree: remove a node if it has no word or children pruneTree(node); determineNonTerminalSpans(node, treeSpan.start()); // add head links if (hr == null) hr = HeadRule.createDefaultRule(); hr.apply(node); // add annotations for non-terminals: Jet.Parser.ParseTreeNode.makeParseAnnotations(doc, node); }
/** generate the dependency parse for a sentence, adding its arcs to 'relations'. */ public static void parseSentence(Document doc, Span span, SyntacticRelationSet relations) { if (fsw == null) { System.out.println("DepParser: no model loaded"); return; } // System.out.println ("parseSentence: " + doc.text(span)); // run Penn part-of-speech tagger // JetTest.tagger.annotate(doc, span, "tagger"); // build sentence List<Token> tokens = new ArrayList<Token>(); List<Integer> offset = new ArrayList<Integer>(); offset.add(0); // don't use 0th entry int tokenNum = 0; int posn = span.start(); while (posn < span.end()) { tokenNum++; Annotation tokenAnnotation = doc.tokenAt(posn); for (String s : SPECIAL_TOKEN) { Vector<Annotation> va = doc.annotationsAt(posn, s); if (va != null && va.size() > 0) { tokenAnnotation = va.get(0); break; } } if (tokenAnnotation == null) return; String tokenText = doc.normalizedText(tokenAnnotation).replaceAll(" ", "_"); Vector v = doc.annotationsAt(posn, "tagger"); Annotation a = (Annotation) v.get(0); String pos = (String) a.get("cat"); tokens.add(new Token(tokenText, pos, tokenNum)); offset.add(posn); if (posn >= tokenAnnotation.end()) { break; } posn = tokenAnnotation.end(); } Sentence sent = new Sentence(tokens); // parse sentence Arc[] arcs = fsw.process( sent, tokens.size() > 0 && tokens.get(0).getPos() == null, true, true, true, true, true) .getParse() .getHeadArcs(); // get dependencies for (Arc arc : arcs) { if (arc == null) continue; if (arc.getDependency().equalsIgnoreCase("ROOT")) continue; Token head = arc.getHead(); String headText = head.getText(); String headPos = head.getPos(); Integer headOffset = offset.get(head.getIndex()); Token dep = arc.getChild(); String depText = dep.getText(); String depPos = dep.getPos(); Integer depOffset = offset.get(dep.getIndex()); String type = arc.getDependency(); SyntacticRelation r = new SyntacticRelation(headOffset, headText, headPos, type, depOffset, depText, depPos); relations.add(r); // System.out.println ("parseSentence: adding relation " + r); } }