/** * Builds Jet.Tipster.Document object from Penn treebank corpus. * * @param in * @return * @throws IOException * @throws InvalidFormatException */ public Treebank load(Reader in) throws IOException, InvalidFormatException { List<ParseTreeNode> trees = new ArrayList<ParseTreeNode>(); PushbackReader input = new PushbackReader(in); int start = 0; while (true) { skipWhitespace(input); if (lookAhead(input) == -1) { break; } ParseTreeNode tree = readNode(input); trees.add(tree); determineSpans(tree, start); setAnnotations(tree, null); start = tree.end; } String text = buildDocumentString(trees); Document doc = new Document(text); for (ParseTreeNode tree : trees) { doc.annotate("sentence", new Span(tree.start, tree.end), new FeatureSet()); annotate(doc, tree); } return new Treebank(doc, trees); }
/** * Adds <B>constit</B> annotations to an existing Document <CODE>doc</CODE> to represent the parse * tree structure of a set of trees <CODE>trees</CODE>. This version is provided for parse tree * files which include sentence offsets. * * @param trees list of parse trees * @param offsets list of the starting position (in doc) of the text corresponding to each parse * tree * @param doc document to which annotations should be added * @param targetAnnotation name of annotation to get 'parse' feature pointing to parse tree * @param span target span. * @param jetCategories if false, use lexical categories from Penn Tree Bank; if true, use * categories from Jet */ public void addAnnotations( List<ParseTreeNode> trees, List<Integer> offsets, Document doc, String targetAnnotation, Span span, boolean jetCategories) { if (trees.size() != offsets.size()) { System.err.println( "PTBReader.addAnnotations: mismatch between number of " + "trees (" + trees.size() + ") and number of offsets (" + offsets.size() + ")"); return; } for (int i = 0; i < trees.size(); i++) { ParseTreeNode tree = trees.get(i); int start = offsets.get(i); if (start < 0) { System.err.println("PTBReader.addAnnotations: offset missing for " + " parse tree " + i); continue; } int end = (i + 1 == offsets.size()) ? span.end() : offsets.get(i + 1); Span sentenceSpan = new Span(start, end); addAnnotations(tree, doc, sentenceSpan, jetCategories); Vector<Annotation> anns = doc.annotationsAt(start, targetAnnotation); if (anns != null && anns.size() > 0) { Annotation ann = anns.get(0); ann.put("parse", tree.ann); } } }
/** * Adds <B>constit</B> annotations to an existing Document <CODE>doc</CODE> to represent the parse * tree structure of a set of trees <CODE>trees</CODE>. * * @param trees list of parse trees * @param doc document to which annotations should be added * @param targetAnnotation name of annotation to determine spans to add parse tree annotations. * @param span target span. * @param jetCategories if false, use lexical categories from Penn Tree Bank; if true, use * categories from Jet */ public void addAnnotations( List<ParseTreeNode> trees, Document doc, String targetAnnotation, Span span, boolean jetCategories) { List<Annotation> targetList = (List<Annotation>) doc.annotationsOfType(targetAnnotation, span); Comparator<Annotation> cmp = new Comparator<Annotation>() { public int compare(Annotation a, Annotation b) { return a.span().compareTo(b.span()); } }; Collections.sort(targetList, cmp); if (trees.size() != targetList.size()) { System.err.println( "PTBReader.addAnnotations: mismatch between number of " + targetAnnotation + " (" + targetList.size() + ") and number of trees (" + trees.size() + ")"); } int n = Math.min(trees.size(), targetList.size()); for (int i = 0; i < n; i++) { ParseTreeNode tree = trees.get(i); addAnnotations(tree, doc, targetList.get(i).span(), jetCategories); targetList.get(i).put("parse", tree.ann); } }
/** * Adds <B>constit</B> annotations to an existing Document <CODE>doc</CODE> to represent the parse * tree structure <CODE>tree</CODE>. * * @param tree the parse tree (for a portion of Document doc) * @param doc the document * @param span the portion of doc covered by the parse tree * @param jetCategories if true, use Jet categories as terminal categories (if false, use * categories read from parse trees) */ public void addAnnotations(ParseTreeNode tree, Document doc, Span span, boolean jetCategories) { List<ParseTreeNode> terminalNodes = getTerminalNodes(tree); String text = doc.text(); int offset = span.start(); for (ParseTreeNode terminal : terminalNodes) { while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) { offset++; } for (String skipString : skip) { if (text.startsWith(skipString, offset)) { offset += skipString.length(); while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) { offset++; } break; } } // match next terminal node against next word in text int matchLength = matchTextToTree(text, offset, terminal.word); if (matchLength > 0) { int endOffset = offset + matchLength; while (endOffset < span.end() && Character.isWhitespace(text.charAt(endOffset))) { endOffset++; } terminal.start = offset; terminal.end = endOffset; offset = endOffset; } else { System.err.println( "PTBReader.addAnnotations: " + "Cannot determine parse tree offset for word " + terminal.word); System.err.println(" at document offset " + offset + " in sentence"); System.err.println(" " + doc.text(span)); return; } } if (jetCategories) { setJetAnnotations(tree, span, doc); StatParser.deleteUnusedConstits(doc, span, tree.ann); // <<< } else { determineNonTerminalSpans(tree, span.start()); setAnnotations(tree, doc); } }
private void annotate(Document doc, ParseTreeNode node) { doc.addAnnotation(node.ann); if (node.children != null) { Annotation[] children = new Annotation[node.children.length]; for (int i = 0; i < node.children.length; i++) { children[i] = node.children[i].ann; } node.ann.put("children", children); for (ParseTreeNode child : node.children) { annotate(doc, child); } } if (node.children == null && isAddingTokens) { // TODO: adds `case' property doc.annotate("token", node.ann.span(), new FeatureSet()); } }
/** * converts a set of Penn TreeBank files into text documents. Invoked by: PTBReader inputDir * outputDir. Converts all files with extension .mrg in inputDir to text documents, and writes * them into outputDir. */ public static void main(String[] args) throws Exception { if (args.length != 2) { System.out.println("usage: java " + PTBReader.class.getName() + " "); System.exit(1); } File inputDir = new File(args[0]); File outputDir = new File(args[1]); PTBReader parser = new PTBReader(); for (File file : getFiles(new File(args[0]), ".mrg")) { String outFilename = removeSuffix(getRelativePath(inputDir, file)); File outFile = new File(outputDir, outFilename); outFile.getParentFile().mkdirs(); Writer out = new FileWriter(outFile); Document doc = parser.load(file).getDocument(); out.write(doc.text()); out.close(); } }
/** * Creates annotations for each node in parse tree <CODE>node</NODE>. * These annotations are added to the parse tree and to the document * <CODE>doc</CODE>. In constrast to <CODE>setAnnotations</CODE>, * the categories used for terminal nodes are Jet categories obtained by * Jet tokenization and lexical look-up. This means that hyphenated * items are split, and multi-word names are reduced to a single node. * * @param node the root of the parse tree * @param treeSpan the span of the document matching the parse tree * @param doc the document to which annotations will be added */ private void setJetAnnotations(ParseTreeNode node, Span treeSpan, Document doc) { StatParser.buildParserInput(doc, treeSpan.start(), treeSpan.end(), false); StatParser.fixHyphenatedItems(doc); int nameConstitEnd = -1; List<ParseTreeNode> terminals = getTerminalNodes(node); for (ParseTreeNode terminal : terminals) { int terminalEnd = terminal.end; // is there a 'name' constituent or 'hyphword' constituent here? Vector<Annotation> constits = doc.annotationsAt(terminal.start, "constit"); Annotation constit = null; Annotation nameConstit = null; Annotation hyphword = null; if (constits != null) { for (Annotation c : constits) { if (c.get("cat") == "name") { nameConstit = c; } else if (c.get("cat") == "hyphword") { hyphword = c; } if (constit == null) constit = c; } } if (hyphword != null) { nameConstit = null; constit = hyphword; } // if there is a name which is not part of a hyphword, associate the // name with this (first) terminal node, and mark any remaining terminal // nodes which match tokens in the name as empty if (nameConstit != null) { terminal.end = nameConstit.end(); terminal.ann = nameConstit; nameConstitEnd = nameConstit.end(); } else if (nameConstitEnd >= 0) { terminal.word = null; } else { Span span = new Span(terminal.start, terminal.end); String pennPOS = ((String) terminal.category).toUpperCase().intern(); String word = terminal.word; terminal.ann = StatParser.buildWordDefn(doc, word, span, constit, pennPOS); } if (nameConstitEnd == terminalEnd) nameConstitEnd = -1; } // prune parse tree: remove a node if it has no word or children pruneTree(node); determineNonTerminalSpans(node, treeSpan.start()); // add head links if (hr == null) hr = HeadRule.createDefaultRule(); hr.apply(node); // add annotations for non-terminals: Jet.Parser.ParseTreeNode.makeParseAnnotations(doc, node); }
public void apply(Document doc, List<Object> values, Span span, DateTime ref) { Map params = getParameters(); String value = (String) params.get("value"); String diff = (String) params.get("diff"); String dir = (String) params.get("dir"); DateTime val = ref; if (value != null) { value = assignValues(value, values); val = new DateTime(value); } else if (diff != null) { diff = assignValues(diff, values); Period period = new Period(diff); if (dir == null || dir.equals("plus")) { val = ref.plus(period); } else if (dir.equals("minus")) { val = ref.minus(period); } } else { val = ref; // use set_xxx for (Map.Entry entry : (Set<Map.Entry>) params.entrySet()) { Matcher m = Pattern.compile("set_(.*)").matcher((String) entry.getKey()); if (m.matches()) { String field = assignValues((String) entry.getValue(), values); String fieldName = m.group(1); if (fieldName.equals("month")) { int month = Integer.parseInt(field); val = getTimeAnnotator().normalizeMonth(val, month); } else if (fieldName.equals("day")) { int day = Integer.parseInt(field); val = val.withField(DateTimeFieldType.dayOfMonth(), day); } else { throw new InternalError(); } } } } String formattedDate = formatter.print(val); FeatureSet attrs = new FeatureSet(); attrs.put("VAL", formattedDate); doc.annotate("TIMEX2", span, attrs); }
/** * Creates annotations for each node in parse tree <CODE>node</NODE>. * These annotations are added to the parse tree; in addition, if * Document <CODE>doc</CODE> is non-empty, they are added to the document. * <P> * Note that this method does not set the "children" attribute. * * @param node * @param doc */ private void setAnnotations(ParseTreeNode node, Document doc) { Span span = new Span(node.start, node.end); FeatureSet attrs = new FeatureSet(); attrs.put("cat", node.category); if (node.head != 0) { attrs.put("head", node.head); } if (node.function != null) { attrs.put("func", node.function); } node.ann = new Annotation("constit", span, attrs); if (doc != null) { doc.addAnnotation(node.ann); } if (node.children != null) { for (ParseTreeNode child : node.children) { setAnnotations(child, doc); } } }