/** * Builds Jet.Tipster.Document object from Penn treebank corpus. * * @param in * @return * @throws IOException * @throws InvalidFormatException */ public Treebank load(Reader in) throws IOException, InvalidFormatException { List<ParseTreeNode> trees = new ArrayList<ParseTreeNode>(); PushbackReader input = new PushbackReader(in); int start = 0; while (true) { skipWhitespace(input); if (lookAhead(input) == -1) { break; } ParseTreeNode tree = readNode(input); trees.add(tree); determineSpans(tree, start); setAnnotations(tree, null); start = tree.end; } String text = buildDocumentString(trees); Document doc = new Document(text); for (ParseTreeNode tree : trees) { doc.annotate("sentence", new Span(tree.start, tree.end), new FeatureSet()); annotate(doc, tree); } return new Treebank(doc, trees); }
public void apply(Document doc, List<Object> values, Span span, DateTime ref) { Map params = getParameters(); String value = (String) params.get("value"); String diff = (String) params.get("diff"); String dir = (String) params.get("dir"); DateTime val = ref; if (value != null) { value = assignValues(value, values); val = new DateTime(value); } else if (diff != null) { diff = assignValues(diff, values); Period period = new Period(diff); if (dir == null || dir.equals("plus")) { val = ref.plus(period); } else if (dir.equals("minus")) { val = ref.minus(period); } } else { val = ref; // use set_xxx for (Map.Entry entry : (Set<Map.Entry>) params.entrySet()) { Matcher m = Pattern.compile("set_(.*)").matcher((String) entry.getKey()); if (m.matches()) { String field = assignValues((String) entry.getValue(), values); String fieldName = m.group(1); if (fieldName.equals("month")) { int month = Integer.parseInt(field); val = getTimeAnnotator().normalizeMonth(val, month); } else if (fieldName.equals("day")) { int day = Integer.parseInt(field); val = val.withField(DateTimeFieldType.dayOfMonth(), day); } else { throw new InternalError(); } } } } String formattedDate = formatter.print(val); FeatureSet attrs = new FeatureSet(); attrs.put("VAL", formattedDate); doc.annotate("TIMEX2", span, attrs); }
private void annotate(Document doc, ParseTreeNode node) { doc.addAnnotation(node.ann); if (node.children != null) { Annotation[] children = new Annotation[node.children.length]; for (int i = 0; i < node.children.length; i++) { children[i] = node.children[i].ann; } node.ann.put("children", children); for (ParseTreeNode child : node.children) { annotate(doc, child); } } if (node.children == null && isAddingTokens) { // TODO: adds `case' property doc.annotate("token", node.ann.span(), new FeatureSet()); } }