/** * Builds Jet.Tipster.Document object from Penn treebank corpus. * * @param in * @return * @throws IOException * @throws InvalidFormatException */ public Treebank load(Reader in) throws IOException, InvalidFormatException { List<ParseTreeNode> trees = new ArrayList<ParseTreeNode>(); PushbackReader input = new PushbackReader(in); int start = 0; while (true) { skipWhitespace(input); if (lookAhead(input) == -1) { break; } ParseTreeNode tree = readNode(input); trees.add(tree); determineSpans(tree, start); setAnnotations(tree, null); start = tree.end; } String text = buildDocumentString(trees); Document doc = new Document(text); for (ParseTreeNode tree : trees) { doc.annotate("sentence", new Span(tree.start, tree.end), new FeatureSet()); annotate(doc, tree); } return new Treebank(doc, trees); }
/** generate mention annotations (with entity numbers) based on the ACE entities and mentions. */ static void addMentionTags(Document doc, AceDocument aceDoc) { ArrayList<AceEntity> entities = aceDoc.entities; for (int i = 0; i < entities.size(); i++) { AceEntity entity = entities.get(i); ArrayList<AceEntityMention> mentions = entity.mentions; for (int j = 0; j < mentions.size(); j++) { AceEntityMention mention = (AceEntityMention) mentions.get(j); // we compute a jetSpan not including trailing whitespace Span aceSpan = mention.head; // skip mentions in ChEnglish APF not aligned to any English text if (aceSpan.start() < 0) continue; Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1); FeatureSet features = new FeatureSet("entity", new Integer(i)); if (flags.contains("types")) { features.put("type", entity.type.substring(0, 3)); if (entity.subtype != null) features.put("subtype", entity.subtype); } if (flags.contains("extents")) { String cleanExtent = mention.text.replaceAll("\n", " "); features.put("extent", AceEntityMention.addXmlEscapes(cleanExtent)); } doc.annotate("mention", jetSpan, features); } } }
/** write 'fileText' out as file 'XMLfileName' with ENAMEX tags for the names in the document */ static void addENAMEXtags(Document doc, AceDocument aceDoc) { ArrayList<AceEntity> entities = aceDoc.entities; for (int i = 0; i < entities.size(); i++) { AceEntity entity = entities.get(i); ArrayList<AceEntityName> names = entity.names; for (int j = 0; j < names.size(); j++) { AceEntityName name = names.get(j); Span aceSpan = name.extent; Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1); doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type)); } // for 2004 we have to examine PRE mentions and decide which are names if (year.equals("2004")) { ArrayList<AceEntityMention> mentions = entity.mentions; for (int j = 0; j < mentions.size(); j++) { AceEntityMention mention = mentions.get(j); String htext = Resolve.normalizeName(mention.headText); String[] mentionName = Gazetteer.splitAtWS(htext); String preClass = preDict.get(htext.toLowerCase()); if (mention.type.equals("PRE")) { if (gazetteer.isNationality(mentionName) || gazetteer.isLocation(mentionName) || "N".equals(preClass)) { Span aceSpan = mention.head; Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1); doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type)); } else if (preClass != null) { // do nothing } else { System.out.println( "Unclassified PRE: " + mention.text + " {" + mention.headText + ")"); unknownPre.add(htext.toLowerCase()); } } } } } }
static void addTimexTags(Document doc, AceDocument aceDoc) { List<AceTimex> timeExpressions = aceDoc.timeExpressions; for (AceTimex timex : timeExpressions) { AceTimexMention mention = (AceTimexMention) timex.mentions.get(0); Span aceSpan = mention.extent; Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1); FeatureSet features = new FeatureSet(); if (timex.val != null && !timex.val.equals("")) features.put("val", timex.val); if (timex.anchorVal != null && !timex.anchorVal.equals("")) features.put("anchor_val", timex.anchorVal); if (timex.anchorDir != null && !timex.anchorDir.equals("")) features.put("anchor_dir", timex.anchorDir); if (timex.set != null && !timex.set.equals("")) features.put("set", timex.set); if (timex.mod != null && !timex.mod.equals("")) features.put("mod", timex.mod); doc.annotate("timex2", jetSpan, features); } }
/** generate mention annotations (with entity numbers) based on the ACE entities and mentions. */ static void addMentionTags(Document doc, AceDocument aceDoc) { ArrayList<AceEntity> entities = aceDoc.entities; for (int i = 0; i < entities.size(); i++) { AceEntity entity = (AceEntity) entities.get(i); ArrayList<AceEntityMention> mentions = entity.mentions; for (int j = 0; j < mentions.size(); j++) { AceEntityMention mention = mentions.get(j); // we compute a jetSpan not including trailing whitespace Span aceSpan = mention.head; Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1); FeatureSet features = new FeatureSet("entity", new Integer(i)); if (showTypes) { features.put("type", entity.type.substring(0, 3)); if (entity.subtype != null) features.put("subtype", entity.subtype); } doc.annotate("mention", jetSpan, features); } } }
private void annotate(Document doc, ParseTreeNode node) { doc.addAnnotation(node.ann); if (node.children != null) { Annotation[] children = new Annotation[node.children.length]; for (int i = 0; i < node.children.length; i++) { children[i] = node.children[i].ann; } node.ann.put("children", children); for (ParseTreeNode child : node.children) { annotate(doc, child); } } if (node.children == null && isAddingTokens) { // TODO: adds `case' property doc.annotate("token", node.ann.span(), new FeatureSet()); } }