/** * Get the text value of this entity. The headTokenSpan MUST be set before calling this method! */ public String getValue() { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); // int lastEnd = -1; StringBuilder sb = new StringBuilder(); for (int i = headTokenSpan.start(); i < headTokenSpan.end(); i++) { CoreLabel token = tokens.get(i); // we are not guaranteed to have CharacterOffsets so we can't use them... /* Integer start = token.get(CharacterOffsetBeginAnnotation.class); Integer end = token.get(CharacterOffsetEndAnnotation.class); if (start != null && end != null) { if (lastEnd != -1 && !start.equals(lastEnd)) { sb.append(StringUtils.repeat(" ", start - lastEnd)); lastEnd = end; } } else { if (lastEnd != -1) sb.append(" "); lastEnd = 0; } */ if (i > headTokenSpan.start()) sb.append(" "); sb.append(token.word()); } return sb.toString(); }
/** generate mention annotations (with entity numbers) based on the ACE entities and mentions. */ static void addMentionTags(Document doc, AceDocument aceDoc) { ArrayList<AceEntity> entities = aceDoc.entities; for (int i = 0; i < entities.size(); i++) { AceEntity entity = entities.get(i); ArrayList<AceEntityMention> mentions = entity.mentions; for (int j = 0; j < mentions.size(); j++) { AceEntityMention mention = (AceEntityMention) mentions.get(j); // we compute a jetSpan not including trailing whitespace Span aceSpan = mention.head; // skip mentions in ChEnglish APF not aligned to any English text if (aceSpan.start() < 0) continue; Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1); FeatureSet features = new FeatureSet("entity", new Integer(i)); if (flags.contains("types")) { features.put("type", entity.type.substring(0, 3)); if (entity.subtype != null) features.put("subtype", entity.subtype); } if (flags.contains("extents")) { String cleanExtent = mention.text.replaceAll("\n", " "); features.put("extent", AceEntityMention.addXmlEscapes(cleanExtent)); } doc.annotate("mention", jetSpan, features); } } }
public String getExtentString() { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); StringBuilder sb = new StringBuilder(); for (int i = extentTokenSpan.start(); i < extentTokenSpan.end(); i++) { CoreLabel token = tokens.get(i); if (i > extentTokenSpan.start()) sb.append(" "); sb.append(token.word()); } return sb.toString(); }
/** * Always returns the text corresponding to the extent of this object, even when getValue is * overridden by subclass. */ public final String getFullValue() { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); StringBuilder sb = new StringBuilder(); if (tokens != null && extentTokenSpan != null) { for (int i = extentTokenSpan.start(); i < extentTokenSpan.end(); i++) { if (i > extentTokenSpan.start()) sb.append(" "); sb.append(tokens.get(i).word()); } } return sb.toString(); }
/** * Creates annotations for each node in parse tree <CODE>node</NODE>. * These annotations are added to the parse tree and to the document * <CODE>doc</CODE>. In constrast to <CODE>setAnnotations</CODE>, * the categories used for terminal nodes are Jet categories obtained by * Jet tokenization and lexical look-up. This means that hyphenated * items are split, and multi-word names are reduced to a single node. * * @param node the root of the parse tree * @param treeSpan the span of the document matching the parse tree * @param doc the document to which annotations will be added */ private void setJetAnnotations(ParseTreeNode node, Span treeSpan, Document doc) { StatParser.buildParserInput(doc, treeSpan.start(), treeSpan.end(), false); StatParser.fixHyphenatedItems(doc); int nameConstitEnd = -1; List<ParseTreeNode> terminals = getTerminalNodes(node); for (ParseTreeNode terminal : terminals) { int terminalEnd = terminal.end; // is there a 'name' constituent or 'hyphword' constituent here? Vector<Annotation> constits = doc.annotationsAt(terminal.start, "constit"); Annotation constit = null; Annotation nameConstit = null; Annotation hyphword = null; if (constits != null) { for (Annotation c : constits) { if (c.get("cat") == "name") { nameConstit = c; } else if (c.get("cat") == "hyphword") { hyphword = c; } if (constit == null) constit = c; } } if (hyphword != null) { nameConstit = null; constit = hyphword; } // if there is a name which is not part of a hyphword, associate the // name with this (first) terminal node, and mark any remaining terminal // nodes which match tokens in the name as empty if (nameConstit != null) { terminal.end = nameConstit.end(); terminal.ann = nameConstit; nameConstitEnd = nameConstit.end(); } else if (nameConstitEnd >= 0) { terminal.word = null; } else { Span span = new Span(terminal.start, terminal.end); String pennPOS = ((String) terminal.category).toUpperCase().intern(); String word = terminal.word; terminal.ann = StatParser.buildWordDefn(doc, word, span, constit, pennPOS); } if (nameConstitEnd == terminalEnd) nameConstitEnd = -1; } // prune parse tree: remove a node if it has no word or children pruneTree(node); determineNonTerminalSpans(node, treeSpan.start()); // add head links if (hr == null) hr = HeadRule.createDefaultRule(); hr.apply(node); // add annotations for non-terminals: Jet.Parser.ParseTreeNode.makeParseAnnotations(doc, node); }
/** * Adds <B>constit</B> annotations to an existing Document <CODE>doc</CODE> to represent the parse * tree structure <CODE>tree</CODE>. * * @param tree the parse tree (for a portion of Document doc) * @param doc the document * @param span the portion of doc covered by the parse tree * @param jetCategories if true, use Jet categories as terminal categories (if false, use * categories read from parse trees) */ public void addAnnotations(ParseTreeNode tree, Document doc, Span span, boolean jetCategories) { List<ParseTreeNode> terminalNodes = getTerminalNodes(tree); String text = doc.text(); int offset = span.start(); for (ParseTreeNode terminal : terminalNodes) { while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) { offset++; } for (String skipString : skip) { if (text.startsWith(skipString, offset)) { offset += skipString.length(); while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) { offset++; } break; } } // match next terminal node against next word in text int matchLength = matchTextToTree(text, offset, terminal.word); if (matchLength > 0) { int endOffset = offset + matchLength; while (endOffset < span.end() && Character.isWhitespace(text.charAt(endOffset))) { endOffset++; } terminal.start = offset; terminal.end = endOffset; offset = endOffset; } else { System.err.println( "PTBReader.addAnnotations: " + "Cannot determine parse tree offset for word " + terminal.word); System.err.println(" at document offset " + offset + " in sentence"); System.err.println(" " + doc.text(span)); return; } } if (jetCategories) { setJetAnnotations(tree, span, doc); StatParser.deleteUnusedConstits(doc, span, tree.ann); // <<< } else { determineNonTerminalSpans(tree, span.start()); setAnnotations(tree, doc); } }
/** * hides (adds the 'hidden' feature) to all annotations of type <I>type</I> beginning at the * starting position of span <I>span</I>. */ public static void hideAnnotations(Document doc, String type, Span span) { for (int posn = span.start(); posn < span.end(); posn++) { Vector annotations = doc.annotationsAt(posn, type); if (annotations != null) { for (int i = 0; i < annotations.size(); i++) { Annotation ann = (Annotation) annotations.elementAt(i); ann.put("hidden", "true"); // Console.println ("Hiding " + ann); } } } }
/** write 'fileText' out as file 'XMLfileName' with ENAMEX tags for the names in the document */ static void addENAMEXtags(Document doc, AceDocument aceDoc) { ArrayList<AceEntity> entities = aceDoc.entities; for (int i = 0; i < entities.size(); i++) { AceEntity entity = entities.get(i); ArrayList<AceEntityName> names = entity.names; for (int j = 0; j < names.size(); j++) { AceEntityName name = names.get(j); Span aceSpan = name.extent; Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1); doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type)); } // for 2004 we have to examine PRE mentions and decide which are names if (year.equals("2004")) { ArrayList<AceEntityMention> mentions = entity.mentions; for (int j = 0; j < mentions.size(); j++) { AceEntityMention mention = mentions.get(j); String htext = Resolve.normalizeName(mention.headText); String[] mentionName = Gazetteer.splitAtWS(htext); String preClass = preDict.get(htext.toLowerCase()); if (mention.type.equals("PRE")) { if (gazetteer.isNationality(mentionName) || gazetteer.isLocation(mentionName) || "N".equals(preClass)) { Span aceSpan = mention.head; Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1); doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type)); } else if (preClass != null) { // do nothing } else { System.out.println( "Unclassified PRE: " + mention.text + " {" + mention.headText + ")"); unknownPre.add(htext.toLowerCase()); } } } } } }
static void addTimexTags(Document doc, AceDocument aceDoc) { List<AceTimex> timeExpressions = aceDoc.timeExpressions; for (AceTimex timex : timeExpressions) { AceTimexMention mention = (AceTimexMention) timex.mentions.get(0); Span aceSpan = mention.extent; Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1); FeatureSet features = new FeatureSet(); if (timex.val != null && !timex.val.equals("")) features.put("val", timex.val); if (timex.anchorVal != null && !timex.anchorVal.equals("")) features.put("anchor_val", timex.anchorVal); if (timex.anchorDir != null && !timex.anchorDir.equals("")) features.put("anchor_dir", timex.anchorDir); if (timex.set != null && !timex.set.equals("")) features.put("set", timex.set); if (timex.mod != null && !timex.mod.equals("")) features.put("mod", timex.mod); doc.annotate("timex2", jetSpan, features); } }
/** generate mention annotations (with entity numbers) based on the ACE entities and mentions. */ static void addMentionTags(Document doc, AceDocument aceDoc) { ArrayList<AceEntity> entities = aceDoc.entities; for (int i = 0; i < entities.size(); i++) { AceEntity entity = (AceEntity) entities.get(i); ArrayList<AceEntityMention> mentions = entity.mentions; for (int j = 0; j < mentions.size(); j++) { AceEntityMention mention = mentions.get(j); // we compute a jetSpan not including trailing whitespace Span aceSpan = mention.head; Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1); FeatureSet features = new FeatureSet("entity", new Integer(i)); if (showTypes) { features.put("type", entity.type.substring(0, 3)); if (entity.subtype != null) features.put("subtype", entity.subtype); } doc.annotate("mention", jetSpan, features); } } }
@Override public String toString() { return "EntityMention [type=" + type + (subType != null ? ", subType=" + subType : "") + (mentionType != null ? ", mentionType=" + mentionType : "") + (objectId != null ? ", objectId=" + objectId : "") + (headTokenSpan != null ? ", hstart=" + headTokenSpan.start() + ", hend=" + headTokenSpan.end() : "") + (extentTokenSpan != null ? ", estart=" + extentTokenSpan.start() + ", eend=" + extentTokenSpan.end() : "") + (syntacticHeadTokenPosition >= 0 ? ", headPosition=" + syntacticHeadTokenPosition : "") + (headTokenSpan != null ? ", value=\"" + getValue() + "\"" : "") + (normalizedName != null ? ", normalizedName=\"" + normalizedName + "\"" : "") + ", corefID=" + corefID + (typeProbabilities != null ? ", probs=" + probsToString() : "") + "]"; }
public int getExtentTokenStart() { return extentTokenSpan.start(); }
public int getHeadTokenStart() { return headTokenSpan.start(); }
/** generate the dependency parse for a sentence, adding its arcs to 'relations'. */ public static void parseSentence(Document doc, Span span, SyntacticRelationSet relations) { if (fsw == null) { System.out.println("DepParser: no model loaded"); return; } // System.out.println ("parseSentence: " + doc.text(span)); // run Penn part-of-speech tagger // JetTest.tagger.annotate(doc, span, "tagger"); // build sentence List<Token> tokens = new ArrayList<Token>(); List<Integer> offset = new ArrayList<Integer>(); offset.add(0); // don't use 0th entry int tokenNum = 0; int posn = span.start(); while (posn < span.end()) { tokenNum++; Annotation tokenAnnotation = doc.tokenAt(posn); for (String s : SPECIAL_TOKEN) { Vector<Annotation> va = doc.annotationsAt(posn, s); if (va != null && va.size() > 0) { tokenAnnotation = va.get(0); break; } } if (tokenAnnotation == null) return; String tokenText = doc.normalizedText(tokenAnnotation).replaceAll(" ", "_"); Vector v = doc.annotationsAt(posn, "tagger"); Annotation a = (Annotation) v.get(0); String pos = (String) a.get("cat"); tokens.add(new Token(tokenText, pos, tokenNum)); offset.add(posn); if (posn >= tokenAnnotation.end()) { break; } posn = tokenAnnotation.end(); } Sentence sent = new Sentence(tokens); // parse sentence Arc[] arcs = fsw.process( sent, tokens.size() > 0 && tokens.get(0).getPos() == null, true, true, true, true, true) .getParse() .getHeadArcs(); // get dependencies for (Arc arc : arcs) { if (arc == null) continue; if (arc.getDependency().equalsIgnoreCase("ROOT")) continue; Token head = arc.getHead(); String headText = head.getText(); String headPos = head.getPos(); Integer headOffset = offset.get(head.getIndex()); Token dep = arc.getChild(); String depText = dep.getText(); String depPos = dep.getPos(); Integer depOffset = offset.get(dep.getIndex()); String type = arc.getDependency(); SyntacticRelation r = new SyntacticRelation(headOffset, headText, headPos, type, depOffset, depText, depPos); relations.add(r); // System.out.println ("parseSentence: adding relation " + r); } }