/** splits document 'doc' into sentences, adding 'sentence' annotations */ static void addSentences(Document doc) { SpecialZoner.findSpecialZones(doc); Vector<Annotation> textSegments = doc.annotationsOfType("TEXT"); if (textSegments == null) { System.out.println("No <TEXT> in document"); return; } for (Annotation ann : textSegments) { Span textSpan = ann.span(); // check document case Ace.monocase = Ace.allLowerCase(doc); // split into sentences SentenceSplitter.split(doc, textSpan); } Vector<Annotation> sentences = doc.annotationsOfType("sentence"); if (sentences != null) { int sentNo = 0; for (Annotation sentence : sentences) { sentNo++; sentence.put("ID", "SENT-" + sentNo); } } doc.removeAnnotationsOfType("dateline"); doc.removeAnnotationsOfType("textBreak"); doc.shrink("sentence"); }
static void writeDoc1(Document doc, PrintStream out) throws IOException { Vector<Annotation> entities = doc.annotationsOfType("entity"); if (entities == null) { System.err.println("No Entity: " + doc); return; } Iterator<Annotation> entityIt = entities.iterator(); int i = 0; while (entityIt.hasNext()) { Annotation entity = entityIt.next(); Vector mentions = (Vector) entity.get("mentions"); Iterator mentionIt = mentions.iterator(); String nameType = (String) entity.get("nameType"); while (mentionIt.hasNext()) { Annotation mention1 = (Annotation) mentionIt.next(); Annotation mention2 = new Annotation("refobj", mention1.span(), new FeatureSet()); mention2.put("objid", Integer.toString(i)); if (nameType != null) { mention2.put("netype", nameType); } doc.addAnnotation(mention2); } i++; } // remove other annotations. String[] annotypes = doc.getAnnotationTypes(); for (i = 0; i < annotypes.length; i++) { String t = annotypes[i]; if (!(t.equals("tagger") || t.equals("refobj") || t.equals("ENAMEX"))) { doc.removeAnnotationsOfType(t); } } writeDocRaw(doc, out); return; }
/** * performs the action, adding the specified Annotation. Returns the position of the end of the * Annotation. */ @Override public int perform(Document doc, PatternApplication patap) { Span span; HashMap bindings = patap.bestBindings; // System.out.println ("bindings (for new annotation): " + bindings); if (spanVariable == null) { span = new Span(patap.startPosition, patap.bestPosition); } else if (spanVariable.name.toString() == "0") { span = new Span(patap.startPosition, patap.startPosition); } else { Object value = bindings.get(spanVariable.name); if (value instanceof Span) { span = (Span) value; } else if (value instanceof Annotation) { span = ((Annotation) value).span(); } else { System.out.println("Value of " + spanVariable.toString() + " is not a span.or annotation"); return -1; } } if (Pat.trace) Console.println( "Annotating " + doc.text(span) + " as " + type + " " + features.substitute(bindings).toSGMLString()); hideAnnotations(doc, type, span); hideAnnotations(doc, "token", span); Annotation newAnnotation = new Annotation(type, span, features.substitute(bindings)); doc.addAnnotation(newAnnotation); if (bindingVariable != null) bindings.put(bindingVariable.name, newAnnotation); return span.end(); }
static void processFile(String fname, PrintStream out) throws IOException { System.err.println("Processing: " + fname); FileInputStream fio = new FileInputStream(new File(fname)); InputStreamReader fread = new InputStreamReader(fio, JetTest.encoding); BufferedReader fp = new BufferedReader(fread); StringBuffer buf = new StringBuffer(); int docno = 0, allsents = 0, processedsents = 0; while (true) { String line = fp.readLine(); // EOF or an empty line: the end of a Document. if (line == null || line.equals("")) { if (0 < buf.length()) { SGMLProcessor.allTags = true; Document doc = SGMLProcessor.sgmlToDoc(buf.toString(), (String[]) null); doc.setSGMLwrapMargin(0); System.err.println( "Doc-" + docno + ": sents=" + allsents + ", processed=" + processedsents); processDoc1(doc, docno); writeDoc1(doc, out); out.flush(); buf = new StringBuffer(); docno++; allsents = 0; processedsents = 0; } if (line == null) { break; } else { continue; } } if (line.startsWith("#")) { // "#" indicates a comment line. buf.append(line + "\n"); } else { allsents++; if (processedsents < MaxProcessSentences) { buf.append("<sentence>"); String[] words = line.split(" "); for (int i = 0; i < words.length; i++) { if (0 != words[i].length()) { buf.append("<token>" + words[i] + " </token>"); } } buf.append("</sentence>\n"); processedsents++; } } } fp.close(); fread.close(); fio.close(); return; }
/** generate mention annotations (with entity numbers) based on the ACE entities and mentions. */ static void addMentionTags(Document doc, AceDocument aceDoc) { ArrayList<AceEntity> entities = aceDoc.entities; for (int i = 0; i < entities.size(); i++) { AceEntity entity = entities.get(i); ArrayList<AceEntityMention> mentions = entity.mentions; for (int j = 0; j < mentions.size(); j++) { AceEntityMention mention = (AceEntityMention) mentions.get(j); // we compute a jetSpan not including trailing whitespace Span aceSpan = mention.head; // skip mentions in ChEnglish APF not aligned to any English text if (aceSpan.start() < 0) continue; Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1); FeatureSet features = new FeatureSet("entity", new Integer(i)); if (flags.contains("types")) { features.put("type", entity.type.substring(0, 3)); if (entity.subtype != null) features.put("subtype", entity.subtype); } if (flags.contains("extents")) { String cleanExtent = mention.text.replaceAll("\n", " "); features.put("extent", AceEntityMention.addXmlEscapes(cleanExtent)); } doc.annotate("mention", jetSpan, features); } } }
static void processDoc1(Document doc, int docno) throws IOException { // process document // System.err.println ("Parsing: "+docno+"/"+doc); String script = JetTest.config.getProperty("processDocument"); // if there is a name tagger, clear its cache if (JetTest.nameTagger != null) JetTest.nameTagger.newDocument(); Span all = new Span(0, doc.length()); Control.applyScript(doc, all, script); }
/** * hides (adds the 'hidden' feature) to all annotations of type <I>type</I> beginning at the * starting position of span <I>span</I>. */ public static void hideAnnotations(Document doc, String type, Span span) { for (int posn = span.start(); posn < span.end(); posn++) { Vector annotations = doc.annotationsAt(posn, type); if (annotations != null) { for (int i = 0; i < annotations.size(); i++) { Annotation ann = (Annotation) annotations.elementAt(i); ann.put("hidden", "true"); // Console.println ("Hiding " + ann); } } } }
/** write 'fileText' out as file 'XMLfileName' with ENAMEX tags for the names in the document */ static void addENAMEXtags(Document doc, AceDocument aceDoc) { ArrayList<AceEntity> entities = aceDoc.entities; for (int i = 0; i < entities.size(); i++) { AceEntity entity = entities.get(i); ArrayList<AceEntityName> names = entity.names; for (int j = 0; j < names.size(); j++) { AceEntityName name = names.get(j); Span aceSpan = name.extent; Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1); doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type)); } // for 2004 we have to examine PRE mentions and decide which are names if (year.equals("2004")) { ArrayList<AceEntityMention> mentions = entity.mentions; for (int j = 0; j < mentions.size(); j++) { AceEntityMention mention = mentions.get(j); String htext = Resolve.normalizeName(mention.headText); String[] mentionName = Gazetteer.splitAtWS(htext); String preClass = preDict.get(htext.toLowerCase()); if (mention.type.equals("PRE")) { if (gazetteer.isNationality(mentionName) || gazetteer.isLocation(mentionName) || "N".equals(preClass)) { Span aceSpan = mention.head; Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1); doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type)); } else if (preClass != null) { // do nothing } else { System.out.println( "Unclassified PRE: " + mention.text + " {" + mention.headText + ")"); unknownPre.add(htext.toLowerCase()); } } } } } }
static void addTimexTags(Document doc, AceDocument aceDoc) { List<AceTimex> timeExpressions = aceDoc.timeExpressions; for (AceTimex timex : timeExpressions) { AceTimexMention mention = (AceTimexMention) timex.mentions.get(0); Span aceSpan = mention.extent; Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1); FeatureSet features = new FeatureSet(); if (timex.val != null && !timex.val.equals("")) features.put("val", timex.val); if (timex.anchorVal != null && !timex.anchorVal.equals("")) features.put("anchor_val", timex.anchorVal); if (timex.anchorDir != null && !timex.anchorDir.equals("")) features.put("anchor_dir", timex.anchorDir); if (timex.set != null && !timex.set.equals("")) features.put("set", timex.set); if (timex.mod != null && !timex.mod.equals("")) features.put("mod", timex.mod); doc.annotate("timex2", jetSpan, features); } }
/** * parse all the sentences in Document 'doc', returning a SyntacticRelationSet containing all the * dependency relations. */ public static SyntacticRelationSet parseDocument(Document doc) { Vector<Annotation> sentences = doc.annotationsOfType("sentence"); if (sentences == null || sentences.size() == 0) { System.out.println("DepParser: no sentences"); return null; } if (fsw == null) { System.out.println("DepParser: no model loaded"); return null; } SyntacticRelationSet relations = new SyntacticRelationSet(); for (Annotation sentence : sentences) { Span span = sentence.span(); parseSentence(doc, span, relations); } return relations; }
/** generate mention annotations (with entity numbers) based on the ACE entities and mentions. */ static void addMentionTags(Document doc, AceDocument aceDoc) { ArrayList<AceEntity> entities = aceDoc.entities; for (int i = 0; i < entities.size(); i++) { AceEntity entity = (AceEntity) entities.get(i); ArrayList<AceEntityMention> mentions = entity.mentions; for (int j = 0; j < mentions.size(); j++) { AceEntityMention mention = mentions.get(j); // we compute a jetSpan not including trailing whitespace Span aceSpan = mention.head; Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1); FeatureSet features = new FeatureSet("entity", new Integer(i)); if (showTypes) { features.put("type", entity.type.substring(0, 3)); if (entity.subtype != null) features.put("subtype", entity.subtype); } doc.annotate("mention", jetSpan, features); } } }
static void writeDocRaw(Document doc, PrintStream out) throws IOException { out.println(doc.writeSGML(null).toString()); out.flush(); return; }
public static String processDocument(Document doc, AceDocument aceDoc) { addAnnotations(doc, aceDoc); return doc.writeSGML(null).toString(); }
/** generate the dependency parse for a sentence, adding its arcs to 'relations'. */ public static void parseSentence(Document doc, Span span, SyntacticRelationSet relations) { if (fsw == null) { System.out.println("DepParser: no model loaded"); return; } // System.out.println ("parseSentence: " + doc.text(span)); // run Penn part-of-speech tagger // JetTest.tagger.annotate(doc, span, "tagger"); // build sentence List<Token> tokens = new ArrayList<Token>(); List<Integer> offset = new ArrayList<Integer>(); offset.add(0); // don't use 0th entry int tokenNum = 0; int posn = span.start(); while (posn < span.end()) { tokenNum++; Annotation tokenAnnotation = doc.tokenAt(posn); for (String s : SPECIAL_TOKEN) { Vector<Annotation> va = doc.annotationsAt(posn, s); if (va != null && va.size() > 0) { tokenAnnotation = va.get(0); break; } } if (tokenAnnotation == null) return; String tokenText = doc.normalizedText(tokenAnnotation).replaceAll(" ", "_"); Vector v = doc.annotationsAt(posn, "tagger"); Annotation a = (Annotation) v.get(0); String pos = (String) a.get("cat"); tokens.add(new Token(tokenText, pos, tokenNum)); offset.add(posn); if (posn >= tokenAnnotation.end()) { break; } posn = tokenAnnotation.end(); } Sentence sent = new Sentence(tokens); // parse sentence Arc[] arcs = fsw.process( sent, tokens.size() > 0 && tokens.get(0).getPos() == null, true, true, true, true, true) .getParse() .getHeadArcs(); // get dependencies for (Arc arc : arcs) { if (arc == null) continue; if (arc.getDependency().equalsIgnoreCase("ROOT")) continue; Token head = arc.getHead(); String headText = head.getText(); String headPos = head.getPos(); Integer headOffset = offset.get(head.getIndex()); Token dep = arc.getChild(); String depText = dep.getText(); String depPos = dep.getPos(); Integer depOffset = offset.get(dep.getIndex()); String type = arc.getDependency(); SyntacticRelation r = new SyntacticRelation(headOffset, headText, headPos, type, depOffset, depText, depPos); relations.add(r); // System.out.println ("parseSentence: adding relation " + r); } }