static void writeDoc1(Document doc, PrintStream out) throws IOException { Vector<Annotation> entities = doc.annotationsOfType("entity"); if (entities == null) { System.err.println("No Entity: " + doc); return; } Iterator<Annotation> entityIt = entities.iterator(); int i = 0; while (entityIt.hasNext()) { Annotation entity = entityIt.next(); Vector mentions = (Vector) entity.get("mentions"); Iterator mentionIt = mentions.iterator(); String nameType = (String) entity.get("nameType"); while (mentionIt.hasNext()) { Annotation mention1 = (Annotation) mentionIt.next(); Annotation mention2 = new Annotation("refobj", mention1.span(), new FeatureSet()); mention2.put("objid", Integer.toString(i)); if (nameType != null) { mention2.put("netype", nameType); } doc.addAnnotation(mention2); } i++; } // remove other annotations. String[] annotypes = doc.getAnnotationTypes(); for (i = 0; i < annotypes.length; i++) { String t = annotypes[i]; if (!(t.equals("tagger") || t.equals("refobj") || t.equals("ENAMEX"))) { doc.removeAnnotationsOfType(t); } } writeDocRaw(doc, out); return; }
/** * parse all the sentences in Document 'doc', returning a SyntacticRelationSet containing all the * dependency relations. */ public static SyntacticRelationSet parseDocument(Document doc) { Vector<Annotation> sentences = doc.annotationsOfType("sentence"); if (sentences == null || sentences.size() == 0) { System.out.println("DepParser: no sentences"); return null; } if (fsw == null) { System.out.println("DepParser: no model loaded"); return null; } SyntacticRelationSet relations = new SyntacticRelationSet(); for (Annotation sentence : sentences) { Span span = sentence.span(); parseSentence(doc, span, relations); } return relations; }
/** generate the dependency parse for a sentence, adding its arcs to 'relations'. */ public static void parseSentence(Document doc, Span span, SyntacticRelationSet relations) { if (fsw == null) { System.out.println("DepParser: no model loaded"); return; } // System.out.println ("parseSentence: " + doc.text(span)); // run Penn part-of-speech tagger // JetTest.tagger.annotate(doc, span, "tagger"); // build sentence List<Token> tokens = new ArrayList<Token>(); List<Integer> offset = new ArrayList<Integer>(); offset.add(0); // don't use 0th entry int tokenNum = 0; int posn = span.start(); while (posn < span.end()) { tokenNum++; Annotation tokenAnnotation = doc.tokenAt(posn); for (String s : SPECIAL_TOKEN) { Vector<Annotation> va = doc.annotationsAt(posn, s); if (va != null && va.size() > 0) { tokenAnnotation = va.get(0); break; } } if (tokenAnnotation == null) return; String tokenText = doc.normalizedText(tokenAnnotation).replaceAll(" ", "_"); Vector v = doc.annotationsAt(posn, "tagger"); Annotation a = (Annotation) v.get(0); String pos = (String) a.get("cat"); tokens.add(new Token(tokenText, pos, tokenNum)); offset.add(posn); if (posn >= tokenAnnotation.end()) { break; } posn = tokenAnnotation.end(); } Sentence sent = new Sentence(tokens); // parse sentence Arc[] arcs = fsw.process( sent, tokens.size() > 0 && tokens.get(0).getPos() == null, true, true, true, true, true) .getParse() .getHeadArcs(); // get dependencies for (Arc arc : arcs) { if (arc == null) continue; if (arc.getDependency().equalsIgnoreCase("ROOT")) continue; Token head = arc.getHead(); String headText = head.getText(); String headPos = head.getPos(); Integer headOffset = offset.get(head.getIndex()); Token dep = arc.getChild(); String depText = dep.getText(); String depPos = dep.getPos(); Integer depOffset = offset.get(dep.getIndex()); String type = arc.getDependency(); SyntacticRelation r = new SyntacticRelation(headOffset, headText, headPos, type, depOffset, depText, depPos); relations.add(r); // System.out.println ("parseSentence: adding relation " + r); } }