예제 #1
0
파일: APFtoXML.java 프로젝트: rgrishman/jet
 /** splits document 'doc' into sentences, adding 'sentence' annotations */
 static void addSentences(Document doc) {
   SpecialZoner.findSpecialZones(doc);
   Vector<Annotation> textSegments = doc.annotationsOfType("TEXT");
   if (textSegments == null) {
     System.out.println("No <TEXT> in document");
     return;
   }
   for (Annotation ann : textSegments) {
     Span textSpan = ann.span();
     // check document case
     Ace.monocase = Ace.allLowerCase(doc);
     // split into sentences
     SentenceSplitter.split(doc, textSpan);
   }
   Vector<Annotation> sentences = doc.annotationsOfType("sentence");
   if (sentences != null) {
     int sentNo = 0;
     for (Annotation sentence : sentences) {
       sentNo++;
       sentence.put("ID", "SENT-" + sentNo);
     }
   }
   doc.removeAnnotationsOfType("dateline");
   doc.removeAnnotationsOfType("textBreak");
   doc.shrink("sentence");
 }
예제 #2
0
 /**
  * hides (adds the 'hidden' feature) to all annotations of type <I>type</I> beginning at the
  * starting position of span <I>span</I>.
  */
 public static void hideAnnotations(Document doc, String type, Span span) {
   for (int posn = span.start(); posn < span.end(); posn++) {
     Vector annotations = doc.annotationsAt(posn, type);
     if (annotations != null) {
       for (int i = 0; i < annotations.size(); i++) {
         Annotation ann = (Annotation) annotations.elementAt(i);
         ann.put("hidden", "true");
         // Console.println ("Hiding " + ann);
       }
     }
   }
 }
예제 #3
0
 static void writeDoc1(Document doc, PrintStream out) throws IOException {
   Vector<Annotation> entities = doc.annotationsOfType("entity");
   if (entities == null) {
     System.err.println("No Entity: " + doc);
     return;
   }
   Iterator<Annotation> entityIt = entities.iterator();
   int i = 0;
   while (entityIt.hasNext()) {
     Annotation entity = entityIt.next();
     Vector mentions = (Vector) entity.get("mentions");
     Iterator mentionIt = mentions.iterator();
     String nameType = (String) entity.get("nameType");
     while (mentionIt.hasNext()) {
       Annotation mention1 = (Annotation) mentionIt.next();
       Annotation mention2 = new Annotation("refobj", mention1.span(), new FeatureSet());
       mention2.put("objid", Integer.toString(i));
       if (nameType != null) {
         mention2.put("netype", nameType);
       }
       doc.addAnnotation(mention2);
     }
     i++;
   }
   // remove other annotations.
   String[] annotypes = doc.getAnnotationTypes();
   for (i = 0; i < annotypes.length; i++) {
     String t = annotypes[i];
     if (!(t.equals("tagger") || t.equals("refobj") || t.equals("ENAMEX"))) {
       doc.removeAnnotationsOfType(t);
     }
   }
   writeDocRaw(doc, out);
   return;
 }
예제 #4
0
 /**
  * parse all the sentences in Document 'doc', returning a SyntacticRelationSet containing all the
  * dependency relations.
  */
 public static SyntacticRelationSet parseDocument(Document doc) {
   Vector<Annotation> sentences = doc.annotationsOfType("sentence");
   if (sentences == null || sentences.size() == 0) {
     System.out.println("DepParser:  no sentences");
     return null;
   }
   if (fsw == null) {
     System.out.println("DepParser:  no model loaded");
     return null;
   }
   SyntacticRelationSet relations = new SyntacticRelationSet();
   for (Annotation sentence : sentences) {
     Span span = sentence.span();
     parseSentence(doc, span, relations);
   }
   return relations;
 }
예제 #5
0
  /** generate the dependency parse for a sentence, adding its arcs to 'relations'. */
  public static void parseSentence(Document doc, Span span, SyntacticRelationSet relations) {
    if (fsw == null) {
      System.out.println("DepParser:  no model loaded");
      return;
    }
    // System.out.println ("parseSentence:  " + doc.text(span));
    // run Penn part-of-speech tagger
    // JetTest.tagger.annotate(doc, span, "tagger");
    // build sentence
    List<Token> tokens = new ArrayList<Token>();
    List<Integer> offset = new ArrayList<Integer>();
    offset.add(0); // don't use 0th entry
    int tokenNum = 0;
    int posn = span.start();
    while (posn < span.end()) {
      tokenNum++;
      Annotation tokenAnnotation = doc.tokenAt(posn);
      for (String s : SPECIAL_TOKEN) {
        Vector<Annotation> va = doc.annotationsAt(posn, s);
        if (va != null && va.size() > 0) {
          tokenAnnotation = va.get(0);
          break;
        }
      }
      if (tokenAnnotation == null) return;
      String tokenText = doc.normalizedText(tokenAnnotation).replaceAll(" ", "_");
      Vector v = doc.annotationsAt(posn, "tagger");
      Annotation a = (Annotation) v.get(0);
      String pos = (String) a.get("cat");
      tokens.add(new Token(tokenText, pos, tokenNum));
      offset.add(posn);
      if (posn >= tokenAnnotation.end()) {
        break;
      }
      posn = tokenAnnotation.end();
    }
    Sentence sent = new Sentence(tokens);
    // parse sentence
    Arc[] arcs =
        fsw.process(
                sent,
                tokens.size() > 0 && tokens.get(0).getPos() == null,
                true,
                true,
                true,
                true,
                true)
            .getParse()
            .getHeadArcs();

    // get dependencies
    for (Arc arc : arcs) {
      if (arc == null) continue;
      if (arc.getDependency().equalsIgnoreCase("ROOT")) continue;
      Token head = arc.getHead();
      String headText = head.getText();
      String headPos = head.getPos();
      Integer headOffset = offset.get(head.getIndex());
      Token dep = arc.getChild();
      String depText = dep.getText();
      String depPos = dep.getPos();
      Integer depOffset = offset.get(dep.getIndex());
      String type = arc.getDependency();
      SyntacticRelation r =
          new SyntacticRelation(headOffset, headText, headPos, type, depOffset, depText, depPos);
      relations.add(r);
      // System.out.println ("parseSentence:  adding relation " + r);
    }
  }