예제 #1
0
파일: APFtoXML.java 프로젝트: rgrishman/jet
 /** splits document 'doc' into sentences, adding 'sentence' annotations */
 static void addSentences(Document doc) {
   SpecialZoner.findSpecialZones(doc);
   Vector<Annotation> textSegments = doc.annotationsOfType("TEXT");
   if (textSegments == null) {
     System.out.println("No <TEXT> in document");
     return;
   }
   for (Annotation ann : textSegments) {
     Span textSpan = ann.span();
     // check document case
     Ace.monocase = Ace.allLowerCase(doc);
     // split into sentences
     SentenceSplitter.split(doc, textSpan);
   }
   Vector<Annotation> sentences = doc.annotationsOfType("sentence");
   if (sentences != null) {
     int sentNo = 0;
     for (Annotation sentence : sentences) {
       sentNo++;
       sentence.put("ID", "SENT-" + sentNo);
     }
   }
   doc.removeAnnotationsOfType("dateline");
   doc.removeAnnotationsOfType("textBreak");
   doc.shrink("sentence");
 }
예제 #2
0
 static void writeDoc1(Document doc, PrintStream out) throws IOException {
   Vector<Annotation> entities = doc.annotationsOfType("entity");
   if (entities == null) {
     System.err.println("No Entity: " + doc);
     return;
   }
   Iterator<Annotation> entityIt = entities.iterator();
   int i = 0;
   while (entityIt.hasNext()) {
     Annotation entity = entityIt.next();
     Vector mentions = (Vector) entity.get("mentions");
     Iterator mentionIt = mentions.iterator();
     String nameType = (String) entity.get("nameType");
     while (mentionIt.hasNext()) {
       Annotation mention1 = (Annotation) mentionIt.next();
       Annotation mention2 = new Annotation("refobj", mention1.span(), new FeatureSet());
       mention2.put("objid", Integer.toString(i));
       if (nameType != null) {
         mention2.put("netype", nameType);
       }
       doc.addAnnotation(mention2);
     }
     i++;
   }
   // remove other annotations.
   String[] annotypes = doc.getAnnotationTypes();
   for (i = 0; i < annotypes.length; i++) {
     String t = annotypes[i];
     if (!(t.equals("tagger") || t.equals("refobj") || t.equals("ENAMEX"))) {
       doc.removeAnnotationsOfType(t);
     }
   }
   writeDocRaw(doc, out);
   return;
 }
예제 #3
0
 /**
  * performs the action, adding the specified Annotation. Returns the position of the end of the
  * Annotation.
  */
 @Override
 public int perform(Document doc, PatternApplication patap) {
   Span span;
   HashMap bindings = patap.bestBindings;
   // System.out.println ("bindings (for new annotation): " + bindings);
   if (spanVariable == null) {
     span = new Span(patap.startPosition, patap.bestPosition);
   } else if (spanVariable.name.toString() == "0") {
     span = new Span(patap.startPosition, patap.startPosition);
   } else {
     Object value = bindings.get(spanVariable.name);
     if (value instanceof Span) {
       span = (Span) value;
     } else if (value instanceof Annotation) {
       span = ((Annotation) value).span();
     } else {
       System.out.println("Value of " + spanVariable.toString() + " is not a span.or annotation");
       return -1;
     }
   }
   if (Pat.trace)
     Console.println(
         "Annotating "
             + doc.text(span)
             + " as "
             + type
             + " "
             + features.substitute(bindings).toSGMLString());
   hideAnnotations(doc, type, span);
   hideAnnotations(doc, "token", span);
   Annotation newAnnotation = new Annotation(type, span, features.substitute(bindings));
   doc.addAnnotation(newAnnotation);
   if (bindingVariable != null) bindings.put(bindingVariable.name, newAnnotation);
   return span.end();
 }
예제 #4
0
  static void processFile(String fname, PrintStream out) throws IOException {
    System.err.println("Processing: " + fname);

    FileInputStream fio = new FileInputStream(new File(fname));
    InputStreamReader fread = new InputStreamReader(fio, JetTest.encoding);
    BufferedReader fp = new BufferedReader(fread);
    StringBuffer buf = new StringBuffer();

    int docno = 0, allsents = 0, processedsents = 0;
    while (true) {
      String line = fp.readLine();

      // EOF or an empty line: the end of a Document.
      if (line == null || line.equals("")) {
        if (0 < buf.length()) {
          SGMLProcessor.allTags = true;
          Document doc = SGMLProcessor.sgmlToDoc(buf.toString(), (String[]) null);
          doc.setSGMLwrapMargin(0);
          System.err.println(
              "Doc-" + docno + ": sents=" + allsents + ", processed=" + processedsents);
          processDoc1(doc, docno);
          writeDoc1(doc, out);
          out.flush();
          buf = new StringBuffer();
          docno++;
          allsents = 0;
          processedsents = 0;
        }
        if (line == null) {
          break;
        } else {
          continue;
        }
      }

      if (line.startsWith("#")) {
        // "#" indicates a comment line.
        buf.append(line + "\n");
      } else {
        allsents++;
        if (processedsents < MaxProcessSentences) {
          buf.append("<sentence>");
          String[] words = line.split(" ");
          for (int i = 0; i < words.length; i++) {
            if (0 != words[i].length()) {
              buf.append("<token>" + words[i] + " </token>");
            }
          }
          buf.append("</sentence>\n");
          processedsents++;
        }
      }
    }

    fp.close();
    fread.close();
    fio.close();
    return;
  }
예제 #5
0
파일: APFtoXML.java 프로젝트: rgrishman/jet
 /** generate mention annotations (with entity numbers) based on the ACE entities and mentions. */
 static void addMentionTags(Document doc, AceDocument aceDoc) {
   ArrayList<AceEntity> entities = aceDoc.entities;
   for (int i = 0; i < entities.size(); i++) {
     AceEntity entity = entities.get(i);
     ArrayList<AceEntityMention> mentions = entity.mentions;
     for (int j = 0; j < mentions.size(); j++) {
       AceEntityMention mention = (AceEntityMention) mentions.get(j);
       // we compute a jetSpan not including trailing whitespace
       Span aceSpan = mention.head;
       // skip mentions in ChEnglish APF not aligned to any English text
       if (aceSpan.start() < 0) continue;
       Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1);
       FeatureSet features = new FeatureSet("entity", new Integer(i));
       if (flags.contains("types")) {
         features.put("type", entity.type.substring(0, 3));
         if (entity.subtype != null) features.put("subtype", entity.subtype);
       }
       if (flags.contains("extents")) {
         String cleanExtent = mention.text.replaceAll("\n", " ");
         features.put("extent", AceEntityMention.addXmlEscapes(cleanExtent));
       }
       doc.annotate("mention", jetSpan, features);
     }
   }
 }
예제 #6
0
 static void processDoc1(Document doc, int docno) throws IOException {
   // process document
   // System.err.println ("Parsing: "+docno+"/"+doc);
   String script = JetTest.config.getProperty("processDocument");
   // if there is a name tagger, clear its cache
   if (JetTest.nameTagger != null) JetTest.nameTagger.newDocument();
   Span all = new Span(0, doc.length());
   Control.applyScript(doc, all, script);
 }
예제 #7
0
 /**
  * hides (adds the 'hidden' feature) to all annotations of type <I>type</I> beginning at the
  * starting position of span <I>span</I>.
  */
 public static void hideAnnotations(Document doc, String type, Span span) {
   for (int posn = span.start(); posn < span.end(); posn++) {
     Vector annotations = doc.annotationsAt(posn, type);
     if (annotations != null) {
       for (int i = 0; i < annotations.size(); i++) {
         Annotation ann = (Annotation) annotations.elementAt(i);
         ann.put("hidden", "true");
         // Console.println ("Hiding " + ann);
       }
     }
   }
 }
예제 #8
0
파일: APFtoXML.java 프로젝트: rgrishman/jet
 /** write 'fileText' out as file 'XMLfileName' with ENAMEX tags for the names in the document */
 static void addENAMEXtags(Document doc, AceDocument aceDoc) {
   ArrayList<AceEntity> entities = aceDoc.entities;
   for (int i = 0; i < entities.size(); i++) {
     AceEntity entity = entities.get(i);
     ArrayList<AceEntityName> names = entity.names;
     for (int j = 0; j < names.size(); j++) {
       AceEntityName name = names.get(j);
       Span aceSpan = name.extent;
       Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1);
       doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type));
     }
     // for 2004 we have to examine PRE mentions and decide which are names
     if (year.equals("2004")) {
       ArrayList<AceEntityMention> mentions = entity.mentions;
       for (int j = 0; j < mentions.size(); j++) {
         AceEntityMention mention = mentions.get(j);
         String htext = Resolve.normalizeName(mention.headText);
         String[] mentionName = Gazetteer.splitAtWS(htext);
         String preClass = preDict.get(htext.toLowerCase());
         if (mention.type.equals("PRE")) {
           if (gazetteer.isNationality(mentionName)
               || gazetteer.isLocation(mentionName)
               || "N".equals(preClass)) {
             Span aceSpan = mention.head;
             Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1);
             doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type));
           } else if (preClass != null) {
             // do nothing
           } else {
             System.out.println(
                 "Unclassified PRE: " + mention.text + " {" + mention.headText + ")");
             unknownPre.add(htext.toLowerCase());
           }
         }
       }
     }
   }
 }
예제 #9
0
파일: APFtoXML.java 프로젝트: rgrishman/jet
 static void addTimexTags(Document doc, AceDocument aceDoc) {
   List<AceTimex> timeExpressions = aceDoc.timeExpressions;
   for (AceTimex timex : timeExpressions) {
     AceTimexMention mention = (AceTimexMention) timex.mentions.get(0);
     Span aceSpan = mention.extent;
     Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1);
     FeatureSet features = new FeatureSet();
     if (timex.val != null && !timex.val.equals("")) features.put("val", timex.val);
     if (timex.anchorVal != null && !timex.anchorVal.equals(""))
       features.put("anchor_val", timex.anchorVal);
     if (timex.anchorDir != null && !timex.anchorDir.equals(""))
       features.put("anchor_dir", timex.anchorDir);
     if (timex.set != null && !timex.set.equals("")) features.put("set", timex.set);
     if (timex.mod != null && !timex.mod.equals("")) features.put("mod", timex.mod);
     doc.annotate("timex2", jetSpan, features);
   }
 }
예제 #10
0
 /**
  * parse all the sentences in Document 'doc', returning a SyntacticRelationSet containing all the
  * dependency relations.
  */
 public static SyntacticRelationSet parseDocument(Document doc) {
   Vector<Annotation> sentences = doc.annotationsOfType("sentence");
   if (sentences == null || sentences.size() == 0) {
     System.out.println("DepParser:  no sentences");
     return null;
   }
   if (fsw == null) {
     System.out.println("DepParser:  no model loaded");
     return null;
   }
   SyntacticRelationSet relations = new SyntacticRelationSet();
   for (Annotation sentence : sentences) {
     Span span = sentence.span();
     parseSentence(doc, span, relations);
   }
   return relations;
 }
예제 #11
0
 /** generate mention annotations (with entity numbers) based on the ACE entities and mentions. */
 static void addMentionTags(Document doc, AceDocument aceDoc) {
   ArrayList<AceEntity> entities = aceDoc.entities;
   for (int i = 0; i < entities.size(); i++) {
     AceEntity entity = (AceEntity) entities.get(i);
     ArrayList<AceEntityMention> mentions = entity.mentions;
     for (int j = 0; j < mentions.size(); j++) {
       AceEntityMention mention = mentions.get(j);
       // we compute a jetSpan not including trailing whitespace
       Span aceSpan = mention.head;
       Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1);
       FeatureSet features = new FeatureSet("entity", new Integer(i));
       if (showTypes) {
         features.put("type", entity.type.substring(0, 3));
         if (entity.subtype != null) features.put("subtype", entity.subtype);
       }
       doc.annotate("mention", jetSpan, features);
     }
   }
 }
예제 #12
0
 static void writeDocRaw(Document doc, PrintStream out) throws IOException {
   out.println(doc.writeSGML(null).toString());
   out.flush();
   return;
 }
예제 #13
0
파일: APFtoXML.java 프로젝트: rgrishman/jet
 public static String processDocument(Document doc, AceDocument aceDoc) {
   addAnnotations(doc, aceDoc);
   return doc.writeSGML(null).toString();
 }
예제 #14
0
  /** generate the dependency parse for a sentence, adding its arcs to 'relations'. */
  public static void parseSentence(Document doc, Span span, SyntacticRelationSet relations) {
    if (fsw == null) {
      System.out.println("DepParser:  no model loaded");
      return;
    }
    // System.out.println ("parseSentence:  " + doc.text(span));
    // run Penn part-of-speech tagger
    // JetTest.tagger.annotate(doc, span, "tagger");
    // build sentence
    List<Token> tokens = new ArrayList<Token>();
    List<Integer> offset = new ArrayList<Integer>();
    offset.add(0); // don't use 0th entry
    int tokenNum = 0;
    int posn = span.start();
    while (posn < span.end()) {
      tokenNum++;
      Annotation tokenAnnotation = doc.tokenAt(posn);
      for (String s : SPECIAL_TOKEN) {
        Vector<Annotation> va = doc.annotationsAt(posn, s);
        if (va != null && va.size() > 0) {
          tokenAnnotation = va.get(0);
          break;
        }
      }
      if (tokenAnnotation == null) return;
      String tokenText = doc.normalizedText(tokenAnnotation).replaceAll(" ", "_");
      Vector v = doc.annotationsAt(posn, "tagger");
      Annotation a = (Annotation) v.get(0);
      String pos = (String) a.get("cat");
      tokens.add(new Token(tokenText, pos, tokenNum));
      offset.add(posn);
      if (posn >= tokenAnnotation.end()) {
        break;
      }
      posn = tokenAnnotation.end();
    }
    Sentence sent = new Sentence(tokens);
    // parse sentence
    Arc[] arcs =
        fsw.process(
                sent,
                tokens.size() > 0 && tokens.get(0).getPos() == null,
                true,
                true,
                true,
                true,
                true)
            .getParse()
            .getHeadArcs();

    // get dependencies
    for (Arc arc : arcs) {
      if (arc == null) continue;
      if (arc.getDependency().equalsIgnoreCase("ROOT")) continue;
      Token head = arc.getHead();
      String headText = head.getText();
      String headPos = head.getPos();
      Integer headOffset = offset.get(head.getIndex());
      Token dep = arc.getChild();
      String depText = dep.getText();
      String depPos = dep.getPos();
      Integer depOffset = offset.get(dep.getIndex());
      String type = arc.getDependency();
      SyntacticRelation r =
          new SyntacticRelation(headOffset, headText, headPos, type, depOffset, depText, depPos);
      relations.add(r);
      // System.out.println ("parseSentence:  adding relation " + r);
    }
  }