コード例 #1
0
  /**
   * Get the text value of this entity. The headTokenSpan MUST be set before calling this method!
   */
  public String getValue() {
    List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
    // int lastEnd = -1;
    StringBuilder sb = new StringBuilder();
    for (int i = headTokenSpan.start(); i < headTokenSpan.end(); i++) {
      CoreLabel token = tokens.get(i);

      // we are not guaranteed to have CharacterOffsets so we can't use them...
      /*
      Integer start = token.get(CharacterOffsetBeginAnnotation.class);
      Integer end = token.get(CharacterOffsetEndAnnotation.class);

      if (start != null && end != null) {
        if (lastEnd != -1 && !start.equals(lastEnd)) {
          sb.append(StringUtils.repeat(" ", start - lastEnd));
          lastEnd = end;
        }
      } else {
        if (lastEnd != -1) sb.append(" ");
        lastEnd = 0;
      }
        */
      if (i > headTokenSpan.start()) sb.append(" ");

      sb.append(token.word());
    }

    return sb.toString();
  }
コード例 #2
0
ファイル: APFtoXML.java プロジェクト: rgrishman/jet
 /** generate mention annotations (with entity numbers) based on the ACE entities and mentions. */
 static void addMentionTags(Document doc, AceDocument aceDoc) {
   ArrayList<AceEntity> entities = aceDoc.entities;
   for (int i = 0; i < entities.size(); i++) {
     AceEntity entity = entities.get(i);
     ArrayList<AceEntityMention> mentions = entity.mentions;
     for (int j = 0; j < mentions.size(); j++) {
       AceEntityMention mention = (AceEntityMention) mentions.get(j);
       // we compute a jetSpan not including trailing whitespace
       Span aceSpan = mention.head;
       // skip mentions in ChEnglish APF not aligned to any English text
       if (aceSpan.start() < 0) continue;
       Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1);
       FeatureSet features = new FeatureSet("entity", new Integer(i));
       if (flags.contains("types")) {
         features.put("type", entity.type.substring(0, 3));
         if (entity.subtype != null) features.put("subtype", entity.subtype);
       }
       if (flags.contains("extents")) {
         String cleanExtent = mention.text.replaceAll("\n", " ");
         features.put("extent", AceEntityMention.addXmlEscapes(cleanExtent));
       }
       doc.annotate("mention", jetSpan, features);
     }
   }
 }
コード例 #3
0
 public String getExtentString() {
   List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
   StringBuilder sb = new StringBuilder();
   for (int i = extentTokenSpan.start(); i < extentTokenSpan.end(); i++) {
     CoreLabel token = tokens.get(i);
     if (i > extentTokenSpan.start()) sb.append(" ");
     sb.append(token.word());
   }
   return sb.toString();
 }
コード例 #4
0
 /**
  * Always returns the text corresponding to the extent of this object, even when getValue is
  * overridden by subclass.
  */
 public final String getFullValue() {
   List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
   StringBuilder sb = new StringBuilder();
   if (tokens != null && extentTokenSpan != null) {
     for (int i = extentTokenSpan.start(); i < extentTokenSpan.end(); i++) {
       if (i > extentTokenSpan.start()) sb.append(" ");
       sb.append(tokens.get(i).word());
     }
   }
   return sb.toString();
 }
コード例 #5
0
 /**
  * Creates annotations for each node in parse tree <CODE>node</NODE>.
  * These annotations are added to the parse tree and to the document
  * <CODE>doc</CODE>.  In constrast to <CODE>setAnnotations</CODE>,
  * the categories used for terminal nodes are Jet categories obtained by
  * Jet tokenization and lexical look-up.  This means that hyphenated
  * items are split, and multi-word names are reduced to a single node.
  *
  * @param node      the root of the parse tree
  * @param treeSpan  the span of the document matching the parse tree
  * @param doc       the document to which annotations will be added
  */
 private void setJetAnnotations(ParseTreeNode node, Span treeSpan, Document doc) {
   StatParser.buildParserInput(doc, treeSpan.start(), treeSpan.end(), false);
   StatParser.fixHyphenatedItems(doc);
   int nameConstitEnd = -1;
   List<ParseTreeNode> terminals = getTerminalNodes(node);
   for (ParseTreeNode terminal : terminals) {
     int terminalEnd = terminal.end;
     // is there a 'name' constituent or 'hyphword' constituent here?
     Vector<Annotation> constits = doc.annotationsAt(terminal.start, "constit");
     Annotation constit = null;
     Annotation nameConstit = null;
     Annotation hyphword = null;
     if (constits != null) {
       for (Annotation c : constits) {
         if (c.get("cat") == "name") {
           nameConstit = c;
         } else if (c.get("cat") == "hyphword") {
           hyphword = c;
         }
         if (constit == null) constit = c;
       }
     }
     if (hyphword != null) {
       nameConstit = null;
       constit = hyphword;
     }
     // if there is a name which is not part of a hyphword, associate the
     // name with this (first) terminal node, and mark any remaining terminal
     // nodes which match tokens in the name as empty
     if (nameConstit != null) {
       terminal.end = nameConstit.end();
       terminal.ann = nameConstit;
       nameConstitEnd = nameConstit.end();
     } else if (nameConstitEnd >= 0) {
       terminal.word = null;
     } else {
       Span span = new Span(terminal.start, terminal.end);
       String pennPOS = ((String) terminal.category).toUpperCase().intern();
       String word = terminal.word;
       terminal.ann = StatParser.buildWordDefn(doc, word, span, constit, pennPOS);
     }
     if (nameConstitEnd == terminalEnd) nameConstitEnd = -1;
   }
   // prune parse tree:  remove a node if it has no word or children
   pruneTree(node);
   determineNonTerminalSpans(node, treeSpan.start());
   // add head links
   if (hr == null) hr = HeadRule.createDefaultRule();
   hr.apply(node);
   // add annotations for non-terminals:
   Jet.Parser.ParseTreeNode.makeParseAnnotations(doc, node);
 }
コード例 #6
0
  /**
   * Adds <B>constit</B> annotations to an existing Document <CODE>doc</CODE> to represent the parse
   * tree structure <CODE>tree</CODE>.
   *
   * @param tree the parse tree (for a portion of Document doc)
   * @param doc the document
   * @param span the portion of doc covered by the parse tree
   * @param jetCategories if true, use Jet categories as terminal categories (if false, use
   *     categories read from parse trees)
   */
  public void addAnnotations(ParseTreeNode tree, Document doc, Span span, boolean jetCategories) {
    List<ParseTreeNode> terminalNodes = getTerminalNodes(tree);
    String text = doc.text();
    int offset = span.start();

    for (ParseTreeNode terminal : terminalNodes) {
      while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) {
        offset++;
      }
      for (String skipString : skip) {
        if (text.startsWith(skipString, offset)) {
          offset += skipString.length();
          while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) {
            offset++;
          }
          break;
        }
      }
      // match next terminal node against next word in text
      int matchLength = matchTextToTree(text, offset, terminal.word);
      if (matchLength > 0) {
        int endOffset = offset + matchLength;
        while (endOffset < span.end() && Character.isWhitespace(text.charAt(endOffset))) {
          endOffset++;
        }
        terminal.start = offset;
        terminal.end = endOffset;
        offset = endOffset;
      } else {
        System.err.println(
            "PTBReader.addAnnotations:  "
                + "Cannot determine parse tree offset for word "
                + terminal.word);
        System.err.println("  at document offset " + offset + " in sentence");
        System.err.println("  " + doc.text(span));
        return;
      }
    }

    if (jetCategories) {
      setJetAnnotations(tree, span, doc);
      StatParser.deleteUnusedConstits(doc, span, tree.ann); // <<<
    } else {
      determineNonTerminalSpans(tree, span.start());
      setAnnotations(tree, doc);
    }
  }
コード例 #7
0
 /**
  * hides (adds the 'hidden' feature) to all annotations of type <I>type</I> beginning at the
  * starting position of span <I>span</I>.
  */
 public static void hideAnnotations(Document doc, String type, Span span) {
   for (int posn = span.start(); posn < span.end(); posn++) {
     Vector annotations = doc.annotationsAt(posn, type);
     if (annotations != null) {
       for (int i = 0; i < annotations.size(); i++) {
         Annotation ann = (Annotation) annotations.elementAt(i);
         ann.put("hidden", "true");
         // Console.println ("Hiding " + ann);
       }
     }
   }
 }
コード例 #8
0
ファイル: APFtoXML.java プロジェクト: rgrishman/jet
 /** write 'fileText' out as file 'XMLfileName' with ENAMEX tags for the names in the document */
 static void addENAMEXtags(Document doc, AceDocument aceDoc) {
   ArrayList<AceEntity> entities = aceDoc.entities;
   for (int i = 0; i < entities.size(); i++) {
     AceEntity entity = entities.get(i);
     ArrayList<AceEntityName> names = entity.names;
     for (int j = 0; j < names.size(); j++) {
       AceEntityName name = names.get(j);
       Span aceSpan = name.extent;
       Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1);
       doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type));
     }
     // for 2004 we have to examine PRE mentions and decide which are names
     if (year.equals("2004")) {
       ArrayList<AceEntityMention> mentions = entity.mentions;
       for (int j = 0; j < mentions.size(); j++) {
         AceEntityMention mention = mentions.get(j);
         String htext = Resolve.normalizeName(mention.headText);
         String[] mentionName = Gazetteer.splitAtWS(htext);
         String preClass = preDict.get(htext.toLowerCase());
         if (mention.type.equals("PRE")) {
           if (gazetteer.isNationality(mentionName)
               || gazetteer.isLocation(mentionName)
               || "N".equals(preClass)) {
             Span aceSpan = mention.head;
             Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1);
             doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type));
           } else if (preClass != null) {
             // do nothing
           } else {
             System.out.println(
                 "Unclassified PRE: " + mention.text + " {" + mention.headText + ")");
             unknownPre.add(htext.toLowerCase());
           }
         }
       }
     }
   }
 }
コード例 #9
0
ファイル: APFtoXML.java プロジェクト: rgrishman/jet
 static void addTimexTags(Document doc, AceDocument aceDoc) {
   List<AceTimex> timeExpressions = aceDoc.timeExpressions;
   for (AceTimex timex : timeExpressions) {
     AceTimexMention mention = (AceTimexMention) timex.mentions.get(0);
     Span aceSpan = mention.extent;
     Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1);
     FeatureSet features = new FeatureSet();
     if (timex.val != null && !timex.val.equals("")) features.put("val", timex.val);
     if (timex.anchorVal != null && !timex.anchorVal.equals(""))
       features.put("anchor_val", timex.anchorVal);
     if (timex.anchorDir != null && !timex.anchorDir.equals(""))
       features.put("anchor_dir", timex.anchorDir);
     if (timex.set != null && !timex.set.equals("")) features.put("set", timex.set);
     if (timex.mod != null && !timex.mod.equals("")) features.put("mod", timex.mod);
     doc.annotate("timex2", jetSpan, features);
   }
 }
コード例 #10
0
ファイル: APFtoCorefXML.java プロジェクト: rgrishman/jet
 /** generate mention annotations (with entity numbers) based on the ACE entities and mentions. */
 static void addMentionTags(Document doc, AceDocument aceDoc) {
   ArrayList<AceEntity> entities = aceDoc.entities;
   for (int i = 0; i < entities.size(); i++) {
     AceEntity entity = (AceEntity) entities.get(i);
     ArrayList<AceEntityMention> mentions = entity.mentions;
     for (int j = 0; j < mentions.size(); j++) {
       AceEntityMention mention = mentions.get(j);
       // we compute a jetSpan not including trailing whitespace
       Span aceSpan = mention.head;
       Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1);
       FeatureSet features = new FeatureSet("entity", new Integer(i));
       if (showTypes) {
         features.put("type", entity.type.substring(0, 3));
         if (entity.subtype != null) features.put("subtype", entity.subtype);
       }
       doc.annotate("mention", jetSpan, features);
     }
   }
 }
コード例 #11
0
 @Override
 public String toString() {
   return "EntityMention [type="
       + type
       + (subType != null ? ", subType=" + subType : "")
       + (mentionType != null ? ", mentionType=" + mentionType : "")
       + (objectId != null ? ", objectId=" + objectId : "")
       + (headTokenSpan != null
           ? ", hstart=" + headTokenSpan.start() + ", hend=" + headTokenSpan.end()
           : "")
       + (extentTokenSpan != null
           ? ", estart=" + extentTokenSpan.start() + ", eend=" + extentTokenSpan.end()
           : "")
       + (syntacticHeadTokenPosition >= 0 ? ", headPosition=" + syntacticHeadTokenPosition : "")
       + (headTokenSpan != null ? ", value=\"" + getValue() + "\"" : "")
       + (normalizedName != null ? ", normalizedName=\"" + normalizedName + "\"" : "")
       + ", corefID="
       + corefID
       + (typeProbabilities != null ? ", probs=" + probsToString() : "")
       + "]";
 }
コード例 #12
0
 public int getExtentTokenStart() {
   return extentTokenSpan.start();
 }
コード例 #13
0
 public int getHeadTokenStart() {
   return headTokenSpan.start();
 }
コード例 #14
0
ファイル: DepParser.java プロジェクト: rgrishman/jet
  /** generate the dependency parse for a sentence, adding its arcs to 'relations'. */
  public static void parseSentence(Document doc, Span span, SyntacticRelationSet relations) {
    if (fsw == null) {
      System.out.println("DepParser:  no model loaded");
      return;
    }
    // System.out.println ("parseSentence:  " + doc.text(span));
    // run Penn part-of-speech tagger
    // JetTest.tagger.annotate(doc, span, "tagger");
    // build sentence
    List<Token> tokens = new ArrayList<Token>();
    List<Integer> offset = new ArrayList<Integer>();
    offset.add(0); // don't use 0th entry
    int tokenNum = 0;
    int posn = span.start();
    while (posn < span.end()) {
      tokenNum++;
      Annotation tokenAnnotation = doc.tokenAt(posn);
      for (String s : SPECIAL_TOKEN) {
        Vector<Annotation> va = doc.annotationsAt(posn, s);
        if (va != null && va.size() > 0) {
          tokenAnnotation = va.get(0);
          break;
        }
      }
      if (tokenAnnotation == null) return;
      String tokenText = doc.normalizedText(tokenAnnotation).replaceAll(" ", "_");
      Vector v = doc.annotationsAt(posn, "tagger");
      Annotation a = (Annotation) v.get(0);
      String pos = (String) a.get("cat");
      tokens.add(new Token(tokenText, pos, tokenNum));
      offset.add(posn);
      if (posn >= tokenAnnotation.end()) {
        break;
      }
      posn = tokenAnnotation.end();
    }
    Sentence sent = new Sentence(tokens);
    // parse sentence
    Arc[] arcs =
        fsw.process(
                sent,
                tokens.size() > 0 && tokens.get(0).getPos() == null,
                true,
                true,
                true,
                true,
                true)
            .getParse()
            .getHeadArcs();

    // get dependencies
    for (Arc arc : arcs) {
      if (arc == null) continue;
      if (arc.getDependency().equalsIgnoreCase("ROOT")) continue;
      Token head = arc.getHead();
      String headText = head.getText();
      String headPos = head.getPos();
      Integer headOffset = offset.get(head.getIndex());
      Token dep = arc.getChild();
      String depText = dep.getText();
      String depPos = dep.getPos();
      Integer depOffset = offset.get(dep.getIndex());
      String type = arc.getDependency();
      SyntacticRelation r =
          new SyntacticRelation(headOffset, headText, headPos, type, depOffset, depText, depPos);
      relations.add(r);
      // System.out.println ("parseSentence:  adding relation " + r);
    }
  }