예제 #1
0
  public static void fillInParseAnnotations(
      boolean verbose, boolean buildGraphs, CoreMap sentence, Tree tree) {
    // make sure all tree nodes are CoreLabels
    // TODO: why isn't this always true? something fishy is going on
    ParserAnnotatorUtils.convertToCoreLabels(tree);

    // index nodes, i.e., add start and end token positions to all nodes
    // this is needed by other annotators down stream, e.g., the NFLAnnotator
    tree.indexSpans(0);

    sentence.set(TreeAnnotation.class, tree);
    if (verbose) {
      System.err.println("Tree is:");
      tree.pennPrint(System.err);
    }

    if (buildGraphs) {
      // generate the dependency graph
      SemanticGraph deps = generateCollapsedDependencies(tree);
      SemanticGraph uncollapsedDeps = generateUncollapsedDependencies(tree);
      SemanticGraph ccDeps = generateCCProcessedDependencies(tree);
      if (verbose) {
        System.err.println("SDs:");
        System.err.println(deps.toString("plain"));
      }
      sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, deps);
      sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, uncollapsedDeps);
      sentence.set(
          SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, ccDeps);
    }

    setMissingTags(sentence, tree);
  }
예제 #2
0
  public static void addFigerAnnotationToDocument(Annotation d) throws SQLException {

    List<CoreMap> sentences = d.get(CoreAnnotations.SentencesAnnotation.class);
    Set<String> entityIds = new HashSet<String>();
    for (CoreMap sen : sentences) {
      List<Triple<Pair<Integer, Integer>, String, Float>> nelAnnotation =
          sen.get(NamedEntityLinkingAnnotation.class);
      for (Triple<Pair<Integer, Integer>, String, Float> t : nelAnnotation) {
        String id = t.second;
        if (!id.equals("null")) {
          entityIds.add(id);
        }
      }
    }
    Map<String, Set<String>> idTypeMap = bigQuery(entityIds);
    // add type onto sentences
    for (CoreMap sen : sentences) {
      List<Triple<Pair<Integer, Integer>, String, Float>> nelAnnotation =
          sen.get(NamedEntityLinkingAnnotation.class);
      List<Triple<Set<String>, Integer, Integer>> figerData = new ArrayList<>();
      for (Triple<Pair<Integer, Integer>, String, Float> t : nelAnnotation) {
        Integer start = t.first.first;
        Integer end = t.first.second;
        Set<String> types = null;
        if (!t.second.equals("null")) {
          types = idTypeMap.get(GuidMidConversion.convertBackward(t.second));
        }
        Triple<Set<String>, Integer, Integer> figerTrip = new Triple<>(types, start, end);
        figerData.add(figerTrip);
      }
      sen.set(FigerAnnotation.class, figerData);
    }
  }
예제 #3
0
 private static void addLemma(
     Morphology morpha,
     Class<? extends CoreAnnotation<String>> ann,
     CoreMap map,
     String word,
     String tag) {
   if (tag.length() > 0) {
     String phrasalVerb = phrasalVerb(morpha, word, tag);
     if (phrasalVerb == null) {
       map.set(ann, morpha.lemma(word, tag));
     } else {
       map.set(ann, phrasalVerb);
     }
   } else {
     map.set(ann, morpha.stem(word));
   }
 }
 private List<CoreMap> toCoreMaps(
     CoreMap annotation, List<TimeExpression> timeExpressions, SUTime.TimeIndex timeIndex) {
   if (timeExpressions == null) return null;
   List<CoreMap> coreMaps = new ArrayList<CoreMap>(timeExpressions.size());
   for (TimeExpression te : timeExpressions) {
     CoreMap cm = te.getAnnotation();
     SUTime.Temporal temporal = te.getTemporal();
     if (temporal != null) {
       String origText = annotation.get(CoreAnnotations.TextAnnotation.class);
       String text = cm.get(CoreAnnotations.TextAnnotation.class);
       if (origText != null) {
         // Make sure the text is from original (and not from concatenated tokens)
         ChunkAnnotationUtils.annotateChunkText(cm, annotation);
         text = cm.get(CoreAnnotations.TextAnnotation.class);
       }
       Map<String, String> timexAttributes;
       try {
         timexAttributes = temporal.getTimexAttributes(timeIndex);
         if (options.includeRange) {
           SUTime.Temporal rangeTemporal = temporal.getRange();
           if (rangeTemporal != null) {
             timexAttributes.put("range", rangeTemporal.toString());
           }
         }
       } catch (Exception e) {
         logger.log(
             Level.WARNING,
             "Failed to get attributes from " + text + ", timeIndex " + timeIndex,
             e);
         continue;
       }
       Timex timex;
       try {
         timex = Timex.fromMap(text, timexAttributes);
       } catch (Exception e) {
         logger.log(
             Level.WARNING,
             "Failed to process " + text + " with attributes " + timexAttributes,
             e);
         continue;
       }
       cm.set(TimexAnnotation.class, timex);
       if (timex != null) {
         coreMaps.add(cm);
       } else {
         logger.warning("No timex expression for: " + text);
       }
     }
   }
   return coreMaps;
 }
  public List<TimeExpression> extractTimeExpressions(CoreMap annotation, String docDateStr) {
    List<CoreMap> mergedNumbers = NumberNormalizer.findAndMergeNumbers(annotation);
    annotation.set(CoreAnnotations.NumerizedTokensAnnotation.class, mergedNumbers);

    // TODO: docDate may not have century....
    SUTime.Time docDate = timexPatterns.parseDateTime(docDateStr);

    List<? extends MatchedExpression> matchedExpressions =
        expressionExtractor.extractExpressions(annotation);
    List<TimeExpression> timeExpressions = new ArrayList<TimeExpression>(matchedExpressions.size());
    for (MatchedExpression expr : matchedExpressions) {
      if (expr instanceof TimeExpression) {
        timeExpressions.add((TimeExpression) expr);
      } else {
        timeExpressions.add(new TimeExpression(expr));
      }
    }

    // Add back nested time expressions for ranges....
    // For now only one level of nesting...
    if (options.includeNested) {
      List<TimeExpression> nestedTimeExpressions = new ArrayList<TimeExpression>();
      for (TimeExpression te : timeExpressions) {
        if (te.isIncludeNested()) {
          List<? extends CoreMap> children =
              te.getAnnotation().get(TimeExpression.ChildrenAnnotation.class);
          if (children != null) {
            for (CoreMap child : children) {
              TimeExpression childTe = child.get(TimeExpression.Annotation.class);
              if (childTe != null) {
                nestedTimeExpressions.add(childTe);
              }
            }
          }
        }
      }
      timeExpressions.addAll(nestedTimeExpressions);
    }
    Collections.sort(timeExpressions, MatchedExpression.EXPR_TOKEN_OFFSETS_NESTED_FIRST_COMPARATOR);
    timeExpressions = filterInvalidTimeExpressions(timeExpressions);

    // Some resolving is done even if docDate null...
    if (
    /*docDate != null && */ timeExpressions != null) {
      resolveTimeExpressions(annotation, timeExpressions, docDate);
    }
    // Annotate timex
    return timeExpressions;
  }
  /** Reads an annotation from the given filename using the requested input. */
  public static List<Annotation> getAnnotations(
      StanfordCoreNLP tokenizer, Input inputFormat, String filename, boolean filterUnknown) {
    switch (inputFormat) {
      case TEXT:
        {
          String text = IOUtils.slurpFileNoExceptions(filename);
          Annotation annotation = new Annotation(text);
          tokenizer.annotate(annotation);
          List<Annotation> annotations = Generics.newArrayList();
          for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
            Annotation nextAnnotation =
                new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class));
            nextAnnotation.set(
                CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence));
            annotations.add(nextAnnotation);
          }
          return annotations;
        }
      case TREES:
        {
          List<Tree> trees;
          if (filterUnknown) {
            trees = SentimentUtils.readTreesWithGoldLabels(filename);
            trees = SentimentUtils.filterUnknownRoots(trees);
          } else {
            trees = Generics.newArrayList();
            MemoryTreebank treebank = new MemoryTreebank("utf-8");
            treebank.loadPath(filename, null);
            for (Tree tree : treebank) {
              trees.add(tree);
            }
          }

          List<Annotation> annotations = Generics.newArrayList();
          for (Tree tree : trees) {
            CoreMap sentence = new Annotation(Sentence.listToString(tree.yield()));
            sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
            List<CoreMap> sentences = Collections.singletonList(sentence);
            Annotation annotation = new Annotation("");
            annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
            annotations.add(annotation);
          }
          return annotations;
        }
      default:
        throw new IllegalArgumentException("Unknown format " + inputFormat);
    }
  }
예제 #7
0
  /**
   * Converts NamedEntityTagAnnotation tags into {@link EntityMention}s. This finds the longest
   * sequence of NamedEntityTagAnnotation tags of the matching type.
   *
   * @param sentence A sentence annotated with NamedEntityTagAnnotation
   */
  public void makeAnnotationFromAllNERTags(CoreMap sentence) {
    List<CoreLabel> words = sentence.get(CoreAnnotations.TokensAnnotation.class);
    List<EntityMention> mentions =
        sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
    assert words != null;
    if (mentions == null) {
      this.logger.info("mentions are null");
      mentions = new ArrayList<>();
    }

    for (int start = 0; start < words.size(); start++) {

      int end;
      // find the first token after start that isn't of nerType
      String lastneTag = null;
      String ne = null;
      for (end = start; end < words.size(); end++) {
        ne = words.get(end).get(NamedEntityTagAnnotation.class);
        if (ne.equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL)
            || (lastneTag != null && !ne.equals(lastneTag))) {
          break;
        }
        lastneTag = ne;
      }

      if (end > start) {

        // found a match!
        String entityType = this.getEntityTypeForTag(lastneTag);
        EntityMention m =
            EntityMentionFactory.constructEntityMention(
                EntityMention.makeUniqueId(),
                sentence,
                new Span(start, end),
                new Span(start, end),
                entityType,
                null,
                null);
        // TODO: changed entityType in the above sentence to nerTag - Sonal
        logger.info("Created " + entityType + " entity mention: " + m);
        start = end - 1;
        mentions.add(m);
      }
    }

    sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, mentions);
  }
예제 #8
0
  /**
   * Converts NamedEntityTagAnnotation tags into {@link EntityMention}s. This finds the longest
   * sequence of NamedEntityTagAnnotation tags of the matching type.
   *
   * @param sentence A sentence, ideally annotated with NamedEntityTagAnnotation
   * @param nerTag The name of the NER tag to copy, e.g. "DATE".
   * @param entityType The type of the {@link EntityMention} objects created
   */
  public static void makeAnnotationFromGivenNERTag(
      CoreMap sentence, String nerTag, String entityType) {
    List<CoreLabel> words = sentence.get(CoreAnnotations.TokensAnnotation.class);
    List<EntityMention> mentions =
        sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
    assert words != null;
    assert mentions != null;

    for (int start = 0; start < words.size(); start++) {
      int end;
      // find the first token after start that isn't of nerType
      for (end = start; end < words.size(); end++) {
        String ne = words.get(end).get(NamedEntityTagAnnotation.class);
        if (!ne.equals(nerTag)) {
          break;
        }
      }

      if (end > start) {

        // found a match!
        EntityMention m =
            EntityMentionFactory.constructEntityMention(
                EntityMention.makeUniqueId(),
                sentence,
                new Span(start, end),
                new Span(start, end),
                entityType,
                null,
                null);
        logger.info("Created " + entityType + " entity mention: " + m);
        start = end - 1;
        mentions.add(m);
      }
    }

    sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, mentions);
  }
  @Override
  public Document nextDoc() throws Exception {
    List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>();
    List<Tree> allTrees = new ArrayList<Tree>();

    CoNLL2011DocumentReader.Document conllDoc = reader.getNextDocument();
    if (conllDoc == null) {
      return null;
    }

    Annotation anno = conllDoc.getAnnotation();
    List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
      if (!Constants.USE_GOLD_PARSES && !replicateCoNLL) {
        // Remove tree from annotation and replace with parse using stanford parser
        sentence.remove(TreeCoreAnnotations.TreeAnnotation.class);
      } else {
        Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
        // generate the dependency graph
        try {
          SemanticGraph deps =
              SemanticGraphFactory.makeFromTree(
                  tree, SemanticGraphFactory.Mode.COLLAPSED, includeExtras, lemmatize, threadSafe);
          SemanticGraph basicDeps =
              SemanticGraphFactory.makeFromTree(
                  tree, SemanticGraphFactory.Mode.BASIC, includeExtras, lemmatize, threadSafe);
          sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, basicDeps);
          sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, deps);
        } catch (Exception e) {
          logger.log(
              Level.WARNING,
              "Exception caught during extraction of Stanford dependencies. Will ignore and continue...",
              e);
        }
      }
    }

    String preSpeaker = null;
    int utterance = -1;
    for (CoreLabel token : anno.get(CoreAnnotations.TokensAnnotation.class)) {
      if (!token.containsKey(CoreAnnotations.SpeakerAnnotation.class)) {
        token.set(CoreAnnotations.SpeakerAnnotation.class, "");
      }
      String curSpeaker = token.get(CoreAnnotations.SpeakerAnnotation.class);
      if (!curSpeaker.equals(preSpeaker)) {
        utterance++;
        preSpeaker = curSpeaker;
      }
      token.set(CoreAnnotations.UtteranceAnnotation.class, utterance);
    }

    // Run pipeline
    stanfordProcessor.annotate(anno);

    for (CoreMap sentence : anno.get(CoreAnnotations.SentencesAnnotation.class)) {
      allWords.add(sentence.get(CoreAnnotations.TokensAnnotation.class));
      allTrees.add(sentence.get(TreeCoreAnnotations.TreeAnnotation.class));
    }

    // Initialize gold mentions
    List<List<Mention>> allGoldMentions = extractGoldMentions(conllDoc);

    List<List<Mention>> allPredictedMentions;
    if (Constants.USE_GOLD_MENTIONS) {
      // allPredictedMentions = allGoldMentions;
      // Make copy of gold mentions since mentions may be later merged, mentionID's changed and
      // stuff
      allPredictedMentions = makeCopy(allGoldMentions);
    } else if (Constants.USE_GOLD_MENTION_BOUNDARIES) {
      allPredictedMentions =
          ((RuleBasedCorefMentionFinder) mentionFinder)
              .filterPredictedMentions(allGoldMentions, anno, dictionaries);
    } else {
      allPredictedMentions = mentionFinder.extractPredictedMentions(anno, maxID, dictionaries);
    }

    try {
      recallErrors(allGoldMentions, allPredictedMentions, anno);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
    Document doc = arrange(anno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
    doc.conllDoc = conllDoc;
    return doc;
  }
  @Override
  public Document nextDoc() throws Exception {
    List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>();
    List<Tree> allTrees = new ArrayList<Tree>();
    List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>();
    List<List<Mention>> allPredictedMentions;
    List<CoreMap> allSentences = new ArrayList<CoreMap>();
    Annotation docAnno = new Annotation("");

    Pattern docPattern =
        Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Pattern sentencePattern =
        Pattern.compile(
            "(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)",
            Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docMatcher = docPattern.matcher(fileContents);
    if (!docMatcher.find(currentOffset)) return null;

    currentOffset = docMatcher.end();
    String doc = docMatcher.group(1);
    Matcher sentenceMatcher = sentencePattern.matcher(doc);
    String ner = null;

    // Maintain current document ID.
    Pattern docIDPattern =
        Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docIDMatcher = docIDPattern.matcher(doc);
    if (docIDMatcher.find()) currentDocumentID = docIDMatcher.group(1);
    else currentDocumentID = "documentAfter " + currentDocumentID;

    while (sentenceMatcher.find()) {
      String sentenceString = sentenceMatcher.group(2);
      List<CoreLabel> words =
          tokenizerFactory.getTokenizer(new StringReader(sentenceString)).tokenize();

      // FIXING TOKENIZATION PROBLEMS
      for (int i = 0; i < words.size(); i++) {
        CoreLabel w = words.get(i);
        if (i > 0 && w.word().equals("$")) {
          if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP"))
            continue;
          words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$");
          words.remove(i);
          i--;
        } else if (w.word().equals("\\/")) {
          if (words.get(i - 1).word().equals("</COREF>")) continue;
          w.set(
              CoreAnnotations.TextAnnotation.class,
              words.get(i - 1).word() + "\\/" + words.get(i + 1).word());
          words.remove(i + 1);
          words.remove(i - 1);
        }
      }
      // END FIXING TOKENIZATION PROBLEMS

      List<CoreLabel> sentence = new ArrayList<CoreLabel>();
      // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently
      // open
      Stack<Mention> stack = new Stack<Mention>();
      List<Mention> mentions = new ArrayList<Mention>();

      allWords.add(sentence);
      allGoldMentions.add(mentions);

      for (CoreLabel word : words) {
        String w = word.get(CoreAnnotations.TextAnnotation.class);
        // found regular token: WORD/POS
        if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) {
          int i = w.lastIndexOf("\\/");
          String w1 = w.substring(0, i);
          // we do NOT set POS info here. We take the POS tags from the parser!
          word.set(CoreAnnotations.TextAnnotation.class, w1);
          word.remove(CoreAnnotations.OriginalTextAnnotation.class);
          if (Constants.USE_GOLD_NE) {
            if (ner != null) {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
            } else {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
            }
          }
          sentence.add(word);
        }
        // found the start SGML tag for a NE, e.g., "<ORGANIZATION>"
        else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) {
          Pattern nerPattern = Pattern.compile("<(.*?)>");
          Matcher m = nerPattern.matcher(w);
          m.find();
          ner = m.group(1);
        }
        // found the end SGML tag for a NE, e.g., "</ORGANIZATION>"
        else if (w.startsWith("</") && !w.startsWith("</COREF")) {
          Pattern nerPattern = Pattern.compile("</(.*?)>");
          Matcher m = nerPattern.matcher(w);
          m.find();
          String ner1 = m.group(1);
          if (ner != null && !ner.equals(ner1))
            throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1);
          ner = null;
        }
        // found the start SGML tag for a coref mention
        else if (w.startsWith("<COREF")) {
          Mention mention = new Mention();
          // position of this mention in the sentence
          mention.startIndex = sentence.size();

          // extract GOLD info about this coref chain. needed for eval
          Pattern idPattern = Pattern.compile("ID=\"(.*?)\"");
          Pattern refPattern = Pattern.compile("REF=\"(.*?)\"");

          Matcher m = idPattern.matcher(w);
          m.find();
          mention.mentionID = Integer.parseInt(m.group(1));

          m = refPattern.matcher(w);
          if (m.find()) {
            mention.originalRef = Integer.parseInt(m.group(1));
          }

          // open mention. keep track of all open mentions using the stack
          stack.push(mention);
        }
        // found the end SGML tag for a coref mention
        else if (w.equals("</COREF>")) {
          Mention mention = stack.pop();
          mention.endIndex = sentence.size();

          // this is a closed mention. add it to the final list of mentions
          // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID,
          // mention.originalRef);
          mentions.add(mention);
        } else {
          word.remove(CoreAnnotations.OriginalTextAnnotation.class);
          if (Constants.USE_GOLD_NE) {
            if (ner != null) {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
            } else {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
            }
          }
          sentence.add(word);
        }
      }
      StringBuilder textContent = new StringBuilder();
      for (int i = 0; i < sentence.size(); i++) {
        CoreLabel w = sentence.get(i);
        w.set(CoreAnnotations.IndexAnnotation.class, i + 1);
        w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
        if (i > 0) textContent.append(" ");
        textContent.append(w.getString(CoreAnnotations.TextAnnotation.class));
      }
      CoreMap sentCoreMap = new Annotation(textContent.toString());
      allSentences.add(sentCoreMap);
      sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence);
    }

    // assign goldCorefClusterID
    Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use
    for (List<Mention> goldMentions : allGoldMentions) {
      for (Mention m : goldMentions) {
        idMention.put(m.mentionID, m);
      }
    }
    for (List<Mention> goldMentions : allGoldMentions) {
      for (Mention m : goldMentions) {
        if (m.goldCorefClusterID == -1) {
          if (m.originalRef == -1) m.goldCorefClusterID = m.mentionID;
          else {
            int ref = m.originalRef;
            while (true) {
              Mention m2 = idMention.get(ref);
              if (m2.goldCorefClusterID != -1) {
                m.goldCorefClusterID = m2.goldCorefClusterID;
                break;
              } else if (m2.originalRef == -1) {
                m2.goldCorefClusterID = m2.mentionID;
                m.goldCorefClusterID = m2.goldCorefClusterID;
                break;
              } else {
                ref = m2.originalRef;
              }
            }
          }
        }
      }
    }

    docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences);
    stanfordProcessor.annotate(docAnno);

    if (allSentences.size() != allWords.size())
      throw new IllegalStateException("allSentences != allWords");
    for (int i = 0; i < allSentences.size(); i++) {
      List<CoreLabel> annotatedSent =
          allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
      List<CoreLabel> unannotatedSent = allWords.get(i);
      List<Mention> mentionInSent = allGoldMentions.get(i);
      for (Mention m : mentionInSent) {
        m.dependency =
            allSentences
                .get(i)
                .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
      }
      if (annotatedSent.size() != unannotatedSent.size()) {
        throw new IllegalStateException("annotatedSent != unannotatedSent");
      }
      for (int j = 0, sz = annotatedSent.size(); j < sz; j++) {
        CoreLabel annotatedWord = annotatedSent.get(j);
        CoreLabel unannotatedWord = unannotatedSent.get(j);
        if (!annotatedWord
            .get(CoreAnnotations.TextAnnotation.class)
            .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) {
          throw new IllegalStateException("annotatedWord != unannotatedWord");
        }
      }
      allWords.set(i, annotatedSent);
      allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class));
    }

    // extract predicted mentions
    if (Constants.USE_GOLD_MENTIONS) allPredictedMentions = allGoldMentions;
    else
      allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);

    // add the relevant fields to mentions and order them for coref
    return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
  }
예제 #11
0
  public void annotate(CoreMap document) throws IOException {
    // write input file in GUTime format
    Element inputXML = toInputXML(document);
    File inputFile = File.createTempFile("gutime", ".input");

    // Document doc = new Document(inputXML);
    PrintWriter inputWriter = new PrintWriter(inputFile);
    inputWriter.println(inputXML.toXML());
    // new XMLOutputter().output(inputXML, inputWriter);
    inputWriter.close();

    boolean useFirstDate =
        (!document.has(CoreAnnotations.CalendarAnnotation.class)
            && !document.has(CoreAnnotations.DocDateAnnotation.class));

    ArrayList<String> args = new ArrayList<String>();
    args.add("perl");
    args.add("-I" + this.gutimePath.getPath());
    args.add(new File(this.gutimePath, "TimeTag.pl").getPath());
    if (useFirstDate) args.add("-FDNW");
    args.add(inputFile.getPath());
    // run GUTime on the input file
    ProcessBuilder process = new ProcessBuilder(args);

    StringWriter outputWriter = new StringWriter();
    SystemUtils.run(process, outputWriter, null);
    String output = outputWriter.getBuffer().toString();
    Pattern docClose = Pattern.compile("</DOC>.*", Pattern.DOTALL);
    output = docClose.matcher(output).replaceAll("</DOC>");

    // parse the GUTime output
    Element outputXML;
    try {
      Document newNodeDocument = new Builder().build(output, "");
      outputXML = newNodeDocument.getRootElement();
    } catch (ParsingException ex) {
      throw new RuntimeException(
          String.format(
              "error:\n%s\ninput:\n%s\noutput:\n%s", ex, IOUtils.slurpFile(inputFile), output));
    }
    /*
    try {
      outputXML = new SAXBuilder().build(new StringReader(output)).getRootElement();
    } catch (JDOMException e) {
      throw new RuntimeException(String.format("error:\n%s\ninput:\n%s\noutput:\n%s",
      		e, IOUtils.slurpFile(inputFile), output));
    } */
    inputFile.delete();

    // get Timex annotations
    List<CoreMap> timexAnns = toTimexCoreMaps(outputXML, document);
    document.set(TimexAnnotations.class, timexAnns);
    if (outputResults) {
      System.out.println(timexAnns);
    }

    // align Timex annotations to sentences
    int timexIndex = 0;
    for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
      int sentBegin = beginOffset(sentence);
      int sentEnd = endOffset(sentence);

      // skip times before the sentence
      while (timexIndex < timexAnns.size() && beginOffset(timexAnns.get(timexIndex)) < sentBegin) {
        ++timexIndex;
      }

      // determine times within the sentence
      int sublistBegin = timexIndex;
      int sublistEnd = timexIndex;
      while (timexIndex < timexAnns.size()
          && sentBegin <= beginOffset(timexAnns.get(timexIndex))
          && endOffset(timexAnns.get(timexIndex)) <= sentEnd) {
        ++sublistEnd;
        ++timexIndex;
      }

      // set the sentence timexes
      sentence.set(TimexAnnotations.class, timexAnns.subList(sublistBegin, sublistEnd));
    }
  }
예제 #12
0
 private static List<CoreMap> toTimexCoreMaps(Element docElem, CoreMap originalDocument) {
   // --Collect Token Offsets
   HashMap<Integer, Integer> beginMap = new HashMap<Integer, Integer>();
   HashMap<Integer, Integer> endMap = new HashMap<Integer, Integer>();
   boolean haveTokenOffsets = true;
   for (CoreMap sent : originalDocument.get(CoreAnnotations.SentencesAnnotation.class)) {
     for (CoreLabel token : sent.get(CoreAnnotations.TokensAnnotation.class)) {
       Integer tokBegin = token.get(CoreAnnotations.TokenBeginAnnotation.class);
       Integer tokEnd = token.get(CoreAnnotations.TokenEndAnnotation.class);
       if (tokBegin == null || tokEnd == null) {
         haveTokenOffsets = false;
       }
       int charBegin = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
       int charEnd = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
       beginMap.put(charBegin, tokBegin);
       endMap.put(charEnd, tokEnd);
     }
   }
   // --Set Timexes
   List<CoreMap> timexMaps = new ArrayList<CoreMap>();
   int offset = 0;
   Element textElem = docElem.getFirstChildElement("text");
   for (int i = 0; i < textElem.getChildCount(); i++) {
     Node content = textElem.getChild(i);
     if (content instanceof Text) {
       Text text = (Text) content;
       offset += text.getValue().length();
     } else if (content instanceof Element) {
       Element child = (Element) content;
       if (child.getLocalName().equals("TIMEX3")) {
         Timex timex = new Timex(child);
         if (child.getChildCount() != 1) {
           throw new RuntimeException("TIMEX3 should only contain text " + child);
         }
         String timexText = child.getValue();
         CoreMap timexMap = new ArrayCoreMap();
         // (timex)
         timexMap.set(TimexAnnotation.class, timex);
         // (text)
         timexMap.set(CoreAnnotations.TextAnnotation.class, timexText);
         // (characters)
         int charBegin = offset;
         timexMap.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, charBegin);
         offset += timexText.length();
         int charEnd = offset;
         timexMap.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, charEnd);
         // (tokens)
         if (haveTokenOffsets) {
           Integer tokBegin = beginMap.get(charBegin);
           int searchStep = 1; // if no exact match, search around the character offset
           while (tokBegin == null) {
             tokBegin = beginMap.get(charBegin - searchStep);
             if (tokBegin == null) {
               tokBegin = beginMap.get(charBegin + searchStep);
             }
             searchStep += 1;
           }
           searchStep = 1;
           Integer tokEnd = endMap.get(charEnd);
           while (tokEnd == null) {
             tokEnd = endMap.get(charEnd - searchStep);
             if (tokEnd == null) {
               tokEnd = endMap.get(charEnd + searchStep);
             }
             searchStep += 1;
           }
           timexMap.set(CoreAnnotations.TokenBeginAnnotation.class, tokBegin);
           timexMap.set(CoreAnnotations.TokenEndAnnotation.class, tokEnd);
         }
         // (add)
         timexMaps.add(timexMap);
       } else {
         throw new RuntimeException("unexpected element " + child);
       }
     } else {
       throw new RuntimeException("unexpected content " + content);
     }
   }
   return timexMaps;
 }
예제 #13
0
  /**
   * Label entities in an ExtractionSentence. Assumes the classifier has already been trained.
   *
   * @param sentence ExtractionSentence that we want to extract entities from
   * @return an ExtractionSentence with text content, tree and entities set. Relations will not be
   *     set.
   */
  private CoreMap extractEntities(CoreMap sentence, int sentCount) {
    // don't add answer annotations
    List<CoreLabel> testSentence =
        AnnotationUtils.sentenceEntityMentionsToCoreLabels(
            sentence, false, annotationsToSkip, null, useSubTypes, useBIO);

    // now label the sentence
    List<CoreLabel> annotatedSentence = this.classifier.classify(testSentence);
    if (logger.isLoggable(Level.FINEST)) {
      logger.finest("CLASSFIER OUTPUT: " + annotatedSentence);
    }

    List<EntityMention> extractedEntities = new ArrayList<>();
    int i = 0;

    // variables which keep track of partially seen entities (i.e. we've seen
    // some but not all the words in them so far)
    String lastType = null;
    int startIndex = -1;

    //
    // note that labels may be in the BIO or just the IO format. we must handle both transparently
    //
    for (CoreLabel label : annotatedSentence) {
      String type = label.get(AnswerAnnotation.class);
      if (type.equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL)) {
        type = null;
      }

      // this is an entity end boundary followed by O
      if (type == null && lastType != null) {
        makeEntityMention(sentence, startIndex, i, lastType, extractedEntities, sentCount);
        logger.info("Found entity: " + extractedEntities.get(extractedEntities.size() - 1));
        startIndex = -1;
      }

      // entity start preceded by an O
      else if (lastType == null && type != null) {
        startIndex = i;
      }

      // entity end followed by another entity of different type
      else if (lastType != null
          && type != null
          && (type.startsWith("B-")
              || (lastType.startsWith("I-") && type.startsWith("I-") && !lastType.equals(type))
              || (notBIO(lastType) && notBIO(type) && !lastType.equals(type)))) {
        makeEntityMention(sentence, startIndex, i, lastType, extractedEntities, sentCount);
        logger.info("Found entity: " + extractedEntities.get(extractedEntities.size() - 1));
        startIndex = i;
      }

      lastType = type;
      i++;
    }

    // replace the original annotation with the predicted entities
    sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, extractedEntities);
    logger.finest("EXTRACTED ENTITIES: ");
    for (EntityMention e : extractedEntities) {
      if (logger.isLoggable(Level.FINEST)) {
        logger.finest("\t" + e);
      }
    }

    postprocessSentence(sentence, sentCount);

    return sentence;
  }