コード例 #1
0
 public void resetDocs() {
   super.resetDocs();
   reader.reset();
 }
コード例 #2
0
  @Override
  public Document nextDoc() throws Exception {
    List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>();
    List<Tree> allTrees = new ArrayList<Tree>();

    CoNLL2011DocumentReader.Document conllDoc = reader.getNextDocument();
    if (conllDoc == null) {
      return null;
    }

    Annotation anno = conllDoc.getAnnotation();
    List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
      if (!Constants.USE_GOLD_PARSES && !replicateCoNLL) {
        // Remove tree from annotation and replace with parse using stanford parser
        sentence.remove(TreeCoreAnnotations.TreeAnnotation.class);
      } else {
        Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
        // generate the dependency graph
        try {
          SemanticGraph deps =
              SemanticGraphFactory.makeFromTree(
                  tree, SemanticGraphFactory.Mode.COLLAPSED, includeExtras, lemmatize, threadSafe);
          SemanticGraph basicDeps =
              SemanticGraphFactory.makeFromTree(
                  tree, SemanticGraphFactory.Mode.BASIC, includeExtras, lemmatize, threadSafe);
          sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, basicDeps);
          sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, deps);
        } catch (Exception e) {
          logger.log(
              Level.WARNING,
              "Exception caught during extraction of Stanford dependencies. Will ignore and continue...",
              e);
        }
      }
    }

    String preSpeaker = null;
    int utterance = -1;
    for (CoreLabel token : anno.get(CoreAnnotations.TokensAnnotation.class)) {
      if (!token.containsKey(CoreAnnotations.SpeakerAnnotation.class)) {
        token.set(CoreAnnotations.SpeakerAnnotation.class, "");
      }
      String curSpeaker = token.get(CoreAnnotations.SpeakerAnnotation.class);
      if (!curSpeaker.equals(preSpeaker)) {
        utterance++;
        preSpeaker = curSpeaker;
      }
      token.set(CoreAnnotations.UtteranceAnnotation.class, utterance);
    }

    // Run pipeline
    stanfordProcessor.annotate(anno);

    for (CoreMap sentence : anno.get(CoreAnnotations.SentencesAnnotation.class)) {
      allWords.add(sentence.get(CoreAnnotations.TokensAnnotation.class));
      allTrees.add(sentence.get(TreeCoreAnnotations.TreeAnnotation.class));
    }

    // Initialize gold mentions
    List<List<Mention>> allGoldMentions = extractGoldMentions(conllDoc);

    List<List<Mention>> allPredictedMentions;
    if (Constants.USE_GOLD_MENTIONS) {
      // allPredictedMentions = allGoldMentions;
      // Make copy of gold mentions since mentions may be later merged, mentionID's changed and
      // stuff
      allPredictedMentions = makeCopy(allGoldMentions);
    } else if (Constants.USE_GOLD_MENTION_BOUNDARIES) {
      allPredictedMentions =
          ((RuleBasedCorefMentionFinder) mentionFinder)
              .filterPredictedMentions(allGoldMentions, anno, dictionaries);
    } else {
      allPredictedMentions = mentionFinder.extractPredictedMentions(anno, maxID, dictionaries);
    }

    try {
      recallErrors(allGoldMentions, allPredictedMentions, anno);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
    Document doc = arrange(anno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
    doc.conllDoc = conllDoc;
    return doc;
  }