Esempio n. 1
0
  /** set UtteranceAnnotation for quotations: default UtteranceAnnotation = 0 is given */
  private void markQuotations(List<CoreMap> results, boolean normalQuotationType) {
    boolean insideQuotation = false;
    for (CoreMap m : results) {
      for (CoreLabel l : m.get(CoreAnnotations.TokensAnnotation.class)) {
        String w = l.get(CoreAnnotations.TextAnnotation.class);

        boolean noSpeakerInfo =
            !l.containsKey(CoreAnnotations.SpeakerAnnotation.class)
                || l.get(CoreAnnotations.SpeakerAnnotation.class).equals("")
                || l.get(CoreAnnotations.SpeakerAnnotation.class).startsWith("PER");

        if (w.equals("``") || (!insideQuotation && normalQuotationType && w.equals("\""))) {
          insideQuotation = true;
          maxUtter++;
          continue;
        } else if (w.equals("''") || (insideQuotation && normalQuotationType && w.equals("\""))) {
          insideQuotation = false;
        }
        if (insideQuotation) {
          l.set(CoreAnnotations.UtteranceAnnotation.class, maxUtter);
        }
        if (noSpeakerInfo) {
          l.set(
              CoreAnnotations.SpeakerAnnotation.class,
              "PER" + l.get(CoreAnnotations.UtteranceAnnotation.class));
        }
      }
    }
    if (maxUtter == 0 && !normalQuotationType) markQuotations(results, true);
  }
    @Override
    public List<CoreLabel> apply(String doc) {
      if (num > 0 && num % 1000 == 0) {
        System.err.print("[" + num + "]");
      }
      num++;

      List<CoreLabel> words = new ArrayList<>();

      String[] lines = doc.split("\n");

      for (String line : lines) {
        ++lineCount;
        if (line.trim().length() == 0) {
          continue;
        }
        String[] info = whitePattern.split(line);
        // todo: We could speed things up here by having one time only having converted map into an
        // array of CoreLabel keys (Class<? extends CoreAnnotation<?>>) and then instantiating them.
        // Need new constructor.
        CoreLabel wi;
        try {
          wi = new CoreLabel(map, info);
          // Since the map normally only specified answer, we copy it to GoldAnswer unless they've
          // put something else there!
          if (!wi.containsKey(CoreAnnotations.GoldAnswerAnnotation.class)
              && wi.containsKey(CoreAnnotations.AnswerAnnotation.class)) {
            wi.set(
                CoreAnnotations.GoldAnswerAnnotation.class,
                wi.get(CoreAnnotations.AnswerAnnotation.class));
          }
        } catch (RuntimeException e) {
          System.err.println("Error on line " + lineCount + ": " + line);
          throw e;
        }
        words.add(wi);
      }
      return words;
    }
Esempio n. 3
0
 /** Set paragraph index */
 private void setParagraphAnnotation() {
   int paragraphIndex = 0;
   int previousOffset = -10;
   for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
     for (CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
       if (w.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) {
         if (w.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) > previousOffset + 2)
           paragraphIndex++;
         w.set(CoreAnnotations.ParagraphAnnotation.class, paragraphIndex);
         previousOffset = w.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
       } else {
         w.set(CoreAnnotations.ParagraphAnnotation.class, -1);
       }
     }
   }
   for (List<Mention> l : predictedOrderedMentionsBySentence) {
     for (Mention m : l) {
       m.paragraph = m.headWord.get(CoreAnnotations.ParagraphAnnotation.class);
     }
   }
   numParagraph = paragraphIndex;
 }
  @Override
  public Document nextDoc() throws Exception {
    List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>();
    List<Tree> allTrees = new ArrayList<Tree>();

    CoNLL2011DocumentReader.Document conllDoc = reader.getNextDocument();
    if (conllDoc == null) {
      return null;
    }

    Annotation anno = conllDoc.getAnnotation();
    List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
      if (!Constants.USE_GOLD_PARSES && !replicateCoNLL) {
        // Remove tree from annotation and replace with parse using stanford parser
        sentence.remove(TreeCoreAnnotations.TreeAnnotation.class);
      } else {
        Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
        // generate the dependency graph
        try {
          SemanticGraph deps =
              SemanticGraphFactory.makeFromTree(
                  tree, SemanticGraphFactory.Mode.COLLAPSED, includeExtras, lemmatize, threadSafe);
          SemanticGraph basicDeps =
              SemanticGraphFactory.makeFromTree(
                  tree, SemanticGraphFactory.Mode.BASIC, includeExtras, lemmatize, threadSafe);
          sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, basicDeps);
          sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, deps);
        } catch (Exception e) {
          logger.log(
              Level.WARNING,
              "Exception caught during extraction of Stanford dependencies. Will ignore and continue...",
              e);
        }
      }
    }

    String preSpeaker = null;
    int utterance = -1;
    for (CoreLabel token : anno.get(CoreAnnotations.TokensAnnotation.class)) {
      if (!token.containsKey(CoreAnnotations.SpeakerAnnotation.class)) {
        token.set(CoreAnnotations.SpeakerAnnotation.class, "");
      }
      String curSpeaker = token.get(CoreAnnotations.SpeakerAnnotation.class);
      if (!curSpeaker.equals(preSpeaker)) {
        utterance++;
        preSpeaker = curSpeaker;
      }
      token.set(CoreAnnotations.UtteranceAnnotation.class, utterance);
    }

    // Run pipeline
    stanfordProcessor.annotate(anno);

    for (CoreMap sentence : anno.get(CoreAnnotations.SentencesAnnotation.class)) {
      allWords.add(sentence.get(CoreAnnotations.TokensAnnotation.class));
      allTrees.add(sentence.get(TreeCoreAnnotations.TreeAnnotation.class));
    }

    // Initialize gold mentions
    List<List<Mention>> allGoldMentions = extractGoldMentions(conllDoc);

    List<List<Mention>> allPredictedMentions;
    if (Constants.USE_GOLD_MENTIONS) {
      // allPredictedMentions = allGoldMentions;
      // Make copy of gold mentions since mentions may be later merged, mentionID's changed and
      // stuff
      allPredictedMentions = makeCopy(allGoldMentions);
    } else if (Constants.USE_GOLD_MENTION_BOUNDARIES) {
      allPredictedMentions =
          ((RuleBasedCorefMentionFinder) mentionFinder)
              .filterPredictedMentions(allGoldMentions, anno, dictionaries);
    } else {
      allPredictedMentions = mentionFinder.extractPredictedMentions(anno, maxID, dictionaries);
    }

    try {
      recallErrors(allGoldMentions, allPredictedMentions, anno);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
    Document doc = arrange(anno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
    doc.conllDoc = conllDoc;
    return doc;
  }