예제 #1
0
  private String findNextParagraphSpeaker(
      List<CoreMap> paragraph, int paragraphOffset, Dictionaries dict) {
    CoreMap lastSent = paragraph.get(paragraph.size() - 1);
    String speaker = "";
    for (CoreLabel w : lastSent.get(CoreAnnotations.TokensAnnotation.class)) {
      if (w.get(CoreAnnotations.LemmaAnnotation.class).equals("report")
          || w.get(CoreAnnotations.LemmaAnnotation.class).equals("say")) {
        String word = w.get(CoreAnnotations.TextAnnotation.class);
        SemanticGraph dependency =
            lastSent.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
        IndexedWord t = dependency.getNodeByWordPattern(word);

        for (Pair<GrammaticalRelation, IndexedWord> child : dependency.childPairs(t)) {
          if (child.first().getShortName().equals("nsubj")) {
            int subjectIndex = child.second().index(); // start from 1
            IntTuple headPosition = new IntTuple(2);
            headPosition.set(0, paragraph.size() - 1 + paragraphOffset);
            headPosition.set(1, subjectIndex - 1);
            if (mentionheadPositions.containsKey(headPosition)
                && mentionheadPositions.get(headPosition).nerString.startsWith("PER")) {
              speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID);
            }
          }
        }
      }
    }
    return speaker;
  }
예제 #2
0
  private boolean findSpeaker(
      int utterNum,
      int sentNum,
      List<CoreMap> sentences,
      int startIndex,
      int endIndex,
      Dictionaries dict) {
    List<CoreLabel> sent = sentences.get(sentNum).get(CoreAnnotations.TokensAnnotation.class);
    for (int i = startIndex; i < endIndex; i++) {
      if (sent.get(i).get(CoreAnnotations.UtteranceAnnotation.class) != 0) continue;
      String lemma = sent.get(i).get(CoreAnnotations.LemmaAnnotation.class);
      String word = sent.get(i).get(CoreAnnotations.TextAnnotation.class);
      if (dict.reportVerb.contains(lemma)) {
        // find subject
        SemanticGraph dependency =
            sentences
                .get(sentNum)
                .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
        IndexedWord w = dependency.getNodeByWordPattern(word);

        if (w != null) {
          for (Pair<GrammaticalRelation, IndexedWord> child : dependency.childPairs(w)) {
            if (child.first().getShortName().equals("nsubj")) {
              String subjectString = child.second().word();
              int subjectIndex = child.second().index(); // start from 1
              IntTuple headPosition = new IntTuple(2);
              headPosition.set(0, sentNum);
              headPosition.set(1, subjectIndex - 1);
              String speaker;
              if (mentionheadPositions.containsKey(headPosition)) {
                speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID);
              } else {
                speaker = subjectString;
              }
              speakers.put(utterNum, speaker);
              return true;
            }
          }
        } else {
          SieveCoreferenceSystem.logger.warning("Cannot find node in dependency for word " + word);
        }
      }
    }
    return false;
  }
예제 #3
0
    public CorefMention(Mention m, IntTuple pos) {
      mentionType = m.mentionType;
      number = m.number;
      gender = m.gender;
      animacy = m.animacy;
      startIndex = m.startIndex + 1;
      endIndex = m.endIndex + 1;
      headIndex = m.headIndex + 1;
      corefClusterID = m.corefClusterID;
      sentNum = m.sentNum + 1;
      mentionID = m.mentionID;
      mentionSpan = m.spanToString();

      // index starts from 1
      position = new IntTuple(2);
      position.set(0, pos.get(0) + 1);
      position.set(1, pos.get(1) + 1);

      m.headWord.set(CorefCoreAnnotations.CorefClusterIdAnnotation.class, corefClusterID);
    }
예제 #4
0
 private String findParagraphSpeaker(
     List<CoreMap> paragraph,
     int paragraphUtterIndex,
     String nextParagraphSpeaker,
     int paragraphOffset,
     Dictionaries dict) {
   if (!speakers.containsKey(paragraphUtterIndex)) {
     if (!nextParagraphSpeaker.equals("")) {
       speakers.put(paragraphUtterIndex, nextParagraphSpeaker);
     } else { // find the speaker of this paragraph (John, nbc news)
       CoreMap lastSent = paragraph.get(paragraph.size() - 1);
       String speaker = "";
       boolean hasVerb = false;
       for (int i = 0; i < lastSent.get(CoreAnnotations.TokensAnnotation.class).size(); i++) {
         CoreLabel w = lastSent.get(CoreAnnotations.TokensAnnotation.class).get(i);
         String pos = w.get(CoreAnnotations.PartOfSpeechAnnotation.class);
         String ner = w.get(CoreAnnotations.NamedEntityTagAnnotation.class);
         if (pos.startsWith("V")) {
           hasVerb = true;
           break;
         }
         if (ner.startsWith("PER")) {
           IntTuple headPosition = new IntTuple(2);
           headPosition.set(0, paragraph.size() - 1 + paragraphOffset);
           headPosition.set(1, i);
           if (mentionheadPositions.containsKey(headPosition)) {
             speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID);
           }
         }
       }
       if (!hasVerb && !speaker.equals("")) {
         speakers.put(paragraphUtterIndex, speaker);
       }
     }
   }
   return findNextParagraphSpeaker(paragraph, paragraphOffset, dict);
 }
예제 #5
0
  /** initialize positions and corefClusters (put each mention in each CorefCluster) */
  private void initializeCorefCluster() {
    for (int i = 0; i < predictedOrderedMentionsBySentence.size(); i++) {
      for (int j = 0; j < predictedOrderedMentionsBySentence.get(i).size(); j++) {
        Mention m = predictedOrderedMentionsBySentence.get(i).get(j);
        if (allPredictedMentions.containsKey(m.mentionID)) {
          SieveCoreferenceSystem.logger.warning("WARNING: Already contain mention " + m.mentionID);
          Mention m1 = allPredictedMentions.get(m.mentionID);
          SieveCoreferenceSystem.logger.warning(
              "OLD mention: " + m1.spanToString() + "[" + m1.startIndex + "," + m1.endIndex + "]");
          SieveCoreferenceSystem.logger.warning(
              "NEW mention: " + m.spanToString() + "[" + m.startIndex + "," + m.endIndex + "]");
          //          SieveCoreferenceSystem.debugPrintMentions(System.err, "PREDICTED ORDERED",
          // predictedOrderedMentionsBySentence);
          //          SieveCoreferenceSystem.debugPrintMentions(System.err, "GOLD ORDERED",
          // goldOrderedMentionsBySentence);
        }
        assert (!allPredictedMentions.containsKey(m.mentionID));
        allPredictedMentions.put(m.mentionID, m);

        IntTuple pos = new IntTuple(2);
        pos.set(0, i);
        pos.set(1, j);
        positions.put(m, pos);
        m.sentNum = i;

        assert (!corefClusters.containsKey(m.mentionID));
        corefClusters.put(
            m.mentionID, new CorefCluster(m.mentionID, Generics.newHashSet(Arrays.asList(m))));
        m.corefClusterID = m.mentionID;

        IntTuple headPosition = new IntTuple(2);
        headPosition.set(0, i);
        headPosition.set(1, m.headIndex);
        mentionheadPositions.put(headPosition, m);
      }
    }
  }
예제 #6
0
 @Override
 public boolean equals(Object aThat) {
   if (this == aThat) return true;
   if (!(aThat instanceof CorefMention)) return false;
   CorefMention that = (CorefMention) aThat;
   if (mentionType != that.mentionType) return false;
   if (number != that.number) return false;
   if (gender != that.gender) return false;
   if (animacy != that.animacy) return false;
   if (startIndex != that.startIndex) return false;
   if (endIndex != that.endIndex) return false;
   if (headIndex != that.headIndex) return false;
   if (corefClusterID != that.corefClusterID) return false;
   if (mentionID != that.mentionID) return false;
   if (sentNum != that.sentNum) return false;
   if (!position.equals(that.position)) return false;
   // we ignore MentionSpan as it is constructed from the tokens
   // the mention is a span of, so if we know those spans are the
   // same, we should be able to ignore the actual text
   return true;
 }
예제 #7
0
 @Override
 public int hashCode() {
   return position.hashCode();
 }
예제 #8
0
  /** Extract gold coref link information */
  protected void extractGoldLinks() {
    //    List<List<Mention>> orderedMentionsBySentence = this.getOrderedMentions();
    List<Pair<IntTuple, IntTuple>> links = new ArrayList<Pair<IntTuple, IntTuple>>();

    // position of each mention in the input matrix, by id
    Map<Integer, IntTuple> positions = Generics.newHashMap();
    // positions of antecedents
    Map<Integer, List<IntTuple>> antecedents = Generics.newHashMap();
    for (int i = 0; i < goldOrderedMentionsBySentence.size(); i++) {
      for (int j = 0; j < goldOrderedMentionsBySentence.get(i).size(); j++) {
        Mention m = goldOrderedMentionsBySentence.get(i).get(j);
        int id = m.mentionID;
        IntTuple pos = new IntTuple(2);
        pos.set(0, i);
        pos.set(1, j);
        positions.put(id, pos);
        antecedents.put(id, new ArrayList<IntTuple>());
      }
    }

    //    SieveCoreferenceSystem.debugPrintMentions(System.err, "", goldOrderedMentionsBySentence);
    for (List<Mention> mentions : goldOrderedMentionsBySentence) {
      for (Mention m : mentions) {
        int id = m.mentionID;
        IntTuple src = positions.get(id);

        assert (src != null);
        if (m.originalRef >= 0) {
          IntTuple dst = positions.get(m.originalRef);
          if (dst == null) {
            throw new RuntimeException("Cannot find gold mention with ID=" + m.originalRef);
          }

          // to deal with cataphoric annotation
          while (dst.get(0) > src.get(0) || (dst.get(0) == src.get(0) && dst.get(1) > src.get(1))) {
            Mention dstMention = goldOrderedMentionsBySentence.get(dst.get(0)).get(dst.get(1));
            m.originalRef = dstMention.originalRef;
            dstMention.originalRef = id;

            if (m.originalRef < 0) break;
            dst = positions.get(m.originalRef);
          }
          if (m.originalRef < 0) continue;

          // A B C: if A<-B, A<-C => make a link B<-C
          for (int k = dst.get(0); k <= src.get(0); k++) {
            for (int l = 0; l < goldOrderedMentionsBySentence.get(k).size(); l++) {
              if (k == dst.get(0) && l < dst.get(1)) continue;
              if (k == src.get(0) && l > src.get(1)) break;
              IntTuple missed = new IntTuple(2);
              missed.set(0, k);
              missed.set(1, l);
              if (links.contains(new Pair<IntTuple, IntTuple>(missed, dst))) {
                antecedents.get(id).add(missed);
                links.add(new Pair<IntTuple, IntTuple>(src, missed));
              }
            }
          }

          links.add(new Pair<IntTuple, IntTuple>(src, dst));

          assert (antecedents.get(id) != null);
          antecedents.get(id).add(dst);

          List<IntTuple> ants = antecedents.get(m.originalRef);
          assert (ants != null);
          for (IntTuple ant : ants) {
            antecedents.get(id).add(ant);
            links.add(new Pair<IntTuple, IntTuple>(src, ant));
          }
        }
      }
    }
    goldLinks = links;
  }
  public void annotate(Annotation annotation) {
    try {
      List<Tree> trees = new ArrayList<Tree>();
      List<List<CoreLabel>> sentences = new ArrayList<List<CoreLabel>>();

      // extract trees and sentence words
      // we are only supporting the new annotation standard for this Annotator!
      if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
        int sentNum = 0;
        for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
          List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
          sentences.add(tokens);
          Tree tree = sentence.get(TreeAnnotation.class);
          trees.add(tree);

          MentionExtractor.mergeLabels(tree, tokens);
          MentionExtractor.initializeUtterance(tokens);
        }
      } else {
        System.err.println(
            "ERROR: this coreference resolution system requires SentencesAnnotation!");
        return;
      }

      // extract all possible mentions
      RuleBasedCorefMentionFinder finder = new RuleBasedCorefMentionFinder();
      List<List<Mention>> allUnprocessedMentions =
          finder.extractPredictedMentions(annotation, 0, corefSystem.dictionaries());

      // add the relevant info to mentions and order them for coref
      Document document =
          mentionExtractor.arrange(annotation, sentences, trees, allUnprocessedMentions);
      List<List<Mention>> orderedMentions = document.getOrderedMentions();
      if (VERBOSE) {
        for (int i = 0; i < orderedMentions.size(); i++) {
          System.err.printf("Mentions in sentence #%d:\n", i);
          for (int j = 0; j < orderedMentions.get(i).size(); j++) {
            System.err.println(
                "\tMention #" + j + ": " + orderedMentions.get(i).get(j).spanToString());
          }
        }
      }

      Map<Integer, CorefChain> result = corefSystem.coref(document);
      annotation.set(CorefChainAnnotation.class, result);

      // for backward compatibility
      if (OLD_FORMAT) {
        List<Pair<IntTuple, IntTuple>> links = SieveCoreferenceSystem.getLinks(result);

        if (VERBOSE) {
          System.err.printf("Found %d coreference links:\n", links.size());
          for (Pair<IntTuple, IntTuple> link : links) {
            System.err.printf(
                "LINK (%d, %d) -> (%d, %d)\n",
                link.first.get(0), link.first.get(1), link.second.get(0), link.second.get(1));
          }
        }

        //
        // save the coref output as CorefGraphAnnotation
        //
        List<List<CoreLabel>> sents = new ArrayList<List<CoreLabel>>();
        for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
          List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
          sents.add(tokens);
        }
        // this graph is stored in CorefGraphAnnotation -- the raw links found by the coref system
        List<Pair<IntTuple, IntTuple>> graph = new ArrayList<Pair<IntTuple, IntTuple>>();

        for (Pair<IntTuple, IntTuple> link : links) {
          //
          // Note: all offsets in the graph start at 1 (not at 0!)
          //       we do this for consistency reasons, as indices for syntactic dependencies start
          // at 1
          //
          int srcSent = link.first.get(0);
          int srcTok = orderedMentions.get(srcSent - 1).get(link.first.get(1) - 1).headIndex + 1;
          int dstSent = link.second.get(0);
          int dstTok = orderedMentions.get(dstSent - 1).get(link.second.get(1) - 1).headIndex + 1;
          IntTuple dst = new IntTuple(2);
          dst.set(0, dstSent);
          dst.set(1, dstTok);
          IntTuple src = new IntTuple(2);
          src.set(0, srcSent);
          src.set(1, srcTok);
          graph.add(new Pair<IntTuple, IntTuple>(src, dst));
        }
        annotation.set(CorefGraphAnnotation.class, graph);

        for (CorefChain corefChain : result.values()) {
          if (corefChain.getCorefMentions().size() < 2) continue;
          Set<CoreLabel> coreferentTokens = new HashSet<CoreLabel>();
          Set<CoreLabel> cyclicCoreferentTokens = new HashSet<CoreLabel>();
          for (CorefMention mention : corefChain.getCorefMentions()) {
            CoreMap sentence = annotation.get(SentencesAnnotation.class).get(mention.sentNum - 1);
            CoreLabel token = sentence.get(TokensAnnotation.class).get(mention.headIndex - 1);

            // this stuff is so things will mostly work without us replacing all
            // tokens with CyclicCoreLabels whenever coref is run (which maybe
            // wouldn't be a terrible thing?)
            coreferentTokens.add(token);
            cyclicCoreferentTokens.add(new CyclicCoreLabel(token));
          }
          for (CoreLabel token : coreferentTokens) {
            token.set(CorefClusterAnnotation.class, cyclicCoreferentTokens);
          }
          for (CoreLabel token : cyclicCoreferentTokens) {
            token.set(CorefClusterAnnotation.class, cyclicCoreferentTokens);
          }
        }
      }
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }