Esempi in Java per Mention.mentionID

Esempio n. 1

0

Mostra file

File: CoNLLMentionExtractor.java Progetto: nipengadmaster/CoreNLP

  public List<List<Mention>> extractGoldMentions(CoNLL2011DocumentReader.Document conllDoc) {
    List<CoreMap> sentences =
        conllDoc.getAnnotation().get(CoreAnnotations.SentencesAnnotation.class);
    List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>();
    CollectionValuedMap<String, CoreMap> corefChainMap = conllDoc.getCorefChainMap();
    for (int i = 0; i < sentences.size(); i++) {
      allGoldMentions.add(new ArrayList<Mention>());
    }
    int maxCorefClusterId = -1;
    for (String corefIdStr : corefChainMap.keySet()) {
      int id = Integer.parseInt(corefIdStr);
      if (id > maxCorefClusterId) {
        maxCorefClusterId = id;
      }
    }
    int newMentionID = maxCorefClusterId + 1;
    for (String corefIdStr : corefChainMap.keySet()) {
      int id = Integer.parseInt(corefIdStr);
      int clusterMentionCnt = 0;
      for (CoreMap m : corefChainMap.get(corefIdStr)) {
        clusterMentionCnt++;
        Mention mention = new Mention();

        mention.goldCorefClusterID = id;
        if (clusterMentionCnt == 1) {
          // First mention in cluster
          mention.mentionID = id;
          mention.originalRef = -1;
        } else {
          mention.mentionID = newMentionID;
          mention.originalRef = id;
          newMentionID++;
        }
        if (maxID < mention.mentionID) maxID = mention.mentionID;
        int sentIndex = m.get(CoreAnnotations.SentenceIndexAnnotation.class);
        CoreMap sent = sentences.get(sentIndex);
        mention.startIndex =
            m.get(CoreAnnotations.TokenBeginAnnotation.class)
                - sent.get(CoreAnnotations.TokenBeginAnnotation.class);
        mention.endIndex =
            m.get(CoreAnnotations.TokenEndAnnotation.class)
                - sent.get(CoreAnnotations.TokenBeginAnnotation.class);

        // will be set by arrange
        mention.originalSpan = m.get(CoreAnnotations.TokensAnnotation.class);

        // Mention dependency is collapsed dependency for sentence
        mention.dependency =
            sentences
                .get(sentIndex)
                .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);

        allGoldMentions.get(sentIndex).add(mention);
      }
    }
    return allGoldMentions;
  }

Esempio n. 2

0

Mostra file

File: Document.java Progetto: taarraas/matrixfactorization

  /** Mark twin mentions: All mention boundaries should be matched */
  private void findTwinMentionsStrict() {
    for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) {
      List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum);
      List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum);

      // For CoNLL training there are some documents with gold mentions with the same position
      // offsets
      // See
      // /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll
      //  (Packwood - Roth)
      CollectionValuedMap<IntPair, Mention> goldMentionPositions =
          new CollectionValuedMap<IntPair, Mention>();
      for (Mention g : golds) {
        IntPair ip = new IntPair(g.startIndex, g.endIndex);
        if (goldMentionPositions.containsKey(ip)) {
          StringBuilder existingMentions = new StringBuilder();
          for (Mention eg : goldMentionPositions.get(ip)) {
            if (existingMentions.length() > 0) {
              existingMentions.append(",");
            }
            existingMentions.append(eg.mentionID);
          }
          SieveCoreferenceSystem.logger.warning(
              "WARNING: gold mentions with the same offsets: "
                  + ip
                  + " mentions="
                  + g.mentionID
                  + ","
                  + existingMentions
                  + ", "
                  + g.spanToString());
        }
        // assert(!goldMentionPositions.containsKey(ip));
        goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g);
      }
      for (Mention p : predicts) {
        IntPair pos = new IntPair(p.startIndex, p.endIndex);
        if (goldMentionPositions.containsKey(pos)) {
          Collection<Mention> cm = goldMentionPositions.get(pos);
          Mention g = cm.iterator().next();
          cm.remove(g);
          p.mentionID = g.mentionID;
          p.twinless = false;
          g.twinless = false;
        }
      }
      // temp: for making easy to recognize twinless mention
      for (Mention p : predicts) {
        if (p.twinless) p.mentionID += 10000;
      }
    }
  }

Esempio n. 3

0

Mostra file

File: Document.java Progetto: taarraas/matrixfactorization

  /** Mark twin mentions: heads of the mentions are matched */
  private void findTwinMentionsRelaxed() {
    for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) {
      List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum);
      List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum);

      Map<IntPair, Mention> goldMentionPositions = Generics.newHashMap();
      Map<Integer, LinkedList<Mention>> goldMentionHeadPositions = Generics.newHashMap();
      for (Mention g : golds) {
        goldMentionPositions.put(new IntPair(g.startIndex, g.endIndex), g);
        if (!goldMentionHeadPositions.containsKey(g.headIndex)) {
          goldMentionHeadPositions.put(g.headIndex, new LinkedList<Mention>());
        }
        goldMentionHeadPositions.get(g.headIndex).add(g);
      }

      List<Mention> remains = new ArrayList<Mention>();
      for (Mention p : predicts) {
        IntPair pos = new IntPair(p.startIndex, p.endIndex);
        if (goldMentionPositions.containsKey(pos)) {
          Mention g = goldMentionPositions.get(pos);
          p.mentionID = g.mentionID;
          p.twinless = false;
          g.twinless = false;
          goldMentionHeadPositions.get(g.headIndex).remove(g);
          if (goldMentionHeadPositions.get(g.headIndex).isEmpty()) {
            goldMentionHeadPositions.remove(g.headIndex);
          }
        } else remains.add(p);
      }
      for (Mention r : remains) {
        if (goldMentionHeadPositions.containsKey(r.headIndex)) {
          Mention g = goldMentionHeadPositions.get(r.headIndex).poll();
          r.mentionID = g.mentionID;
          r.twinless = false;
          g.twinless = false;
          if (goldMentionHeadPositions.get(g.headIndex).isEmpty()) {
            goldMentionHeadPositions.remove(g.headIndex);
          }
        }
      }
    }
  }

Esempio n. 4

0

Mostra file

File: CoNLLMentionExtractor.java Progetto: nipengadmaster/CoreNLP

 public static List<List<Mention>> makeCopy(List<List<Mention>> mentions) {
   List<List<Mention>> copy = new ArrayList<List<Mention>>(mentions.size());
   for (List<Mention> sm : mentions) {
     List<Mention> sm2 = new ArrayList<Mention>(sm.size());
     for (Mention m : sm) {
       Mention m2 = new Mention();
       m2.goldCorefClusterID = m.goldCorefClusterID;
       m2.mentionID = m.mentionID;
       m2.startIndex = m.startIndex;
       m2.endIndex = m.endIndex;
       m2.originalSpan = m.originalSpan;
       m2.dependency = m.dependency;
       sm2.add(m2);
     }
     copy.add(sm2);
   }
   return copy;
 }

Esempio n. 5

0

Mostra file

File: Document.java Progetto: taarraas/matrixfactorization

 /** When there is no mentionID information (without gold annotation), assign mention IDs */
 protected void assignOriginalID() {
   List<List<Mention>> orderedMentionsBySentence = this.getOrderedMentions();
   boolean hasOriginalID = true;
   for (List<Mention> l : orderedMentionsBySentence) {
     if (l.size() == 0) continue;
     for (Mention m : l) {
       if (m.mentionID == -1) {
         hasOriginalID = false;
       }
     }
   }
   if (!hasOriginalID) {
     int id = 0;
     for (List<Mention> l : orderedMentionsBySentence) {
       for (Mention m : l) {
         m.mentionID = id++;
       }
     }
   }
 }

Esempio n. 6

0

Mostra file

File: MUCMentionExtractor.java Progetto: taarraas/matrixfactorization

  @Override
  public Document nextDoc() throws Exception {
    List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>();
    List<Tree> allTrees = new ArrayList<Tree>();
    List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>();
    List<List<Mention>> allPredictedMentions;
    List<CoreMap> allSentences = new ArrayList<CoreMap>();
    Annotation docAnno = new Annotation("");

    Pattern docPattern =
        Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Pattern sentencePattern =
        Pattern.compile(
            "(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)",
            Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docMatcher = docPattern.matcher(fileContents);
    if (!docMatcher.find(currentOffset)) return null;

    currentOffset = docMatcher.end();
    String doc = docMatcher.group(1);
    Matcher sentenceMatcher = sentencePattern.matcher(doc);
    String ner = null;

    // Maintain current document ID.
    Pattern docIDPattern =
        Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docIDMatcher = docIDPattern.matcher(doc);
    if (docIDMatcher.find()) currentDocumentID = docIDMatcher.group(1);
    else currentDocumentID = "documentAfter " + currentDocumentID;

    while (sentenceMatcher.find()) {
      String sentenceString = sentenceMatcher.group(2);
      List<CoreLabel> words =
          tokenizerFactory.getTokenizer(new StringReader(sentenceString)).tokenize();

      // FIXING TOKENIZATION PROBLEMS
      for (int i = 0; i < words.size(); i++) {
        CoreLabel w = words.get(i);
        if (i > 0 && w.word().equals("$")) {
          if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP"))
            continue;
          words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$");
          words.remove(i);
          i--;
        } else if (w.word().equals("\\/")) {
          if (words.get(i - 1).word().equals("</COREF>")) continue;
          w.set(
              CoreAnnotations.TextAnnotation.class,
              words.get(i - 1).word() + "\\/" + words.get(i + 1).word());
          words.remove(i + 1);
          words.remove(i - 1);
        }
      }
      // END FIXING TOKENIZATION PROBLEMS

      List<CoreLabel> sentence = new ArrayList<CoreLabel>();
      // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently
      // open
      Stack<Mention> stack = new Stack<Mention>();
      List<Mention> mentions = new ArrayList<Mention>();

      allWords.add(sentence);
      allGoldMentions.add(mentions);

      for (CoreLabel word : words) {
        String w = word.get(CoreAnnotations.TextAnnotation.class);
        // found regular token: WORD/POS
        if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) {
          int i = w.lastIndexOf("\\/");
          String w1 = w.substring(0, i);
          // we do NOT set POS info here. We take the POS tags from the parser!
          word.set(CoreAnnotations.TextAnnotation.class, w1);
          word.remove(CoreAnnotations.OriginalTextAnnotation.class);
          if (Constants.USE_GOLD_NE) {
            if (ner != null) {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
            } else {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
            }
          }
          sentence.add(word);
        }
        // found the start SGML tag for a NE, e.g., "<ORGANIZATION>"
        else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) {
          Pattern nerPattern = Pattern.compile("<(.*?)>");
          Matcher m = nerPattern.matcher(w);
          m.find();
          ner = m.group(1);
        }
        // found the end SGML tag for a NE, e.g., "</ORGANIZATION>"
        else if (w.startsWith("</") && !w.startsWith("</COREF")) {
          Pattern nerPattern = Pattern.compile("</(.*?)>");
          Matcher m = nerPattern.matcher(w);
          m.find();
          String ner1 = m.group(1);
          if (ner != null && !ner.equals(ner1))
            throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1);
          ner = null;
        }
        // found the start SGML tag for a coref mention
        else if (w.startsWith("<COREF")) {
          Mention mention = new Mention();
          // position of this mention in the sentence
          mention.startIndex = sentence.size();

          // extract GOLD info about this coref chain. needed for eval
          Pattern idPattern = Pattern.compile("ID=\"(.*?)\"");
          Pattern refPattern = Pattern.compile("REF=\"(.*?)\"");

          Matcher m = idPattern.matcher(w);
          m.find();
          mention.mentionID = Integer.parseInt(m.group(1));

          m = refPattern.matcher(w);
          if (m.find()) {
            mention.originalRef = Integer.parseInt(m.group(1));
          }

          // open mention. keep track of all open mentions using the stack
          stack.push(mention);
        }
        // found the end SGML tag for a coref mention
        else if (w.equals("</COREF>")) {
          Mention mention = stack.pop();
          mention.endIndex = sentence.size();

          // this is a closed mention. add it to the final list of mentions
          // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID,
          // mention.originalRef);
          mentions.add(mention);
        } else {
          word.remove(CoreAnnotations.OriginalTextAnnotation.class);
          if (Constants.USE_GOLD_NE) {
            if (ner != null) {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
            } else {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
            }
          }
          sentence.add(word);
        }
      }
      StringBuilder textContent = new StringBuilder();
      for (int i = 0; i < sentence.size(); i++) {
        CoreLabel w = sentence.get(i);
        w.set(CoreAnnotations.IndexAnnotation.class, i + 1);
        w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
        if (i > 0) textContent.append(" ");
        textContent.append(w.getString(CoreAnnotations.TextAnnotation.class));
      }
      CoreMap sentCoreMap = new Annotation(textContent.toString());
      allSentences.add(sentCoreMap);
      sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence);
    }

    // assign goldCorefClusterID
    Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use
    for (List<Mention> goldMentions : allGoldMentions) {
      for (Mention m : goldMentions) {
        idMention.put(m.mentionID, m);
      }
    }
    for (List<Mention> goldMentions : allGoldMentions) {
      for (Mention m : goldMentions) {
        if (m.goldCorefClusterID == -1) {
          if (m.originalRef == -1) m.goldCorefClusterID = m.mentionID;
          else {
            int ref = m.originalRef;
            while (true) {
              Mention m2 = idMention.get(ref);
              if (m2.goldCorefClusterID != -1) {
                m.goldCorefClusterID = m2.goldCorefClusterID;
                break;
              } else if (m2.originalRef == -1) {
                m2.goldCorefClusterID = m2.mentionID;
                m.goldCorefClusterID = m2.goldCorefClusterID;
                break;
              } else {
                ref = m2.originalRef;
              }
            }
          }
        }
      }
    }

    docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences);
    stanfordProcessor.annotate(docAnno);

    if (allSentences.size() != allWords.size())
      throw new IllegalStateException("allSentences != allWords");
    for (int i = 0; i < allSentences.size(); i++) {
      List<CoreLabel> annotatedSent =
          allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
      List<CoreLabel> unannotatedSent = allWords.get(i);
      List<Mention> mentionInSent = allGoldMentions.get(i);
      for (Mention m : mentionInSent) {
        m.dependency =
            allSentences
                .get(i)
                .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
      }
      if (annotatedSent.size() != unannotatedSent.size()) {
        throw new IllegalStateException("annotatedSent != unannotatedSent");
      }
      for (int j = 0, sz = annotatedSent.size(); j < sz; j++) {
        CoreLabel annotatedWord = annotatedSent.get(j);
        CoreLabel unannotatedWord = unannotatedSent.get(j);
        if (!annotatedWord
            .get(CoreAnnotations.TextAnnotation.class)
            .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) {
          throw new IllegalStateException("annotatedWord != unannotatedWord");
        }
      }
      allWords.set(i, annotatedSent);
      allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class));
    }

    // extract predicted mentions
    if (Constants.USE_GOLD_MENTIONS) allPredictedMentions = allGoldMentions;
    else
      allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);

    // add the relevant fields to mentions and order them for coref
    return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
  }