private String findNextParagraphSpeaker( List<CoreMap> paragraph, int paragraphOffset, Dictionaries dict) { CoreMap lastSent = paragraph.get(paragraph.size() - 1); String speaker = ""; for (CoreLabel w : lastSent.get(CoreAnnotations.TokensAnnotation.class)) { if (w.get(CoreAnnotations.LemmaAnnotation.class).equals("report") || w.get(CoreAnnotations.LemmaAnnotation.class).equals("say")) { String word = w.get(CoreAnnotations.TextAnnotation.class); SemanticGraph dependency = lastSent.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); IndexedWord t = dependency.getNodeByWordPattern(word); for (Pair<GrammaticalRelation, IndexedWord> child : dependency.childPairs(t)) { if (child.first().getShortName().equals("nsubj")) { int subjectIndex = child.second().index(); // start from 1 IntTuple headPosition = new IntTuple(2); headPosition.set(0, paragraph.size() - 1 + paragraphOffset); headPosition.set(1, subjectIndex - 1); if (mentionheadPositions.containsKey(headPosition) && mentionheadPositions.get(headPosition).nerString.startsWith("PER")) { speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID); } } } } } return speaker; }
private boolean findSpeaker( int utterNum, int sentNum, List<CoreMap> sentences, int startIndex, int endIndex, Dictionaries dict) { List<CoreLabel> sent = sentences.get(sentNum).get(CoreAnnotations.TokensAnnotation.class); for (int i = startIndex; i < endIndex; i++) { if (sent.get(i).get(CoreAnnotations.UtteranceAnnotation.class) != 0) continue; String lemma = sent.get(i).get(CoreAnnotations.LemmaAnnotation.class); String word = sent.get(i).get(CoreAnnotations.TextAnnotation.class); if (dict.reportVerb.contains(lemma)) { // find subject SemanticGraph dependency = sentences .get(sentNum) .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); IndexedWord w = dependency.getNodeByWordPattern(word); if (w != null) { for (Pair<GrammaticalRelation, IndexedWord> child : dependency.childPairs(w)) { if (child.first().getShortName().equals("nsubj")) { String subjectString = child.second().word(); int subjectIndex = child.second().index(); // start from 1 IntTuple headPosition = new IntTuple(2); headPosition.set(0, sentNum); headPosition.set(1, subjectIndex - 1); String speaker; if (mentionheadPositions.containsKey(headPosition)) { speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID); } else { speaker = subjectString; } speakers.put(utterNum, speaker); return true; } } } else { SieveCoreferenceSystem.logger.warning("Cannot find node in dependency for word " + word); } } } return false; }
public CorefMention(Mention m, IntTuple pos) { mentionType = m.mentionType; number = m.number; gender = m.gender; animacy = m.animacy; startIndex = m.startIndex + 1; endIndex = m.endIndex + 1; headIndex = m.headIndex + 1; corefClusterID = m.corefClusterID; sentNum = m.sentNum + 1; mentionID = m.mentionID; mentionSpan = m.spanToString(); // index starts from 1 position = new IntTuple(2); position.set(0, pos.get(0) + 1); position.set(1, pos.get(1) + 1); m.headWord.set(CorefCoreAnnotations.CorefClusterIdAnnotation.class, corefClusterID); }
private String findParagraphSpeaker( List<CoreMap> paragraph, int paragraphUtterIndex, String nextParagraphSpeaker, int paragraphOffset, Dictionaries dict) { if (!speakers.containsKey(paragraphUtterIndex)) { if (!nextParagraphSpeaker.equals("")) { speakers.put(paragraphUtterIndex, nextParagraphSpeaker); } else { // find the speaker of this paragraph (John, nbc news) CoreMap lastSent = paragraph.get(paragraph.size() - 1); String speaker = ""; boolean hasVerb = false; for (int i = 0; i < lastSent.get(CoreAnnotations.TokensAnnotation.class).size(); i++) { CoreLabel w = lastSent.get(CoreAnnotations.TokensAnnotation.class).get(i); String pos = w.get(CoreAnnotations.PartOfSpeechAnnotation.class); String ner = w.get(CoreAnnotations.NamedEntityTagAnnotation.class); if (pos.startsWith("V")) { hasVerb = true; break; } if (ner.startsWith("PER")) { IntTuple headPosition = new IntTuple(2); headPosition.set(0, paragraph.size() - 1 + paragraphOffset); headPosition.set(1, i); if (mentionheadPositions.containsKey(headPosition)) { speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID); } } } if (!hasVerb && !speaker.equals("")) { speakers.put(paragraphUtterIndex, speaker); } } } return findNextParagraphSpeaker(paragraph, paragraphOffset, dict); }
/** initialize positions and corefClusters (put each mention in each CorefCluster) */ private void initializeCorefCluster() { for (int i = 0; i < predictedOrderedMentionsBySentence.size(); i++) { for (int j = 0; j < predictedOrderedMentionsBySentence.get(i).size(); j++) { Mention m = predictedOrderedMentionsBySentence.get(i).get(j); if (allPredictedMentions.containsKey(m.mentionID)) { SieveCoreferenceSystem.logger.warning("WARNING: Already contain mention " + m.mentionID); Mention m1 = allPredictedMentions.get(m.mentionID); SieveCoreferenceSystem.logger.warning( "OLD mention: " + m1.spanToString() + "[" + m1.startIndex + "," + m1.endIndex + "]"); SieveCoreferenceSystem.logger.warning( "NEW mention: " + m.spanToString() + "[" + m.startIndex + "," + m.endIndex + "]"); // SieveCoreferenceSystem.debugPrintMentions(System.err, "PREDICTED ORDERED", // predictedOrderedMentionsBySentence); // SieveCoreferenceSystem.debugPrintMentions(System.err, "GOLD ORDERED", // goldOrderedMentionsBySentence); } assert (!allPredictedMentions.containsKey(m.mentionID)); allPredictedMentions.put(m.mentionID, m); IntTuple pos = new IntTuple(2); pos.set(0, i); pos.set(1, j); positions.put(m, pos); m.sentNum = i; assert (!corefClusters.containsKey(m.mentionID)); corefClusters.put( m.mentionID, new CorefCluster(m.mentionID, Generics.newHashSet(Arrays.asList(m)))); m.corefClusterID = m.mentionID; IntTuple headPosition = new IntTuple(2); headPosition.set(0, i); headPosition.set(1, m.headIndex); mentionheadPositions.put(headPosition, m); } } }
@Override public boolean equals(Object aThat) { if (this == aThat) return true; if (!(aThat instanceof CorefMention)) return false; CorefMention that = (CorefMention) aThat; if (mentionType != that.mentionType) return false; if (number != that.number) return false; if (gender != that.gender) return false; if (animacy != that.animacy) return false; if (startIndex != that.startIndex) return false; if (endIndex != that.endIndex) return false; if (headIndex != that.headIndex) return false; if (corefClusterID != that.corefClusterID) return false; if (mentionID != that.mentionID) return false; if (sentNum != that.sentNum) return false; if (!position.equals(that.position)) return false; // we ignore MentionSpan as it is constructed from the tokens // the mention is a span of, so if we know those spans are the // same, we should be able to ignore the actual text return true; }
@Override public int hashCode() { return position.hashCode(); }
/** Extract gold coref link information */ protected void extractGoldLinks() { // List<List<Mention>> orderedMentionsBySentence = this.getOrderedMentions(); List<Pair<IntTuple, IntTuple>> links = new ArrayList<Pair<IntTuple, IntTuple>>(); // position of each mention in the input matrix, by id Map<Integer, IntTuple> positions = Generics.newHashMap(); // positions of antecedents Map<Integer, List<IntTuple>> antecedents = Generics.newHashMap(); for (int i = 0; i < goldOrderedMentionsBySentence.size(); i++) { for (int j = 0; j < goldOrderedMentionsBySentence.get(i).size(); j++) { Mention m = goldOrderedMentionsBySentence.get(i).get(j); int id = m.mentionID; IntTuple pos = new IntTuple(2); pos.set(0, i); pos.set(1, j); positions.put(id, pos); antecedents.put(id, new ArrayList<IntTuple>()); } } // SieveCoreferenceSystem.debugPrintMentions(System.err, "", goldOrderedMentionsBySentence); for (List<Mention> mentions : goldOrderedMentionsBySentence) { for (Mention m : mentions) { int id = m.mentionID; IntTuple src = positions.get(id); assert (src != null); if (m.originalRef >= 0) { IntTuple dst = positions.get(m.originalRef); if (dst == null) { throw new RuntimeException("Cannot find gold mention with ID=" + m.originalRef); } // to deal with cataphoric annotation while (dst.get(0) > src.get(0) || (dst.get(0) == src.get(0) && dst.get(1) > src.get(1))) { Mention dstMention = goldOrderedMentionsBySentence.get(dst.get(0)).get(dst.get(1)); m.originalRef = dstMention.originalRef; dstMention.originalRef = id; if (m.originalRef < 0) break; dst = positions.get(m.originalRef); } if (m.originalRef < 0) continue; // A B C: if A<-B, A<-C => make a link B<-C for (int k = dst.get(0); k <= src.get(0); k++) { for (int l = 0; l < goldOrderedMentionsBySentence.get(k).size(); l++) { if (k == dst.get(0) && l < dst.get(1)) continue; if (k == src.get(0) && l > src.get(1)) break; IntTuple missed = new IntTuple(2); missed.set(0, k); missed.set(1, l); if (links.contains(new Pair<IntTuple, IntTuple>(missed, dst))) { antecedents.get(id).add(missed); links.add(new Pair<IntTuple, IntTuple>(src, missed)); } } } links.add(new Pair<IntTuple, IntTuple>(src, dst)); assert (antecedents.get(id) != null); antecedents.get(id).add(dst); List<IntTuple> ants = antecedents.get(m.originalRef); assert (ants != null); for (IntTuple ant : ants) { antecedents.get(id).add(ant); links.add(new Pair<IntTuple, IntTuple>(src, ant)); } } } } goldLinks = links; }
public void annotate(Annotation annotation) { try { List<Tree> trees = new ArrayList<Tree>(); List<List<CoreLabel>> sentences = new ArrayList<List<CoreLabel>>(); // extract trees and sentence words // we are only supporting the new annotation standard for this Annotator! if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) { int sentNum = 0; for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); sentences.add(tokens); Tree tree = sentence.get(TreeAnnotation.class); trees.add(tree); MentionExtractor.mergeLabels(tree, tokens); MentionExtractor.initializeUtterance(tokens); } } else { System.err.println( "ERROR: this coreference resolution system requires SentencesAnnotation!"); return; } // extract all possible mentions RuleBasedCorefMentionFinder finder = new RuleBasedCorefMentionFinder(); List<List<Mention>> allUnprocessedMentions = finder.extractPredictedMentions(annotation, 0, corefSystem.dictionaries()); // add the relevant info to mentions and order them for coref Document document = mentionExtractor.arrange(annotation, sentences, trees, allUnprocessedMentions); List<List<Mention>> orderedMentions = document.getOrderedMentions(); if (VERBOSE) { for (int i = 0; i < orderedMentions.size(); i++) { System.err.printf("Mentions in sentence #%d:\n", i); for (int j = 0; j < orderedMentions.get(i).size(); j++) { System.err.println( "\tMention #" + j + ": " + orderedMentions.get(i).get(j).spanToString()); } } } Map<Integer, CorefChain> result = corefSystem.coref(document); annotation.set(CorefChainAnnotation.class, result); // for backward compatibility if (OLD_FORMAT) { List<Pair<IntTuple, IntTuple>> links = SieveCoreferenceSystem.getLinks(result); if (VERBOSE) { System.err.printf("Found %d coreference links:\n", links.size()); for (Pair<IntTuple, IntTuple> link : links) { System.err.printf( "LINK (%d, %d) -> (%d, %d)\n", link.first.get(0), link.first.get(1), link.second.get(0), link.second.get(1)); } } // // save the coref output as CorefGraphAnnotation // List<List<CoreLabel>> sents = new ArrayList<List<CoreLabel>>(); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); sents.add(tokens); } // this graph is stored in CorefGraphAnnotation -- the raw links found by the coref system List<Pair<IntTuple, IntTuple>> graph = new ArrayList<Pair<IntTuple, IntTuple>>(); for (Pair<IntTuple, IntTuple> link : links) { // // Note: all offsets in the graph start at 1 (not at 0!) // we do this for consistency reasons, as indices for syntactic dependencies start // at 1 // int srcSent = link.first.get(0); int srcTok = orderedMentions.get(srcSent - 1).get(link.first.get(1) - 1).headIndex + 1; int dstSent = link.second.get(0); int dstTok = orderedMentions.get(dstSent - 1).get(link.second.get(1) - 1).headIndex + 1; IntTuple dst = new IntTuple(2); dst.set(0, dstSent); dst.set(1, dstTok); IntTuple src = new IntTuple(2); src.set(0, srcSent); src.set(1, srcTok); graph.add(new Pair<IntTuple, IntTuple>(src, dst)); } annotation.set(CorefGraphAnnotation.class, graph); for (CorefChain corefChain : result.values()) { if (corefChain.getCorefMentions().size() < 2) continue; Set<CoreLabel> coreferentTokens = new HashSet<CoreLabel>(); Set<CoreLabel> cyclicCoreferentTokens = new HashSet<CoreLabel>(); for (CorefMention mention : corefChain.getCorefMentions()) { CoreMap sentence = annotation.get(SentencesAnnotation.class).get(mention.sentNum - 1); CoreLabel token = sentence.get(TokensAnnotation.class).get(mention.headIndex - 1); // this stuff is so things will mostly work without us replacing all // tokens with CyclicCoreLabels whenever coref is run (which maybe // wouldn't be a terrible thing?) coreferentTokens.add(token); cyclicCoreferentTokens.add(new CyclicCoreLabel(token)); } for (CoreLabel token : coreferentTokens) { token.set(CorefClusterAnnotation.class, cyclicCoreferentTokens); } for (CoreLabel token : cyclicCoreferentTokens) { token.set(CorefClusterAnnotation.class, cyclicCoreferentTokens); } } } } catch (Exception e) { throw new RuntimeException(e); } }