/** Check one mention is the speaker of the other mention */ public static boolean isSpeaker(Mention m, Mention ant, Dictionaries dict) { if (!dict.firstPersonPronouns.contains(ant.spanToString().toLowerCase()) || ant.number == Number.PLURAL || ant.sentNum != m.sentNum) return false; int countQuotationMark = 0; for (int i = Math.min(m.headIndex, ant.headIndex) + 1; i < Math.max(m.headIndex, ant.headIndex); i++) { String word = m.sentenceWords.get(i).get(CoreAnnotations.TextAnnotation.class); if (word.equals("``") || word.equals("''")) countQuotationMark++; } if (countQuotationMark != 1) return false; IndexedWord w = m.dependency.getNodeByWordPattern( m.sentenceWords.get(m.headIndex).get(CoreAnnotations.TextAnnotation.class)); if (w == null) return false; for (Pair<GrammaticalRelation, IndexedWord> parent : m.dependency.parentPairs(w)) { if (parent.first().getShortName().equals("nsubj") && dict.reportVerb.contains(parent.second().get(CoreAnnotations.LemmaAnnotation.class))) { return true; } } return false; }
private void findSpeakersInConversation(Dictionaries dict) { for (List<Mention> l : predictedOrderedMentionsBySentence) { for (Mention m : l) { if (m.predicateNominatives == null) continue; for (Mention a : m.predicateNominatives) { if (a.spanToString().toLowerCase().equals("i")) { speakers.put( m.headWord.get(CoreAnnotations.UtteranceAnnotation.class), Integer.toString(m.mentionID)); } } } } List<CoreMap> paragraph = new ArrayList<CoreMap>(); int paragraphUtterIndex = 0; String nextParagraphSpeaker = ""; int paragraphOffset = 0; for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { int currentUtter = sent.get(CoreAnnotations.TokensAnnotation.class) .get(0) .get(CoreAnnotations.UtteranceAnnotation.class); if (paragraphUtterIndex != currentUtter) { nextParagraphSpeaker = findParagraphSpeaker( paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict); paragraphUtterIndex = currentUtter; paragraphOffset += paragraph.size(); paragraph = new ArrayList<CoreMap>(); } paragraph.add(sent); } findParagraphSpeaker( paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict); }
/** Mark twin mentions: All mention boundaries should be matched */ private void findTwinMentionsStrict() { for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) { List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum); List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum); // For CoNLL training there are some documents with gold mentions with the same position // offsets // See // /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll // (Packwood - Roth) CollectionValuedMap<IntPair, Mention> goldMentionPositions = new CollectionValuedMap<IntPair, Mention>(); for (Mention g : golds) { IntPair ip = new IntPair(g.startIndex, g.endIndex); if (goldMentionPositions.containsKey(ip)) { StringBuilder existingMentions = new StringBuilder(); for (Mention eg : goldMentionPositions.get(ip)) { if (existingMentions.length() > 0) { existingMentions.append(","); } existingMentions.append(eg.mentionID); } SieveCoreferenceSystem.logger.warning( "WARNING: gold mentions with the same offsets: " + ip + " mentions=" + g.mentionID + "," + existingMentions + ", " + g.spanToString()); } // assert(!goldMentionPositions.containsKey(ip)); goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g); } for (Mention p : predicts) { IntPair pos = new IntPair(p.startIndex, p.endIndex); if (goldMentionPositions.containsKey(pos)) { Collection<Mention> cm = goldMentionPositions.get(pos); Mention g = cm.iterator().next(); cm.remove(g); p.mentionID = g.mentionID; p.twinless = false; g.twinless = false; } } // temp: for making easy to recognize twinless mention for (Mention p : predicts) { if (p.twinless) p.mentionID += 10000; } } }
/** initialize positions and corefClusters (put each mention in each CorefCluster) */ private void initializeCorefCluster() { for (int i = 0; i < predictedOrderedMentionsBySentence.size(); i++) { for (int j = 0; j < predictedOrderedMentionsBySentence.get(i).size(); j++) { Mention m = predictedOrderedMentionsBySentence.get(i).get(j); if (allPredictedMentions.containsKey(m.mentionID)) { SieveCoreferenceSystem.logger.warning("WARNING: Already contain mention " + m.mentionID); Mention m1 = allPredictedMentions.get(m.mentionID); SieveCoreferenceSystem.logger.warning( "OLD mention: " + m1.spanToString() + "[" + m1.startIndex + "," + m1.endIndex + "]"); SieveCoreferenceSystem.logger.warning( "NEW mention: " + m.spanToString() + "[" + m.startIndex + "," + m.endIndex + "]"); // SieveCoreferenceSystem.debugPrintMentions(System.err, "PREDICTED ORDERED", // predictedOrderedMentionsBySentence); // SieveCoreferenceSystem.debugPrintMentions(System.err, "GOLD ORDERED", // goldOrderedMentionsBySentence); } assert (!allPredictedMentions.containsKey(m.mentionID)); allPredictedMentions.put(m.mentionID, m); IntTuple pos = new IntTuple(2); pos.set(0, i); pos.set(1, j); positions.put(m, pos); m.sentNum = i; assert (!corefClusters.containsKey(m.mentionID)); corefClusters.put( m.mentionID, new CorefCluster(m.mentionID, Generics.newHashSet(Arrays.asList(m)))); m.corefClusterID = m.mentionID; IntTuple headPosition = new IntTuple(2); headPosition.set(0, i); headPosition.set(1, m.headIndex); mentionheadPositions.put(headPosition, m); } } }
public CorefMention(Mention m, IntTuple pos) { mentionType = m.mentionType; number = m.number; gender = m.gender; animacy = m.animacy; startIndex = m.startIndex + 1; endIndex = m.endIndex + 1; headIndex = m.headIndex + 1; corefClusterID = m.corefClusterID; sentNum = m.sentNum + 1; mentionID = m.mentionID; mentionSpan = m.spanToString(); // index starts from 1 position = new IntTuple(2); position.set(0, pos.get(0) + 1); position.set(1, pos.get(1) + 1); m.headWord.set(CorefCoreAnnotations.CorefClusterIdAnnotation.class, corefClusterID); }