/** Mark twin mentions: All mention boundaries should be matched */ private void findTwinMentionsStrict() { for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) { List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum); List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum); // For CoNLL training there are some documents with gold mentions with the same position // offsets // See // /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll // (Packwood - Roth) CollectionValuedMap<IntPair, Mention> goldMentionPositions = new CollectionValuedMap<IntPair, Mention>(); for (Mention g : golds) { IntPair ip = new IntPair(g.startIndex, g.endIndex); if (goldMentionPositions.containsKey(ip)) { StringBuilder existingMentions = new StringBuilder(); for (Mention eg : goldMentionPositions.get(ip)) { if (existingMentions.length() > 0) { existingMentions.append(","); } existingMentions.append(eg.mentionID); } SieveCoreferenceSystem.logger.warning( "WARNING: gold mentions with the same offsets: " + ip + " mentions=" + g.mentionID + "," + existingMentions + ", " + g.spanToString()); } // assert(!goldMentionPositions.containsKey(ip)); goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g); } for (Mention p : predicts) { IntPair pos = new IntPair(p.startIndex, p.endIndex); if (goldMentionPositions.containsKey(pos)) { Collection<Mention> cm = goldMentionPositions.get(pos); Mention g = cm.iterator().next(); cm.remove(g); p.mentionID = g.mentionID; p.twinless = false; g.twinless = false; } } // temp: for making easy to recognize twinless mention for (Mention p : predicts) { if (p.twinless) p.mentionID += 10000; } } }
/** Mark twin mentions: heads of the mentions are matched */ private void findTwinMentionsRelaxed() { for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) { List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum); List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum); Map<IntPair, Mention> goldMentionPositions = Generics.newHashMap(); Map<Integer, LinkedList<Mention>> goldMentionHeadPositions = Generics.newHashMap(); for (Mention g : golds) { goldMentionPositions.put(new IntPair(g.startIndex, g.endIndex), g); if (!goldMentionHeadPositions.containsKey(g.headIndex)) { goldMentionHeadPositions.put(g.headIndex, new LinkedList<Mention>()); } goldMentionHeadPositions.get(g.headIndex).add(g); } List<Mention> remains = new ArrayList<Mention>(); for (Mention p : predicts) { IntPair pos = new IntPair(p.startIndex, p.endIndex); if (goldMentionPositions.containsKey(pos)) { Mention g = goldMentionPositions.get(pos); p.mentionID = g.mentionID; p.twinless = false; g.twinless = false; goldMentionHeadPositions.get(g.headIndex).remove(g); if (goldMentionHeadPositions.get(g.headIndex).isEmpty()) { goldMentionHeadPositions.remove(g.headIndex); } } else remains.add(p); } for (Mention r : remains) { if (goldMentionHeadPositions.containsKey(r.headIndex)) { Mention g = goldMentionHeadPositions.get(r.headIndex).poll(); r.mentionID = g.mentionID; r.twinless = false; g.twinless = false; if (goldMentionHeadPositions.get(g.headIndex).isEmpty()) { goldMentionHeadPositions.remove(g.headIndex); } } } } }