/** Mark twin mentions: All mention boundaries should be matched */ private void findTwinMentionsStrict() { for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) { List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum); List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum); // For CoNLL training there are some documents with gold mentions with the same position // offsets // See // /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll // (Packwood - Roth) CollectionValuedMap<IntPair, Mention> goldMentionPositions = new CollectionValuedMap<IntPair, Mention>(); for (Mention g : golds) { IntPair ip = new IntPair(g.startIndex, g.endIndex); if (goldMentionPositions.containsKey(ip)) { StringBuilder existingMentions = new StringBuilder(); for (Mention eg : goldMentionPositions.get(ip)) { if (existingMentions.length() > 0) { existingMentions.append(","); } existingMentions.append(eg.mentionID); } SieveCoreferenceSystem.logger.warning( "WARNING: gold mentions with the same offsets: " + ip + " mentions=" + g.mentionID + "," + existingMentions + ", " + g.spanToString()); } // assert(!goldMentionPositions.containsKey(ip)); goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g); } for (Mention p : predicts) { IntPair pos = new IntPair(p.startIndex, p.endIndex); if (goldMentionPositions.containsKey(pos)) { Collection<Mention> cm = goldMentionPositions.get(pos); Mention g = cm.iterator().next(); cm.remove(g); p.mentionID = g.mentionID; p.twinless = false; g.twinless = false; } } // temp: for making easy to recognize twinless mention for (Mention p : predicts) { if (p.twinless) p.mentionID += 10000; } } }
public List<List<Mention>> extractGoldMentions(CoNLL2011DocumentReader.Document conllDoc) { List<CoreMap> sentences = conllDoc.getAnnotation().get(CoreAnnotations.SentencesAnnotation.class); List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>(); CollectionValuedMap<String, CoreMap> corefChainMap = conllDoc.getCorefChainMap(); for (int i = 0; i < sentences.size(); i++) { allGoldMentions.add(new ArrayList<Mention>()); } int maxCorefClusterId = -1; for (String corefIdStr : corefChainMap.keySet()) { int id = Integer.parseInt(corefIdStr); if (id > maxCorefClusterId) { maxCorefClusterId = id; } } int newMentionID = maxCorefClusterId + 1; for (String corefIdStr : corefChainMap.keySet()) { int id = Integer.parseInt(corefIdStr); int clusterMentionCnt = 0; for (CoreMap m : corefChainMap.get(corefIdStr)) { clusterMentionCnt++; Mention mention = new Mention(); mention.goldCorefClusterID = id; if (clusterMentionCnt == 1) { // First mention in cluster mention.mentionID = id; mention.originalRef = -1; } else { mention.mentionID = newMentionID; mention.originalRef = id; newMentionID++; } if (maxID < mention.mentionID) maxID = mention.mentionID; int sentIndex = m.get(CoreAnnotations.SentenceIndexAnnotation.class); CoreMap sent = sentences.get(sentIndex); mention.startIndex = m.get(CoreAnnotations.TokenBeginAnnotation.class) - sent.get(CoreAnnotations.TokenBeginAnnotation.class); mention.endIndex = m.get(CoreAnnotations.TokenEndAnnotation.class) - sent.get(CoreAnnotations.TokenBeginAnnotation.class); // will be set by arrange mention.originalSpan = m.get(CoreAnnotations.TokensAnnotation.class); // Mention dependency is collapsed dependency for sentence mention.dependency = sentences .get(sentIndex) .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); allGoldMentions.get(sentIndex).add(mention); } } return allGoldMentions; }
/** * Load a collection of parse trees from the file of given name. Each tree may optionally be * encased in parens to allow for Penn Treebank style trees. This methods implements the <code> * FileProcessor</code> interface. * * @param file file to load a tree from */ public void processFile(File file) { TreeReader tr = null; // SRL stuff CollectionValuedMap<Integer, String> srlMap = null; if (this.srlMap != null) { // there must be a better way ... String filename = file.getAbsolutePath(); for (String suffix : this.srlMap.keySet()) { if (filename.endsWith(suffix)) { srlMap = this.srlMap.get(suffix); break; } } if (srlMap == null) { System.err.println("could not find SRL entries for file: " + file); } } try { // maybe print file name to stdout to get some feedback if (PRINT_FILENAMES) { System.err.println(file); } // could throw an IO exception if can't open for reading tr = treeReaderFactory() .newTreeReader( new BufferedReader(new InputStreamReader(new FileInputStream(file), encoding()))); int sentIndex = 0; Tree pt; while ((pt = tr.readTree()) != null) { if (pt.label() instanceof HasIndex) { // so we can trace where this tree came from HasIndex hi = (HasIndex) pt.label(); hi.setDocID(file.getName()); hi.setSentIndex(sentIndex); } if (srlMap == null) { parseTrees.add(pt); } else { Collection<String> srls = srlMap.get(sentIndex); // pt.pennPrint(); // System.err.println(srls); parseTrees.add(pt); if (srls.isEmpty()) { // parseTrees.add(pt); } else { for (String srl : srls) { // Tree t = pt.deepCopy(); String[] bits = srl.split("\\s+"); int verbIndex = Integer.parseInt(bits[0]); String lemma = bits[2].split("\\.")[0]; // Tree verb = Trees.getTerminal(t, verbIndex); Tree verb = Trees.getTerminal(pt, verbIndex); // ((CoreLabel)verb.label()).set(SRLIDAnnotation.class, SRL_ID.REL); ((CoreLabel) verb.label()).set(CoreAnnotations.CoNLLPredicateAnnotation.class, true); for (int i = 4; i < bits.length; i++) { String arg = bits[i]; String[] bits1; if (arg.indexOf("ARGM") >= 0) { bits1 = arg.split("-"); } else { bits1 = arg.split("-"); } String locs = bits1[0]; String argType = bits1[1]; if (argType.equals("rel")) { continue; } for (String loc : locs.split("[*,]")) { bits1 = loc.split(":"); int term = Integer.parseInt(bits1[0]); int height = Integer.parseInt(bits1[1]); // Tree t1 = Trees.getPreTerminal(t, term); Tree t1 = Trees.getPreTerminal(pt, term); for (int j = 0; j < height; j++) { // t1 = t1.parent(t); t1 = t1.parent(pt); } Map<Integer, String> roleMap = ((CoreLabel) t1.label()).get(CoreAnnotations.CoNLLSRLAnnotation.class); if (roleMap == null) { roleMap = new HashMap<Integer, String>(); ((CoreLabel) t1.label()).set(CoreAnnotations.CoNLLSRLAnnotation.class, roleMap); } roleMap.put(verbIndex, argType); // ((CoreLabel)t1.label()).set(SRLIDAnnotation.class, // SRL_ID.ARG); } } // for (Tree t1 : t) { // if (t1.isLeaf()) { continue; } // CoreLabel fl = (CoreLabel)t1.label(); // if (fl.value() == null) { continue; } // if (!fl.has(SRLIDAnnotation.class)) { // boolean allNone = true; // for (Tree t2 : t1) { // SRL_ID s = ((CoreLabel)t2.label()).get(SRLIDAnnotation.class); // if (s == SRL_ID.ARG || s == SRL_ID.REL) { // allNone = false; // break; // } // } // if (allNone) { // fl.set(SRLIDAnnotation.class, SRL_ID.ALL_NO); // } else { // fl.set(SRLIDAnnotation.class, SRL_ID.NO); // } // } // } // parseTrees.add(t); } } } sentIndex++; } } catch (IOException e) { System.err.println("loadTree IO Exception: " + e + " in file " + file); } finally { try { if (tr != null) { tr.close(); // important: closes file even if error! } } catch (IOException e) { // do nothin' } } }