private void findSpeakersInConversation(Dictionaries dict) { for (List<Mention> l : predictedOrderedMentionsBySentence) { for (Mention m : l) { if (m.predicateNominatives == null) continue; for (Mention a : m.predicateNominatives) { if (a.spanToString().toLowerCase().equals("i")) { speakers.put( m.headWord.get(CoreAnnotations.UtteranceAnnotation.class), Integer.toString(m.mentionID)); } } } } List<CoreMap> paragraph = new ArrayList<CoreMap>(); int paragraphUtterIndex = 0; String nextParagraphSpeaker = ""; int paragraphOffset = 0; for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { int currentUtter = sent.get(CoreAnnotations.TokensAnnotation.class) .get(0) .get(CoreAnnotations.UtteranceAnnotation.class); if (paragraphUtterIndex != currentUtter) { nextParagraphSpeaker = findParagraphSpeaker( paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict); paragraphUtterIndex = currentUtter; paragraphOffset += paragraph.size(); paragraph = new ArrayList<CoreMap>(); } paragraph.add(sent); } findParagraphSpeaker( paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict); }
public List<List<Mention>> extractGoldMentions(CoNLL2011DocumentReader.Document conllDoc) { List<CoreMap> sentences = conllDoc.getAnnotation().get(CoreAnnotations.SentencesAnnotation.class); List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>(); CollectionValuedMap<String, CoreMap> corefChainMap = conllDoc.getCorefChainMap(); for (int i = 0; i < sentences.size(); i++) { allGoldMentions.add(new ArrayList<Mention>()); } int maxCorefClusterId = -1; for (String corefIdStr : corefChainMap.keySet()) { int id = Integer.parseInt(corefIdStr); if (id > maxCorefClusterId) { maxCorefClusterId = id; } } int newMentionID = maxCorefClusterId + 1; for (String corefIdStr : corefChainMap.keySet()) { int id = Integer.parseInt(corefIdStr); int clusterMentionCnt = 0; for (CoreMap m : corefChainMap.get(corefIdStr)) { clusterMentionCnt++; Mention mention = new Mention(); mention.goldCorefClusterID = id; if (clusterMentionCnt == 1) { // First mention in cluster mention.mentionID = id; mention.originalRef = -1; } else { mention.mentionID = newMentionID; mention.originalRef = id; newMentionID++; } if (maxID < mention.mentionID) maxID = mention.mentionID; int sentIndex = m.get(CoreAnnotations.SentenceIndexAnnotation.class); CoreMap sent = sentences.get(sentIndex); mention.startIndex = m.get(CoreAnnotations.TokenBeginAnnotation.class) - sent.get(CoreAnnotations.TokenBeginAnnotation.class); mention.endIndex = m.get(CoreAnnotations.TokenEndAnnotation.class) - sent.get(CoreAnnotations.TokenBeginAnnotation.class); // will be set by arrange mention.originalSpan = m.get(CoreAnnotations.TokensAnnotation.class); // Mention dependency is collapsed dependency for sentence mention.dependency = sentences .get(sentIndex) .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); allGoldMentions.get(sentIndex).add(mention); } } return allGoldMentions; }
/** * Generate the training features from the CoNLL input file. * * @return Dataset of feature vectors * @throws Exception */ public GeneralDataset<String, String> generateFeatureVectors(Properties props) throws Exception { GeneralDataset<String, String> dataset = new Dataset<>(); Dictionaries dict = new Dictionaries(props); MentionExtractor mentionExtractor = new CoNLLMentionExtractor(dict, props, new Semantics(dict)); Document document; while ((document = mentionExtractor.nextDoc()) != null) { setTokenIndices(document); document.extractGoldCorefClusters(); Map<Integer, CorefCluster> entities = document.goldCorefClusters; // Generate features for coreferent mentions with class label 1 for (CorefCluster entity : entities.values()) { for (Mention mention : entity.getCorefMentions()) { // Ignore verbal mentions if (mention.headWord.tag().startsWith("V")) continue; IndexedWord head = mention.dependency.getNodeByIndexSafe(mention.headWord.index()); if (head == null) continue; ArrayList<String> feats = mention.getSingletonFeatures(dict); dataset.add(new BasicDatum<>(feats, "1")); } } // Generate features for singletons with class label 0 ArrayList<CoreLabel> gold_heads = new ArrayList<>(); for (Mention gold_men : document.allGoldMentions.values()) { gold_heads.add(gold_men.headWord); } for (Mention predicted_men : document.allPredictedMentions.values()) { SemanticGraph dep = predicted_men.dependency; IndexedWord head = dep.getNodeByIndexSafe(predicted_men.headWord.index()); if (head == null) continue; // Ignore verbal mentions if (predicted_men.headWord.tag().startsWith("V")) continue; // If the mention is in the gold set, it is not a singleton and thus ignore if (gold_heads.contains(predicted_men.headWord)) continue; dataset.add(new BasicDatum<>(predicted_men.getSingletonFeatures(dict), "0")); } } dataset.summaryStatistics(); return dataset; }
/** Check one mention is the speaker of the other mention */ public static boolean isSpeaker(Mention m, Mention ant, Dictionaries dict) { if (!dict.firstPersonPronouns.contains(ant.spanToString().toLowerCase()) || ant.number == Number.PLURAL || ant.sentNum != m.sentNum) return false; int countQuotationMark = 0; for (int i = Math.min(m.headIndex, ant.headIndex) + 1; i < Math.max(m.headIndex, ant.headIndex); i++) { String word = m.sentenceWords.get(i).get(CoreAnnotations.TextAnnotation.class); if (word.equals("``") || word.equals("''")) countQuotationMark++; } if (countQuotationMark != 1) return false; IndexedWord w = m.dependency.getNodeByWordPattern( m.sentenceWords.get(m.headIndex).get(CoreAnnotations.TextAnnotation.class)); if (w == null) return false; for (Pair<GrammaticalRelation, IndexedWord> parent : m.dependency.parentPairs(w)) { if (parent.first().getShortName().equals("nsubj") && dict.reportVerb.contains(parent.second().get(CoreAnnotations.LemmaAnnotation.class))) { return true; } } return false; }
/** Mark twin mentions: All mention boundaries should be matched */ private void findTwinMentionsStrict() { for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) { List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum); List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum); // For CoNLL training there are some documents with gold mentions with the same position // offsets // See // /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll // (Packwood - Roth) CollectionValuedMap<IntPair, Mention> goldMentionPositions = new CollectionValuedMap<IntPair, Mention>(); for (Mention g : golds) { IntPair ip = new IntPair(g.startIndex, g.endIndex); if (goldMentionPositions.containsKey(ip)) { StringBuilder existingMentions = new StringBuilder(); for (Mention eg : goldMentionPositions.get(ip)) { if (existingMentions.length() > 0) { existingMentions.append(","); } existingMentions.append(eg.mentionID); } SieveCoreferenceSystem.logger.warning( "WARNING: gold mentions with the same offsets: " + ip + " mentions=" + g.mentionID + "," + existingMentions + ", " + g.spanToString()); } // assert(!goldMentionPositions.containsKey(ip)); goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g); } for (Mention p : predicts) { IntPair pos = new IntPair(p.startIndex, p.endIndex); if (goldMentionPositions.containsKey(pos)) { Collection<Mention> cm = goldMentionPositions.get(pos); Mention g = cm.iterator().next(); cm.remove(g); p.mentionID = g.mentionID; p.twinless = false; g.twinless = false; } } // temp: for making easy to recognize twinless mention for (Mention p : predicts) { if (p.twinless) p.mentionID += 10000; } } }
public void setKeywordsWikiMiner() { try { WikipediaAnnotator annotator = new WikipediaAnnotator(); long annstartTime = System.currentTimeMillis(); HashMap<String, Label.Sense[]> ment2ent = annotator.annotate(document); long annendTime = System.currentTimeMillis(); long diff1 = (annendTime - annstartTime); System.out.println("Time taken by annotater : " + diff1 + " milliseconds"); for (String key : ment2ent.keySet()) { // sunny:adding code to check if sense is within freebase dataset // adding it only if we find it. // Vector<Label.Sense> updatedsenses = new Vector(); // for(Label.Sense s : ment2ent.get(key)){ // String entity = s.getTitle().replace(" ", "_"); // String freebaseid = // WikiToFreebaseIDMap.getInstance().getFreeBaseID("\"/wikipedia/en_title/" + entity + // "\""); // // if(freebaseid != null){ // updatedsenses.add(s); // } // } // // Label.Sense[] sensearray = new Label.Sense[updatedsenses.size()]; // // updatedsenses.toArray(sensearray); LabelSense senses = new LabelSense(ment2ent.get(key)); // LabelSense senses = new LabelSense(sensearray); Mention mention = new Mention(); // System.out.println("key from ment2ent : " + key); String ment = key.split("_")[0]; // System.out.println("ment from ment2ent : " + ment); int off = Integer.parseInt(key.split("_")[1]); mention.key = ment; mention.name = ment; mention.length = ment.length(); mention.offset = off; mention.context = getContext(off, mention.length, contextSize); mention.contextAroundMention = getContext(off, mention.length, 10); mention.senses = senses; keywords.add(mention); } } catch (Exception e) { e.printStackTrace(); } }
public void setGroundMention(ArrayList<XMLTagInfo> groundtruth) throws Exception { groundMention.clear(); keywords.clear(); for (int i = 0; i < groundtruth.size(); i++) { int off = groundtruth.get(i).offset; int len = groundtruth.get(i).length; groundMention.add(document.substring(off, off + len)); Mention m = new Mention(); int context_lo = Math.max(0, off - contextSize); int context_hi = Math.min(document.length() - 1, off + contextSize); String contextString = document.substring(context_lo, context_hi); m.context = contextString.replaceAll("[^0-9a-z\\sA-Z/\\-]", "").toLowerCase(); int con_lo = Math.max(0, m.offset - 10); int con_hi = Math.min(document.length() - 1, m.offset + 10); m.contextAroundMention = document.substring(con_lo, con_hi).replaceAll("[^0-9a-z\\sA-Z]", " ").toLowerCase(); m.contextAroundMention += " " + m.name.replaceAll("[^0-9a-z\\sA-Z]", " ").toLowerCase(); m.name = document.substring(off, off + len); m.context.replaceAll("\\sand", ""); m.contextAroundMention.replaceAll("\\sand", ""); m.context.replaceAll("\\snot", ""); m.contextAroundMention.replaceAll("\\snot", ""); m.length = len; m.offset = off; keywords.add(m); } }
/** Mark twin mentions: heads of the mentions are matched */ private void findTwinMentionsRelaxed() { for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) { List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum); List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum); Map<IntPair, Mention> goldMentionPositions = Generics.newHashMap(); Map<Integer, LinkedList<Mention>> goldMentionHeadPositions = Generics.newHashMap(); for (Mention g : golds) { goldMentionPositions.put(new IntPair(g.startIndex, g.endIndex), g); if (!goldMentionHeadPositions.containsKey(g.headIndex)) { goldMentionHeadPositions.put(g.headIndex, new LinkedList<Mention>()); } goldMentionHeadPositions.get(g.headIndex).add(g); } List<Mention> remains = new ArrayList<Mention>(); for (Mention p : predicts) { IntPair pos = new IntPair(p.startIndex, p.endIndex); if (goldMentionPositions.containsKey(pos)) { Mention g = goldMentionPositions.get(pos); p.mentionID = g.mentionID; p.twinless = false; g.twinless = false; goldMentionHeadPositions.get(g.headIndex).remove(g); if (goldMentionHeadPositions.get(g.headIndex).isEmpty()) { goldMentionHeadPositions.remove(g.headIndex); } } else remains.add(p); } for (Mention r : remains) { if (goldMentionHeadPositions.containsKey(r.headIndex)) { Mention g = goldMentionHeadPositions.get(r.headIndex).poll(); r.mentionID = g.mentionID; r.twinless = false; g.twinless = false; if (goldMentionHeadPositions.get(g.headIndex).isEmpty()) { goldMentionHeadPositions.remove(g.headIndex); } } } } }
/** When there is no mentionID information (without gold annotation), assign mention IDs */ protected void assignOriginalID() { List<List<Mention>> orderedMentionsBySentence = this.getOrderedMentions(); boolean hasOriginalID = true; for (List<Mention> l : orderedMentionsBySentence) { if (l.size() == 0) continue; for (Mention m : l) { if (m.mentionID == -1) { hasOriginalID = false; } } } if (!hasOriginalID) { int id = 0; for (List<Mention> l : orderedMentionsBySentence) { for (Mention m : l) { m.mentionID = id++; } } } }
public List<ClusteredMention> runCoreference(Document doc) { // --Overhead startTrack("Testing " + doc.id); // (variables) List<ClusteredMention> rtn = new ArrayList<ClusteredMention>(doc.getMentions().size()); List<Mention> mentions = doc.getMentions(); int singletons = 0; // --Run Classifier for (int i = 0; i < mentions.size(); i++) { // (variables) Mention onPrix = mentions.get(i); int coreferentWith = -1; // (get mention it is coreferent with) for (int j = i - 1; j >= 0; j--) { ClusteredMention cand = rtn.get(j); boolean coreferent = classifier.classOf( new RVFDatum<Boolean, Feature>(extractor.extractFeatures(Pair.make(onPrix, cand)))); if (coreferent) { coreferentWith = j; break; } } if (coreferentWith < 0) { singletons += 1; rtn.add(onPrix.markSingleton()); } else { // log("Mention " + onPrix + " coreferent with " + mentions.get(coreferentWith)); rtn.add(onPrix.markCoreferent(rtn.get(coreferentWith))); } } // log("" + singletons + " singletons"); // --Return endTrack("Testing " + doc.id); return rtn; }
/** Set paragraph index */ private void setParagraphAnnotation() { int paragraphIndex = 0; int previousOffset = -10; for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { for (CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) { if (w.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) { if (w.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) > previousOffset + 2) paragraphIndex++; w.set(CoreAnnotations.ParagraphAnnotation.class, paragraphIndex); previousOffset = w.get(CoreAnnotations.CharacterOffsetEndAnnotation.class); } else { w.set(CoreAnnotations.ParagraphAnnotation.class, -1); } } } for (List<Mention> l : predictedOrderedMentionsBySentence) { for (Mention m : l) { m.paragraph = m.headWord.get(CoreAnnotations.ParagraphAnnotation.class); } } numParagraph = paragraphIndex; }
public static List<List<Mention>> makeCopy(List<List<Mention>> mentions) { List<List<Mention>> copy = new ArrayList<List<Mention>>(mentions.size()); for (List<Mention> sm : mentions) { List<Mention> sm2 = new ArrayList<Mention>(sm.size()); for (Mention m : sm) { Mention m2 = new Mention(); m2.goldCorefClusterID = m.goldCorefClusterID; m2.mentionID = m.mentionID; m2.startIndex = m.startIndex; m2.endIndex = m.endIndex; m2.originalSpan = m.originalSpan; m2.dependency = m.dependency; sm2.add(m2); } copy.add(sm2); } return copy; }
public CorefMention(Mention m, IntTuple pos) { mentionType = m.mentionType; number = m.number; gender = m.gender; animacy = m.animacy; startIndex = m.startIndex + 1; endIndex = m.endIndex + 1; headIndex = m.headIndex + 1; corefClusterID = m.corefClusterID; sentNum = m.sentNum + 1; mentionID = m.mentionID; mentionSpan = m.spanToString(); // index starts from 1 position = new IntTuple(2); position.set(0, pos.get(0) + 1); position.set(1, pos.get(1) + 1); m.headWord.set(CorefCoreAnnotations.CorefClusterIdAnnotation.class, corefClusterID); }
/** initialize positions and corefClusters (put each mention in each CorefCluster) */ private void initializeCorefCluster() { for (int i = 0; i < predictedOrderedMentionsBySentence.size(); i++) { for (int j = 0; j < predictedOrderedMentionsBySentence.get(i).size(); j++) { Mention m = predictedOrderedMentionsBySentence.get(i).get(j); if (allPredictedMentions.containsKey(m.mentionID)) { SieveCoreferenceSystem.logger.warning("WARNING: Already contain mention " + m.mentionID); Mention m1 = allPredictedMentions.get(m.mentionID); SieveCoreferenceSystem.logger.warning( "OLD mention: " + m1.spanToString() + "[" + m1.startIndex + "," + m1.endIndex + "]"); SieveCoreferenceSystem.logger.warning( "NEW mention: " + m.spanToString() + "[" + m.startIndex + "," + m.endIndex + "]"); // SieveCoreferenceSystem.debugPrintMentions(System.err, "PREDICTED ORDERED", // predictedOrderedMentionsBySentence); // SieveCoreferenceSystem.debugPrintMentions(System.err, "GOLD ORDERED", // goldOrderedMentionsBySentence); } assert (!allPredictedMentions.containsKey(m.mentionID)); allPredictedMentions.put(m.mentionID, m); IntTuple pos = new IntTuple(2); pos.set(0, i); pos.set(1, j); positions.put(m, pos); m.sentNum = i; assert (!corefClusters.containsKey(m.mentionID)); corefClusters.put( m.mentionID, new CorefCluster(m.mentionID, Generics.newHashSet(Arrays.asList(m)))); m.corefClusterID = m.mentionID; IntTuple headPosition = new IntTuple(2); headPosition.set(0, i); headPosition.set(1, m.headIndex); mentionheadPositions.put(headPosition, m); } } }
public void consolidateMentions(int maxLength) { if (!Config.Server && thesaurus == null) { thesaurus = new Wikisaurus(); } ClientWikisauras obj = new ClientWikisauras(); // LuceneIndexWrapper luceneIndex = new LuceneIndexWrapper( // props.getCompleteIndex(), props.getRedirectIndex(), // props.getInlinkIndex(), props.getDisambIndex(), // props.getAnchorIndex()); ArrayList<Mention> mentions = new ArrayList<Mention>(); mentions.addAll(keywords); // System.out.println("consolidating mentions size:" + mentions.size()); keywords = new ArrayList<Mention>(); Integer[] token_type = new Integer[mentions.size()]; for (int i = 0; i < token_type.length; i++) token_type[i] = 0; int curr_offset = 0; String curr_mention = ""; for (int i = 0; i < mentions.size(); i++) { if (token_type[i] != 0) { // i++; continue; } curr_offset = mentions.get(i).offset; curr_mention = mentions.get(i).name; // System.out.println("offset: " + curr_offset + " curr_mention: " + curr_mention + " context: // " + mentions.get(i).context); String[] allWords = new String[maxLength]; Integer[] allOffset = new Integer[maxLength]; String currWord = curr_mention; Integer currWordEnd = curr_offset + curr_mention.length() + 1; allWords[0] = currWord; allOffset[0] = curr_offset; int k = 1; for (; k < maxLength; k++) { currWordEnd = document.indexOf(" ", currWordEnd + 1); if (currWordEnd == -1) currWordEnd = document.length(); if (curr_offset < 0 || curr_offset >= document.length()) { k--; break; } currWord = document.substring(curr_offset, currWordEnd); allWords[k] = currWord; allOffset[k] = currWordEnd; if (currWordEnd >= document.length()) break; } if (k == maxLength) k--; for (; k >= 0; k--) { LabelSense senses = null; // System.out.println("allwords[" + k + "] : " + allWords[k]); try { if (Config.Server) senses = obj.getSenses(allWords[k]); else { // String possibleMention = WordUtils.capitalize(allWords[k]); Label.Sense[] temp = thesaurus.getSenses(allWords[k]); // List<String> qwords = Arrays.asList(allWords[k].split(" ")); // // boolean nostopword = true; // for(String item : qwords){ // if(Stopwords.isStopword(item)){ // nostopword = false; // } // } if (temp != null) senses = new LabelSense(temp); // Vector<String> sensewmc = new Vector(); // Vector<Double> sensewmp = new Vector(); // hard coded search for the word in the freebase dataset. should not be done for long // text hence commenting // if(!Stopwords.isStopword(allWords[k].toLowerCase())){ // Vector<String> freebaseTitles = // WikiToFreebaseIDMap.getInstance().getAllWikiTitles(allWords[k].toLowerCase()); // //String title = "/wikipedia/en_title/" + allWords[k].replace(" ", "_"); // //java.util.regex.Pattern pa = // java.util.regex.Pattern.compile(title.toLowerCase()); // //java.util.regex.Matcher ma = pa.matcher(""); // // if(freebaseTitles != null){ // for(String fbTitle : freebaseTitles){ // fbTitle = fbTitle.replace("/wikipedia/en_title/", ""); // fbTitle = fbTitle.replace("\"", ""); // fbTitle = fbTitle.replace("_", " "); // System.out.println("fbTitle : " + fbTitle); // sensewmc.add(fbTitle); // sensewmp.add(new Double(1.0/(freebaseTitles.size()))); // //sensewmp.add(new Double(0)); // } // } // } // if((temp != null) || (sensewmc.size() > 0)){ // senses = new LabelSense(); // // int scount= 0,total = 0; // // if(temp != null){ // total = temp.length + sensewmc.size(); // senses.wikiMinerCandidate = new String[temp.length + sensewmc.size()]; // senses.wikiMinerProbability = new double[temp.length + sensewmc.size()]; // // for(;scount<temp.length;++scount){ // senses.wikiMinerCandidate[scount] = temp[scount].getTitle(); // senses.wikiMinerProbability[scount] = temp[scount].getPriorProbability(); // } // scount = temp.length; // } // else{ // total = sensewmc.size(); // senses.wikiMinerCandidate = new String[sensewmc.size()]; // senses.wikiMinerProbability = new double[sensewmc.size()]; // } // // for(int cnt = 0;scount < total;++scount,++cnt){ // senses.wikiMinerCandidate[scount] = sensewmc.elementAt(cnt); // senses.wikiMinerProbability[scount] = sensewmp.elementAt(cnt); // } // } // else if((k >= 1 && k <= 2) || ((k == 0) && // (!Stopwords.isStopword(allWords[k].split("_")[0])))) // else if((k <= 2) && (nostopword == true)) // { // String myquery = allWords[k]; // // String query = luceneIndex.buildPhraseSearchQuery(myquery,null); // // System.out.println("query : " + query); // // if (query != null) { // luceneIndex.searchStringInIndex(query, 2); // // System.out.println("found : " + luceneIndex.hits.scoreDocs.length); // // Vector<String> sensewmc = new Vector(); // Vector<Double> sensewmp = new Vector(); // // for (int licount = 0;licount < luceneIndex.hits.scoreDocs.length;++licount) { // Document doc = // luceneIndex.searcher.doc(luceneIndex.hits.scoreDocs[licount].doc); // get the next // document // // String pagetitle = doc.get("page_title"); // String disamb = doc.get("title_disamb"); // if (!((disamb == null) || disamb.equals(""))){ // pagetitle = pagetitle + " (" + disamb + ")"; // } // // System.out.println("lucene hit: " + pagetitle + " score : " + // luceneIndex.hits.scoreDocs[licount].score); // // //if(luceneIndex.hits.scoreDocs[licount].score < 0.5) // // continue; // // sensewmc.add(pagetitle); // sensewmp.add(new Double(luceneIndex.hits.scoreDocs[licount].score)); // // if(sensewmc.size() == 3) // break; // } // // if(sensewmc.size() > 0){ // senses = new LabelSense(); // // if(sensewmc.size() == 3){ // senses.wikiMinerCandidate = new String[3]; // senses.wikiMinerProbability = new double[3]; // } // else{ // senses.wikiMinerCandidate = new String[sensewmc.size()]; // senses.wikiMinerProbability = new double[sensewmc.size()]; // } // } // // for(int scount=0;scount<sensewmc.size();++scount){ // senses.wikiMinerCandidate[scount] = sensewmc.elementAt(scount); // senses.wikiMinerProbability[scount] = sensewmp.elementAt(scount); // } // } // } } } catch (Exception e) { e.printStackTrace(); System.exit(1); } if (null != senses) { // Vector<String> updatedsensewmc = new Vector(); // Vector<Double> updatedsensewmp = new Vector(); // for(int x=0;x<senses.wikiMinerCandidate.length;++x){ // System.out.println("senses: " + senses.wikiMinerCandidate[x]); // String entity = senses.wikiMinerCandidate[x].replace(" ", "_"); // String freebaseid = // WikiToFreebaseIDMap.getInstance().getFreeBaseID("\"/wikipedia/en_title/" + entity + // "\""); // if(freebaseid != null){ // updatedsensewmc.add(senses.wikiMinerCandidate[x]); // updatedsensewmp.add(senses.wikiMinerProbability[x]); // } // } // // LabelSense lsense = new LabelSense(); // // lsense.wikiMinerCandidate = new String[updatedsensewmc.size()]; // lsense.wikiMinerProbability = new double[updatedsensewmc.size()]; // // for(int scount=0;scount<updatedsensewmc.size();++scount){ // lsense.wikiMinerCandidate[scount] = updatedsensewmc.elementAt(scount); // lsense.wikiMinerProbability[scount] = updatedsensewmp.elementAt(scount); // } Mention new_mention = new Mention(); new_mention.name = allWords[k]; new_mention.length = new_mention.name.length(); new_mention.offset = curr_offset; new_mention.context = getContext(curr_offset, new_mention.length, contextSize); new_mention.contextAroundMention = getContext(curr_offset, new_mention.length, 10); if (k == 0) new_mention.key = mentions.get(i).key; // new_mention.senses = lsense; new_mention.senses = senses; System.out.println("wikiminer candidate for : " + new_mention.name); for (int ic = 0; ic < senses.wikiMinerCandidate.length; ++ic) { System.out.println( "\t" + senses.wikiMinerCandidate[ic] + " " + senses.wikiMinerProbability[ic]); } keywords.add(new_mention); // System.out.println("new_mention offset + length : " + new_mention.offset + " " + // new_mention.length); if (!isArticleToken(curr_mention)) { for (int j = i; j < mentions.size() && mentions.get(j).offset < (new_mention.offset + new_mention.length); j++) token_type[j] = 1; } else { token_type[i] = 2; } break; } } if (token_type[i] == 0 && !isArticleToken(curr_mention) && isValidToken(curr_mention)) { keywords.add(mentions.get(i)); } } }
public void setKeywords(boolean stem) throws Exception { tagged_document = tagger.tagString(document); ArrayList<String> tokens = new ArrayList<String>(); // System.out.println("tagged document : " + tagged_document); StringTokenizer str = new StringTokenizer(tagged_document); while (str.hasMoreTokens()) { String token = str.nextToken(); if (token == null || "".equals(token) || " ".equals(token)) continue; if (!Stopwords.isStopword(token.split("_")[0]) || noun_tags.contains(token.split("_")[1]) || adj_tags.contains(token.split("_")[1]) || extra_tags.contains(token.split("_")[1])) tokens.add(token); // System.out.println("token : " + token); if (!Stopwords.isStopword(token.split("_")[0])) { // System.out.println("token added."); tokens.add(token); } } String prev_tag = null; // if previous token was a noun then add n-gram // noun clause int curr_offset = 0, currbyte = 0; for (int i = 0; i < tokens.size(); i++) { // System.out.print(" "+tokens.get(i)); if (tokens.get(i) == null) continue; Matcher matcher = pattern.matcher(tokens.get(i)); matcher.find(); String word = matcher.group(1); String tag = matcher.group(2); // System.out.println("word: " + word + " tag: " + tag); if (word == null || "".equals(word)) { prev_tag = null; continue; } String token = word.replaceAll("[^0-9a-z\\sA-Z/\\-]", ""); if ("".equals(token) || "/".equals(token)) { prev_tag = null; continue; } if (!(noun_tags.contains(tag) || adj_tags.contains(tag) || extra_tags.contains(tag))) { prev_tag = null; continue; } Mention mention = new Mention(); if (tag.equals("JJ")) { String temp = TestJAWS.getNounForm(token); if (temp != null && !"".equals(temp)) { mention.key = temp; prev_tag = null; } else { mention.key = token; prev_tag = null; } } else { mention.key = token; } mention.name = word; mention.length = word.length(); curr_offset = document.indexOf(word, curr_offset); mention.offset = curr_offset; mention.context = getContext(curr_offset, mention.length, contextSize); mention.contextAroundMention = getContext(curr_offset, mention.length, 10); // StringTokenizer str1 = new StringTokenizer(contextString); // while(str1.hasMoreTokens()){ // String w=str1.nextToken(); // if (w == null || "".equals(w) || " ".equals(w)) continue; // mention.context.add(w); // } // parseContext(mention); // System.out.println("mention.name : " + mention.name + " offset : " + mention.offset); keywords.add(mention); } // System.out.println("Keywords: "+getMentionNames()); consolidateMentions(6); // consolidateMentions(4); }
// for collective training as we already have ground mentions public void setKeywordsTraining( HashMap<String, ArrayList<XMLTagInfo>> groundMapWiki, HashMap<String, ArrayList<XMLTagInfo>> groundMapManual, String file) { ArrayList<XMLTagInfo> mapForTrainFile = groundMapWiki.get(file); for (int i = 0; i < mapForTrainFile.size(); i++) { Mention mention = new Mention(); mention.key = mapForTrainFile.get(i).mention; mention.name = mapForTrainFile.get(i).mention; mention.length = mapForTrainFile.get(i).length; mention.offset = mapForTrainFile.get(i).offset; if (null == mention.name) mention.name = document.substring(mention.offset, mention.offset + mention.length); if (mention.offset < document.length() - 1) { int context_lo = Math.max(0, (mention.offset) - contextSize); int context_hi = Math.min(document.length() - 1, (mention.offset) + contextSize); String contextString = document.substring(context_lo, context_hi); mention.context = contextString.replaceAll("[^0-9a-z\\sA-Z/\\-]", ""); int con_lo = Math.max(0, mention.offset - 10); int con_hi = Math.min(document.length() - 1, mention.offset + 10); mention.contextAroundMention = document.substring(con_lo, con_hi).replaceAll("[^0-9a-z\\sA-Z]", " "); mention.contextAroundMention += " " + mention.name.replaceAll("[^0-9a-z\\sA-Z]", " "); keywords.add(mention); } } if (groundMapManual != null) { ArrayList<XMLTagInfo> mapForTrainFile1 = groundMapManual.get(file); for (int i = 0; i < mapForTrainFile1.size(); i++) { Mention mention = new Mention(); mention.key = mapForTrainFile1.get(i).mention; mention.name = mapForTrainFile1.get(i).mention; mention.length = mapForTrainFile1.get(i).mention.length(); mention.offset = mapForTrainFile1.get(i).offset; int context_lo = Math.max(0, mention.offset - contextSize); int context_hi = Math.min(document.length() - 1, mention.offset + contextSize); String contextString = document.substring(context_lo, context_hi); mention.context = contextString.replaceAll("[^0-9a-z\\sA-Z/\\-]", ""); int con_lo = Math.max(0, mention.offset - 10); int con_hi = Math.min(document.length() - 1, mention.offset + 10); mention.contextAroundMention = document.substring(con_lo, con_hi).replaceAll("[^0-9a-z\\sA-Z]", " "); mention.contextAroundMention += " " + mention.name.replaceAll("[^0-9a-z\\sA-Z]", " "); keywords.add(mention); } } }
@Override public Document nextDoc() throws Exception { List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>(); List<Tree> allTrees = new ArrayList<Tree>(); List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>(); List<List<Mention>> allPredictedMentions; List<CoreMap> allSentences = new ArrayList<CoreMap>(); Annotation docAnno = new Annotation(""); Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Pattern sentencePattern = Pattern.compile( "(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docMatcher = docPattern.matcher(fileContents); if (!docMatcher.find(currentOffset)) return null; currentOffset = docMatcher.end(); String doc = docMatcher.group(1); Matcher sentenceMatcher = sentencePattern.matcher(doc); String ner = null; // Maintain current document ID. Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docIDMatcher = docIDPattern.matcher(doc); if (docIDMatcher.find()) currentDocumentID = docIDMatcher.group(1); else currentDocumentID = "documentAfter " + currentDocumentID; while (sentenceMatcher.find()) { String sentenceString = sentenceMatcher.group(2); List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString)).tokenize(); // FIXING TOKENIZATION PROBLEMS for (int i = 0; i < words.size(); i++) { CoreLabel w = words.get(i); if (i > 0 && w.word().equals("$")) { if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP")) continue; words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$"); words.remove(i); i--; } else if (w.word().equals("\\/")) { if (words.get(i - 1).word().equals("</COREF>")) continue; w.set( CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "\\/" + words.get(i + 1).word()); words.remove(i + 1); words.remove(i - 1); } } // END FIXING TOKENIZATION PROBLEMS List<CoreLabel> sentence = new ArrayList<CoreLabel>(); // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently // open Stack<Mention> stack = new Stack<Mention>(); List<Mention> mentions = new ArrayList<Mention>(); allWords.add(sentence); allGoldMentions.add(mentions); for (CoreLabel word : words) { String w = word.get(CoreAnnotations.TextAnnotation.class); // found regular token: WORD/POS if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) { int i = w.lastIndexOf("\\/"); String w1 = w.substring(0, i); // we do NOT set POS info here. We take the POS tags from the parser! word.set(CoreAnnotations.TextAnnotation.class, w1); word.remove(CoreAnnotations.OriginalTextAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); } else { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } // found the start SGML tag for a NE, e.g., "<ORGANIZATION>" else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) { Pattern nerPattern = Pattern.compile("<(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); ner = m.group(1); } // found the end SGML tag for a NE, e.g., "</ORGANIZATION>" else if (w.startsWith("</") && !w.startsWith("</COREF")) { Pattern nerPattern = Pattern.compile("</(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); String ner1 = m.group(1); if (ner != null && !ner.equals(ner1)) throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1); ner = null; } // found the start SGML tag for a coref mention else if (w.startsWith("<COREF")) { Mention mention = new Mention(); // position of this mention in the sentence mention.startIndex = sentence.size(); // extract GOLD info about this coref chain. needed for eval Pattern idPattern = Pattern.compile("ID=\"(.*?)\""); Pattern refPattern = Pattern.compile("REF=\"(.*?)\""); Matcher m = idPattern.matcher(w); m.find(); mention.mentionID = Integer.parseInt(m.group(1)); m = refPattern.matcher(w); if (m.find()) { mention.originalRef = Integer.parseInt(m.group(1)); } // open mention. keep track of all open mentions using the stack stack.push(mention); } // found the end SGML tag for a coref mention else if (w.equals("</COREF>")) { Mention mention = stack.pop(); mention.endIndex = sentence.size(); // this is a closed mention. add it to the final list of mentions // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, // mention.originalRef); mentions.add(mention); } else { word.remove(CoreAnnotations.OriginalTextAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); } else { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } } StringBuilder textContent = new StringBuilder(); for (int i = 0; i < sentence.size(); i++) { CoreLabel w = sentence.get(i); w.set(CoreAnnotations.IndexAnnotation.class, i + 1); w.set(CoreAnnotations.UtteranceAnnotation.class, 0); if (i > 0) textContent.append(" "); textContent.append(w.getString(CoreAnnotations.TextAnnotation.class)); } CoreMap sentCoreMap = new Annotation(textContent.toString()); allSentences.add(sentCoreMap); sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence); } // assign goldCorefClusterID Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use for (List<Mention> goldMentions : allGoldMentions) { for (Mention m : goldMentions) { idMention.put(m.mentionID, m); } } for (List<Mention> goldMentions : allGoldMentions) { for (Mention m : goldMentions) { if (m.goldCorefClusterID == -1) { if (m.originalRef == -1) m.goldCorefClusterID = m.mentionID; else { int ref = m.originalRef; while (true) { Mention m2 = idMention.get(ref); if (m2.goldCorefClusterID != -1) { m.goldCorefClusterID = m2.goldCorefClusterID; break; } else if (m2.originalRef == -1) { m2.goldCorefClusterID = m2.mentionID; m.goldCorefClusterID = m2.goldCorefClusterID; break; } else { ref = m2.originalRef; } } } } } } docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences); stanfordProcessor.annotate(docAnno); if (allSentences.size() != allWords.size()) throw new IllegalStateException("allSentences != allWords"); for (int i = 0; i < allSentences.size(); i++) { List<CoreLabel> annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class); List<CoreLabel> unannotatedSent = allWords.get(i); List<Mention> mentionInSent = allGoldMentions.get(i); for (Mention m : mentionInSent) { m.dependency = allSentences .get(i) .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); } if (annotatedSent.size() != unannotatedSent.size()) { throw new IllegalStateException("annotatedSent != unannotatedSent"); } for (int j = 0, sz = annotatedSent.size(); j < sz; j++) { CoreLabel annotatedWord = annotatedSent.get(j); CoreLabel unannotatedWord = unannotatedSent.get(j); if (!annotatedWord .get(CoreAnnotations.TextAnnotation.class) .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) { throw new IllegalStateException("annotatedWord != unannotatedWord"); } } allWords.set(i, annotatedSent); allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class)); } // extract predicted mentions if (Constants.USE_GOLD_MENTIONS) allPredictedMentions = allGoldMentions; else allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries); // add the relevant fields to mentions and order them for coref return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true); }
public void train(Collection<Pair<Document, List<Entity>>> trainingData) { startTrack("Training"); // --Variables RVFDataset<Boolean, Feature> dataset = new RVFDataset<Boolean, Feature>(); LinearClassifierFactory<Boolean, Feature> fact = new LinearClassifierFactory<Boolean, Feature>(); // --Feature Extraction startTrack("Feature Extraction"); for (Pair<Document, List<Entity>> datum : trainingData) { // (document variables) Document doc = datum.getFirst(); List<Entity> goldClusters = datum.getSecond(); List<Mention> mentions = doc.getMentions(); Map<Mention, Entity> goldEntities = Entity.mentionToEntityMap(goldClusters); startTrack("Document " + doc.id); // (for each mention...) for (int i = 0; i < mentions.size(); i++) { // (get the mention and its cluster) Mention onPrix = mentions.get(i); Entity source = goldEntities.get(onPrix); if (source == null) { throw new IllegalArgumentException("Mention has no gold entity: " + onPrix); } // (for each previous mention...) int oldSize = dataset.size(); for (int j = i - 1; j >= 0; j--) { // (get previous mention and its cluster) Mention cand = mentions.get(j); Entity target = goldEntities.get(cand); if (target == null) { throw new IllegalArgumentException("Mention has no gold entity: " + cand); } // (extract features) Counter<Feature> feats = extractor.extractFeatures(Pair.make(onPrix, cand.markCoreferent(target))); // (add datum) dataset.add(new RVFDatum<Boolean, Feature>(feats, target == source)); // (stop if if (target == source) { break; } } // logf("Mention %s (%d datums)", onPrix.toString(), dataset.size() - oldSize); } endTrack("Document " + doc.id); } endTrack("Feature Extraction"); // --Train Classifier startTrack("Minimizer"); this.classifier = fact.trainClassifier(dataset); endTrack("Minimizer"); // --Dump Weights startTrack("Features"); // (get labels to print) Set<Boolean> labels = new HashSet<Boolean>(); labels.add(true); // (print features) for (Triple<Feature, Boolean, Double> featureInfo : this.classifier.getTopFeatures(labels, 0.0, true, 100, true)) { Feature feature = featureInfo.first(); Boolean label = featureInfo.second(); Double magnitude = featureInfo.third(); // log(FORCE,new DecimalFormat("0.000").format(magnitude) + " [" + label + "] " + feature); } end_Track("Features"); endTrack("Training"); }
/** Extract gold coref link information */ protected void extractGoldLinks() { // List<List<Mention>> orderedMentionsBySentence = this.getOrderedMentions(); List<Pair<IntTuple, IntTuple>> links = new ArrayList<Pair<IntTuple, IntTuple>>(); // position of each mention in the input matrix, by id Map<Integer, IntTuple> positions = Generics.newHashMap(); // positions of antecedents Map<Integer, List<IntTuple>> antecedents = Generics.newHashMap(); for (int i = 0; i < goldOrderedMentionsBySentence.size(); i++) { for (int j = 0; j < goldOrderedMentionsBySentence.get(i).size(); j++) { Mention m = goldOrderedMentionsBySentence.get(i).get(j); int id = m.mentionID; IntTuple pos = new IntTuple(2); pos.set(0, i); pos.set(1, j); positions.put(id, pos); antecedents.put(id, new ArrayList<IntTuple>()); } } // SieveCoreferenceSystem.debugPrintMentions(System.err, "", goldOrderedMentionsBySentence); for (List<Mention> mentions : goldOrderedMentionsBySentence) { for (Mention m : mentions) { int id = m.mentionID; IntTuple src = positions.get(id); assert (src != null); if (m.originalRef >= 0) { IntTuple dst = positions.get(m.originalRef); if (dst == null) { throw new RuntimeException("Cannot find gold mention with ID=" + m.originalRef); } // to deal with cataphoric annotation while (dst.get(0) > src.get(0) || (dst.get(0) == src.get(0) && dst.get(1) > src.get(1))) { Mention dstMention = goldOrderedMentionsBySentence.get(dst.get(0)).get(dst.get(1)); m.originalRef = dstMention.originalRef; dstMention.originalRef = id; if (m.originalRef < 0) break; dst = positions.get(m.originalRef); } if (m.originalRef < 0) continue; // A B C: if A<-B, A<-C => make a link B<-C for (int k = dst.get(0); k <= src.get(0); k++) { for (int l = 0; l < goldOrderedMentionsBySentence.get(k).size(); l++) { if (k == dst.get(0) && l < dst.get(1)) continue; if (k == src.get(0) && l > src.get(1)) break; IntTuple missed = new IntTuple(2); missed.set(0, k); missed.set(1, l); if (links.contains(new Pair<IntTuple, IntTuple>(missed, dst))) { antecedents.get(id).add(missed); links.add(new Pair<IntTuple, IntTuple>(src, missed)); } } } links.add(new Pair<IntTuple, IntTuple>(src, dst)); assert (antecedents.get(id) != null); antecedents.get(id).add(dst); List<IntTuple> ants = antecedents.get(m.originalRef); assert (ants != null); for (IntTuple ant : ants) { antecedents.get(id).add(ant); links.add(new Pair<IntTuple, IntTuple>(src, ant)); } } } } goldLinks = links; }
private <E> Feature feature( Class<E> clazz, Pair<Mention, ClusteredMention> input, Option<Double> count) { // --Variables Mention onPrix = input.getFirst(); // the first mention (referred to as m_i in the handout) Mention candidate = input.getSecond().mention; // the second mention (referred to as m_j in the handout) Entity candidateCluster = input.getSecond().entity; // the cluster containing the second mention // --Features:w if (clazz.equals(Feature.ExactMatch.class)) { // (exact string match) return new Feature.ExactMatch(onPrix.gloss().equals(candidate.gloss())); } else if (clazz.equals(Feature.SentenceDist.class)) { return new Feature.SentenceDist( Math.abs( onPrix.doc.indexOfMention(onPrix) - candidate.doc.indexOfMention(candidate))); } else if (clazz.equals(Feature.MentionDist.class)) { return new Feature.MentionDist( Math.abs( onPrix.doc.indexOfSentence(onPrix.sentence) - candidate.doc.indexOfSentence(candidate.sentence))); } else if (clazz.equals(Feature.EitherHeadWordPronoun.class)) { return new Feature.EitherHeadWordPronoun( Pronoun.isSomePronoun(onPrix.gloss()) || Pronoun.isSomePronoun(candidate.gloss())); } else if (clazz.equals(Feature.CandidateNERTag.class)) { return new Feature.CandidateNERTag(candidate.headToken().nerTag()); } else if (clazz.equals(Feature.CandidateSpeaker.class)) { return new Feature.CandidateSpeaker(candidate.headToken().speaker()); } else if (clazz.equals(Feature.FixedSpeaker.class)) { return new Feature.FixedSpeaker(onPrix.headToken().speaker()); } else if (clazz.equals(Feature.HeadWordMatch.class)) { return new Feature.HeadWordMatch(onPrix.equals(candidate.headWord())); } else if (clazz.equals(Feature.HeadWordLemmaMatch.class)) { return new Feature.HeadWordLemmaMatch( onPrix.headToken().lemma().equals(candidate.headToken().lemma())); } else if (clazz.equals(Feature.FixedNERTag.class)) { return new Feature.FixedNERTag(onPrix.headToken().nerTag()); } else if (clazz.equals(Feature.SpeakerMatch.class)) { return new Feature.SpeakerMatch( candidate.headToken().speaker().equals(onPrix.headToken().speaker())); } else if (clazz.equals(Feature.NERTagMatch.class)) { return new Feature.NERTagMatch( candidate.headToken().nerTag().equals(onPrix.headToken().nerTag())); } else if (clazz.equals(Feature.CandidatePOSTag.class)) { return new Feature.CandidatePOSTag(candidate.headToken().posTag()); } else if (clazz.equals(Feature.FixedPOSTag.class)) { return new Feature.FixedPOSTag(onPrix.headToken().posTag()); } else if (clazz.equals(Feature.GenderMatch.class)) { Pair<Boolean, Boolean> match = Util.haveGenderAndAreSameGender(onPrix, candidate); boolean finalMatch = (!match.getFirst() || match.getSecond()); return new Feature.GenderMatch(finalMatch); } else if (clazz.equals(Feature.NumberMatch.class)) { Pair<Boolean, Boolean> match = Util.haveNumberAndAreSameNumber(onPrix, candidate); boolean finalMatch = (!match.getFirst() || match.getSecond()); return new Feature.NumberMatch(finalMatch); } // } else if(clazz.equals(Feature.NewFeature.class) { /* * TODO: Add features to return for specific classes. Implement calculating values of features here. */ else { throw new IllegalArgumentException("Unregistered feature: " + clazz); } }
/** Process discourse information */ protected void processDiscourse(Dictionaries dict) { docType = findDocType(dict); markQuotations(this.annotation.get(CoreAnnotations.SentencesAnnotation.class), false); findSpeakers(dict); // find 'speaker mention' for each mention for (Mention m : allPredictedMentions.values()) { int utter = m.headWord.get(CoreAnnotations.UtteranceAnnotation.class); String speaker = m.headWord.get(CoreAnnotations.SpeakerAnnotation.class); if (speaker != null) { // Populate speaker info SpeakerInfo speakerInfo = speakerInfoMap.get(speaker); if (speakerInfo == null) { speakerInfoMap.put(speaker, speakerInfo = new SpeakerInfo(speaker)); // span indicates this is the speaker if (Rules.mentionMatchesSpeaker(m, speakerInfo, true)) { m.speakerInfo = speakerInfo; } } if (NumberMatchingRegex.isDecimalInteger(speaker)) { try { int speakerMentionID = Integer.parseInt(speaker); if (utter != 0) { // Add pairs of mention id and the mention id of the speaker speakerPairs.add(new Pair<Integer, Integer>(m.mentionID, speakerMentionID)); // speakerPairs.add(new Pair<Integer, Integer>(speakerMentionID, // m.mentionID)); } } catch (Exception e) { // no mention found for the speaker // nothing to do } } } // set generic 'you' : e.g., you know in conversation if (docType != DocType.ARTICLE && m.person == Person.YOU && m.endIndex < m.sentenceWords.size() - 1 && m.sentenceWords .get(m.endIndex) .get(CoreAnnotations.TextAnnotation.class) .equalsIgnoreCase("know")) { m.generic = true; } } // now that we have identified the speakers, first pass to check if mentions should cluster with // the speakers for (Mention m : allPredictedMentions.values()) { if (m.speakerInfo == null) { for (SpeakerInfo speakerInfo : speakerInfoMap.values()) { if (speakerInfo.hasRealSpeakerName()) { // do loose match - assumes that there isn't that many speakers.... if (Rules.mentionMatchesSpeaker(m, speakerInfo, false)) { m.speakerInfo = speakerInfo; break; } } } } } }