public void setGroundMention(ArrayList<XMLTagInfo> groundtruth) throws Exception { groundMention.clear(); keywords.clear(); for (int i = 0; i < groundtruth.size(); i++) { int off = groundtruth.get(i).offset; int len = groundtruth.get(i).length; groundMention.add(document.substring(off, off + len)); Mention m = new Mention(); int context_lo = Math.max(0, off - contextSize); int context_hi = Math.min(document.length() - 1, off + contextSize); String contextString = document.substring(context_lo, context_hi); m.context = contextString.replaceAll("[^0-9a-z\\sA-Z/\\-]", "").toLowerCase(); int con_lo = Math.max(0, m.offset - 10); int con_hi = Math.min(document.length() - 1, m.offset + 10); m.contextAroundMention = document.substring(con_lo, con_hi).replaceAll("[^0-9a-z\\sA-Z]", " ").toLowerCase(); m.contextAroundMention += " " + m.name.replaceAll("[^0-9a-z\\sA-Z]", " ").toLowerCase(); m.name = document.substring(off, off + len); m.context.replaceAll("\\sand", ""); m.contextAroundMention.replaceAll("\\sand", ""); m.context.replaceAll("\\snot", ""); m.contextAroundMention.replaceAll("\\snot", ""); m.length = len; m.offset = off; keywords.add(m); } }
// for collective training as we already have ground mentions public void setKeywordsTraining( HashMap<String, ArrayList<XMLTagInfo>> groundMapWiki, HashMap<String, ArrayList<XMLTagInfo>> groundMapManual, String file) { ArrayList<XMLTagInfo> mapForTrainFile = groundMapWiki.get(file); for (int i = 0; i < mapForTrainFile.size(); i++) { Mention mention = new Mention(); mention.key = mapForTrainFile.get(i).mention; mention.name = mapForTrainFile.get(i).mention; mention.length = mapForTrainFile.get(i).length; mention.offset = mapForTrainFile.get(i).offset; if (null == mention.name) mention.name = document.substring(mention.offset, mention.offset + mention.length); if (mention.offset < document.length() - 1) { int context_lo = Math.max(0, (mention.offset) - contextSize); int context_hi = Math.min(document.length() - 1, (mention.offset) + contextSize); String contextString = document.substring(context_lo, context_hi); mention.context = contextString.replaceAll("[^0-9a-z\\sA-Z/\\-]", ""); int con_lo = Math.max(0, mention.offset - 10); int con_hi = Math.min(document.length() - 1, mention.offset + 10); mention.contextAroundMention = document.substring(con_lo, con_hi).replaceAll("[^0-9a-z\\sA-Z]", " "); mention.contextAroundMention += " " + mention.name.replaceAll("[^0-9a-z\\sA-Z]", " "); keywords.add(mention); } } if (groundMapManual != null) { ArrayList<XMLTagInfo> mapForTrainFile1 = groundMapManual.get(file); for (int i = 0; i < mapForTrainFile1.size(); i++) { Mention mention = new Mention(); mention.key = mapForTrainFile1.get(i).mention; mention.name = mapForTrainFile1.get(i).mention; mention.length = mapForTrainFile1.get(i).mention.length(); mention.offset = mapForTrainFile1.get(i).offset; int context_lo = Math.max(0, mention.offset - contextSize); int context_hi = Math.min(document.length() - 1, mention.offset + contextSize); String contextString = document.substring(context_lo, context_hi); mention.context = contextString.replaceAll("[^0-9a-z\\sA-Z/\\-]", ""); int con_lo = Math.max(0, mention.offset - 10); int con_hi = Math.min(document.length() - 1, mention.offset + 10); mention.contextAroundMention = document.substring(con_lo, con_hi).replaceAll("[^0-9a-z\\sA-Z]", " "); mention.contextAroundMention += " " + mention.name.replaceAll("[^0-9a-z\\sA-Z]", " "); keywords.add(mention); } } }
public void setKeywordsWikiMiner() { try { WikipediaAnnotator annotator = new WikipediaAnnotator(); long annstartTime = System.currentTimeMillis(); HashMap<String, Label.Sense[]> ment2ent = annotator.annotate(document); long annendTime = System.currentTimeMillis(); long diff1 = (annendTime - annstartTime); System.out.println("Time taken by annotater : " + diff1 + " milliseconds"); for (String key : ment2ent.keySet()) { // sunny:adding code to check if sense is within freebase dataset // adding it only if we find it. // Vector<Label.Sense> updatedsenses = new Vector(); // for(Label.Sense s : ment2ent.get(key)){ // String entity = s.getTitle().replace(" ", "_"); // String freebaseid = // WikiToFreebaseIDMap.getInstance().getFreeBaseID("\"/wikipedia/en_title/" + entity + // "\""); // // if(freebaseid != null){ // updatedsenses.add(s); // } // } // // Label.Sense[] sensearray = new Label.Sense[updatedsenses.size()]; // // updatedsenses.toArray(sensearray); LabelSense senses = new LabelSense(ment2ent.get(key)); // LabelSense senses = new LabelSense(sensearray); Mention mention = new Mention(); // System.out.println("key from ment2ent : " + key); String ment = key.split("_")[0]; // System.out.println("ment from ment2ent : " + ment); int off = Integer.parseInt(key.split("_")[1]); mention.key = ment; mention.name = ment; mention.length = ment.length(); mention.offset = off; mention.context = getContext(off, mention.length, contextSize); mention.contextAroundMention = getContext(off, mention.length, 10); mention.senses = senses; keywords.add(mention); } } catch (Exception e) { e.printStackTrace(); } }
public void consolidateMentions(int maxLength) { if (!Config.Server && thesaurus == null) { thesaurus = new Wikisaurus(); } ClientWikisauras obj = new ClientWikisauras(); // LuceneIndexWrapper luceneIndex = new LuceneIndexWrapper( // props.getCompleteIndex(), props.getRedirectIndex(), // props.getInlinkIndex(), props.getDisambIndex(), // props.getAnchorIndex()); ArrayList<Mention> mentions = new ArrayList<Mention>(); mentions.addAll(keywords); // System.out.println("consolidating mentions size:" + mentions.size()); keywords = new ArrayList<Mention>(); Integer[] token_type = new Integer[mentions.size()]; for (int i = 0; i < token_type.length; i++) token_type[i] = 0; int curr_offset = 0; String curr_mention = ""; for (int i = 0; i < mentions.size(); i++) { if (token_type[i] != 0) { // i++; continue; } curr_offset = mentions.get(i).offset; curr_mention = mentions.get(i).name; // System.out.println("offset: " + curr_offset + " curr_mention: " + curr_mention + " context: // " + mentions.get(i).context); String[] allWords = new String[maxLength]; Integer[] allOffset = new Integer[maxLength]; String currWord = curr_mention; Integer currWordEnd = curr_offset + curr_mention.length() + 1; allWords[0] = currWord; allOffset[0] = curr_offset; int k = 1; for (; k < maxLength; k++) { currWordEnd = document.indexOf(" ", currWordEnd + 1); if (currWordEnd == -1) currWordEnd = document.length(); if (curr_offset < 0 || curr_offset >= document.length()) { k--; break; } currWord = document.substring(curr_offset, currWordEnd); allWords[k] = currWord; allOffset[k] = currWordEnd; if (currWordEnd >= document.length()) break; } if (k == maxLength) k--; for (; k >= 0; k--) { LabelSense senses = null; // System.out.println("allwords[" + k + "] : " + allWords[k]); try { if (Config.Server) senses = obj.getSenses(allWords[k]); else { // String possibleMention = WordUtils.capitalize(allWords[k]); Label.Sense[] temp = thesaurus.getSenses(allWords[k]); // List<String> qwords = Arrays.asList(allWords[k].split(" ")); // // boolean nostopword = true; // for(String item : qwords){ // if(Stopwords.isStopword(item)){ // nostopword = false; // } // } if (temp != null) senses = new LabelSense(temp); // Vector<String> sensewmc = new Vector(); // Vector<Double> sensewmp = new Vector(); // hard coded search for the word in the freebase dataset. should not be done for long // text hence commenting // if(!Stopwords.isStopword(allWords[k].toLowerCase())){ // Vector<String> freebaseTitles = // WikiToFreebaseIDMap.getInstance().getAllWikiTitles(allWords[k].toLowerCase()); // //String title = "/wikipedia/en_title/" + allWords[k].replace(" ", "_"); // //java.util.regex.Pattern pa = // java.util.regex.Pattern.compile(title.toLowerCase()); // //java.util.regex.Matcher ma = pa.matcher(""); // // if(freebaseTitles != null){ // for(String fbTitle : freebaseTitles){ // fbTitle = fbTitle.replace("/wikipedia/en_title/", ""); // fbTitle = fbTitle.replace("\"", ""); // fbTitle = fbTitle.replace("_", " "); // System.out.println("fbTitle : " + fbTitle); // sensewmc.add(fbTitle); // sensewmp.add(new Double(1.0/(freebaseTitles.size()))); // //sensewmp.add(new Double(0)); // } // } // } // if((temp != null) || (sensewmc.size() > 0)){ // senses = new LabelSense(); // // int scount= 0,total = 0; // // if(temp != null){ // total = temp.length + sensewmc.size(); // senses.wikiMinerCandidate = new String[temp.length + sensewmc.size()]; // senses.wikiMinerProbability = new double[temp.length + sensewmc.size()]; // // for(;scount<temp.length;++scount){ // senses.wikiMinerCandidate[scount] = temp[scount].getTitle(); // senses.wikiMinerProbability[scount] = temp[scount].getPriorProbability(); // } // scount = temp.length; // } // else{ // total = sensewmc.size(); // senses.wikiMinerCandidate = new String[sensewmc.size()]; // senses.wikiMinerProbability = new double[sensewmc.size()]; // } // // for(int cnt = 0;scount < total;++scount,++cnt){ // senses.wikiMinerCandidate[scount] = sensewmc.elementAt(cnt); // senses.wikiMinerProbability[scount] = sensewmp.elementAt(cnt); // } // } // else if((k >= 1 && k <= 2) || ((k == 0) && // (!Stopwords.isStopword(allWords[k].split("_")[0])))) // else if((k <= 2) && (nostopword == true)) // { // String myquery = allWords[k]; // // String query = luceneIndex.buildPhraseSearchQuery(myquery,null); // // System.out.println("query : " + query); // // if (query != null) { // luceneIndex.searchStringInIndex(query, 2); // // System.out.println("found : " + luceneIndex.hits.scoreDocs.length); // // Vector<String> sensewmc = new Vector(); // Vector<Double> sensewmp = new Vector(); // // for (int licount = 0;licount < luceneIndex.hits.scoreDocs.length;++licount) { // Document doc = // luceneIndex.searcher.doc(luceneIndex.hits.scoreDocs[licount].doc); // get the next // document // // String pagetitle = doc.get("page_title"); // String disamb = doc.get("title_disamb"); // if (!((disamb == null) || disamb.equals(""))){ // pagetitle = pagetitle + " (" + disamb + ")"; // } // // System.out.println("lucene hit: " + pagetitle + " score : " + // luceneIndex.hits.scoreDocs[licount].score); // // //if(luceneIndex.hits.scoreDocs[licount].score < 0.5) // // continue; // // sensewmc.add(pagetitle); // sensewmp.add(new Double(luceneIndex.hits.scoreDocs[licount].score)); // // if(sensewmc.size() == 3) // break; // } // // if(sensewmc.size() > 0){ // senses = new LabelSense(); // // if(sensewmc.size() == 3){ // senses.wikiMinerCandidate = new String[3]; // senses.wikiMinerProbability = new double[3]; // } // else{ // senses.wikiMinerCandidate = new String[sensewmc.size()]; // senses.wikiMinerProbability = new double[sensewmc.size()]; // } // } // // for(int scount=0;scount<sensewmc.size();++scount){ // senses.wikiMinerCandidate[scount] = sensewmc.elementAt(scount); // senses.wikiMinerProbability[scount] = sensewmp.elementAt(scount); // } // } // } } } catch (Exception e) { e.printStackTrace(); System.exit(1); } if (null != senses) { // Vector<String> updatedsensewmc = new Vector(); // Vector<Double> updatedsensewmp = new Vector(); // for(int x=0;x<senses.wikiMinerCandidate.length;++x){ // System.out.println("senses: " + senses.wikiMinerCandidate[x]); // String entity = senses.wikiMinerCandidate[x].replace(" ", "_"); // String freebaseid = // WikiToFreebaseIDMap.getInstance().getFreeBaseID("\"/wikipedia/en_title/" + entity + // "\""); // if(freebaseid != null){ // updatedsensewmc.add(senses.wikiMinerCandidate[x]); // updatedsensewmp.add(senses.wikiMinerProbability[x]); // } // } // // LabelSense lsense = new LabelSense(); // // lsense.wikiMinerCandidate = new String[updatedsensewmc.size()]; // lsense.wikiMinerProbability = new double[updatedsensewmc.size()]; // // for(int scount=0;scount<updatedsensewmc.size();++scount){ // lsense.wikiMinerCandidate[scount] = updatedsensewmc.elementAt(scount); // lsense.wikiMinerProbability[scount] = updatedsensewmp.elementAt(scount); // } Mention new_mention = new Mention(); new_mention.name = allWords[k]; new_mention.length = new_mention.name.length(); new_mention.offset = curr_offset; new_mention.context = getContext(curr_offset, new_mention.length, contextSize); new_mention.contextAroundMention = getContext(curr_offset, new_mention.length, 10); if (k == 0) new_mention.key = mentions.get(i).key; // new_mention.senses = lsense; new_mention.senses = senses; System.out.println("wikiminer candidate for : " + new_mention.name); for (int ic = 0; ic < senses.wikiMinerCandidate.length; ++ic) { System.out.println( "\t" + senses.wikiMinerCandidate[ic] + " " + senses.wikiMinerProbability[ic]); } keywords.add(new_mention); // System.out.println("new_mention offset + length : " + new_mention.offset + " " + // new_mention.length); if (!isArticleToken(curr_mention)) { for (int j = i; j < mentions.size() && mentions.get(j).offset < (new_mention.offset + new_mention.length); j++) token_type[j] = 1; } else { token_type[i] = 2; } break; } } if (token_type[i] == 0 && !isArticleToken(curr_mention) && isValidToken(curr_mention)) { keywords.add(mentions.get(i)); } } }
public void setKeywords(boolean stem) throws Exception { tagged_document = tagger.tagString(document); ArrayList<String> tokens = new ArrayList<String>(); // System.out.println("tagged document : " + tagged_document); StringTokenizer str = new StringTokenizer(tagged_document); while (str.hasMoreTokens()) { String token = str.nextToken(); if (token == null || "".equals(token) || " ".equals(token)) continue; if (!Stopwords.isStopword(token.split("_")[0]) || noun_tags.contains(token.split("_")[1]) || adj_tags.contains(token.split("_")[1]) || extra_tags.contains(token.split("_")[1])) tokens.add(token); // System.out.println("token : " + token); if (!Stopwords.isStopword(token.split("_")[0])) { // System.out.println("token added."); tokens.add(token); } } String prev_tag = null; // if previous token was a noun then add n-gram // noun clause int curr_offset = 0, currbyte = 0; for (int i = 0; i < tokens.size(); i++) { // System.out.print(" "+tokens.get(i)); if (tokens.get(i) == null) continue; Matcher matcher = pattern.matcher(tokens.get(i)); matcher.find(); String word = matcher.group(1); String tag = matcher.group(2); // System.out.println("word: " + word + " tag: " + tag); if (word == null || "".equals(word)) { prev_tag = null; continue; } String token = word.replaceAll("[^0-9a-z\\sA-Z/\\-]", ""); if ("".equals(token) || "/".equals(token)) { prev_tag = null; continue; } if (!(noun_tags.contains(tag) || adj_tags.contains(tag) || extra_tags.contains(tag))) { prev_tag = null; continue; } Mention mention = new Mention(); if (tag.equals("JJ")) { String temp = TestJAWS.getNounForm(token); if (temp != null && !"".equals(temp)) { mention.key = temp; prev_tag = null; } else { mention.key = token; prev_tag = null; } } else { mention.key = token; } mention.name = word; mention.length = word.length(); curr_offset = document.indexOf(word, curr_offset); mention.offset = curr_offset; mention.context = getContext(curr_offset, mention.length, contextSize); mention.contextAroundMention = getContext(curr_offset, mention.length, 10); // StringTokenizer str1 = new StringTokenizer(contextString); // while(str1.hasMoreTokens()){ // String w=str1.nextToken(); // if (w == null || "".equals(w) || " ".equals(w)) continue; // mention.context.add(w); // } // parseContext(mention); // System.out.println("mention.name : " + mention.name + " offset : " + mention.offset); keywords.add(mention); } // System.out.println("Keywords: "+getMentionNames()); consolidateMentions(6); // consolidateMentions(4); }