// returns the documents were w occurs in - binary version Set<Integer> occursin_binary(String v) throws IOException { Set<Integer> vecv = new HashSet<Integer>(); Lexicon<String> lex = index.getLexicon(); LexiconEntry le = lex.getLexiconEntry(v); IterablePosting postings = inv.getPostings(le); while (postings.next() != IterablePosting.EOL) { vecv.add(postings.getId()); } return vecv; }
/** * Builds a CooccurenceMap by iterating over the documents of the collection. It counts document * co-occurence, i.e. it doesn't consider the frequency of two terms in a document. Complexity: * O(d * t *t/2) = O(d t^2) where d is the number of documents in the collection and t is the * average number of terms per documents. Note that t = avg doc len */ public void build_full_cooccurencemap_docversion() throws IOException { PostingIndex di = index.getDirectIndex(); DocumentIndex doi = index.getDocumentIndex(); Lexicon<String> lex = index.getLexicon(); for (int docid = 0; docid < doi.getNumberOfDocuments(); docid++) { if (docid % 1000 == 0) System.out.println( "Processing... " + 100.0 * ((double) docid) / doi.getNumberOfDocuments() + "%"); IterablePosting postings = di.getPostings(doi.getDocumentEntry(docid)); Vector<String> seenterms = new Vector<String>(); while (postings.next() != IterablePosting.EOL) { Map.Entry<String, LexiconEntry> lee = lex.getLexiconEntry(postings.getId()); String termw = lee.getKey(); if (lee.getValue().getFrequency() < this.rarethreshold || lee.getValue().getFrequency() > this.topthreshold) continue; HashMap<String, Integer> w_cooccurence = new HashMap<String, Integer>(); if (this.cooccurencemap.containsKey(termw)) { w_cooccurence = this.cooccurencemap.get(termw); this.cooccurencemap.remove(termw); } Iterator<String> it = seenterms.iterator(); while (it.hasNext()) { String termu = it.next(); int count = 1; if (w_cooccurence.containsKey(termu)) { count = count + w_cooccurence.get(termu); w_cooccurence.remove(termu); } w_cooccurence.put(termu, count); // System.out.println(termw + ": " + w_cooccurence); // and now I need to do the symmetric HashMap<String, Integer> u_cooccurence = new HashMap<String, Integer>(); if (cooccurencemap.containsKey(termu)) { u_cooccurence = cooccurencemap.get(termu); cooccurencemap.remove(termu); } int countu = 1; if (u_cooccurence.containsKey(termw)) { countu = countu + u_cooccurence.get(termw); u_cooccurence.remove(termw); } u_cooccurence.put(termw, count); cooccurencemap.put(termu, u_cooccurence); // System.out.println(termu + ": " + u_cooccurence); } cooccurencemap.put(termw, w_cooccurence); seenterms.add(termw); // I add only the termw that are within the thresholds } } }
// returns the documents were w occurs in HashMap<Integer, Integer> occursin(String v) throws IOException { HashMap<Integer, Integer> docsofv = new HashMap<Integer, Integer>(); // MetaIndex meta = index.getMetaIndex(); Lexicon<String> lex = index.getLexicon(); LexiconEntry lev = lex.getLexiconEntry(v); IterablePosting postings = inv.getPostings(lev); while (postings.next() != IterablePosting.EOL) { docsofv.put(postings.getId(), postings.getFrequency()); } return docsofv; }
/** * @param args * @throws IOException * @throws ClassNotFoundException */ public static void main(String[] args) throws IOException, ClassNotFoundException { CooccurenceMap coccmap = new CooccurenceMap(); // System.setProperty("terrier.home", "/Users/zuccong/tools/terrier-4.0"); System.setProperty("terrier.home", args[0]); // coccmap.set_index("/Users/zuccong/experiments/dotgov_stoplist/", "data"); coccmap.set_index(args[1], "data"); coccmap.setRarethreshold(500); coccmap.setTopthreshold(coccmap.index.getCollectionStatistics().getNumberOfDocuments() / 1000); coccmap.build_full_cooccurencemap_docversion(); // coccmap.build_full_cooccurencemap(); // coccmap.writemap("/Users/zuccong/experiments/cooccurence_dotgov_stoplist.map"); coccmap.writemap(args[2]); System.out.println("Size written" + coccmap.cooccurencemap.size()); /* System.out.println("Reading map from file"); //CooccurenceMap coccmapr = coccmap.readmap("/Users/zuccong/experiments/sigir2015_nlm/cooccurence_dotgov_stoplist"); CooccurenceMap coccmapr = coccmap.readmap(args[2]); //coccmapr.set_index("/Users/zuccong/experiments/dotgov_stoplist/", "data"); coccmapr.set_index(args[1], "data"); System.out.println("Size read " + coccmapr.cooccurencemap.size()); */ CooccurenceMap coccmapr = coccmap; // This is just a testing loop: will only examine the first 5 terms int count = 5; for (String w : coccmapr.cooccurencemap.keySet()) { if (count > 0) { count--; System.out.println(w); HashMap<String, Integer> w_cooccurence = coccmapr.cooccurencemap.get(w); for (String u : w_cooccurence.keySet()) { System.out.println("\t" + u + ": " + w_cooccurence.get(u)); Set<Integer> vecw = new HashSet<Integer>(); Lexicon<String> lex = coccmapr.index.getLexicon(); LexiconEntry le = lex.getLexiconEntry(w); IterablePosting postings = coccmapr.inv.getPostings(le); while (postings.next() != IterablePosting.EOL) { vecw.add(postings.getId()); } Set<Integer> vecu = new HashSet<Integer>(); LexiconEntry leu = lex.getLexiconEntry(u); IterablePosting postingsu = coccmapr.inv.getPostings(leu); while (postingsu.next() != IterablePosting.EOL) { vecu.add(postingsu.getId()); } Set<Integer> intersection = new HashSet<Integer>(vecw); // use the copy constructor intersection.retainAll(vecu); System.out.println( "\tintersection: " + intersection.size() + " size w: " + vecw.size() + " size u: " + vecu.size()); } } } System.out.println("co-occurrence(fracture,doctor) = " + coccmap.get_w_u("holiday", "meeting")); System.out.println("co-occurrence(doctor,fracture) = " + coccmap.get_w_u("meeting", "holiday")); System.out.println("co-occurrence(risk,economy) = " + coccmap.get_w_u("risk", "economy")); System.out.println("co-occurrence(economy,risk) = " + coccmap.get_w_u("economy", "risk")); System.out.println("co-occurrence(dollar,million) = " + coccmap.get_w_u("dollar", "million")); System.out.println("co-occurrence(million,dollar) = " + coccmap.get_w_u("million", "dollar")); }
/** * Builds a CooccurenceMap by iterating over the vocabulary of the collection. It counts document * co-occurence, i.e. it doesn't consider the frequency of two terms in a document. Complexity: * O(n^3) = O(d t^2) where n is the number of terms in the vocabulary Note: this currently goes * out of heap space on DOTGOV with 5GB of RAM allocated to the JVM */ void build_full_cooccurencemap() throws IOException { Lexicon<String> lex = index.getLexicon(); Iterator<Entry<String, LexiconEntry>> itw = lex.iterator(); int prcount = 1; // iterating over all possible w while (itw.hasNext()) { Entry<String, LexiconEntry> lw = itw.next(); String termw = lw.getKey(); if (lw.getValue().getFrequency() < this.rarethreshold || lw.getValue().getFrequency() > this.topthreshold) continue; if (prcount % 1000 == 0) System.out.println( "Processing... " + 100.0 * ((double) prcount) / this.index.getCollectionStatistics().getNumberOfUniqueTerms() + "%"); prcount++; // LexiconEntry lew = lw.getValue(); // System.out.println("analysing " + termw); HashMap<String, Integer> w_cooccurence = new HashMap<String, Integer>(); if (cooccurencemap.containsKey(termw)) { w_cooccurence = cooccurencemap.get(termw); cooccurencemap.remove(termw); } Set<Integer> docsofw = occursin_binary(termw); Iterator<Entry<String, LexiconEntry>> itu = lex.iterator(); while (itu.hasNext()) { Entry<String, LexiconEntry> lu = itu.next(); String termu = lu.getKey(); if (lu.getValue().getFrequency() < this.rarethreshold || lu.getValue().getFrequency() > this.topthreshold) continue; // System.out.println("\tmeasuring co-occurence with " + termu); // LexiconEntry leu = lu.getValue(); Set<Integer> docsofu = occursin_binary(termu); Set<Integer> intersection = new HashSet<Integer>(docsofw); // use the copy constructor intersection.retainAll(docsofu); int count = intersection.size(); if (w_cooccurence.containsKey(termu)) { count = count + w_cooccurence.get(termu); w_cooccurence.remove(termu); } w_cooccurence.put(termu, count); // System.out.println("\t\t"+termw + " " + termu + " = " + count); // System.out.println(docsofw.size() + " " + docsofu.size() + " " + diff.entriesInCommon()); // The next bit of code instead does count frequencies /* if(docsofw.size() <= docsofu.size()) { for (Integer docidw: docsofw.keySet()) { if (docsofu.containsKey(docidw)) { //then w and u co-occur Integer count = (Integer) Math.min(docsofw.get(docidw), docsofu.get(docidw)); if(w_cooccurence.containsKey(termu)) { count = count + w_cooccurence.get(termu); w_cooccurence.remove(termu); } w_cooccurence.put(termu, count); System.out.println("\t\t"+termw + " " + termu + " = " + count); } } }else { for (Integer docidu: docsofu.keySet()) { if (docsofw.containsKey(docidu)) { //then w and u co-occur Integer count = (Integer) Math.min(docsofw.get(docidu), docsofu.get(docidu)); if(w_cooccurence.containsKey(termu)) { count = count + w_cooccurence.get(termu); w_cooccurence.remove(termu); } w_cooccurence.put(termu, count); System.out.println("\t\t"+termw + " " + termu + " = " + count); } } }*/ } cooccurencemap.put(termw, w_cooccurence); // System.out.println(termw + ": " + w_cooccurence); } }