// returns the documents were w occurs in - binary version Set<Integer> occursin_binary(String v) throws IOException { Set<Integer> vecv = new HashSet<Integer>(); Lexicon<String> lex = index.getLexicon(); LexiconEntry le = lex.getLexiconEntry(v); IterablePosting postings = inv.getPostings(le); while (postings.next() != IterablePosting.EOL) { vecv.add(postings.getId()); } return vecv; }
/** * Builds a CooccurenceMap by iterating over the documents of the collection. It counts document * co-occurence, i.e. it doesn't consider the frequency of two terms in a document. Complexity: * O(d * t *t/2) = O(d t^2) where d is the number of documents in the collection and t is the * average number of terms per documents. Note that t = avg doc len */ public void build_full_cooccurencemap_docversion() throws IOException { PostingIndex di = index.getDirectIndex(); DocumentIndex doi = index.getDocumentIndex(); Lexicon<String> lex = index.getLexicon(); for (int docid = 0; docid < doi.getNumberOfDocuments(); docid++) { if (docid % 1000 == 0) System.out.println( "Processing... " + 100.0 * ((double) docid) / doi.getNumberOfDocuments() + "%"); IterablePosting postings = di.getPostings(doi.getDocumentEntry(docid)); Vector<String> seenterms = new Vector<String>(); while (postings.next() != IterablePosting.EOL) { Map.Entry<String, LexiconEntry> lee = lex.getLexiconEntry(postings.getId()); String termw = lee.getKey(); if (lee.getValue().getFrequency() < this.rarethreshold || lee.getValue().getFrequency() > this.topthreshold) continue; HashMap<String, Integer> w_cooccurence = new HashMap<String, Integer>(); if (this.cooccurencemap.containsKey(termw)) { w_cooccurence = this.cooccurencemap.get(termw); this.cooccurencemap.remove(termw); } Iterator<String> it = seenterms.iterator(); while (it.hasNext()) { String termu = it.next(); int count = 1; if (w_cooccurence.containsKey(termu)) { count = count + w_cooccurence.get(termu); w_cooccurence.remove(termu); } w_cooccurence.put(termu, count); // System.out.println(termw + ": " + w_cooccurence); // and now I need to do the symmetric HashMap<String, Integer> u_cooccurence = new HashMap<String, Integer>(); if (cooccurencemap.containsKey(termu)) { u_cooccurence = cooccurencemap.get(termu); cooccurencemap.remove(termu); } int countu = 1; if (u_cooccurence.containsKey(termw)) { countu = countu + u_cooccurence.get(termw); u_cooccurence.remove(termw); } u_cooccurence.put(termw, count); cooccurencemap.put(termu, u_cooccurence); // System.out.println(termu + ": " + u_cooccurence); } cooccurencemap.put(termw, w_cooccurence); seenterms.add(termw); // I add only the termw that are within the thresholds } } }
// returns the documents were w occurs in HashMap<Integer, Integer> occursin(String v) throws IOException { HashMap<Integer, Integer> docsofv = new HashMap<Integer, Integer>(); // MetaIndex meta = index.getMetaIndex(); Lexicon<String> lex = index.getLexicon(); LexiconEntry lev = lex.getLexiconEntry(v); IterablePosting postings = inv.getPostings(lev); while (postings.next() != IterablePosting.EOL) { docsofv.put(postings.getId(), postings.getFrequency()); } return docsofv; }
protected void assignScores(int i, AccumulatorResultSet rs, final IterablePosting postings) throws IOException { int docid; double score; short mask = 0; if (i < 16) mask = (short) (1 << i); while (postings.next() != IterablePosting.EOL) { score = plm.score(i); docid = postings.getId(); // logger.info("Docid=" + docid + " score=" + score); if ((!rs.scoresMap.contains(docid)) && (score > 0.0d)) numberOfRetrievedDocuments++; else if ((rs.scoresMap.contains(docid)) && (score < 0.0d)) numberOfRetrievedDocuments--; rs.scoresMap.adjustOrPutValue(docid, score, score); rs.occurrencesMap.put(docid, (short) (rs.occurrencesMap.get(docid) | mask)); } }
/** Print a list of the postings to standard out */ @Override public void print() { try { int entryIndex = 0; while (this.hasNext()) { IterablePosting ip = this.next(); entryIndex += this.getEntriesSkipped(); System.out.print(entryIndex + " "); while (ip.next() != IterablePosting.EOL) { System.out.print(ip.toString()); System.out.print(" "); } System.out.println(); entryIndex++; } } catch (Exception e) { logger.error("Error during print()", e); } }
/** * @param args * @throws IOException * @throws ClassNotFoundException */ public static void main(String[] args) throws IOException, ClassNotFoundException { CooccurenceMap coccmap = new CooccurenceMap(); // System.setProperty("terrier.home", "/Users/zuccong/tools/terrier-4.0"); System.setProperty("terrier.home", args[0]); // coccmap.set_index("/Users/zuccong/experiments/dotgov_stoplist/", "data"); coccmap.set_index(args[1], "data"); coccmap.setRarethreshold(500); coccmap.setTopthreshold(coccmap.index.getCollectionStatistics().getNumberOfDocuments() / 1000); coccmap.build_full_cooccurencemap_docversion(); // coccmap.build_full_cooccurencemap(); // coccmap.writemap("/Users/zuccong/experiments/cooccurence_dotgov_stoplist.map"); coccmap.writemap(args[2]); System.out.println("Size written" + coccmap.cooccurencemap.size()); /* System.out.println("Reading map from file"); //CooccurenceMap coccmapr = coccmap.readmap("/Users/zuccong/experiments/sigir2015_nlm/cooccurence_dotgov_stoplist"); CooccurenceMap coccmapr = coccmap.readmap(args[2]); //coccmapr.set_index("/Users/zuccong/experiments/dotgov_stoplist/", "data"); coccmapr.set_index(args[1], "data"); System.out.println("Size read " + coccmapr.cooccurencemap.size()); */ CooccurenceMap coccmapr = coccmap; // This is just a testing loop: will only examine the first 5 terms int count = 5; for (String w : coccmapr.cooccurencemap.keySet()) { if (count > 0) { count--; System.out.println(w); HashMap<String, Integer> w_cooccurence = coccmapr.cooccurencemap.get(w); for (String u : w_cooccurence.keySet()) { System.out.println("\t" + u + ": " + w_cooccurence.get(u)); Set<Integer> vecw = new HashSet<Integer>(); Lexicon<String> lex = coccmapr.index.getLexicon(); LexiconEntry le = lex.getLexiconEntry(w); IterablePosting postings = coccmapr.inv.getPostings(le); while (postings.next() != IterablePosting.EOL) { vecw.add(postings.getId()); } Set<Integer> vecu = new HashSet<Integer>(); LexiconEntry leu = lex.getLexiconEntry(u); IterablePosting postingsu = coccmapr.inv.getPostings(leu); while (postingsu.next() != IterablePosting.EOL) { vecu.add(postingsu.getId()); } Set<Integer> intersection = new HashSet<Integer>(vecw); // use the copy constructor intersection.retainAll(vecu); System.out.println( "\tintersection: " + intersection.size() + " size w: " + vecw.size() + " size u: " + vecu.size()); } } } System.out.println("co-occurrence(fracture,doctor) = " + coccmap.get_w_u("holiday", "meeting")); System.out.println("co-occurrence(doctor,fracture) = " + coccmap.get_w_u("meeting", "holiday")); System.out.println("co-occurrence(risk,economy) = " + coccmap.get_w_u("risk", "economy")); System.out.println("co-occurrence(economy,risk) = " + coccmap.get_w_u("economy", "risk")); System.out.println("co-occurrence(dollar,million) = " + coccmap.get_w_u("dollar", "million")); System.out.println("co-occurrence(million,dollar) = " + coccmap.get_w_u("million", "dollar")); }