/** * Builds a CooccurenceMap by iterating over the documents of the collection. It counts document * co-occurence, i.e. it doesn't consider the frequency of two terms in a document. Complexity: * O(d * t *t/2) = O(d t^2) where d is the number of documents in the collection and t is the * average number of terms per documents. Note that t = avg doc len */ public void build_full_cooccurencemap_docversion() throws IOException { PostingIndex di = index.getDirectIndex(); DocumentIndex doi = index.getDocumentIndex(); Lexicon<String> lex = index.getLexicon(); for (int docid = 0; docid < doi.getNumberOfDocuments(); docid++) { if (docid % 1000 == 0) System.out.println( "Processing... " + 100.0 * ((double) docid) / doi.getNumberOfDocuments() + "%"); IterablePosting postings = di.getPostings(doi.getDocumentEntry(docid)); Vector<String> seenterms = new Vector<String>(); while (postings.next() != IterablePosting.EOL) { Map.Entry<String, LexiconEntry> lee = lex.getLexiconEntry(postings.getId()); String termw = lee.getKey(); if (lee.getValue().getFrequency() < this.rarethreshold || lee.getValue().getFrequency() > this.topthreshold) continue; HashMap<String, Integer> w_cooccurence = new HashMap<String, Integer>(); if (this.cooccurencemap.containsKey(termw)) { w_cooccurence = this.cooccurencemap.get(termw); this.cooccurencemap.remove(termw); } Iterator<String> it = seenterms.iterator(); while (it.hasNext()) { String termu = it.next(); int count = 1; if (w_cooccurence.containsKey(termu)) { count = count + w_cooccurence.get(termu); w_cooccurence.remove(termu); } w_cooccurence.put(termu, count); // System.out.println(termw + ": " + w_cooccurence); // and now I need to do the symmetric HashMap<String, Integer> u_cooccurence = new HashMap<String, Integer>(); if (cooccurencemap.containsKey(termu)) { u_cooccurence = cooccurencemap.get(termu); cooccurencemap.remove(termu); } int countu = 1; if (u_cooccurence.containsKey(termw)) { countu = countu + u_cooccurence.get(termw); u_cooccurence.remove(termw); } u_cooccurence.put(termw, count); cooccurencemap.put(termu, u_cooccurence); // System.out.println(termu + ": " + u_cooccurence); } cooccurencemap.put(termw, w_cooccurence); seenterms.add(termw); // I add only the termw that are within the thresholds } } }
/** * Runs the actual query expansion * * @see * org.terrier.querying.PostProcess#process(org.terrier.querying.Manager,org.terrier.querying.SearchRequest) */ public void process(Manager manager, SearchRequest q) { Index index = getIndex(manager); lastIndex = index; documentIndex = index.getDocumentIndex(); invertedIndex = index.getInvertedIndex(); lexicon = index.getLexicon(); collStats = index.getCollectionStatistics(); directIndex = index.getDirectIndex(); metaIndex = index.getMetaIndex(); if (directIndex == null) { logger.error("This index does not have a direct index. Query expansion disabled!!"); return; } logger.debug("Starting query expansion post-processing."); // get the query expansion model to use String qeModel = q.getControl("qemodel"); if (qeModel == null || qeModel.length() == 0) { logger.warn( "qemodel control not set for QueryExpansion" + " post process. Using default model Bo1"); qeModel = "Bo1"; } setQueryExpansionModel(getQueryExpansionModel(qeModel)); if (logger.isDebugEnabled()) { logger.info("query expansion model: " + QEModel.getInfo()); } MatchingQueryTerms queryTerms = ((Request) q).getMatchingQueryTerms(); if (queryTerms == null) { logger.warn("No query terms for this query. Skipping QE"); return; } // get the expanded query terms try { expandQuery(queryTerms, (Request) q); } catch (IOException ioe) { logger.error("IOException while expanding query, skipping QE", ioe); return; } if (logger.isDebugEnabled()) { logger.info("query length after expansion: " + queryTerms.length()); logger.info("Expanded query: "); } final String[] newQueryTerms = queryTerms.getTerms(); StringBuilder newQuery = new StringBuilder(); for (int i = 0; i < newQueryTerms.length; i++) { try { if (logger.isDebugEnabled()) { logger.info( (i + 1) + ": " + newQueryTerms[i] + ", normalisedFrequency: " + Rounding.toString(queryTerms.getTermWeight(newQueryTerms[i]), 4)); } newQuery.append(newQueryTerms[i]); newQuery.append('^'); newQuery.append(Rounding.toString(queryTerms.getTermWeight(newQueryTerms[i]), 9)); newQuery.append(' '); } catch (NullPointerException npe) { logger.error("Nullpointer exception occured in Query Expansion dumping of new Query", npe); } } logger.debug("NEWQUERY " + q.getQueryID() + " " + newQuery.toString()); lastExpandedQuery = newQuery.toString(); q.setControl("QE.ExpandedQuery", newQuery.toString()); final boolean no2ndPass = Boolean.parseBoolean(ApplicationSetup.getProperty("qe.no.2nd.matching", "false")); if (no2ndPass) { return; } // run retrieval process again for the expanded query logger.info("Accessing inverted file for expanded query " + q.getQueryID()); manager.runMatching(q); }