/** * Builds a CooccurenceMap by iterating over the documents of the collection. It counts document * co-occurence, i.e. it doesn't consider the frequency of two terms in a document. Complexity: * O(d * t *t/2) = O(d t^2) where d is the number of documents in the collection and t is the * average number of terms per documents. Note that t = avg doc len */ public void build_full_cooccurencemap_docversion() throws IOException { PostingIndex di = index.getDirectIndex(); DocumentIndex doi = index.getDocumentIndex(); Lexicon<String> lex = index.getLexicon(); for (int docid = 0; docid < doi.getNumberOfDocuments(); docid++) { if (docid % 1000 == 0) System.out.println( "Processing... " + 100.0 * ((double) docid) / doi.getNumberOfDocuments() + "%"); IterablePosting postings = di.getPostings(doi.getDocumentEntry(docid)); Vector<String> seenterms = new Vector<String>(); while (postings.next() != IterablePosting.EOL) { Map.Entry<String, LexiconEntry> lee = lex.getLexiconEntry(postings.getId()); String termw = lee.getKey(); if (lee.getValue().getFrequency() < this.rarethreshold || lee.getValue().getFrequency() > this.topthreshold) continue; HashMap<String, Integer> w_cooccurence = new HashMap<String, Integer>(); if (this.cooccurencemap.containsKey(termw)) { w_cooccurence = this.cooccurencemap.get(termw); this.cooccurencemap.remove(termw); } Iterator<String> it = seenterms.iterator(); while (it.hasNext()) { String termu = it.next(); int count = 1; if (w_cooccurence.containsKey(termu)) { count = count + w_cooccurence.get(termu); w_cooccurence.remove(termu); } w_cooccurence.put(termu, count); // System.out.println(termw + ": " + w_cooccurence); // and now I need to do the symmetric HashMap<String, Integer> u_cooccurence = new HashMap<String, Integer>(); if (cooccurencemap.containsKey(termu)) { u_cooccurence = cooccurencemap.get(termu); cooccurencemap.remove(termu); } int countu = 1; if (u_cooccurence.containsKey(termw)) { countu = countu + u_cooccurence.get(termw); u_cooccurence.remove(termw); } u_cooccurence.put(termw, count); cooccurencemap.put(termu, u_cooccurence); // System.out.println(termu + ": " + u_cooccurence); } cooccurencemap.put(termw, w_cooccurence); seenterms.add(termw); // I add only the termw that are within the thresholds } } }
/** Closes the used structures. */ public void close() { try { index.close(); } catch (IOException ioe) { logger.warn("Problem closing index", ioe); } }
/** * Creates the inverted index from the already created direct index, document index and lexicon. * It saves block information and possibly field information as well. * * @see org.terrier.indexing.Indexer#createInvertedIndex() */ public void createInvertedIndex() { if (currentIndex == null) { currentIndex = Index.createIndex(path, prefix); if (currentIndex == null) { logger.error("No index at (" + path + "," + prefix + ") to build an inverted index for "); } } long beginTimestamp = System.currentTimeMillis(); if (currentIndex.getCollectionStatistics().getNumberOfUniqueTerms() == 0) { logger.error("Index has no terms. Inverted index creation aborted."); return; } if (currentIndex.getCollectionStatistics().getNumberOfDocuments() == 0) { logger.error("Index has no documents. Inverted index creation aborted."); return; } logger.info("Started building the block inverted index..."); invertedIndexBuilder = new BlockInvertedIndexBuilder(currentIndex, "inverted"); invertedIndexBuilder.createInvertedIndex(); this.finishedInvertedIndexBuild(); try { currentIndex.flush(); } catch (IOException ioe) { logger.error("Cannot flush index: ", ioe); } long endTimestamp = System.currentTimeMillis(); logger.info("Finished building the block inverted index..."); long seconds = (endTimestamp - beginTimestamp) / 1000; logger.info("Time elapsed for inverted file: " + seconds); }
/** Loads index(s) from disk. */ protected void loadIndex() { long startLoading = System.currentTimeMillis(); index = Index.createIndex(); if (index == null) { logger.fatal("Failed to load index. Perhaps index files are missing"); } long endLoading = System.currentTimeMillis(); if (logger.isInfoEnabled()) logger.info("time to intialise index : " + ((endLoading - startLoading) / 1000.0D)); }
// returns the documents were w occurs in - binary version Set<Integer> occursin_binary(String v) throws IOException { Set<Integer> vecv = new HashSet<Integer>(); Lexicon<String> lex = index.getLexicon(); LexiconEntry le = lex.getLexiconEntry(v); IterablePosting postings = inv.getPostings(le); while (postings.next() != IterablePosting.EOL) { vecv.add(postings.getId()); } return vecv; }
/** * Prints the results * * @param pw PrintWriter the file to write the results to. * @param q SearchRequest the search request to get results from. */ public void printResults(PrintWriter pw, SearchRequest q) throws IOException { ResultSet set = q.getResultSet(); int[] docids = set.getDocids(); double[] scores = set.getScores(); int minimum = RESULTS_LENGTH; // if the minimum number of documents is more than the // number of documents in the results, aw.length, then // set minimum = aw.length if (minimum > set.getResultSize()) minimum = set.getResultSize(); if (verbose) if (set.getResultSize() > 0) pw.write("\n\tDisplaying 1-" + set.getResultSize() + " results\n"); else pw.write("\n\tNo results\n"); if (set.getResultSize() == 0) return; int metaKeyId = 0; final int metaKeyCount = metaKeys.length; String[][] docNames = new String[metaKeyCount][]; for (String metaIndexDocumentKey : metaKeys) { if (set.hasMetaItems(metaIndexDocumentKey)) { docNames[metaKeyId] = set.getMetaItems(metaIndexDocumentKey); } else { final MetaIndex metaIndex = index.getMetaIndex(); docNames[metaKeyId] = metaIndex.getItems(metaIndexDocumentKey, docids); } metaKeyId++; } StringBuilder sbuffer = new StringBuilder(); // the results are ordered in asceding order // with respect to the score. For example, the // document with the highest score has score // score[scores.length-1] and its docid is // docid[docids.length-1]. int start = 0; int end = minimum; for (int i = start; i < end; i++) { if (scores[i] <= 0d) continue; sbuffer.append(i); sbuffer.append(" "); for (metaKeyId = 0; metaKeyId < metaKeyCount; metaKeyId++) { sbuffer.append(docNames[metaKeyId][i]); sbuffer.append(" "); } sbuffer.append(docids[i]); sbuffer.append(" "); sbuffer.append(scores[i]); sbuffer.append('\n'); } // System.out.println(sbuffer.toString()); pw.write(sbuffer.toString()); pw.flush(); // pw.write("finished outputting\n"); }
// returns the documents were w occurs in HashMap<Integer, Integer> occursin(String v) throws IOException { HashMap<Integer, Integer> docsofv = new HashMap<Integer, Integer>(); // MetaIndex meta = index.getMetaIndex(); Lexicon<String> lex = index.getLexicon(); LexiconEntry lev = lex.getLexiconEntry(v); IterablePosting postings = inv.getPostings(lev); while (postings.next() != IterablePosting.EOL) { docsofv.put(postings.getId(), postings.getFrequency()); } return docsofv; }
/** method which extracts the docnos for the prescribed resultset */ protected String[] obtainDocnos( final String metaIndexDocumentKey, final SearchRequest q, final ResultSet set) throws IOException { String[] docnos; if (set.hasMetaItems(metaIndexDocumentKey)) { docnos = set.getMetaItems(metaIndexDocumentKey); } else { final MetaIndex metaIndex = index.getMetaIndex(); docnos = metaIndex.getItems(metaIndexDocumentKey, set.getDocids()); } return docnos; }
/** * Runs the actual query expansion * * @see * org.terrier.querying.PostProcess#process(org.terrier.querying.Manager,org.terrier.querying.SearchRequest) */ public void process(Manager manager, SearchRequest q) { Index index = getIndex(manager); lastIndex = index; documentIndex = index.getDocumentIndex(); invertedIndex = index.getInvertedIndex(); lexicon = index.getLexicon(); collStats = index.getCollectionStatistics(); directIndex = index.getDirectIndex(); metaIndex = index.getMetaIndex(); if (directIndex == null) { logger.error("This index does not have a direct index. Query expansion disabled!!"); return; } logger.debug("Starting query expansion post-processing."); // get the query expansion model to use String qeModel = q.getControl("qemodel"); if (qeModel == null || qeModel.length() == 0) { logger.warn( "qemodel control not set for QueryExpansion" + " post process. Using default model Bo1"); qeModel = "Bo1"; } setQueryExpansionModel(getQueryExpansionModel(qeModel)); if (logger.isDebugEnabled()) { logger.info("query expansion model: " + QEModel.getInfo()); } MatchingQueryTerms queryTerms = ((Request) q).getMatchingQueryTerms(); if (queryTerms == null) { logger.warn("No query terms for this query. Skipping QE"); return; } // get the expanded query terms try { expandQuery(queryTerms, (Request) q); } catch (IOException ioe) { logger.error("IOException while expanding query, skipping QE", ioe); return; } if (logger.isDebugEnabled()) { logger.info("query length after expansion: " + queryTerms.length()); logger.info("Expanded query: "); } final String[] newQueryTerms = queryTerms.getTerms(); StringBuilder newQuery = new StringBuilder(); for (int i = 0; i < newQueryTerms.length; i++) { try { if (logger.isDebugEnabled()) { logger.info( (i + 1) + ": " + newQueryTerms[i] + ", normalisedFrequency: " + Rounding.toString(queryTerms.getTermWeight(newQueryTerms[i]), 4)); } newQuery.append(newQueryTerms[i]); newQuery.append('^'); newQuery.append(Rounding.toString(queryTerms.getTermWeight(newQueryTerms[i]), 9)); newQuery.append(' '); } catch (NullPointerException npe) { logger.error("Nullpointer exception occured in Query Expansion dumping of new Query", npe); } } logger.debug("NEWQUERY " + q.getQueryID() + " " + newQuery.toString()); lastExpandedQuery = newQuery.toString(); q.setControl("QE.ExpandedQuery", newQuery.toString()); final boolean no2ndPass = Boolean.parseBoolean(ApplicationSetup.getProperty("qe.no.2nd.matching", "false")); if (no2ndPass) { return; } // run retrieval process again for the expanded query logger.info("Accessing inverted file for expanded query " + q.getQueryID()); manager.runMatching(q); }
public void readIndex() throws IOException { index = terrierIndexFactory.readIndex(indexId, language); documentIndex = index.getDocumentIndex(); metaIndex = index.getMetaIndex(); }
/** * @param args * @throws IOException * @throws InterruptedException */ public static void main(String[] args) throws IOException, InterruptedException { System.out.println("Usage: "); System.out.println("args[0]: path to terrier.home"); System.out.println("args[1]: path to index"); System.out.println("args[2]: path to trec query file"); System.out.println("args[3]: path to result file (including name of result file)"); System.setProperty("terrier.home", args[0]); Index index = Index.createIndex(args[1], "data"); System.out.println(index.getEnd()); HashMap<String, String> trecqueries = new HashMap<String, String>(); BufferedReader br = new BufferedReader(new FileReader(args[2])); String line = null; while ((line = br.readLine()) != null) { String[] input = line.split(" "); String qid = input[0]; String query = ""; for (int i = 1; i < input.length; i++) query = query + " " + input[i]; query = query.replaceAll("-", " "); query = query.replaceAll("\\p{Punct}", ""); query = query.substring(1, query.length()); trecqueries.put(qid, query.toLowerCase()); } br.close(); double[] muvalues = {100.0, 500.0, 1000.0, 1500.0, 2000.0, 2500.0, 3000.0, 3500.0, 4000.0}; for (int i = 0; i < muvalues.length; i++) { double mu = muvalues[i]; TranslationLMManager tlm = new TranslationLMManager(index); tlm.setTranslation("null"); tlm.setDirMu(mu); TRECDocnoOutputFormat TRECoutput = new TRECDocnoOutputFormat(index); PrintWriter pt = new PrintWriter(new File(args[3] + "_dir_mu_" + String.valueOf(mu) + ".txt")); /* TranslationLMManager tlm_theory = new TranslationLMManager(index); tlm_theory.setTranslation("dir_theory"); tlm_theory.setDirMu(mu); TRECDocnoOutputFormat TRECoutput_theory = new TRECDocnoOutputFormat(index); PrintWriter pt_theory = new PrintWriter(new File(args[3]+"_dir_theory_mu_" + String.valueOf(mu) + ".txt")); */ for (String qid : trecqueries.keySet()) { String query = trecqueries.get(qid); System.out.println(query + " - " + qid); System.out.println("Scoring with Dir LM; mu=" + mu); // scoring with LM dir Request rq = new Request(); rq.setOriginalQuery(query); rq.setIndex(index); rq.setQueryID(qid); rq = tlm.runMatching(rq, "null", "dir"); /* DocumentIndex doi = index.getDocumentIndex(); MetaIndex meta = index.getMetaIndex(); int docid = 1247748; //docids are 0-based DocumentIndexEntry die = doi.getDocumentEntry(docid); System.out.println(meta.getItem("docno", docid) + ":" + die.getDocumentLength()); die = doi.getDocumentEntry(docid+1); System.out.println(meta.getItem("docno", docid+1) + ":" + die.getDocumentLength()); meta. die = doi.getDocumentEntry(docid+2); System.out.println(meta.getItem("docno", docid+2) + ":" + die.getDocumentLength()); */ TRECoutput.printResults(pt, rq, "dir", "Q0", 1000); /* Request rq_theory = new Request(); rq_theory.setOriginalQuery(query); rq_theory.setIndex(index); rq_theory.setQueryID(qid); rq_theory = tlm_theory.runMatching(rq_theory, "dir_theory", "dir"); TRECoutput_theory.printResults(pt_theory, rq_theory, "dir_theory", "Q0", 1000); */ } pt.flush(); pt.close(); // pt_theory.flush(); // pt_theory.close(); } }
// TODO if this class extends BasicIndexer, then this method could be inherited public void createDirectIndex(Collection[] collections) { logger.info( "BlockIndexer creating direct index" + (Boolean.parseBoolean( ApplicationSetup.getProperty("block.delimiters.enabled", "false")) ? " delimited-block indexing enabled" : "")); currentIndex = Index.createNewIndex(path, prefix); lexiconBuilder = FieldScore.FIELDS_COUNT > 0 ? new LexiconBuilder( currentIndex, "lexicon", new BlockFieldLexiconMap(FieldScore.FIELDS_COUNT), FieldLexiconEntry.class.getName()) : new LexiconBuilder( currentIndex, "lexicon", new BlockLexiconMap(), BlockLexiconEntry.class.getName()); // lexiconBuilder = new BlockLexiconBuilder(currentIndex, "lexicon"); try { directIndexBuilder = FieldScore.FIELDS_COUNT > 0 ? new BlockFieldDirectInvertedOutputStream( currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR + currentIndex.getPrefix() + "." + "direct" + BitIn.USUAL_EXTENSION) : new BlockDirectInvertedOutputStream( currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR + currentIndex.getPrefix() + "." + "direct" + BitIn.USUAL_EXTENSION); } catch (IOException ioe) { logger.error("Cannot make DirectInvertedOutputStream:", ioe); } docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document"); metaBuilder = createMetaIndexBuilder(); emptyDocIndexEntry = (FieldScore.FIELDS_COUNT > 0) ? new FieldDocumentIndexEntry(FieldScore.FIELDS_COUNT) : new BasicDocumentIndexEntry(); // int LexiconCount = 0; int numberOfDocuments = 0; // int numberOfTokens = 0; // long startBunchOfDocuments = System.currentTimeMillis(); final boolean boundaryDocsEnabled = BUILDER_BOUNDARY_DOCUMENTS.size() > 0; boolean stopIndexing = false; for (int collectionNo = 0; !stopIndexing && collectionNo < collections.length; collectionNo++) { Collection collection = collections[collectionNo]; long startCollection = System.currentTimeMillis(); boolean notLastDoc = false; // while(notLastDoc = collection.hasNext()) { while ((notLastDoc = collection.nextDocument())) { // get the next document from the collection // String docid = collection.getDocid(); // Document doc = collection.next(); Document doc = collection.getDocument(); if (doc == null) continue; numberOfDocuments++; // setup for parsing createDocumentPostings(); String term; numOfTokensInDocument = 0; numOfTokensInBlock = 0; blockId = 0; // get each term in the document while (!doc.endOfDocument()) { if ((term = doc.getNextTerm()) != null && !term.equals("")) { termFields = doc.getFields(); // pass term into TermPipeline (stop, stem etc) pipeline_first.processTerm(term); // the term pipeline will eventually add the term to this // object. } if (MAX_TOKENS_IN_DOCUMENT > 0 && numOfTokensInDocument > MAX_TOKENS_IN_DOCUMENT) break; } // if we didn't index all tokens from document, // we need to get to the end of the document. while (!doc.endOfDocument()) doc.getNextTerm(); // we now have all terms in the DocumentTree pipeline_first.reset(); // process DocumentTree (tree of terms) try { if (termsInDocument.getDocumentLength() == 0) { // this document is empty, add the // minimum to the document index indexEmpty(doc.getAllProperties()); } else { /* index this docuent */ // numberOfTokens += numOfTokensInDocument; indexDocument(doc.getAllProperties(), termsInDocument); } } catch (Exception ioe) { logger.error("Failed to index " + doc.getProperty("docno"), ioe); } if (MAX_DOCS_PER_BUILDER > 0 && numberOfDocuments >= MAX_DOCS_PER_BUILDER) { stopIndexing = true; break; } if (boundaryDocsEnabled && BUILDER_BOUNDARY_DOCUMENTS.contains(doc.getProperty("docno"))) { stopIndexing = true; break; } } long endCollection = System.currentTimeMillis(); long secs = ((endCollection - startCollection) / 1000); logger.info( "Collection #" + collectionNo + " took " + secs + "seconds to index " + "(" + numberOfDocuments + " documents)\n"); if (secs > 3600) logger.info( "Rate: " + ((double) numberOfDocuments / ((double) secs / 3600.0d)) + " docs/hour"); if (!notLastDoc) { try { collection.close(); } catch (IOException e) { logger.warn("Couldnt close collection", e); } } } /* end of the collection has been reached */ finishedDirectIndexBuild(); currentIndex.addIndexStructure( "direct", "org.terrier.structures.BlockDirectIndex", "org.terrier.structures.Index,java.lang.String,java.lang.Class", "index,structureName," + (FieldScore.FIELDS_COUNT > 0 ? fieldDirectIndexPostingIteratorClass : basicDirectIndexPostingIteratorClass)); currentIndex.addIndexStructureInputStream( "direct", "org.terrier.structures.BlockDirectIndexInputStream", "org.terrier.structures.Index,java.lang.String,java.lang.Class", "index,structureName," + (FieldScore.FIELDS_COUNT > 0 ? fieldDirectIndexPostingIteratorClass : basicDirectIndexPostingIteratorClass)); currentIndex.setIndexProperty("index.direct.fields.count", "" + FieldScore.FIELDS_COUNT); currentIndex.setIndexProperty( "index.direct.fields.names", ArrayUtils.join(FieldScore.FIELD_NAMES, ",")); if (FieldScore.FIELDS_COUNT > 0) { currentIndex.addIndexStructure( "document-factory", FieldDocumentIndexEntry.Factory.class.getName(), "java.lang.String", "${index.direct.fields.count}"); } else { currentIndex.addIndexStructure( "document-factory", BasicDocumentIndexEntry.Factory.class.getName(), "", ""); } /* flush the index buffers */ directIndexBuilder.close(); docIndexBuilder.finishedCollections(); /* and then merge all the temporary lexicons */ lexiconBuilder.finishedDirectIndexBuild(); try { metaBuilder.close(); } catch (IOException ioe) { logger.error("Could not finish MetaIndexBuilder: ", ioe); } if (FieldScore.FIELDS_COUNT > 0) { currentIndex.addIndexStructure( "lexicon-valuefactory", FieldLexiconEntry.Factory.class.getName(), "java.lang.String", "${index.direct.fields.count}"); } /* reset the in-memory mapping of terms to term codes.*/ TermCodes.reset(); System.gc(); try { currentIndex.flush(); } catch (IOException ioe) { logger.error("Could not flush index properties: ", ioe); } }
/** {@inheritDoc} */ public void setIndex(Index index) { metaIndex = index.getMetaIndex(); }
public void set_index(String index_path, String index_prefix) { this.index = Index.createIndex(index_path, index_prefix); this.inv = index.getInvertedIndex(); }
/** * Builds a CooccurenceMap by iterating over the vocabulary of the collection. It counts document * co-occurence, i.e. it doesn't consider the frequency of two terms in a document. Complexity: * O(n^3) = O(d t^2) where n is the number of terms in the vocabulary Note: this currently goes * out of heap space on DOTGOV with 5GB of RAM allocated to the JVM */ void build_full_cooccurencemap() throws IOException { Lexicon<String> lex = index.getLexicon(); Iterator<Entry<String, LexiconEntry>> itw = lex.iterator(); int prcount = 1; // iterating over all possible w while (itw.hasNext()) { Entry<String, LexiconEntry> lw = itw.next(); String termw = lw.getKey(); if (lw.getValue().getFrequency() < this.rarethreshold || lw.getValue().getFrequency() > this.topthreshold) continue; if (prcount % 1000 == 0) System.out.println( "Processing... " + 100.0 * ((double) prcount) / this.index.getCollectionStatistics().getNumberOfUniqueTerms() + "%"); prcount++; // LexiconEntry lew = lw.getValue(); // System.out.println("analysing " + termw); HashMap<String, Integer> w_cooccurence = new HashMap<String, Integer>(); if (cooccurencemap.containsKey(termw)) { w_cooccurence = cooccurencemap.get(termw); cooccurencemap.remove(termw); } Set<Integer> docsofw = occursin_binary(termw); Iterator<Entry<String, LexiconEntry>> itu = lex.iterator(); while (itu.hasNext()) { Entry<String, LexiconEntry> lu = itu.next(); String termu = lu.getKey(); if (lu.getValue().getFrequency() < this.rarethreshold || lu.getValue().getFrequency() > this.topthreshold) continue; // System.out.println("\tmeasuring co-occurence with " + termu); // LexiconEntry leu = lu.getValue(); Set<Integer> docsofu = occursin_binary(termu); Set<Integer> intersection = new HashSet<Integer>(docsofw); // use the copy constructor intersection.retainAll(docsofu); int count = intersection.size(); if (w_cooccurence.containsKey(termu)) { count = count + w_cooccurence.get(termu); w_cooccurence.remove(termu); } w_cooccurence.put(termu, count); // System.out.println("\t\t"+termw + " " + termu + " = " + count); // System.out.println(docsofw.size() + " " + docsofu.size() + " " + diff.entriesInCommon()); // The next bit of code instead does count frequencies /* if(docsofw.size() <= docsofu.size()) { for (Integer docidw: docsofw.keySet()) { if (docsofu.containsKey(docidw)) { //then w and u co-occur Integer count = (Integer) Math.min(docsofw.get(docidw), docsofu.get(docidw)); if(w_cooccurence.containsKey(termu)) { count = count + w_cooccurence.get(termu); w_cooccurence.remove(termu); } w_cooccurence.put(termu, count); System.out.println("\t\t"+termw + " " + termu + " = " + count); } } }else { for (Integer docidu: docsofu.keySet()) { if (docsofw.containsKey(docidu)) { //then w and u co-occur Integer count = (Integer) Math.min(docsofw.get(docidu), docsofu.get(docidu)); if(w_cooccurence.containsKey(termu)) { count = count + w_cooccurence.get(termu); w_cooccurence.remove(termu); } w_cooccurence.put(termu, count); System.out.println("\t\t"+termw + " " + termu + " = " + count); } } }*/ } cooccurencemap.put(termw, w_cooccurence); // System.out.println(termw + ": " + w_cooccurence); } }