@Override public boolean modifyScores(Index index, MatchingQueryTerms terms, ResultSet set) { MU = Double.parseDouble( ApplicationSetup.getProperty( "mrf.mu", ApplicationSetup.getProperty("proximity.norm2.c", "4000d"))); return super.modifyScores(index, terms, set); }
public FatResultsMatching(Index i) throws IOException { filename = ApplicationSetup.getProperty("fat.results.matching.file", null); maxResults = Integer.parseInt(ApplicationSetup.getProperty("fat.results.matching.max.results", "0")); if (filename == null) throw new IllegalArgumentException("fat.results.matching.file needs to be specified"); logger.info("Reading fat resultsets from " + filename); dis = new DataInputStream(Files.openFileStream(filename)); }
/** {@inheritDoc} */ public void setCollectionStatistics(CollectionStatistics cs, Index _index) { super.setCollectionStatistics(cs, _index); w_o = Double.parseDouble( ApplicationSetup.getProperty( "proximity." + super.ngramLength + ".w_o", ApplicationSetup.getProperty("proximity.w_o", "1.0d"))); // these statistics are as used by Ivory system defaultDf = ((double) cs.getNumberOfDocuments()) / 100.0d; defaultCf = defaultDf * 2; }
/** load the feedback selector, based on the property <tt>qe.feedback.selector</tt> */ protected FeedbackSelector getFeedbackSelector(Request rq) { String[] names = ApplicationSetup.getProperty("qe.feedback.selector", "PseudoRelevanceFeedbackSelector") .split("\\s*,\\s*"); FeedbackSelector rtr = null; for (int i = names.length - 1; i >= 0; i--) { String name = names[i]; if (!name.contains(".")) name = "org.terrier.querying." + name; else if (name.startsWith("uk.ac.gla.terrier")) name = name.replaceAll("uk.ac.gla.terrier", "org.terrier"); FeedbackSelector next = null; try { Class<? extends FeedbackSelector> nextClass = Class.forName(name).asSubclass(FeedbackSelector.class); if (names.length - 1 == i) { next = nextClass.newInstance(); } else { next = nextClass.getConstructor(FeedbackSelector.class).newInstance(rtr); } rtr = next; } catch (Exception e) { logger.error("Problem loading a FeedbackSelector called " + name, e); return null; } rtr.setIndex(lastIndex); // TODO index should come from Request } return rtr; }
/** default constructor */ public RelevanceFeedbackSelector() { String feedbackFilename = ApplicationSetup.getProperty( "qe.feedback.filename", ApplicationSetup.TERRIER_ETC + ApplicationSetup.FILE_SEPARATOR + "feedback"); this.loadRelevanceInformation(feedbackFilename); }
/** * Prints the results for the given search request, using the specified destination. * * @param pw PrintWriter the destination where to save the results. * @param q SearchRequest the object encapsulating the query and the results. */ public void printResults( final PrintWriter pw, final SearchRequest q, String method, String iteration, int _RESULTS_LENGTH) throws IOException { final ResultSet set = q.getResultSet(); final String metaIndexDocumentKey = ApplicationSetup.getProperty("trec.querying.outputformat.docno.meta.key", "docno"); final double[] scores = set.getScores(); if (set.getResultSize() == 0) { logger.warn("No results retrieved for query " + q.getQueryID()); return; } String[] docnos = obtainDocnos(metaIndexDocumentKey, q, set); final int maximum = _RESULTS_LENGTH > set.getResultSize() || _RESULTS_LENGTH == 0 ? set.getResultSize() : _RESULTS_LENGTH; logger.debug("Writing " + maximum + " results for query " + q.getQueryID()); // if the minimum number of documents is more than the // number of documents in the results, aw.length, then // set minimum = aw.length // if (minimum > set.getResultSize()) // minimum = set.getResultSize(); // final String iteration = ITERATION + "0"; final String queryIdExpanded = q.getQueryID() + " " + iteration + " "; final String methodExpanded = " " + method + ApplicationSetup.EOL; StringBuilder sbuffer = new StringBuilder(); // the results are ordered in desceding order // with respect to the score. int limit = 10000; int counter = 0; for (int i = 0; i < maximum; i++) { if (scores[i] == Double.NEGATIVE_INFINITY) continue; sbuffer.append(queryIdExpanded); sbuffer.append(docnos[i]); sbuffer.append(" "); sbuffer.append(i); sbuffer.append(" "); sbuffer.append(scores[i]); sbuffer.append(methodExpanded); counter++; if (counter % limit == 0) { pw.write(sbuffer.toString()); sbuffer = null; sbuffer = new StringBuilder(); pw.flush(); } } pw.write(sbuffer.toString()); pw.flush(); }
/** * Returns the object that is to be the end of the TermPipeline. This method is used at * construction time of the parent object. * * @return TermPipeline the last component of the term pipeline. */ protected TermPipeline getEndOfPipeline() { // if using delimited blocks if (Boolean.parseBoolean(ApplicationSetup.getProperty("block.delimiters.enabled", "false"))) { String delim = ApplicationSetup.getProperty("block.delimiters", "").trim(); if (Boolean.parseBoolean(ApplicationSetup.getProperty("lowercase", "true"))) delim = delim.toLowerCase(); String delims[] = delim.split("\\s*,\\s*"); final boolean indexDelims = Boolean.parseBoolean( ApplicationSetup.getProperty("block.delimiters.index.terms", "false")); final boolean countDelims = Boolean.parseBoolean( ApplicationSetup.getProperty("block.delimiters.index.doclength", "true")); return (FieldScore.USE_FIELD_INFORMATION) ? new DelimFieldTermProcessor(delims, indexDelims, countDelims) : new DelimTermProcessor(delims, indexDelims, countDelims); } else if (FieldScore.USE_FIELD_INFORMATION) { return new FieldTermProcessor(); } return new BasicTermProcessor(); }
/** * Implements Markov Random Fields. See Metzler & Croft, SIGIR 2005. Note that this implementation * does not utilise the frequency of a tuple in the collection - instead, this is assumed to be a * constant, as per the implementation in the Ivory retrieval system. <b>Properties:</b> * * <ul> * <li><i>See properties for DependenceScoreModifier</i> * <li><tt>mrf.mu</tt> - Mu of MRF model, in the Dirichlet model. * </ul> * * <b>References:</b> Metzler, D. and Croft, W.B., "A Markov Random Field Model for Term * Dependencies," Proceedings of the 28th annual international ACM SIGIR conference on Research and * development in information retrieval (SIGIR 2005), 472-479, 2005 * * @author Craig Macdonald * @since 3.0 */ public class MRFDependenceScoreModifier extends DependenceScoreModifier { protected double MU = Double.parseDouble( ApplicationSetup.getProperty( "mrf.mu", ApplicationSetup.getProperty("proximity.norm2.c", "4000d"))); double defaultDf; double defaultCf; @Override public boolean modifyScores(Index index, MatchingQueryTerms terms, ResultSet set) { MU = Double.parseDouble( ApplicationSetup.getProperty( "mrf.mu", ApplicationSetup.getProperty("proximity.norm2.c", "4000d"))); return super.modifyScores(index, terms, set); } @Override protected double scoreFDSD(int matchingNGrams, int _docLength) { final double mu = MU; double docLength = (double) _docLength; double tf = (double) matchingNGrams; return w_o * (Idf.log(1 + (tf / (mu * (defaultCf / super.numTokens)))) + Idf.log(mu / (docLength + mu))); } /** {@inheritDoc} */ public void setCollectionStatistics(CollectionStatistics cs, Index _index) { super.setCollectionStatistics(cs, _index); w_o = Double.parseDouble( ApplicationSetup.getProperty( "proximity." + super.ngramLength + ".w_o", ApplicationSetup.getProperty("proximity.w_o", "1.0d"))); // these statistics are as used by Ivory system defaultDf = ((double) cs.getNumberOfDocuments()) / 100.0d; defaultCf = defaultDf * 2; } }
static String[] getModelNames() throws Exception { String[] modelNames = ArrayUtils.parseCommaDelimitedString( ApplicationSetup.getProperty("fat.featured.scoring.matching.features", "")); if (modelNames.length == 1 && modelNames[0].equals("FILE")) { String filename = ApplicationSetup.getProperty("fat.featured.scoring.matching.features.file", null); if (filename == null) throw new IllegalArgumentException(); filename = ApplicationSetup.makeAbsolute(filename, ApplicationSetup.TERRIER_ETC); String line = null; final BufferedReader br = Files.openFileReader(filename); final List<String> models = new ArrayList<String>(); while ((line = br.readLine()) != null) { // ignore linee starting with comments if (line.startsWith("#")) continue; // remove trailing comments line = line.replaceAll("#.+$", ""); models.add(line.trim()); } br.close(); modelNames = models.toArray(new String[models.size()]); } return modelNames; }
/** * load the expansion terms, as per the property <tt>qe.expansion.terms.class</tt>. Defaults to * DFRBagExpansionTerms. * * @return an ExpansionTerms instance, which may or may not wrap other ExpansionTerms instances */ protected ExpansionTerms getExpansionTerms() { String expanderNames[] = ApplicationSetup.getProperty("qe.expansion.terms.class", "DFRBagExpansionTerms") .split("\\s*,\\s*"); ExpansionTerms rtr = null; // foreach name, starting from the last, finishing with the first for (int i = expanderNames.length - 1; i >= 0; i--) { String expanderName = expanderNames[i]; ExpansionTerms next = null; if (!expanderName.contains(".")) expanderName = "org.terrier.querying." + expanderName; else if (expanderName.startsWith("uk.ac.gla.terrier")) expanderName = expanderName.replaceAll("uk.ac.gla.terrier", "org.terrier"); try { Class<? extends ExpansionTerms> clz = Class.forName(expanderName).asSubclass(ExpansionTerms.class); if (expanderNames.length - 1 == i) { next = clz.getConstructor( CollectionStatistics.class, Lexicon.class, PostingIndex.class, DocumentIndex.class) .newInstance(collStats, lexicon, directIndex, documentIndex); } else { next = clz.getConstructor(ExpansionTerms.class).newInstance(rtr); } rtr = next; } catch (Exception e) { logger.error("Errory during GetExpansionTerms", e); return null; } } return rtr; }
/** * Constructs an instance of TRECQuery, that reads and stores all the queries from the files * defined in the trec.topics property. */ public TRECQuery() { // this(ApplicationSetup.getProperty("trec.topics", null)); try { String files[] = ArrayUtils.parseCommaDelimitedString(ApplicationSetup.getProperty("trec.topics", "")); assert files.length > 0; Vector<String> vecStringQueries = new Vector<String>(); Vector<String> vecStringQueryIDs = new Vector<String>(); Vector<String> vecStringFiles = new Vector<String>(); for (int i = 0; i < files.length; i++) { if (this.extractQuery(files[i], vecStringQueries, vecStringQueryIDs)) { vecStringFiles.add(files[i]); } } this.topicFiles = vecStringQueries.toArray(new String[0]); this.queries = vecStringQueries.toArray(new String[0]); this.query_ids = vecStringQueryIDs.toArray(new String[0]); this.index = 0; } catch (Exception ioe) { logger.error("Problem getting trec.topics property:", ioe); return; } }
/** * This class performs interactive querying at the command line. It asks for a query on Standard * Input, and then displays the document IDs that match the given query. * * <p><b>Properties:</b> * * <ul> * <li><tt>interactive.model</tt> - which weighting model to use, defaults to PL2 * <li><tt>interactive.matching</tt> - which Matching class to use, defaults to Matching * <li><tt>interactive.manager</tt> - which Manager class to use, defaults to Matching * </ul> * * @author Gianni Amati, Vassilis Plachouras, Ben He, Craig Macdonald */ public class InteractiveQuerying { /** The logger used */ protected static final Logger logger = Logger.getLogger(InteractiveQuerying.class); /** Change to lowercase? */ protected static final boolean lowercase = Boolean.parseBoolean(ApplicationSetup.getProperty("lowercase", "true")); /** display user prompts */ protected boolean verbose = true; /** the number of processed queries. */ protected int matchingCount = 0; /** The file to store the output to. */ protected PrintWriter resultFile = new PrintWriter(System.out); /** * The name of the manager object that handles the queries. Set by property <tt>trec.manager</tt>, * defaults to Manager. */ protected String managerName = ApplicationSetup.getProperty("interactive.manager", "Manager"); /** The query manager. */ protected Manager queryingManager; /** The weighting model used. */ protected String wModel = ApplicationSetup.getProperty("interactive.model", "PL2"); /** The matching model used. */ protected String mModel = ApplicationSetup.getProperty("interactive.matching", "Matching"); /** The data structures used. */ protected Index index; /** The maximum number of presented results. */ protected static int RESULTS_LENGTH = Integer.parseInt(ApplicationSetup.getProperty("interactive.output.format.length", "1000")); protected String[] metaKeys = ApplicationSetup.getProperty("interactive.output.meta.keys", "docno").split("\\s*,\\s*"); /** A default constructor initialises the index, and the Manager. */ public InteractiveQuerying() { loadIndex(); createManager(); } /** * Create a querying manager. This method should be overriden if another matching model is * required. */ protected void createManager() { try { if (managerName.indexOf('.') == -1) managerName = "org.terrier.querying." + managerName; else if (managerName.startsWith("uk.ac.gla.terrier")) managerName = managerName.replaceAll("uk.ac.gla.terrier", "org.terrier"); queryingManager = (Manager) (Class.forName(managerName) .getConstructor(new Class[] {Index.class}) .newInstance(new Object[] {index})); } catch (Exception e) { logger.error("Problem loading Manager (" + managerName + "): ", e); } } /** Loads index(s) from disk. */ protected void loadIndex() { long startLoading = System.currentTimeMillis(); index = Index.createIndex(); if (index == null) { logger.fatal("Failed to load index. Perhaps index files are missing"); } long endLoading = System.currentTimeMillis(); if (logger.isInfoEnabled()) logger.info("time to intialise index : " + ((endLoading - startLoading) / 1000.0D)); } /** Closes the used structures. */ public void close() { try { index.close(); } catch (IOException ioe) { logger.warn("Problem closing index", ioe); } } /** * According to the given parameters, it sets up the correct matching class. * * @param queryId String the query identifier to use. * @param query String the query to process. * @param cParameter double the value of the parameter to use. */ public void processQuery(String queryId, String query, double cParameter) { SearchRequest srq = queryingManager.newSearchRequest(queryId, query); srq.setControl("c", Double.toString(cParameter)); srq.addMatchingModel(mModel, wModel); matchingCount++; queryingManager.runPreProcessing(srq); queryingManager.runMatching(srq); queryingManager.runPostProcessing(srq); queryingManager.runPostFilters(srq); try { printResults(resultFile, srq); } catch (IOException ioe) { logger.error("Problem displaying results", ioe); } } /** * Performs the matching using the specified weighting model from the setup and possibly a * combination of evidence mechanism. It parses the file with the queries (the name of the file is * defined in the address_query file), creates the file of results, and for each query, gets the * relevant documents, scores them, and outputs the results to the result file. * * @param cParameter the value of c */ public void processQueries(double cParameter) { try { // prepare console input InputStreamReader consoleReader = new InputStreamReader(System.in); BufferedReader consoleInput = new BufferedReader(consoleReader); String query; int qid = 1; if (verbose) System.out.print("Please enter your query: "); while ((query = consoleInput.readLine()) != null) { if (query.length() == 0 || query.toLowerCase().equals("quit") || query.toLowerCase().equals("exit")) { return; } processQuery("" + (qid++), lowercase ? query.toLowerCase() : query, cParameter); if (verbose) System.out.print("Please enter your query: "); } } catch (IOException ioe) { logger.error( "Input/Output exception while performing the matching. Stack trace follows.", ioe); } } /** * Prints the results * * @param pw PrintWriter the file to write the results to. * @param q SearchRequest the search request to get results from. */ public void printResults(PrintWriter pw, SearchRequest q) throws IOException { ResultSet set = q.getResultSet(); int[] docids = set.getDocids(); double[] scores = set.getScores(); int minimum = RESULTS_LENGTH; // if the minimum number of documents is more than the // number of documents in the results, aw.length, then // set minimum = aw.length if (minimum > set.getResultSize()) minimum = set.getResultSize(); if (verbose) if (set.getResultSize() > 0) pw.write("\n\tDisplaying 1-" + set.getResultSize() + " results\n"); else pw.write("\n\tNo results\n"); if (set.getResultSize() == 0) return; int metaKeyId = 0; final int metaKeyCount = metaKeys.length; String[][] docNames = new String[metaKeyCount][]; for (String metaIndexDocumentKey : metaKeys) { if (set.hasMetaItems(metaIndexDocumentKey)) { docNames[metaKeyId] = set.getMetaItems(metaIndexDocumentKey); } else { final MetaIndex metaIndex = index.getMetaIndex(); docNames[metaKeyId] = metaIndex.getItems(metaIndexDocumentKey, docids); } metaKeyId++; } StringBuilder sbuffer = new StringBuilder(); // the results are ordered in asceding order // with respect to the score. For example, the // document with the highest score has score // score[scores.length-1] and its docid is // docid[docids.length-1]. int start = 0; int end = minimum; for (int i = start; i < end; i++) { if (scores[i] <= 0d) continue; sbuffer.append(i); sbuffer.append(" "); for (metaKeyId = 0; metaKeyId < metaKeyCount; metaKeyId++) { sbuffer.append(docNames[metaKeyId][i]); sbuffer.append(" "); } sbuffer.append(docids[i]); sbuffer.append(" "); sbuffer.append(scores[i]); sbuffer.append('\n'); } // System.out.println(sbuffer.toString()); pw.write(sbuffer.toString()); pw.flush(); // pw.write("finished outputting\n"); } /** * Starts the interactive query application. * * @param args the command line arguments. */ public static void main(String[] args) { InteractiveQuerying iq = new InteractiveQuerying(); if (args.length == 0) { iq.processQueries(1.0); } else if (args.length == 1 && args[0].equals("--noverbose")) { iq.verbose = false; iq.processQueries(1.0); } else { iq.verbose = false; StringBuilder s = new StringBuilder(); for (int i = 0; i < args.length; i++) { s.append(args[i]); s.append(" "); } iq.processQuery("CMDLINE", s.toString(), 1.0); } } }
/** * Runs the actual query expansion * * @see * org.terrier.querying.PostProcess#process(org.terrier.querying.Manager,org.terrier.querying.SearchRequest) */ public void process(Manager manager, SearchRequest q) { Index index = getIndex(manager); lastIndex = index; documentIndex = index.getDocumentIndex(); invertedIndex = index.getInvertedIndex(); lexicon = index.getLexicon(); collStats = index.getCollectionStatistics(); directIndex = index.getDirectIndex(); metaIndex = index.getMetaIndex(); if (directIndex == null) { logger.error("This index does not have a direct index. Query expansion disabled!!"); return; } logger.debug("Starting query expansion post-processing."); // get the query expansion model to use String qeModel = q.getControl("qemodel"); if (qeModel == null || qeModel.length() == 0) { logger.warn( "qemodel control not set for QueryExpansion" + " post process. Using default model Bo1"); qeModel = "Bo1"; } setQueryExpansionModel(getQueryExpansionModel(qeModel)); if (logger.isDebugEnabled()) { logger.info("query expansion model: " + QEModel.getInfo()); } MatchingQueryTerms queryTerms = ((Request) q).getMatchingQueryTerms(); if (queryTerms == null) { logger.warn("No query terms for this query. Skipping QE"); return; } // get the expanded query terms try { expandQuery(queryTerms, (Request) q); } catch (IOException ioe) { logger.error("IOException while expanding query, skipping QE", ioe); return; } if (logger.isDebugEnabled()) { logger.info("query length after expansion: " + queryTerms.length()); logger.info("Expanded query: "); } final String[] newQueryTerms = queryTerms.getTerms(); StringBuilder newQuery = new StringBuilder(); for (int i = 0; i < newQueryTerms.length; i++) { try { if (logger.isDebugEnabled()) { logger.info( (i + 1) + ": " + newQueryTerms[i] + ", normalisedFrequency: " + Rounding.toString(queryTerms.getTermWeight(newQueryTerms[i]), 4)); } newQuery.append(newQueryTerms[i]); newQuery.append('^'); newQuery.append(Rounding.toString(queryTerms.getTermWeight(newQueryTerms[i]), 9)); newQuery.append(' '); } catch (NullPointerException npe) { logger.error("Nullpointer exception occured in Query Expansion dumping of new Query", npe); } } logger.debug("NEWQUERY " + q.getQueryID() + " " + newQuery.toString()); lastExpandedQuery = newQuery.toString(); q.setControl("QE.ExpandedQuery", newQuery.toString()); final boolean no2ndPass = Boolean.parseBoolean(ApplicationSetup.getProperty("qe.no.2nd.matching", "false")); if (no2ndPass) { return; } // run retrieval process again for the expanded query logger.info("Accessing inverted file for expanded query " + q.getQueryID()); manager.runMatching(q); }
// TODO if this class extends BasicIndexer, then this method could be inherited public void createDirectIndex(Collection[] collections) { logger.info( "BlockIndexer creating direct index" + (Boolean.parseBoolean( ApplicationSetup.getProperty("block.delimiters.enabled", "false")) ? " delimited-block indexing enabled" : "")); currentIndex = Index.createNewIndex(path, prefix); lexiconBuilder = FieldScore.FIELDS_COUNT > 0 ? new LexiconBuilder( currentIndex, "lexicon", new BlockFieldLexiconMap(FieldScore.FIELDS_COUNT), FieldLexiconEntry.class.getName()) : new LexiconBuilder( currentIndex, "lexicon", new BlockLexiconMap(), BlockLexiconEntry.class.getName()); // lexiconBuilder = new BlockLexiconBuilder(currentIndex, "lexicon"); try { directIndexBuilder = FieldScore.FIELDS_COUNT > 0 ? new BlockFieldDirectInvertedOutputStream( currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR + currentIndex.getPrefix() + "." + "direct" + BitIn.USUAL_EXTENSION) : new BlockDirectInvertedOutputStream( currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR + currentIndex.getPrefix() + "." + "direct" + BitIn.USUAL_EXTENSION); } catch (IOException ioe) { logger.error("Cannot make DirectInvertedOutputStream:", ioe); } docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document"); metaBuilder = createMetaIndexBuilder(); emptyDocIndexEntry = (FieldScore.FIELDS_COUNT > 0) ? new FieldDocumentIndexEntry(FieldScore.FIELDS_COUNT) : new BasicDocumentIndexEntry(); // int LexiconCount = 0; int numberOfDocuments = 0; // int numberOfTokens = 0; // long startBunchOfDocuments = System.currentTimeMillis(); final boolean boundaryDocsEnabled = BUILDER_BOUNDARY_DOCUMENTS.size() > 0; boolean stopIndexing = false; for (int collectionNo = 0; !stopIndexing && collectionNo < collections.length; collectionNo++) { Collection collection = collections[collectionNo]; long startCollection = System.currentTimeMillis(); boolean notLastDoc = false; // while(notLastDoc = collection.hasNext()) { while ((notLastDoc = collection.nextDocument())) { // get the next document from the collection // String docid = collection.getDocid(); // Document doc = collection.next(); Document doc = collection.getDocument(); if (doc == null) continue; numberOfDocuments++; // setup for parsing createDocumentPostings(); String term; numOfTokensInDocument = 0; numOfTokensInBlock = 0; blockId = 0; // get each term in the document while (!doc.endOfDocument()) { if ((term = doc.getNextTerm()) != null && !term.equals("")) { termFields = doc.getFields(); // pass term into TermPipeline (stop, stem etc) pipeline_first.processTerm(term); // the term pipeline will eventually add the term to this // object. } if (MAX_TOKENS_IN_DOCUMENT > 0 && numOfTokensInDocument > MAX_TOKENS_IN_DOCUMENT) break; } // if we didn't index all tokens from document, // we need to get to the end of the document. while (!doc.endOfDocument()) doc.getNextTerm(); // we now have all terms in the DocumentTree pipeline_first.reset(); // process DocumentTree (tree of terms) try { if (termsInDocument.getDocumentLength() == 0) { // this document is empty, add the // minimum to the document index indexEmpty(doc.getAllProperties()); } else { /* index this docuent */ // numberOfTokens += numOfTokensInDocument; indexDocument(doc.getAllProperties(), termsInDocument); } } catch (Exception ioe) { logger.error("Failed to index " + doc.getProperty("docno"), ioe); } if (MAX_DOCS_PER_BUILDER > 0 && numberOfDocuments >= MAX_DOCS_PER_BUILDER) { stopIndexing = true; break; } if (boundaryDocsEnabled && BUILDER_BOUNDARY_DOCUMENTS.contains(doc.getProperty("docno"))) { stopIndexing = true; break; } } long endCollection = System.currentTimeMillis(); long secs = ((endCollection - startCollection) / 1000); logger.info( "Collection #" + collectionNo + " took " + secs + "seconds to index " + "(" + numberOfDocuments + " documents)\n"); if (secs > 3600) logger.info( "Rate: " + ((double) numberOfDocuments / ((double) secs / 3600.0d)) + " docs/hour"); if (!notLastDoc) { try { collection.close(); } catch (IOException e) { logger.warn("Couldnt close collection", e); } } } /* end of the collection has been reached */ finishedDirectIndexBuild(); currentIndex.addIndexStructure( "direct", "org.terrier.structures.BlockDirectIndex", "org.terrier.structures.Index,java.lang.String,java.lang.Class", "index,structureName," + (FieldScore.FIELDS_COUNT > 0 ? fieldDirectIndexPostingIteratorClass : basicDirectIndexPostingIteratorClass)); currentIndex.addIndexStructureInputStream( "direct", "org.terrier.structures.BlockDirectIndexInputStream", "org.terrier.structures.Index,java.lang.String,java.lang.Class", "index,structureName," + (FieldScore.FIELDS_COUNT > 0 ? fieldDirectIndexPostingIteratorClass : basicDirectIndexPostingIteratorClass)); currentIndex.setIndexProperty("index.direct.fields.count", "" + FieldScore.FIELDS_COUNT); currentIndex.setIndexProperty( "index.direct.fields.names", ArrayUtils.join(FieldScore.FIELD_NAMES, ",")); if (FieldScore.FIELDS_COUNT > 0) { currentIndex.addIndexStructure( "document-factory", FieldDocumentIndexEntry.Factory.class.getName(), "java.lang.String", "${index.direct.fields.count}"); } else { currentIndex.addIndexStructure( "document-factory", BasicDocumentIndexEntry.Factory.class.getName(), "", ""); } /* flush the index buffers */ directIndexBuilder.close(); docIndexBuilder.finishedCollections(); /* and then merge all the temporary lexicons */ lexiconBuilder.finishedDirectIndexBuild(); try { metaBuilder.close(); } catch (IOException ioe) { logger.error("Could not finish MetaIndexBuilder: ", ioe); } if (FieldScore.FIELDS_COUNT > 0) { currentIndex.addIndexStructure( "lexicon-valuefactory", FieldLexiconEntry.Factory.class.getName(), "java.lang.String", "${index.direct.fields.count}"); } /* reset the in-memory mapping of terms to term codes.*/ TermCodes.reset(); System.gc(); try { currentIndex.flush(); } catch (IOException ioe) { logger.error("Could not flush index properties: ", ioe); } }
/** * This class is used for reading the queries from TREC topic files. * * <p><b>Properties:</b> * * <ul> * <li><tt>trecquery.ignore.desc.narr.name.tokens</tt> - should the token DESCRIPTION and * NARRATIVE in the desc and narr fields be ignored? Defaluts to true * <li><tt>tokeniser</tt> - name of the Tokeniser class to use to tokenise topics. Defaults to * EnglishTokeniser. * <li><tt>trec.encoding</tt> - use to set the encoding of TREC topic files. Defaults to the * systems default encoding. * </ul> * * @author Ben He & Craig Macdonald */ public class TRECQuery implements QuerySource { /** The logger used for this class */ protected static final Logger logger = Logger.getLogger(TRECQuery.class); /** * Value of <tt>trecquery.ignore.desc.narr.name.tokens</tt> - should the token DESCRIPTION and * NARRATIVE in the desc and narr fields be ignored? Defaluts to true? */ protected static final boolean IGNORE_DESC_NARR_NAME_TOKENS = Boolean.parseBoolean( ApplicationSetup.getProperty("trecquery.ignore.desc.narr.name.tokens", "true")); /** Encoding to be used to open all files. */ protected static String desiredEncoding = ApplicationSetup.getProperty("trec.encoding", Charset.defaultCharset().name()); /** The topic files used in this object */ protected String[] topicFiles; /** The queries in the topic files. */ protected String[] queries; /** The query identifiers in the topic files. */ protected String[] query_ids; /** The index of the queries. */ protected int index; /** * Extracts and stores all the queries from query files. * * @param queryfilenames String the name of files containing topics. * @param vecStringQueries Vector a vector containing the queries as strings. * @param vecStringIds Vector a vector containing the query identifiers as strings. * @return boolean true if some queries were successfully extracted. */ public boolean extractQuery( String[] queryfilenames, Vector<String> vecStringQueries, Vector<String> vecStringIds) { boolean rtn = false; for (int i = 0; i < queryfilenames.length; i++) { if (extractQuery(queryfilenames[i], vecStringQueries, vecStringIds)) rtn = true; } return rtn; } /** * Extracts and stores all the queries from a query file. * * @param queryfilename String the name of a file containing topics. * @param vecStringQueries Vector a vector containing the queries as strings. * @param vecStringIds Vector a vector containing the query identifiers as strings. * @return boolean true if some queries were successfully extracted. */ public boolean extractQuery( String queryfilename, Vector<String> vecStringQueries, Vector<String> vecStringIds) { boolean gotSome = false; try { BufferedReader br; if (!Files.exists(queryfilename) || !Files.canRead(queryfilename)) { logger.error("The topics file " + queryfilename + " does not exist, or it cannot be read."); return false; } else { br = Files.openFileReader(queryfilename, desiredEncoding); TRECFullTokenizer queryTokenizer = new TRECFullTokenizer( new TagSet(TagSet.TREC_QUERY_TAGS), new TagSet(TagSet.EMPTY_TAGS), br); queryTokenizer.setIgnoreMissingClosingTags(true); while (!queryTokenizer.isEndOfFile()) { String docnoToken = null; StringBuilder query = new StringBuilder(); boolean seenDescriptionToken = !IGNORE_DESC_NARR_NAME_TOKENS; boolean seenNarrativeToken = !IGNORE_DESC_NARR_NAME_TOKENS; while (!queryTokenizer.isEndOfDocument()) { String token = queryTokenizer.nextToken(); if (token == null || token.length() == 0 || queryTokenizer.inTagToSkip()) continue; if (queryTokenizer.inDocnoTag()) { // The tokenizer is constructed from the trimmed version of the contents // of the query number tag, so that the last token extracted from it, is // always the query number, and not an empty string StringTokenizer docnoTokens = new StringTokenizer(token.trim(), " "); while (docnoTokens.hasMoreTokens()) docnoToken = docnoTokens.nextToken().trim(); } else if (queryTokenizer.inTagToProcess()) { // Removed the code that checks if "description" and // "narrative" appear in "desc" and "narr", respective. // THIS WILL HURT THE RETRIEVAL PERFORMANCE. Therefore, // it is recommended to add these words in the stopword // list. if (!seenDescriptionToken && queryTokenizer.currentTag().toUpperCase().equals("DESC") && token.toUpperCase().equals("DESCRIPTION")) continue; if (!seenNarrativeToken && queryTokenizer.currentTag().toUpperCase().equals("NARR") && token.toUpperCase().equals("NARRATIVE")) continue; query.append(token); query.append(' '); } } queryTokenizer.nextDocument(); if (query.length() == 0) continue; vecStringQueries.add(query.toString().trim()); vecStringIds.add(docnoToken.trim()); gotSome = true; } // after processing each query file, close the BufferedReader br.close(); } } catch (IOException ioe) { logger.error( "Input/Output exception while extracting queries from the topic file named " + queryfilename, ioe); } return gotSome; } /** * Constructs an instance of TRECQuery, that reads and stores all the queries from the files * defined in the trec.topics property. */ public TRECQuery() { // this(ApplicationSetup.getProperty("trec.topics", null)); try { String files[] = ArrayUtils.parseCommaDelimitedString(ApplicationSetup.getProperty("trec.topics", "")); assert files.length > 0; Vector<String> vecStringQueries = new Vector<String>(); Vector<String> vecStringQueryIDs = new Vector<String>(); Vector<String> vecStringFiles = new Vector<String>(); for (int i = 0; i < files.length; i++) { if (this.extractQuery(files[i], vecStringQueries, vecStringQueryIDs)) { vecStringFiles.add(files[i]); } } this.topicFiles = vecStringQueries.toArray(new String[0]); this.queries = vecStringQueries.toArray(new String[0]); this.query_ids = vecStringQueryIDs.toArray(new String[0]); this.index = 0; } catch (Exception ioe) { logger.error("Problem getting trec.topics property:", ioe); return; } } /** * Constructs an instance of TRECQuery that reads and stores all the queries from a the specified * query file. * * @param queryfile File the file containing the queries. */ public TRECQuery(File queryfile) { this(queryfile.getName()); } /** * Constructs an instance of TRECQuery that reads and stores all the queries from the specified * query files. * * @param queryfiles File the file containing the queries. */ public TRECQuery(File[] queryfiles) { Vector<String> vecStringQueries = new Vector<String>(); Vector<String> vecStringQueryIDs = new Vector<String>(); String[] files = new String[queryfiles.length]; for (int i = 0; i < queryfiles.length; i++) files[i] = queryfiles[i].getName(); if (this.extractQuery(files, vecStringQueries, vecStringQueryIDs)) this.topicFiles = files; if (topicFiles == null) logger.error( "Topic files were specified, but non could be parsed correctly to obtain any topics." + " Check you have the correct topic files specified, and that TrecQueryTags properties are correct."); this.queries = vecStringQueries.toArray(new String[0]); this.query_ids = vecStringQueryIDs.toArray(new String[0]); this.index = 0; } /** * Constructs an instance of TRECQuery that reads and stores all the queries from a file with the * specified filename. * * @param queryfilename String the name of the file containing all the queries. */ public TRECQuery(String queryfilename) { Vector<String> vecStringQueries = new Vector<String>(); Vector<String> vecStringQueryIDs = new Vector<String>(); if (this.extractQuery(queryfilename, vecStringQueries, vecStringQueryIDs)) this.topicFiles = new String[] {queryfilename}; if (topicFiles == null) logger.error( "Topic files were specified, but non could be parsed correctly to obtain any topics." + " Check you have the correct topic files specified, and that TrecQueryTags properties are correct."); this.queries = vecStringQueries.toArray(new String[0]); this.query_ids = vecStringQueryIDs.toArray(new String[0]); this.index = 0; } /** * Constructs an instance of TRECQuery that reads and stores all the queries from files with the * specified filename. * * @param queryfilenames String[] the name of the files containing all the queries. */ public TRECQuery(String[] queryfilenames) { Vector<String> vecStringQueries = new Vector<String>(); Vector<String> vecStringQueryIDs = new Vector<String>(); if (this.extractQuery(queryfilenames, vecStringQueries, vecStringQueryIDs)) this.topicFiles = queryfilenames; if (topicFiles == null) logger.error( "Topic files were specified, but non could be parsed correctly to obtain any topics." + " Check you have the correct topic files specified, and that TrecQueryTags properties are correct."); this.queries = vecStringQueries.toArray(new String[0]); this.query_ids = vecStringQueryIDs.toArray(new String[0]); this.index = 0; } // /** // * @deprecated As of Terrier 3.5 // * Extracts and stores all the queries from // * the topic files, specified in the file // * with default name <tt>trec.topics.list</tt>. // */ // protected void extractQuery() { // try { // //open the query file // BufferedReader addressQueryFile = Files.openFileReader(ApplicationSetup.TREC_TOPICS_LIST); // ArrayList<String> parsedTopicFiles = new ArrayList<String>(1); // String queryFilename; // Vector<String> vecStringQueries = new Vector<String>(); // Vector<String> vecStringQueryIDs = new Vector<String>(); // int fileCount = 0; // while ((queryFilename = addressQueryFile.readLine()) != null) { // if (queryFilename.startsWith("#") || queryFilename.equals("")) // continue; // //logger.info("Extracting queries from "+queryFilename); // fileCount++; // boolean rtr = extractQuery(queryFilename, vecStringQueries, vecStringQueryIDs); // if (rtr) // parsedTopicFiles.add(queryFilename); // } // if (fileCount ==0) // { // logger.error("No topic files found in "+ApplicationSetup.TREC_TOPICS_LIST +" - please // check"); // } // if (fileCount > 0 && parsedTopicFiles.size() == 0) // { // logger.error("Topic files were specified, but non could be parsed correctly to obtain any // topics." // + " Check you have the correct topic files specified, and that TrecQueryTags properties are // correct."); // } // this.queries = (String[]) vecStringQueries.toArray(new String[0]); // this.query_ids = (String[]) vecStringQueryIDs.toArray(new String[0]); // this.topicFiles = (String[]) parsedTopicFiles.toArray(new String[0]); // ////logger.info("found files ="+ this.topicFiles.length); // addressQueryFile.close(); // } catch (IOException ioe) { // logger.error("Input/Output exception while performing the matching.", ioe); // } // } /** * Returns the index of the last obtained query. * * @return int the index of the last obtained query. */ public int getIndexOfCurrentQuery() { return index - 1; } /** * Returns the number of the queries read from the processed topic files. * * @return int the number of topics contained in the processed topic files. */ public int getNumberOfQueries() { return queries.length; } /** Returns the filenames of the topic files from which the queries were extracted */ public String[] getInfo() { return this.topicFiles; } /** @deprecated */ public String[] getTopicFilenames() { return getInfo(); } /** * Return the query for the given query number. * * @return String the string representing the query. * @param queryNo String The number of a query. */ public String getQuery(String queryNo) { for (int i = 0; i < query_ids.length; i++) if (query_ids[i].equals(queryNo)) return queries[i]; return null; } /** * Test if there are more queries to process. * * @return boolean true if there are more queries to process, otherwise returns false. * @deprecated */ public boolean hasMoreQueries() { return hasNext(); } /** {@inheritDoc} */ public boolean hasNext() { if (index == queries.length) return false; return true; } /** * Returns a query. * * @return String the next query. * @deprecated */ public String nextQuery() { return next(); } /** {@inheritDoc} */ public String next() { if (index == queries.length) return null; return queries[index++]; } /** {@inheritDoc} */ public String getQueryId() { return query_ids[index == 0 ? 0 : index - 1]; } /** * Returns the query ids * * @return String array containing the query ids. * @since 2.2 */ public String[] getQueryIds() { return query_ids; } /** * Returns the queries in an array of strings * * @return String[] an array containing the strings that represent the queries. */ public String[] toArray() { return (String[]) queries.clone(); } /** {@inheritDoc} */ public void reset() { this.index = 0; } /** {@inheritDoc} */ public void remove() { throw new UnsupportedOperationException(); } /** * main * * @param args */ public static void main(String[] args) { TRECQuery source = new TRECQuery(args[0]); while (source.hasNext()) { String query = source.next(); String id = source.getQueryId(); System.out.println(id + ": " + query); } } }