@Override
 public boolean modifyScores(Index index, MatchingQueryTerms terms, ResultSet set) {
   MU =
       Double.parseDouble(
           ApplicationSetup.getProperty(
               "mrf.mu", ApplicationSetup.getProperty("proximity.norm2.c", "4000d")));
   return super.modifyScores(index, terms, set);
 }
 public FatResultsMatching(Index i) throws IOException {
   filename = ApplicationSetup.getProperty("fat.results.matching.file", null);
   maxResults =
       Integer.parseInt(ApplicationSetup.getProperty("fat.results.matching.max.results", "0"));
   if (filename == null)
     throw new IllegalArgumentException("fat.results.matching.file needs to be specified");
   logger.info("Reading fat resultsets from " + filename);
   dis = new DataInputStream(Files.openFileStream(filename));
 }
 /** {@inheritDoc} */
 public void setCollectionStatistics(CollectionStatistics cs, Index _index) {
   super.setCollectionStatistics(cs, _index);
   w_o =
       Double.parseDouble(
           ApplicationSetup.getProperty(
               "proximity." + super.ngramLength + ".w_o",
               ApplicationSetup.getProperty("proximity.w_o", "1.0d")));
   // these statistics are as used by Ivory system
   defaultDf = ((double) cs.getNumberOfDocuments()) / 100.0d;
   defaultCf = defaultDf * 2;
 }
  /** load the feedback selector, based on the property <tt>qe.feedback.selector</tt> */
  protected FeedbackSelector getFeedbackSelector(Request rq) {
    String[] names =
        ApplicationSetup.getProperty("qe.feedback.selector", "PseudoRelevanceFeedbackSelector")
            .split("\\s*,\\s*");
    FeedbackSelector rtr = null;
    for (int i = names.length - 1; i >= 0; i--) {
      String name = names[i];
      if (!name.contains(".")) name = "org.terrier.querying." + name;
      else if (name.startsWith("uk.ac.gla.terrier"))
        name = name.replaceAll("uk.ac.gla.terrier", "org.terrier");

      FeedbackSelector next = null;
      try {
        Class<? extends FeedbackSelector> nextClass =
            Class.forName(name).asSubclass(FeedbackSelector.class);
        if (names.length - 1 == i) {
          next = nextClass.newInstance();
        } else {
          next = nextClass.getConstructor(FeedbackSelector.class).newInstance(rtr);
        }

        rtr = next;
      } catch (Exception e) {
        logger.error("Problem loading a FeedbackSelector called " + name, e);
        return null;
      }
      rtr.setIndex(lastIndex); // TODO index should come from Request
    }
    return rtr;
  }
 /** default constructor */
 public RelevanceFeedbackSelector() {
   String feedbackFilename =
       ApplicationSetup.getProperty(
           "qe.feedback.filename",
           ApplicationSetup.TERRIER_ETC + ApplicationSetup.FILE_SEPARATOR + "feedback");
   this.loadRelevanceInformation(feedbackFilename);
 }
  /**
   * Prints the results for the given search request, using the specified destination.
   *
   * @param pw PrintWriter the destination where to save the results.
   * @param q SearchRequest the object encapsulating the query and the results.
   */
  public void printResults(
      final PrintWriter pw,
      final SearchRequest q,
      String method,
      String iteration,
      int _RESULTS_LENGTH)
      throws IOException {
    final ResultSet set = q.getResultSet();
    final String metaIndexDocumentKey =
        ApplicationSetup.getProperty("trec.querying.outputformat.docno.meta.key", "docno");
    final double[] scores = set.getScores();
    if (set.getResultSize() == 0) {
      logger.warn("No results retrieved for query " + q.getQueryID());
      return;
    }
    String[] docnos = obtainDocnos(metaIndexDocumentKey, q, set);

    final int maximum =
        _RESULTS_LENGTH > set.getResultSize() || _RESULTS_LENGTH == 0
            ? set.getResultSize()
            : _RESULTS_LENGTH;
    logger.debug("Writing " + maximum + " results for query " + q.getQueryID());

    // if the minimum number of documents is more than the
    // number of documents in the results, aw.length, then
    // set minimum = aw.length

    // if (minimum > set.getResultSize())
    // minimum = set.getResultSize();
    // final String iteration = ITERATION + "0";
    final String queryIdExpanded = q.getQueryID() + " " + iteration + " ";
    final String methodExpanded = " " + method + ApplicationSetup.EOL;
    StringBuilder sbuffer = new StringBuilder();
    // the results are ordered in desceding order
    // with respect to the score.
    int limit = 10000;
    int counter = 0;
    for (int i = 0; i < maximum; i++) {
      if (scores[i] == Double.NEGATIVE_INFINITY) continue;
      sbuffer.append(queryIdExpanded);
      sbuffer.append(docnos[i]);
      sbuffer.append(" ");
      sbuffer.append(i);
      sbuffer.append(" ");
      sbuffer.append(scores[i]);
      sbuffer.append(methodExpanded);
      counter++;
      if (counter % limit == 0) {
        pw.write(sbuffer.toString());
        sbuffer = null;
        sbuffer = new StringBuilder();
        pw.flush();
      }
    }
    pw.write(sbuffer.toString());
    pw.flush();
  }
 /**
  * Returns the object that is to be the end of the TermPipeline. This method is used at
  * construction time of the parent object.
  *
  * @return TermPipeline the last component of the term pipeline.
  */
 protected TermPipeline getEndOfPipeline() {
   // if using delimited blocks
   if (Boolean.parseBoolean(ApplicationSetup.getProperty("block.delimiters.enabled", "false"))) {
     String delim = ApplicationSetup.getProperty("block.delimiters", "").trim();
     if (Boolean.parseBoolean(ApplicationSetup.getProperty("lowercase", "true")))
       delim = delim.toLowerCase();
     String delims[] = delim.split("\\s*,\\s*");
     final boolean indexDelims =
         Boolean.parseBoolean(
             ApplicationSetup.getProperty("block.delimiters.index.terms", "false"));
     final boolean countDelims =
         Boolean.parseBoolean(
             ApplicationSetup.getProperty("block.delimiters.index.doclength", "true"));
     return (FieldScore.USE_FIELD_INFORMATION)
         ? new DelimFieldTermProcessor(delims, indexDelims, countDelims)
         : new DelimTermProcessor(delims, indexDelims, countDelims);
   } else if (FieldScore.USE_FIELD_INFORMATION) {
     return new FieldTermProcessor();
   }
   return new BasicTermProcessor();
 }
/**
 * Implements Markov Random Fields. See Metzler & Croft, SIGIR 2005. Note that this implementation
 * does not utilise the frequency of a tuple in the collection - instead, this is assumed to be a
 * constant, as per the implementation in the Ivory retrieval system. <b>Properties:</b>
 *
 * <ul>
 *   <li><i>See properties for DependenceScoreModifier</i>
 *   <li><tt>mrf.mu</tt> - Mu of MRF model, in the Dirichlet model.
 * </ul>
 *
 * <b>References:</b> Metzler, D. and Croft, W.B., "A Markov Random Field Model for Term
 * Dependencies," Proceedings of the 28th annual international ACM SIGIR conference on Research and
 * development in information retrieval (SIGIR 2005), 472-479, 2005
 *
 * @author Craig Macdonald
 * @since 3.0
 */
public class MRFDependenceScoreModifier extends DependenceScoreModifier {

  protected double MU =
      Double.parseDouble(
          ApplicationSetup.getProperty(
              "mrf.mu", ApplicationSetup.getProperty("proximity.norm2.c", "4000d")));
  double defaultDf;
  double defaultCf;

  @Override
  public boolean modifyScores(Index index, MatchingQueryTerms terms, ResultSet set) {
    MU =
        Double.parseDouble(
            ApplicationSetup.getProperty(
                "mrf.mu", ApplicationSetup.getProperty("proximity.norm2.c", "4000d")));
    return super.modifyScores(index, terms, set);
  }

  @Override
  protected double scoreFDSD(int matchingNGrams, int _docLength) {
    final double mu = MU;
    double docLength = (double) _docLength;
    double tf = (double) matchingNGrams;
    return w_o
        * (Idf.log(1 + (tf / (mu * (defaultCf / super.numTokens))))
            + Idf.log(mu / (docLength + mu)));
  }
  /** {@inheritDoc} */
  public void setCollectionStatistics(CollectionStatistics cs, Index _index) {
    super.setCollectionStatistics(cs, _index);
    w_o =
        Double.parseDouble(
            ApplicationSetup.getProperty(
                "proximity." + super.ngramLength + ".w_o",
                ApplicationSetup.getProperty("proximity.w_o", "1.0d")));
    // these statistics are as used by Ivory system
    defaultDf = ((double) cs.getNumberOfDocuments()) / 100.0d;
    defaultCf = defaultDf * 2;
  }
}
 static String[] getModelNames() throws Exception {
   String[] modelNames =
       ArrayUtils.parseCommaDelimitedString(
           ApplicationSetup.getProperty("fat.featured.scoring.matching.features", ""));
   if (modelNames.length == 1 && modelNames[0].equals("FILE")) {
     String filename =
         ApplicationSetup.getProperty("fat.featured.scoring.matching.features.file", null);
     if (filename == null) throw new IllegalArgumentException();
     filename = ApplicationSetup.makeAbsolute(filename, ApplicationSetup.TERRIER_ETC);
     String line = null;
     final BufferedReader br = Files.openFileReader(filename);
     final List<String> models = new ArrayList<String>();
     while ((line = br.readLine()) != null) {
       // ignore linee starting with comments
       if (line.startsWith("#")) continue;
       // remove trailing comments
       line = line.replaceAll("#.+$", "");
       models.add(line.trim());
     }
     br.close();
     modelNames = models.toArray(new String[models.size()]);
   }
   return modelNames;
 }
  /**
   * load the expansion terms, as per the property <tt>qe.expansion.terms.class</tt>. Defaults to
   * DFRBagExpansionTerms.
   *
   * @return an ExpansionTerms instance, which may or may not wrap other ExpansionTerms instances
   */
  protected ExpansionTerms getExpansionTerms() {
    String expanderNames[] =
        ApplicationSetup.getProperty("qe.expansion.terms.class", "DFRBagExpansionTerms")
            .split("\\s*,\\s*");
    ExpansionTerms rtr = null;

    // foreach name, starting from the last, finishing with the first

    for (int i = expanderNames.length - 1; i >= 0; i--) {
      String expanderName = expanderNames[i];
      ExpansionTerms next = null;
      if (!expanderName.contains(".")) expanderName = "org.terrier.querying." + expanderName;
      else if (expanderName.startsWith("uk.ac.gla.terrier"))
        expanderName = expanderName.replaceAll("uk.ac.gla.terrier", "org.terrier");

      try {
        Class<? extends ExpansionTerms> clz =
            Class.forName(expanderName).asSubclass(ExpansionTerms.class);
        if (expanderNames.length - 1 == i) {
          next =
              clz.getConstructor(
                      CollectionStatistics.class,
                      Lexicon.class,
                      PostingIndex.class,
                      DocumentIndex.class)
                  .newInstance(collStats, lexicon, directIndex, documentIndex);
        } else {
          next = clz.getConstructor(ExpansionTerms.class).newInstance(rtr);
        }
        rtr = next;
      } catch (Exception e) {
        logger.error("Errory during GetExpansionTerms", e);
        return null;
      }
    }
    return rtr;
  }
Exemple #11
0
  /**
   * Constructs an instance of TRECQuery, that reads and stores all the queries from the files
   * defined in the trec.topics property.
   */
  public TRECQuery() {
    // this(ApplicationSetup.getProperty("trec.topics", null));
    try {
      String files[] =
          ArrayUtils.parseCommaDelimitedString(ApplicationSetup.getProperty("trec.topics", ""));
      assert files.length > 0;
      Vector<String> vecStringQueries = new Vector<String>();
      Vector<String> vecStringQueryIDs = new Vector<String>();
      Vector<String> vecStringFiles = new Vector<String>();
      for (int i = 0; i < files.length; i++) {
        if (this.extractQuery(files[i], vecStringQueries, vecStringQueryIDs)) {
          vecStringFiles.add(files[i]);
        }
      }

      this.topicFiles = vecStringQueries.toArray(new String[0]);
      this.queries = vecStringQueries.toArray(new String[0]);
      this.query_ids = vecStringQueryIDs.toArray(new String[0]);
      this.index = 0;
    } catch (Exception ioe) {
      logger.error("Problem getting trec.topics property:", ioe);
      return;
    }
  }
/**
 * This class performs interactive querying at the command line. It asks for a query on Standard
 * Input, and then displays the document IDs that match the given query.
 *
 * <p><b>Properties:</b>
 *
 * <ul>
 *   <li><tt>interactive.model</tt> - which weighting model to use, defaults to PL2
 *   <li><tt>interactive.matching</tt> - which Matching class to use, defaults to Matching
 *   <li><tt>interactive.manager</tt> - which Manager class to use, defaults to Matching
 * </ul>
 *
 * @author Gianni Amati, Vassilis Plachouras, Ben He, Craig Macdonald
 */
public class InteractiveQuerying {
  /** The logger used */
  protected static final Logger logger = Logger.getLogger(InteractiveQuerying.class);

  /** Change to lowercase? */
  protected static final boolean lowercase =
      Boolean.parseBoolean(ApplicationSetup.getProperty("lowercase", "true"));
  /** display user prompts */
  protected boolean verbose = true;
  /** the number of processed queries. */
  protected int matchingCount = 0;
  /** The file to store the output to. */
  protected PrintWriter resultFile = new PrintWriter(System.out);
  /**
   * The name of the manager object that handles the queries. Set by property <tt>trec.manager</tt>,
   * defaults to Manager.
   */
  protected String managerName = ApplicationSetup.getProperty("interactive.manager", "Manager");
  /** The query manager. */
  protected Manager queryingManager;
  /** The weighting model used. */
  protected String wModel = ApplicationSetup.getProperty("interactive.model", "PL2");
  /** The matching model used. */
  protected String mModel = ApplicationSetup.getProperty("interactive.matching", "Matching");
  /** The data structures used. */
  protected Index index;
  /** The maximum number of presented results. */
  protected static int RESULTS_LENGTH =
      Integer.parseInt(ApplicationSetup.getProperty("interactive.output.format.length", "1000"));

  protected String[] metaKeys =
      ApplicationSetup.getProperty("interactive.output.meta.keys", "docno").split("\\s*,\\s*");

  /** A default constructor initialises the index, and the Manager. */
  public InteractiveQuerying() {
    loadIndex();
    createManager();
  }

  /**
   * Create a querying manager. This method should be overriden if another matching model is
   * required.
   */
  protected void createManager() {
    try {
      if (managerName.indexOf('.') == -1) managerName = "org.terrier.querying." + managerName;
      else if (managerName.startsWith("uk.ac.gla.terrier"))
        managerName = managerName.replaceAll("uk.ac.gla.terrier", "org.terrier");
      queryingManager =
          (Manager)
              (Class.forName(managerName)
                  .getConstructor(new Class[] {Index.class})
                  .newInstance(new Object[] {index}));
    } catch (Exception e) {
      logger.error("Problem loading Manager (" + managerName + "): ", e);
    }
  }

  /** Loads index(s) from disk. */
  protected void loadIndex() {
    long startLoading = System.currentTimeMillis();
    index = Index.createIndex();
    if (index == null) {
      logger.fatal("Failed to load index. Perhaps index files are missing");
    }
    long endLoading = System.currentTimeMillis();
    if (logger.isInfoEnabled())
      logger.info("time to intialise index : " + ((endLoading - startLoading) / 1000.0D));
  }
  /** Closes the used structures. */
  public void close() {
    try {
      index.close();
    } catch (IOException ioe) {
      logger.warn("Problem closing index", ioe);
    }
  }
  /**
   * According to the given parameters, it sets up the correct matching class.
   *
   * @param queryId String the query identifier to use.
   * @param query String the query to process.
   * @param cParameter double the value of the parameter to use.
   */
  public void processQuery(String queryId, String query, double cParameter) {
    SearchRequest srq = queryingManager.newSearchRequest(queryId, query);
    srq.setControl("c", Double.toString(cParameter));
    srq.addMatchingModel(mModel, wModel);
    matchingCount++;
    queryingManager.runPreProcessing(srq);
    queryingManager.runMatching(srq);
    queryingManager.runPostProcessing(srq);
    queryingManager.runPostFilters(srq);
    try {
      printResults(resultFile, srq);
    } catch (IOException ioe) {
      logger.error("Problem displaying results", ioe);
    }
  }
  /**
   * Performs the matching using the specified weighting model from the setup and possibly a
   * combination of evidence mechanism. It parses the file with the queries (the name of the file is
   * defined in the address_query file), creates the file of results, and for each query, gets the
   * relevant documents, scores them, and outputs the results to the result file.
   *
   * @param cParameter the value of c
   */
  public void processQueries(double cParameter) {
    try {
      // prepare console input
      InputStreamReader consoleReader = new InputStreamReader(System.in);
      BufferedReader consoleInput = new BufferedReader(consoleReader);
      String query;
      int qid = 1;
      if (verbose) System.out.print("Please enter your query: ");
      while ((query = consoleInput.readLine()) != null) {
        if (query.length() == 0
            || query.toLowerCase().equals("quit")
            || query.toLowerCase().equals("exit")) {
          return;
        }
        processQuery("" + (qid++), lowercase ? query.toLowerCase() : query, cParameter);
        if (verbose) System.out.print("Please enter your query: ");
      }
    } catch (IOException ioe) {
      logger.error(
          "Input/Output exception while performing the matching. Stack trace follows.", ioe);
    }
  }
  /**
   * Prints the results
   *
   * @param pw PrintWriter the file to write the results to.
   * @param q SearchRequest the search request to get results from.
   */
  public void printResults(PrintWriter pw, SearchRequest q) throws IOException {
    ResultSet set = q.getResultSet();
    int[] docids = set.getDocids();
    double[] scores = set.getScores();
    int minimum = RESULTS_LENGTH;
    // if the minimum number of documents is more than the
    // number of documents in the results, aw.length, then
    // set minimum = aw.length
    if (minimum > set.getResultSize()) minimum = set.getResultSize();
    if (verbose)
      if (set.getResultSize() > 0)
        pw.write("\n\tDisplaying 1-" + set.getResultSize() + " results\n");
      else pw.write("\n\tNo results\n");
    if (set.getResultSize() == 0) return;

    int metaKeyId = 0;
    final int metaKeyCount = metaKeys.length;
    String[][] docNames = new String[metaKeyCount][];
    for (String metaIndexDocumentKey : metaKeys) {
      if (set.hasMetaItems(metaIndexDocumentKey)) {
        docNames[metaKeyId] = set.getMetaItems(metaIndexDocumentKey);
      } else {
        final MetaIndex metaIndex = index.getMetaIndex();
        docNames[metaKeyId] = metaIndex.getItems(metaIndexDocumentKey, docids);
      }
      metaKeyId++;
    }

    StringBuilder sbuffer = new StringBuilder();
    // the results are ordered in asceding order
    // with respect to the score. For example, the
    // document with the highest score has score
    // score[scores.length-1] and its docid is
    // docid[docids.length-1].
    int start = 0;
    int end = minimum;
    for (int i = start; i < end; i++) {
      if (scores[i] <= 0d) continue;
      sbuffer.append(i);
      sbuffer.append(" ");
      for (metaKeyId = 0; metaKeyId < metaKeyCount; metaKeyId++) {
        sbuffer.append(docNames[metaKeyId][i]);
        sbuffer.append(" ");
      }
      sbuffer.append(docids[i]);
      sbuffer.append(" ");
      sbuffer.append(scores[i]);
      sbuffer.append('\n');
    }
    // System.out.println(sbuffer.toString());
    pw.write(sbuffer.toString());
    pw.flush();
    // pw.write("finished outputting\n");
  }
  /**
   * Starts the interactive query application.
   *
   * @param args the command line arguments.
   */
  public static void main(String[] args) {
    InteractiveQuerying iq = new InteractiveQuerying();
    if (args.length == 0) {
      iq.processQueries(1.0);
    } else if (args.length == 1 && args[0].equals("--noverbose")) {
      iq.verbose = false;
      iq.processQueries(1.0);
    } else {
      iq.verbose = false;
      StringBuilder s = new StringBuilder();
      for (int i = 0; i < args.length; i++) {
        s.append(args[i]);
        s.append(" ");
      }
      iq.processQuery("CMDLINE", s.toString(), 1.0);
    }
  }
}
  /**
   * Runs the actual query expansion
   *
   * @see
   *     org.terrier.querying.PostProcess#process(org.terrier.querying.Manager,org.terrier.querying.SearchRequest)
   */
  public void process(Manager manager, SearchRequest q) {
    Index index = getIndex(manager);
    lastIndex = index;
    documentIndex = index.getDocumentIndex();
    invertedIndex = index.getInvertedIndex();
    lexicon = index.getLexicon();
    collStats = index.getCollectionStatistics();
    directIndex = index.getDirectIndex();
    metaIndex = index.getMetaIndex();
    if (directIndex == null) {
      logger.error("This index does not have a direct index. Query expansion disabled!!");
      return;
    }
    logger.debug("Starting query expansion post-processing.");
    // get the query expansion model to use
    String qeModel = q.getControl("qemodel");
    if (qeModel == null || qeModel.length() == 0) {
      logger.warn(
          "qemodel control not set for QueryExpansion" + " post process. Using default model Bo1");
      qeModel = "Bo1";
    }
    setQueryExpansionModel(getQueryExpansionModel(qeModel));
    if (logger.isDebugEnabled()) {
      logger.info("query expansion model: " + QEModel.getInfo());
    }
    MatchingQueryTerms queryTerms = ((Request) q).getMatchingQueryTerms();
    if (queryTerms == null) {
      logger.warn("No query terms for this query. Skipping QE");
      return;
    }
    // get the expanded query terms
    try {
      expandQuery(queryTerms, (Request) q);
    } catch (IOException ioe) {
      logger.error("IOException while expanding query, skipping QE", ioe);
      return;
    }
    if (logger.isDebugEnabled()) {
      logger.info("query length after expansion: " + queryTerms.length());
      logger.info("Expanded query: ");
    }
    final String[] newQueryTerms = queryTerms.getTerms();
    StringBuilder newQuery = new StringBuilder();
    for (int i = 0; i < newQueryTerms.length; i++) {
      try {
        if (logger.isDebugEnabled()) {
          logger.info(
              (i + 1)
                  + ": "
                  + newQueryTerms[i]
                  + ", normalisedFrequency: "
                  + Rounding.toString(queryTerms.getTermWeight(newQueryTerms[i]), 4));
        }
        newQuery.append(newQueryTerms[i]);
        newQuery.append('^');
        newQuery.append(Rounding.toString(queryTerms.getTermWeight(newQueryTerms[i]), 9));
        newQuery.append(' ');
      } catch (NullPointerException npe) {
        logger.error("Nullpointer exception occured in Query Expansion dumping of new Query", npe);
      }
    }

    logger.debug("NEWQUERY " + q.getQueryID() + " " + newQuery.toString());
    lastExpandedQuery = newQuery.toString();
    q.setControl("QE.ExpandedQuery", newQuery.toString());
    final boolean no2ndPass =
        Boolean.parseBoolean(ApplicationSetup.getProperty("qe.no.2nd.matching", "false"));
    if (no2ndPass) {
      return;
    }

    // run retrieval process again for the expanded query
    logger.info("Accessing inverted file for expanded query " + q.getQueryID());
    manager.runMatching(q);
  }
  // TODO if this class extends BasicIndexer, then this method could be inherited
  public void createDirectIndex(Collection[] collections) {
    logger.info(
        "BlockIndexer creating direct index"
            + (Boolean.parseBoolean(
                    ApplicationSetup.getProperty("block.delimiters.enabled", "false"))
                ? " delimited-block indexing enabled"
                : ""));
    currentIndex = Index.createNewIndex(path, prefix);
    lexiconBuilder =
        FieldScore.FIELDS_COUNT > 0
            ? new LexiconBuilder(
                currentIndex,
                "lexicon",
                new BlockFieldLexiconMap(FieldScore.FIELDS_COUNT),
                FieldLexiconEntry.class.getName())
            : new LexiconBuilder(
                currentIndex, "lexicon", new BlockLexiconMap(), BlockLexiconEntry.class.getName());
    // lexiconBuilder = new BlockLexiconBuilder(currentIndex, "lexicon");
    try {
      directIndexBuilder =
          FieldScore.FIELDS_COUNT > 0
              ? new BlockFieldDirectInvertedOutputStream(
                  currentIndex.getPath()
                      + ApplicationSetup.FILE_SEPARATOR
                      + currentIndex.getPrefix()
                      + "."
                      + "direct"
                      + BitIn.USUAL_EXTENSION)
              : new BlockDirectInvertedOutputStream(
                  currentIndex.getPath()
                      + ApplicationSetup.FILE_SEPARATOR
                      + currentIndex.getPrefix()
                      + "."
                      + "direct"
                      + BitIn.USUAL_EXTENSION);
    } catch (IOException ioe) {
      logger.error("Cannot make DirectInvertedOutputStream:", ioe);
    }
    docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document");
    metaBuilder = createMetaIndexBuilder();
    emptyDocIndexEntry =
        (FieldScore.FIELDS_COUNT > 0)
            ? new FieldDocumentIndexEntry(FieldScore.FIELDS_COUNT)
            : new BasicDocumentIndexEntry();

    // int LexiconCount = 0;
    int numberOfDocuments = 0;
    // int numberOfTokens = 0;
    // long startBunchOfDocuments = System.currentTimeMillis();
    final boolean boundaryDocsEnabled = BUILDER_BOUNDARY_DOCUMENTS.size() > 0;
    boolean stopIndexing = false;
    for (int collectionNo = 0; !stopIndexing && collectionNo < collections.length; collectionNo++) {
      Collection collection = collections[collectionNo];
      long startCollection = System.currentTimeMillis();
      boolean notLastDoc = false;
      // while(notLastDoc = collection.hasNext()) {
      while ((notLastDoc = collection.nextDocument())) {
        // get the next document from the collection

        // String docid = collection.getDocid();
        // Document doc = collection.next();
        Document doc = collection.getDocument();

        if (doc == null) continue;

        numberOfDocuments++;
        // setup for parsing
        createDocumentPostings();
        String term;
        numOfTokensInDocument = 0;
        numOfTokensInBlock = 0;
        blockId = 0;
        // get each term in the document
        while (!doc.endOfDocument()) {
          if ((term = doc.getNextTerm()) != null && !term.equals("")) {
            termFields = doc.getFields();
            // pass term into TermPipeline (stop, stem etc)
            pipeline_first.processTerm(term);
            // the term pipeline will eventually add the term to this
            // object.
          }
          if (MAX_TOKENS_IN_DOCUMENT > 0 && numOfTokensInDocument > MAX_TOKENS_IN_DOCUMENT) break;
        }
        // if we didn't index all tokens from document,
        // we need to get to the end of the document.
        while (!doc.endOfDocument()) doc.getNextTerm();
        // we now have all terms in the DocumentTree

        pipeline_first.reset();
        // process DocumentTree (tree of terms)
        try {
          if (termsInDocument.getDocumentLength() == 0) {
            // this document is empty, add the
            // minimum to the document index
            indexEmpty(doc.getAllProperties());
          } else {
              /* index this docuent */
            // numberOfTokens += numOfTokensInDocument;
            indexDocument(doc.getAllProperties(), termsInDocument);
          }
        } catch (Exception ioe) {
          logger.error("Failed to index " + doc.getProperty("docno"), ioe);
        }
        if (MAX_DOCS_PER_BUILDER > 0 && numberOfDocuments >= MAX_DOCS_PER_BUILDER) {
          stopIndexing = true;
          break;
        }

        if (boundaryDocsEnabled && BUILDER_BOUNDARY_DOCUMENTS.contains(doc.getProperty("docno"))) {
          stopIndexing = true;
          break;
        }
      }
      long endCollection = System.currentTimeMillis();
      long secs = ((endCollection - startCollection) / 1000);
      logger.info(
          "Collection #"
              + collectionNo
              + " took "
              + secs
              + "seconds to index "
              + "("
              + numberOfDocuments
              + " documents)\n");
      if (secs > 3600)
        logger.info(
            "Rate: " + ((double) numberOfDocuments / ((double) secs / 3600.0d)) + " docs/hour");

      if (!notLastDoc) {
        try {
          collection.close();
        } catch (IOException e) {
          logger.warn("Couldnt close collection", e);
        }
      }
    }

    /* end of the collection has been reached */
    finishedDirectIndexBuild();
    currentIndex.addIndexStructure(
        "direct",
        "org.terrier.structures.BlockDirectIndex",
        "org.terrier.structures.Index,java.lang.String,java.lang.Class",
        "index,structureName,"
            + (FieldScore.FIELDS_COUNT > 0
                ? fieldDirectIndexPostingIteratorClass
                : basicDirectIndexPostingIteratorClass));
    currentIndex.addIndexStructureInputStream(
        "direct",
        "org.terrier.structures.BlockDirectIndexInputStream",
        "org.terrier.structures.Index,java.lang.String,java.lang.Class",
        "index,structureName,"
            + (FieldScore.FIELDS_COUNT > 0
                ? fieldDirectIndexPostingIteratorClass
                : basicDirectIndexPostingIteratorClass));
    currentIndex.setIndexProperty("index.direct.fields.count", "" + FieldScore.FIELDS_COUNT);
    currentIndex.setIndexProperty(
        "index.direct.fields.names", ArrayUtils.join(FieldScore.FIELD_NAMES, ","));
    if (FieldScore.FIELDS_COUNT > 0) {
      currentIndex.addIndexStructure(
          "document-factory",
          FieldDocumentIndexEntry.Factory.class.getName(),
          "java.lang.String",
          "${index.direct.fields.count}");
    } else {
      currentIndex.addIndexStructure(
          "document-factory", BasicDocumentIndexEntry.Factory.class.getName(), "", "");
    }
    /* flush the index buffers */
    directIndexBuilder.close();
    docIndexBuilder.finishedCollections();
    /* and then merge all the temporary lexicons */
    lexiconBuilder.finishedDirectIndexBuild();
    try {
      metaBuilder.close();
    } catch (IOException ioe) {
      logger.error("Could not finish MetaIndexBuilder: ", ioe);
    }
    if (FieldScore.FIELDS_COUNT > 0) {
      currentIndex.addIndexStructure(
          "lexicon-valuefactory",
          FieldLexiconEntry.Factory.class.getName(),
          "java.lang.String",
          "${index.direct.fields.count}");
    }
    /* reset the in-memory mapping of terms to term codes.*/
    TermCodes.reset();
    System.gc();
    try {
      currentIndex.flush();
    } catch (IOException ioe) {
      logger.error("Could not flush index properties: ", ioe);
    }
  }
Exemple #15
0
/**
 * This class is used for reading the queries from TREC topic files.
 *
 * <p><b>Properties:</b>
 *
 * <ul>
 *   <li><tt>trecquery.ignore.desc.narr.name.tokens</tt> - should the token DESCRIPTION and
 *       NARRATIVE in the desc and narr fields be ignored? Defaluts to true
 *   <li><tt>tokeniser</tt> - name of the Tokeniser class to use to tokenise topics. Defaults to
 *       EnglishTokeniser.
 *   <li><tt>trec.encoding</tt> - use to set the encoding of TREC topic files. Defaults to the
 *       systems default encoding.
 * </ul>
 *
 * @author Ben He &amp; Craig Macdonald
 */
public class TRECQuery implements QuerySource {
  /** The logger used for this class */
  protected static final Logger logger = Logger.getLogger(TRECQuery.class);

  /**
   * Value of <tt>trecquery.ignore.desc.narr.name.tokens</tt> - should the token DESCRIPTION and
   * NARRATIVE in the desc and narr fields be ignored? Defaluts to true?
   */
  protected static final boolean IGNORE_DESC_NARR_NAME_TOKENS =
      Boolean.parseBoolean(
          ApplicationSetup.getProperty("trecquery.ignore.desc.narr.name.tokens", "true"));

  /** Encoding to be used to open all files. */
  protected static String desiredEncoding =
      ApplicationSetup.getProperty("trec.encoding", Charset.defaultCharset().name());

  /** The topic files used in this object */
  protected String[] topicFiles;

  /** The queries in the topic files. */
  protected String[] queries;

  /** The query identifiers in the topic files. */
  protected String[] query_ids;
  /** The index of the queries. */
  protected int index;
  /**
   * Extracts and stores all the queries from query files.
   *
   * @param queryfilenames String the name of files containing topics.
   * @param vecStringQueries Vector a vector containing the queries as strings.
   * @param vecStringIds Vector a vector containing the query identifiers as strings.
   * @return boolean true if some queries were successfully extracted.
   */
  public boolean extractQuery(
      String[] queryfilenames, Vector<String> vecStringQueries, Vector<String> vecStringIds) {
    boolean rtn = false;
    for (int i = 0; i < queryfilenames.length; i++) {
      if (extractQuery(queryfilenames[i], vecStringQueries, vecStringIds)) rtn = true;
    }
    return rtn;
  }
  /**
   * Extracts and stores all the queries from a query file.
   *
   * @param queryfilename String the name of a file containing topics.
   * @param vecStringQueries Vector a vector containing the queries as strings.
   * @param vecStringIds Vector a vector containing the query identifiers as strings.
   * @return boolean true if some queries were successfully extracted.
   */
  public boolean extractQuery(
      String queryfilename, Vector<String> vecStringQueries, Vector<String> vecStringIds) {
    boolean gotSome = false;
    try {
      BufferedReader br;
      if (!Files.exists(queryfilename) || !Files.canRead(queryfilename)) {
        logger.error("The topics file " + queryfilename + " does not exist, or it cannot be read.");
        return false;
      } else {
        br = Files.openFileReader(queryfilename, desiredEncoding);
        TRECFullTokenizer queryTokenizer =
            new TRECFullTokenizer(
                new TagSet(TagSet.TREC_QUERY_TAGS), new TagSet(TagSet.EMPTY_TAGS), br);
        queryTokenizer.setIgnoreMissingClosingTags(true);
        while (!queryTokenizer.isEndOfFile()) {
          String docnoToken = null;
          StringBuilder query = new StringBuilder();
          boolean seenDescriptionToken = !IGNORE_DESC_NARR_NAME_TOKENS;
          boolean seenNarrativeToken = !IGNORE_DESC_NARR_NAME_TOKENS;
          while (!queryTokenizer.isEndOfDocument()) {
            String token = queryTokenizer.nextToken();
            if (token == null || token.length() == 0 || queryTokenizer.inTagToSkip()) continue;

            if (queryTokenizer.inDocnoTag()) {
              // The tokenizer is constructed from the trimmed version of the contents
              // of the query number tag, so that the last token extracted from it, is
              // always the query number, and not an empty string
              StringTokenizer docnoTokens = new StringTokenizer(token.trim(), " ");
              while (docnoTokens.hasMoreTokens()) docnoToken = docnoTokens.nextToken().trim();
            } else if (queryTokenizer.inTagToProcess()) {
              // Removed the code that checks if "description" and
              // "narrative" appear in "desc" and "narr", respective.
              // THIS WILL HURT THE RETRIEVAL PERFORMANCE. Therefore,
              // it is recommended to add these words in the stopword
              // list.
              if (!seenDescriptionToken
                  && queryTokenizer.currentTag().toUpperCase().equals("DESC")
                  && token.toUpperCase().equals("DESCRIPTION")) continue;
              if (!seenNarrativeToken
                  && queryTokenizer.currentTag().toUpperCase().equals("NARR")
                  && token.toUpperCase().equals("NARRATIVE")) continue;
              query.append(token);
              query.append(' ');
            }
          }
          queryTokenizer.nextDocument();
          if (query.length() == 0) continue;
          vecStringQueries.add(query.toString().trim());
          vecStringIds.add(docnoToken.trim());

          gotSome = true;
        }
        // after processing each query file, close the BufferedReader
        br.close();
      }

    } catch (IOException ioe) {
      logger.error(
          "Input/Output exception while extracting queries from the topic file named "
              + queryfilename,
          ioe);
    }
    return gotSome;
  }

  /**
   * Constructs an instance of TRECQuery, that reads and stores all the queries from the files
   * defined in the trec.topics property.
   */
  public TRECQuery() {
    // this(ApplicationSetup.getProperty("trec.topics", null));
    try {
      String files[] =
          ArrayUtils.parseCommaDelimitedString(ApplicationSetup.getProperty("trec.topics", ""));
      assert files.length > 0;
      Vector<String> vecStringQueries = new Vector<String>();
      Vector<String> vecStringQueryIDs = new Vector<String>();
      Vector<String> vecStringFiles = new Vector<String>();
      for (int i = 0; i < files.length; i++) {
        if (this.extractQuery(files[i], vecStringQueries, vecStringQueryIDs)) {
          vecStringFiles.add(files[i]);
        }
      }

      this.topicFiles = vecStringQueries.toArray(new String[0]);
      this.queries = vecStringQueries.toArray(new String[0]);
      this.query_ids = vecStringQueryIDs.toArray(new String[0]);
      this.index = 0;
    } catch (Exception ioe) {
      logger.error("Problem getting trec.topics property:", ioe);
      return;
    }
  }

  /**
   * Constructs an instance of TRECQuery that reads and stores all the queries from a the specified
   * query file.
   *
   * @param queryfile File the file containing the queries.
   */
  public TRECQuery(File queryfile) {
    this(queryfile.getName());
  }

  /**
   * Constructs an instance of TRECQuery that reads and stores all the queries from the specified
   * query files.
   *
   * @param queryfiles File the file containing the queries.
   */
  public TRECQuery(File[] queryfiles) {
    Vector<String> vecStringQueries = new Vector<String>();
    Vector<String> vecStringQueryIDs = new Vector<String>();
    String[] files = new String[queryfiles.length];
    for (int i = 0; i < queryfiles.length; i++) files[i] = queryfiles[i].getName();
    if (this.extractQuery(files, vecStringQueries, vecStringQueryIDs)) this.topicFiles = files;
    if (topicFiles == null)
      logger.error(
          "Topic files were specified, but non could be parsed correctly to obtain any topics."
              + " Check you have the correct topic files specified, and that TrecQueryTags properties are correct.");

    this.queries = vecStringQueries.toArray(new String[0]);
    this.query_ids = vecStringQueryIDs.toArray(new String[0]);
    this.index = 0;
  }

  /**
   * Constructs an instance of TRECQuery that reads and stores all the queries from a file with the
   * specified filename.
   *
   * @param queryfilename String the name of the file containing all the queries.
   */
  public TRECQuery(String queryfilename) {
    Vector<String> vecStringQueries = new Vector<String>();
    Vector<String> vecStringQueryIDs = new Vector<String>();
    if (this.extractQuery(queryfilename, vecStringQueries, vecStringQueryIDs))
      this.topicFiles = new String[] {queryfilename};

    if (topicFiles == null)
      logger.error(
          "Topic files were specified, but non could be parsed correctly to obtain any topics."
              + " Check you have the correct topic files specified, and that TrecQueryTags properties are correct.");

    this.queries = vecStringQueries.toArray(new String[0]);
    this.query_ids = vecStringQueryIDs.toArray(new String[0]);
    this.index = 0;
  }

  /**
   * Constructs an instance of TRECQuery that reads and stores all the queries from files with the
   * specified filename.
   *
   * @param queryfilenames String[] the name of the files containing all the queries.
   */
  public TRECQuery(String[] queryfilenames) {
    Vector<String> vecStringQueries = new Vector<String>();
    Vector<String> vecStringQueryIDs = new Vector<String>();
    if (this.extractQuery(queryfilenames, vecStringQueries, vecStringQueryIDs))
      this.topicFiles = queryfilenames;
    if (topicFiles == null)
      logger.error(
          "Topic files were specified, but non could be parsed correctly to obtain any topics."
              + " Check you have the correct topic files specified, and that TrecQueryTags properties are correct.");

    this.queries = vecStringQueries.toArray(new String[0]);
    this.query_ids = vecStringQueryIDs.toArray(new String[0]);
    this.index = 0;
  }

  //	/**
  //	 * @deprecated As of Terrier 3.5
  //	 * Extracts and stores all the queries from
  //	 * the topic files, specified in the file
  //	 * with default name <tt>trec.topics.list</tt>.
  //	 */
  //	protected void extractQuery() {
  //		try {
  //			//open the query file
  //			BufferedReader addressQueryFile = Files.openFileReader(ApplicationSetup.TREC_TOPICS_LIST);
  //			ArrayList<String> parsedTopicFiles = new ArrayList<String>(1);
  //			String queryFilename;
  //			Vector<String> vecStringQueries = new Vector<String>();
  //			Vector<String> vecStringQueryIDs = new Vector<String>();
  //			int fileCount = 0;
  //			while ((queryFilename = addressQueryFile.readLine()) != null) {
  //				if (queryFilename.startsWith("#") || queryFilename.equals(""))
  //					continue;
  //				//logger.info("Extracting queries from "+queryFilename);
  //				fileCount++;
  //				boolean rtr = extractQuery(queryFilename, vecStringQueries, vecStringQueryIDs);
  //				if (rtr)
  //					parsedTopicFiles.add(queryFilename);
  //			}
  //			if (fileCount ==0)
  //			{
  //				logger.error("No topic files found in "+ApplicationSetup.TREC_TOPICS_LIST  +"  - please
  // check");
  //			}
  //			if (fileCount > 0 && parsedTopicFiles.size() == 0)
  //			{
  //				logger.error("Topic files were specified, but non could be parsed correctly to obtain any
  // topics."
  //					+ " Check you have the correct topic files specified, and that TrecQueryTags properties are
  // correct.");
  //			}
  //			this.queries = (String[]) vecStringQueries.toArray(new String[0]);
  //			this.query_ids = (String[]) vecStringQueryIDs.toArray(new String[0]);
  //			this.topicFiles = (String[]) parsedTopicFiles.toArray(new String[0]);
  //			////logger.info("found files ="+ this.topicFiles.length);
  //			addressQueryFile.close();
  //		} catch (IOException ioe) {
  //			logger.error("Input/Output exception while performing the matching.", ioe);
  //		}
  //	}

  /**
   * Returns the index of the last obtained query.
   *
   * @return int the index of the last obtained query.
   */
  public int getIndexOfCurrentQuery() {
    return index - 1;
  }

  /**
   * Returns the number of the queries read from the processed topic files.
   *
   * @return int the number of topics contained in the processed topic files.
   */
  public int getNumberOfQueries() {
    return queries.length;
  }

  /** Returns the filenames of the topic files from which the queries were extracted */
  public String[] getInfo() {
    return this.topicFiles;
  }

  /** @deprecated */
  public String[] getTopicFilenames() {
    return getInfo();
  }

  /**
   * Return the query for the given query number.
   *
   * @return String the string representing the query.
   * @param queryNo String The number of a query.
   */
  public String getQuery(String queryNo) {
    for (int i = 0; i < query_ids.length; i++) if (query_ids[i].equals(queryNo)) return queries[i];
    return null;
  }
  /**
   * Test if there are more queries to process.
   *
   * @return boolean true if there are more queries to process, otherwise returns false.
   * @deprecated
   */
  public boolean hasMoreQueries() {
    return hasNext();
  }
  /** {@inheritDoc} */
  public boolean hasNext() {
    if (index == queries.length) return false;
    return true;
  }

  /**
   * Returns a query.
   *
   * @return String the next query.
   * @deprecated
   */
  public String nextQuery() {
    return next();
  }
  /** {@inheritDoc} */
  public String next() {
    if (index == queries.length) return null;
    return queries[index++];
  }

  /** {@inheritDoc} */
  public String getQueryId() {
    return query_ids[index == 0 ? 0 : index - 1];
  }

  /**
   * Returns the query ids
   *
   * @return String array containing the query ids.
   * @since 2.2
   */
  public String[] getQueryIds() {
    return query_ids;
  }

  /**
   * Returns the queries in an array of strings
   *
   * @return String[] an array containing the strings that represent the queries.
   */
  public String[] toArray() {
    return (String[]) queries.clone();
  }

  /** {@inheritDoc} */
  public void reset() {
    this.index = 0;
  }

  /** {@inheritDoc} */
  public void remove() {
    throw new UnsupportedOperationException();
  }

  /**
   * main
   *
   * @param args
   */
  public static void main(String[] args) {
    TRECQuery source = new TRECQuery(args[0]);
    while (source.hasNext()) {
      String query = source.next();
      String id = source.getQueryId();
      System.out.println(id + ": " + query);
    }
  }
}