예제 #1
0
  /**
   * Builds a CooccurenceMap by iterating over the documents of the collection. It counts document
   * co-occurence, i.e. it doesn't consider the frequency of two terms in a document. Complexity:
   * O(d * t *t/2) = O(d t^2) where d is the number of documents in the collection and t is the
   * average number of terms per documents. Note that t = avg doc len
   */
  public void build_full_cooccurencemap_docversion() throws IOException {
    PostingIndex di = index.getDirectIndex();
    DocumentIndex doi = index.getDocumentIndex();
    Lexicon<String> lex = index.getLexicon();
    for (int docid = 0; docid < doi.getNumberOfDocuments(); docid++) {
      if (docid % 1000 == 0)
        System.out.println(
            "Processing... " + 100.0 * ((double) docid) / doi.getNumberOfDocuments() + "%");
      IterablePosting postings = di.getPostings(doi.getDocumentEntry(docid));
      Vector<String> seenterms = new Vector<String>();
      while (postings.next() != IterablePosting.EOL) {
        Map.Entry<String, LexiconEntry> lee = lex.getLexiconEntry(postings.getId());
        String termw = lee.getKey();
        if (lee.getValue().getFrequency() < this.rarethreshold
            || lee.getValue().getFrequency() > this.topthreshold) continue;

        HashMap<String, Integer> w_cooccurence = new HashMap<String, Integer>();
        if (this.cooccurencemap.containsKey(termw)) {
          w_cooccurence = this.cooccurencemap.get(termw);
          this.cooccurencemap.remove(termw);
        }
        Iterator<String> it = seenterms.iterator();
        while (it.hasNext()) {
          String termu = it.next();
          int count = 1;
          if (w_cooccurence.containsKey(termu)) {
            count = count + w_cooccurence.get(termu);
            w_cooccurence.remove(termu);
          }
          w_cooccurence.put(termu, count);

          // System.out.println(termw + ": " + w_cooccurence);
          // and now I need to do the symmetric
          HashMap<String, Integer> u_cooccurence = new HashMap<String, Integer>();
          if (cooccurencemap.containsKey(termu)) {
            u_cooccurence = cooccurencemap.get(termu);
            cooccurencemap.remove(termu);
          }
          int countu = 1;
          if (u_cooccurence.containsKey(termw)) {
            countu = countu + u_cooccurence.get(termw);
            u_cooccurence.remove(termw);
          }
          u_cooccurence.put(termw, count);
          cooccurencemap.put(termu, u_cooccurence);
          // System.out.println(termu + ": " + u_cooccurence);
        }

        cooccurencemap.put(termw, w_cooccurence);
        seenterms.add(termw); // I add only the termw that are within the thresholds
      }
    }
  }
예제 #2
0
  /**
   * Runs the actual query expansion
   *
   * @see
   *     org.terrier.querying.PostProcess#process(org.terrier.querying.Manager,org.terrier.querying.SearchRequest)
   */
  public void process(Manager manager, SearchRequest q) {
    Index index = getIndex(manager);
    lastIndex = index;
    documentIndex = index.getDocumentIndex();
    invertedIndex = index.getInvertedIndex();
    lexicon = index.getLexicon();
    collStats = index.getCollectionStatistics();
    directIndex = index.getDirectIndex();
    metaIndex = index.getMetaIndex();
    if (directIndex == null) {
      logger.error("This index does not have a direct index. Query expansion disabled!!");
      return;
    }
    logger.debug("Starting query expansion post-processing.");
    // get the query expansion model to use
    String qeModel = q.getControl("qemodel");
    if (qeModel == null || qeModel.length() == 0) {
      logger.warn(
          "qemodel control not set for QueryExpansion" + " post process. Using default model Bo1");
      qeModel = "Bo1";
    }
    setQueryExpansionModel(getQueryExpansionModel(qeModel));
    if (logger.isDebugEnabled()) {
      logger.info("query expansion model: " + QEModel.getInfo());
    }
    MatchingQueryTerms queryTerms = ((Request) q).getMatchingQueryTerms();
    if (queryTerms == null) {
      logger.warn("No query terms for this query. Skipping QE");
      return;
    }
    // get the expanded query terms
    try {
      expandQuery(queryTerms, (Request) q);
    } catch (IOException ioe) {
      logger.error("IOException while expanding query, skipping QE", ioe);
      return;
    }
    if (logger.isDebugEnabled()) {
      logger.info("query length after expansion: " + queryTerms.length());
      logger.info("Expanded query: ");
    }
    final String[] newQueryTerms = queryTerms.getTerms();
    StringBuilder newQuery = new StringBuilder();
    for (int i = 0; i < newQueryTerms.length; i++) {
      try {
        if (logger.isDebugEnabled()) {
          logger.info(
              (i + 1)
                  + ": "
                  + newQueryTerms[i]
                  + ", normalisedFrequency: "
                  + Rounding.toString(queryTerms.getTermWeight(newQueryTerms[i]), 4));
        }
        newQuery.append(newQueryTerms[i]);
        newQuery.append('^');
        newQuery.append(Rounding.toString(queryTerms.getTermWeight(newQueryTerms[i]), 9));
        newQuery.append(' ');
      } catch (NullPointerException npe) {
        logger.error("Nullpointer exception occured in Query Expansion dumping of new Query", npe);
      }
    }

    logger.debug("NEWQUERY " + q.getQueryID() + " " + newQuery.toString());
    lastExpandedQuery = newQuery.toString();
    q.setControl("QE.ExpandedQuery", newQuery.toString());
    final boolean no2ndPass =
        Boolean.parseBoolean(ApplicationSetup.getProperty("qe.no.2nd.matching", "false"));
    if (no2ndPass) {
      return;
    }

    // run retrieval process again for the expanded query
    logger.info("Accessing inverted file for expanded query " + q.getQueryID());
    manager.runMatching(q);
  }