@Override
  public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
    IndexReader reader = context.getIndexSearcher().getIndexReader();

    for (int i = 0; i < docs.documents.length; i++) {
      Terms terms = null;
      try {
        terms = reader.getTermVector(docs.ids[i], StatusField.TEXT.name);
      } catch (IOException e) {
        continue;
      }

      String qid = context.getQueryId().replaceFirst("^MB0*", "");
      String docid = docs.documents[i].getField(StatusField.ID.name).stringValue();

      out.print(qrels.getRelevanceGrade(qid, docid));
      out.print(" qid:" + qid);
      out.print(" 1:" + docs.scores[i]);

      float[] intFeatures = this.extractorChain.extractAll(docs.documents[i], terms, context);

      for (int j = 0; j < intFeatures.length; j++) {
        out.print(" " + (j + 2) + ":" + intFeatures[j]);
      }

      out.print(" # docid:" + docid);
      out.print("\n");
    }

    return docs;
  }
Esempio n. 2
0
  /**
   * We will implement this according to the Lucene specification the formula used: sum ( IDF(qi) *
   * (df(qi,D) * (k+1)) / (df(qi,D) + k * (1-b + b*|D| / avgFL)) IDF and avgFL computation are
   * described above.
   *
   * @param doc
   * @param terms
   * @param context
   * @return
   */
  @Override
  public float extract(Document doc, Terms terms, RerankerContext context) {
    Set<String> queryTokens = new HashSet<>(context.getQueryTokens());

    TermsEnum termsEnum = null;
    try {
      termsEnum = terms.iterator();
    } catch (IOException e) {
      LOG.warn("Error computing BM25, unable to retrieve terms enum");
      return 0.0f;
    }

    IndexReader reader = context.getIndexSearcher().getIndexReader();
    long maxDocs = reader.numDocs();
    long sumTotalTermFreq = getSumTermFrequency(reader, context.getField());
    // Compute by iterating
    long docSize = 0L;

    // NOTE df cannot be retrieved just from the term vector,
    // the term vector here is only a partial term vector that treats this as if we only have 1
    // document in the index
    Map<String, Integer> docFreqMap = null;
    try {
      docFreqMap = getDocFreqs(reader, context.getQueryTokens(), context.getField());
    } catch (IOException e) {
      LOG.warn("Unable to retrieve document frequencies.");
      docFreqMap = new HashMap<>();
    }

    Map<String, Long> termFreqMap = new HashMap<>();
    try {
      while (termsEnum.next() != null) {
        String termString = termsEnum.term().utf8ToString();
        docSize += termsEnum.totalTermFreq();
        if (queryTokens.contains(termString)) {
          termFreqMap.put(termString, termsEnum.totalTermFreq());
        }
      }
    } catch (IOException e) {
      LOG.warn("Unable to retrieve termsEnum, treating as 0");
    }

    float score = 0.0f;
    // Iterate over the query tokens
    double avgFL = computeAvgFL(sumTotalTermFreq, maxDocs);
    for (String token : queryTokens) {
      long docFreq = docFreqMap.containsKey(token) ? docFreqMap.get(token) : 0;
      double termFreq = termFreqMap.containsKey(token) ? termFreqMap.get(token) : 0;
      double numerator = (this.k1 + 1) * termFreq;
      double docLengthFactor = this.b * (docSize / avgFL);
      double denominator = termFreq + (this.k1) * (1 - this.b + docLengthFactor);
      score += computeIDF(docFreq, maxDocs) * numerator / denominator;
    }

    return score;
  }