Пример #1
0
 @Override
 public void createWeight(QueryContext context, IndexSearcher searcher) throws IOException {
   TFIDFSimilarity sim = asTFIDF(searcher.getSimilarity(), field);
   if (sim == null) {
     throw new UnsupportedOperationException(
         "requires a TFIDFSimilarity (such as DefaultSimilarity)");
   }
   int docfreq = searcher.getIndexReader().docFreq(new Term(indexedField, indexedBytes));
   float idf = sim.idf(docfreq, searcher.getIndexReader().maxDoc());
   FuncValues result = new ConstDoubleDocValues(idf, this);
   context.put(this, result);
 }
Пример #2
0
  /**
   * Create a PriorityQueue from a word->tf map.
   *
   * @param words a map of words keyed on the word(String) with Int objects as the values.
   * @param fieldNames an array of field names to override defaults.
   */
  private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words, String... fieldNames)
      throws IOException {
    // have collected all words in doc and their freqs
    int numDocs = ir.numDocs();
    final int limit = Math.min(maxQueryTerms, words.size());
    FreqQ queue = new FreqQ(limit); // will order words by score

    for (String word : words.keySet()) { // for every word
      int tf = words.get(word).x; // term freq in the source doc
      if (minTermFreq > 0 && tf < minTermFreq) {
        continue; // filter out words that don't occur enough times in the source
      }

      // go through all the fields and find the largest document frequency
      String topField = fieldNames[0];
      int docFreq = 0;
      for (String fieldName : fieldNames) {
        int freq = ir.docFreq(new Term(fieldName, word));
        topField = (freq > docFreq) ? fieldName : topField;
        docFreq = (freq > docFreq) ? freq : docFreq;
      }

      if (minDocFreq > 0 && docFreq < minDocFreq) {
        continue; // filter out words that don't occur in enough docs
      }

      if (docFreq > maxDocFreq) {
        continue; // filter out words that occur in too many docs
      }

      if (docFreq == 0) {
        continue; // index update problem?
      }

      float idf = similarity.idf(docFreq, numDocs);
      float score = tf * idf;

      if (queue.size() < limit) {
        // there is still space in the queue
        queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf));
      } else {
        ScoreTerm term = queue.top();
        if (term.score < score) { // update the smallest in the queue in place and update the queue.
          term.update(word, topField, score, idf, docFreq, tf);
          queue.updateTop();
        }
      }
    }
    return queue;
  }
Пример #3
0
  /**
   * Create a PriorityQueue from a word-&gt;tf map.
   *
   * @param perFieldTermFrequencies a per field map of words keyed on the word(String) with Int
   *     objects as the values.
   */
  private PriorityQueue<ScoreTerm> createQueue(
      Map<String, Map<String, Int>> perFieldTermFrequencies) throws IOException {
    // have collected all words in doc and their freqs
    int numDocs = ir.numDocs();
    final int limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies));
    FreqQ queue = new FreqQ(limit); // will order words by score
    for (Map.Entry<String, Map<String, Int>> entry : perFieldTermFrequencies.entrySet()) {
      Map<String, Int> perWordTermFrequencies = entry.getValue();
      String fieldName = entry.getKey();

      for (Map.Entry<String, Int> tfEntry : perWordTermFrequencies.entrySet()) { // for every word
        String word = tfEntry.getKey();
        int tf = tfEntry.getValue().x; // term freq in the source doc
        if (minTermFreq > 0 && tf < minTermFreq) {
          continue; // filter out words that don't occur enough times in the source
        }

        int docFreq = ir.docFreq(new Term(fieldName, word));

        if (minDocFreq > 0 && docFreq < minDocFreq) {
          continue; // filter out words that don't occur in enough docs
        }

        if (docFreq > maxDocFreq) {
          continue; // filter out words that occur in too many docs
        }

        if (docFreq == 0) {
          continue; // index update problem?
        }

        float idf = similarity.idf(docFreq, numDocs);
        float score = tf * idf;

        if (queue.size() < limit) {
          // there is still space in the queue
          queue.add(new ScoreTerm(word, fieldName, score, idf, docFreq, tf));
        } else {
          ScoreTerm term = queue.top();
          if (term.score
              < score) { // update the smallest in the queue in place and update the queue.
            term.update(word, fieldName, score, idf, docFreq, tf);
            queue.updateTop();
          }
        }
      }
    }
    return queue;
  }
Пример #4
0
  public void testHyperbolicSweetSpot() {

    SweetSpotSimilarity ss =
        new SweetSpotSimilarity() {
          @Override
          public float tf(int freq) {
            return hyperbolicTf(freq);
          }
        };
    ss.setHyperbolicTfFactors(3.3f, 7.7f, Math.E, 5.0f);

    TFIDFSimilarity s = ss;

    for (int i = 1; i <= 1000; i++) {
      assertTrue("MIN tf: i=" + i + " : s=" + s.tf(i), 3.3f <= s.tf(i));
      assertTrue("MAX tf: i=" + i + " : s=" + s.tf(i), s.tf(i) <= 7.7f);
    }
    assertEquals("MID tf", 3.3f + (7.7f - 3.3f) / 2.0f, s.tf(5), 0.00001f);

    // stupidity
    assertEquals("tf zero", 0.0f, s.tf(0), 0.0f);
  }
Пример #5
0
  public void testSweetSpotTf() {

    SweetSpotSimilarity ss = new SweetSpotSimilarity();

    TFIDFSimilarity d = new DefaultSimilarity();
    TFIDFSimilarity s = ss;

    // tf equal

    ss.setBaselineTfFactors(0.0f, 0.0f);

    for (int i = 1; i < 1000; i++) {
      assertEquals("tf: i=" + i, d.tf(i), s.tf(i), 0.0f);
    }

    // tf higher

    ss.setBaselineTfFactors(1.0f, 0.0f);

    for (int i = 1; i < 1000; i++) {
      assertTrue("tf: i=" + i + " : d=" + d.tf(i) + " < s=" + s.tf(i), d.tf(i) < s.tf(i));
    }

    // tf flat

    ss.setBaselineTfFactors(1.0f, 6.0f);
    for (int i = 1; i <= 6; i++) {
      assertEquals("tf flat1: i=" + i, 1.0f, s.tf(i), 0.0f);
    }
    ss.setBaselineTfFactors(2.0f, 6.0f);
    for (int i = 1; i <= 6; i++) {
      assertEquals("tf flat2: i=" + i, 2.0f, s.tf(i), 0.0f);
    }
    for (int i = 6; i <= 1000; i++) {
      assertTrue("tf: i=" + i + " : s=" + s.tf(i) + " < d=" + d.tf(i), s.tf(i) < d.tf(i));
    }

    // stupidity
    assertEquals("tf zero", 0.0f, s.tf(0), 0.0f);
  }
  private void addTerms(IndexReader reader, FieldVals f, ScoreTermQueue q) throws IOException {
    if (f.queryString == null) return;
    final Terms terms = MultiFields.getTerms(reader, f.fieldName);
    if (terms == null) {
      return;
    }
    try (TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString)) {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

      int corpusNumDocs = reader.numDocs();
      HashSet<String> processedTerms = new HashSet<>();
      ts.reset();
      while (ts.incrementToken()) {
        String term = termAtt.toString();
        if (!processedTerms.contains(term)) {
          processedTerms.add(term);
          ScoreTermQueue variantsQ =
              new ScoreTermQueue(
                  MAX_VARIANTS_PER_TERM); // maxNum variants considered for any one term
          float minScore = 0;
          Term startTerm = new Term(f.fieldName, term);
          AttributeSource atts = new AttributeSource();
          MaxNonCompetitiveBoostAttribute maxBoostAtt =
              atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
          SlowFuzzyTermsEnum fe =
              new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength);
          // store the df so all variants use same idf
          int df = reader.docFreq(startTerm);
          int numVariants = 0;
          int totalVariantDocFreqs = 0;
          BytesRef possibleMatch;
          BoostAttribute boostAtt = fe.attributes().addAttribute(BoostAttribute.class);
          while ((possibleMatch = fe.next()) != null) {
            numVariants++;
            totalVariantDocFreqs += fe.docFreq();
            float score = boostAtt.getBoost();
            if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) {
              ScoreTerm st =
                  new ScoreTerm(
                      new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)),
                      score,
                      startTerm);
              variantsQ.insertWithOverflow(st);
              minScore = variantsQ.top().score; // maintain minScore
            }
            maxBoostAtt.setMaxNonCompetitiveBoost(
                variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
          }

          if (numVariants > 0) {
            int avgDf = totalVariantDocFreqs / numVariants;
            if (df == 0) // no direct match we can use as df for all variants
            {
              df = avgDf; // use avg df of all variants
            }

            // take the top variants (scored by edit distance) and reset the score
            // to include an IDF factor then add to the global queue for ranking
            // overall top query terms
            int size = variantsQ.size();
            for (int i = 0; i < size; i++) {
              ScoreTerm st = variantsQ.pop();
              st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs);
              q.insertWithOverflow(st);
            }
          }
        }
      }
      ts.end();
    }
  }