コード例 #1
0
    /**
     * The termCompare method in FuzzyTermEnum uses Levenshtein distance to calculate the distance
     * between the given term and the comparing term.
     *
     * <p>If the minSimilarity is &gt;= 1.0, this uses the maxEdits as the comparison. Otherwise,
     * this method uses the following logic to calculate similarity.
     *
     * <pre>
     *   similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
     *   </pre>
     *
     * where distance is the Levenshtein distance for the two words.
     */
    @Override
    protected final AcceptStatus accept(BytesRef term) {
      if (StringHelper.startsWith(term, prefixBytesRef)) {
        utf32.copyUTF8Bytes(term);
        final int distance =
            calcDistance(utf32.ints(), realPrefixLength, utf32.length() - realPrefixLength);

        // Integer.MIN_VALUE is the sentinel that Levenshtein stopped early
        if (distance == Integer.MIN_VALUE) {
          return AcceptStatus.NO;
        }
        // no need to calc similarity, if raw is true and distance > maxEdits
        if (raw == true && distance > maxEdits) {
          return AcceptStatus.NO;
        }
        final float similarity =
            calcSimilarity(distance, (utf32.length() - realPrefixLength), text.length);

        // if raw is true, then distance must also be <= maxEdits by now
        // given the previous if statement
        if (raw == true || (raw == false && similarity > minSimilarity)) {
          boostAtt.setBoost((similarity - minSimilarity) * scale_factor);
          return AcceptStatus.YES;
        } else {
          return AcceptStatus.NO;
        }
      } else {
        return AcceptStatus.END;
      }
    }
コード例 #2
0
  private void addTerms(IndexReader reader, FieldVals f, ScoreTermQueue q) throws IOException {
    if (f.queryString == null) return;
    final Terms terms = MultiFields.getTerms(reader, f.fieldName);
    if (terms == null) {
      return;
    }
    try (TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString)) {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

      int corpusNumDocs = reader.numDocs();
      HashSet<String> processedTerms = new HashSet<>();
      ts.reset();
      while (ts.incrementToken()) {
        String term = termAtt.toString();
        if (!processedTerms.contains(term)) {
          processedTerms.add(term);
          ScoreTermQueue variantsQ =
              new ScoreTermQueue(
                  MAX_VARIANTS_PER_TERM); // maxNum variants considered for any one term
          float minScore = 0;
          Term startTerm = new Term(f.fieldName, term);
          AttributeSource atts = new AttributeSource();
          MaxNonCompetitiveBoostAttribute maxBoostAtt =
              atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
          SlowFuzzyTermsEnum fe =
              new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength);
          // store the df so all variants use same idf
          int df = reader.docFreq(startTerm);
          int numVariants = 0;
          int totalVariantDocFreqs = 0;
          BytesRef possibleMatch;
          BoostAttribute boostAtt = fe.attributes().addAttribute(BoostAttribute.class);
          while ((possibleMatch = fe.next()) != null) {
            numVariants++;
            totalVariantDocFreqs += fe.docFreq();
            float score = boostAtt.getBoost();
            if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) {
              ScoreTerm st =
                  new ScoreTerm(
                      new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)),
                      score,
                      startTerm);
              variantsQ.insertWithOverflow(st);
              minScore = variantsQ.top().score; // maintain minScore
            }
            maxBoostAtt.setMaxNonCompetitiveBoost(
                variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
          }

          if (numVariants > 0) {
            int avgDf = totalVariantDocFreqs / numVariants;
            if (df == 0) // no direct match we can use as df for all variants
            {
              df = avgDf; // use avg df of all variants
            }

            // take the top variants (scored by edit distance) and reset the score
            // to include an IDF factor then add to the global queue for ranking
            // overall top query terms
            int size = variantsQ.size();
            for (int i = 0; i < size; i++) {
              ScoreTerm st = variantsQ.pop();
              st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs);
              q.insertWithOverflow(st);
            }
          }
        }
      }
      ts.end();
    }
  }