Example #1
0
  /**
   * Get all the terms in the index with low edit distance from the supplied term
   *
   * @param reader the index
   * @param luceneName the field to search in
   * @param searchTerms search terms
   * @param maxEdits maximum edit distance (Levenshtein algorithm) for matches (i.e. lower is more
   *     similar)
   * @return the set of terms in the index that are close to our search term
   * @throws BooleanQuery.TooManyClauses if the expansion resulted in too many terms
   */
  public static Set<String> getMatchingTermsFromIndex(
      IndexReader reader, String luceneName, Collection<String> searchTerms, int maxEdits) {
    boolean doFuzzy = true;
    if (maxEdits == 0) {
      // Exact match; don't use fuzzy query (slow)
      Set<String> result = new HashSet<>();
      try {
        for (String term : searchTerms) {
          if (reader.docFreq(new Term(luceneName, term)) > 0) result.add(term);
        }
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
      return result;
    }

    BooleanQuery q = new BooleanQuery();
    for (String s : searchTerms) {
      FuzzyQuery fq = new FuzzyQuery(new Term(luceneName, s), maxEdits);
      q.add(fq, Occur.SHOULD);
    }

    try {
      Query rewritten = q.rewrite(reader);
      WeightedTerm[] wts = QueryTermExtractor.getTerms(rewritten);
      Set<String> terms = new HashSet<>();
      for (WeightedTerm wt : wts) {
        if (doFuzzy || searchTerms.contains(wt.getTerm())) {
          terms.add(wt.getTerm());
        }
      }
      return terms;
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }