/**
     * The termCompare method in FuzzyTermEnum uses Levenshtein distance to calculate the distance
     * between the given term and the comparing term.
     *
     * <p>If the minSimilarity is &gt;= 1.0, this uses the maxEdits as the comparison. Otherwise,
     * this method uses the following logic to calculate similarity.
     *
     * <pre>
     *   similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
     *   </pre>
     *
     * where distance is the Levenshtein distance for the two words.
     */
    @Override
    protected final AcceptStatus accept(BytesRef term) {
      if (StringHelper.startsWith(term, prefixBytesRef)) {
        utf32.copyUTF8Bytes(term);
        final int distance =
            calcDistance(utf32.ints(), realPrefixLength, utf32.length() - realPrefixLength);

        // Integer.MIN_VALUE is the sentinel that Levenshtein stopped early
        if (distance == Integer.MIN_VALUE) {
          return AcceptStatus.NO;
        }
        // no need to calc similarity, if raw is true and distance > maxEdits
        if (raw == true && distance > maxEdits) {
          return AcceptStatus.NO;
        }
        final float similarity =
            calcSimilarity(distance, (utf32.length() - realPrefixLength), text.length);

        // if raw is true, then distance must also be <= maxEdits by now
        // given the previous if statement
        if (raw == true || (raw == false && similarity > minSimilarity)) {
          boostAtt.setBoost((similarity - minSimilarity) * scale_factor);
          return AcceptStatus.YES;
        } else {
          return AcceptStatus.NO;
        }
      } else {
        return AcceptStatus.END;
      }
    }
Exemple #2
0
  private UserDictionary(List<String[]> featureEntries) throws IOException {

    int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
    // TODO: should we allow multiple segmentations per input 'phrase'?
    // the old treemap didn't support this either, and i'm not sure if it's needed/useful?

    Collections.sort(
        featureEntries,
        new Comparator<String[]>() {
          @Override
          public int compare(String[] left, String[] right) {
            return left[0].compareTo(right[0]);
          }
        });

    List<String> data = new ArrayList<>(featureEntries.size());
    List<int[]> segmentations = new ArrayList<>(featureEntries.size());

    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
    IntsRefBuilder scratch = new IntsRefBuilder();
    long ord = 0;

    for (String[] values : featureEntries) {
      String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
      String[] readings = values[2].replaceAll("  *", " ").split(" ");
      String pos = values[3];

      if (segmentation.length != readings.length) {
        throw new RuntimeException(
            "Illegal user dictionary entry "
                + values[0]
                + " - the number of segmentations ("
                + segmentation.length
                + ")"
                + " does not the match number of readings ("
                + readings.length
                + ")");
      }

      int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
      wordIdAndLength[0] = wordId;
      for (int i = 0; i < segmentation.length; i++) {
        wordIdAndLength[i + 1] = segmentation[i].length();
        data.add(readings[i] + INTERNAL_SEPARATOR + pos);
        wordId++;
      }
      // add mapping to FST
      String token = values[0];
      scratch.grow(token.length());
      scratch.setLength(token.length());
      for (int i = 0; i < token.length(); i++) {
        scratch.setIntAt(i, (int) token.charAt(i));
      }
      fstBuilder.add(scratch.get(), ord);
      segmentations.add(wordIdAndLength);
      ord++;
    }
    this.fst = new TokenInfoFST(fstBuilder.finish(), false);
    this.data = data.toArray(new String[data.size()]);
    this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
  }