/** * The termCompare method in FuzzyTermEnum uses Levenshtein distance to calculate the distance * between the given term and the comparing term. * * <p>If the minSimilarity is >= 1.0, this uses the maxEdits as the comparison. Otherwise, * this method uses the following logic to calculate similarity. * * <pre> * similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen))); * </pre> * * where distance is the Levenshtein distance for the two words. */ @Override protected final AcceptStatus accept(BytesRef term) { if (StringHelper.startsWith(term, prefixBytesRef)) { utf32.copyUTF8Bytes(term); final int distance = calcDistance(utf32.ints(), realPrefixLength, utf32.length() - realPrefixLength); // Integer.MIN_VALUE is the sentinel that Levenshtein stopped early if (distance == Integer.MIN_VALUE) { return AcceptStatus.NO; } // no need to calc similarity, if raw is true and distance > maxEdits if (raw == true && distance > maxEdits) { return AcceptStatus.NO; } final float similarity = calcSimilarity(distance, (utf32.length() - realPrefixLength), text.length); // if raw is true, then distance must also be <= maxEdits by now // given the previous if statement if (raw == true || (raw == false && similarity > minSimilarity)) { boostAtt.setBoost((similarity - minSimilarity) * scale_factor); return AcceptStatus.YES; } else { return AcceptStatus.NO; } } else { return AcceptStatus.END; } }
private UserDictionary(List<String[]> featureEntries) throws IOException { int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET; // TODO: should we allow multiple segmentations per input 'phrase'? // the old treemap didn't support this either, and i'm not sure if it's needed/useful? Collections.sort( featureEntries, new Comparator<String[]>() { @Override public int compare(String[] left, String[] right) { return left[0].compareTo(right[0]); } }); List<String> data = new ArrayList<>(featureEntries.size()); List<int[]> segmentations = new ArrayList<>(featureEntries.size()); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput); IntsRefBuilder scratch = new IntsRefBuilder(); long ord = 0; for (String[] values : featureEntries) { String[] segmentation = values[1].replaceAll(" *", " ").split(" "); String[] readings = values[2].replaceAll(" *", " ").split(" "); String pos = values[3]; if (segmentation.length != readings.length) { throw new RuntimeException( "Illegal user dictionary entry " + values[0] + " - the number of segmentations (" + segmentation.length + ")" + " does not the match number of readings (" + readings.length + ")"); } int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length.... wordIdAndLength[0] = wordId; for (int i = 0; i < segmentation.length; i++) { wordIdAndLength[i + 1] = segmentation[i].length(); data.add(readings[i] + INTERNAL_SEPARATOR + pos); wordId++; } // add mapping to FST String token = values[0]; scratch.grow(token.length()); scratch.setLength(token.length()); for (int i = 0; i < token.length(); i++) { scratch.setIntAt(i, (int) token.charAt(i)); } fstBuilder.add(scratch.get(), ord); segmentations.add(wordIdAndLength); ord++; } this.fst = new TokenInfoFST(fstBuilder.finish(), false); this.data = data.toArray(new String[data.size()]); this.segmentations = segmentations.toArray(new int[segmentations.size()][]); }