コード例 #1
0
  private void findRepl(
      final int depth,
      final int node,
      final byte[] prevBytes,
      final int wordIndex,
      final int candIndex) {
    int dist = 0;
    for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
      byteBuffer = BufferUtils.clearAndEnsureCapacity(byteBuffer, prevBytes.length + 1);
      byteBuffer.put(prevBytes);
      byteBuffer.put(fsa.getArcLabel(arc));
      final int bufPos = byteBuffer.position();
      byteBuffer.flip();
      decoder.reset();
      // FIXME: this isn't correct -- no checks for overflows, no decoder flush. I don't think this
      // should be in here
      // too, the decoder should run once on accumulated temporary byte buffer (current path) only
      // when there's
      // a potential that this buffer can become a replacement candidate (isEndOfCandidate). Because
      // we assume candidates
      // are valid input strings (this is verified when building the dictionary), it's save a lot of
      // conversions.
      final CoderResult c = decoder.decode(byteBuffer, charBuffer, true);
      if (c.isMalformed()) { // assume that only valid
        // encodings are there
        final byte[] prev = new byte[bufPos];
        byteBuffer.position(0);
        byteBuffer.get(prev);
        if (!fsa.isArcTerminal(arc)) {
          findRepl(
              depth,
              fsa.getEndNode(arc),
              prev,
              wordIndex,
              candIndex); // note: depth is not incremented
        }
        byteBuffer.clear();
      } else if (!c.isError()) { // unmappable characters are silently discarded
        charBuffer.flip();
        candidate[candIndex] = charBuffer.get();
        charBuffer.clear();
        byteBuffer.clear();

        int lengthReplacement;
        // replacement "any to two"
        if ((lengthReplacement = matchAnyToTwo(wordIndex, candIndex)) > 0) {
          // the replacement takes place at the end of the candidate
          if (isEndOfCandidate(arc, wordIndex)
              && (dist = hMatrix.get(depth - 1, depth - 1)) <= effectEditDistance) {
            if (Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 2)) > 0) {
              // there are extra letters in the word after the replacement
              dist = dist + Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 2));
            }
            if (dist <= effectEditDistance) {
              addCandidate(candIndex, dist);
            }
          }
          if (isArcNotTerminal(arc, candIndex)) {
            int x = hMatrix.get(depth, depth);
            hMatrix.set(depth, depth, hMatrix.get(depth - 1, depth - 1));
            findRepl(
                Math.max(0, depth),
                fsa.getEndNode(arc),
                new byte[0],
                wordIndex + lengthReplacement - 1,
                candIndex + 1);
            hMatrix.set(depth, depth, x);
          }
        }
        // replacement "any to one"
        if ((lengthReplacement = matchAnyToOne(wordIndex, candIndex)) > 0) {
          // the replacement takes place at the end of the candidate
          if (isEndOfCandidate(arc, wordIndex)
              && (dist = hMatrix.get(depth, depth)) <= effectEditDistance) {
            if (Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 1)) > 0) {
              // there are extra letters in the word after the replacement
              dist = dist + Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 1));
            }
            if (dist <= effectEditDistance) {
              addCandidate(candIndex, dist);
            }
          }
          if (isArcNotTerminal(arc, candIndex)) {
            findRepl(
                depth,
                fsa.getEndNode(arc),
                new byte[0],
                wordIndex + lengthReplacement,
                candIndex + 1);
          }
        }
        // general
        if (cuted(depth, wordIndex, candIndex) <= effectEditDistance) {
          if ((isEndOfCandidate(arc, wordIndex))
              && (dist = ed(wordLen - 1 - (wordIndex - depth), depth, wordLen - 1, candIndex))
                  <= effectEditDistance) {
            addCandidate(candIndex, dist);
          }
          if (isArcNotTerminal(arc, candIndex)) {
            findRepl(depth + 1, fsa.getEndNode(arc), new byte[0], wordIndex + 1, candIndex + 1);
          }
        }
      }
    }
  }
コード例 #2
0
  /**
   * Find suggestions by using K. Oflazer's algorithm. See Jan Daciuk's s_fsa package, spell.cc for
   * further explanation.
   *
   * @param w The original misspelled word.
   * @return A list of suggested replacements.
   */
  public List<String> findReplacements(final String w) {
    String word = w;
    if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) {
      word = DictionaryLookup.applyReplacements(w, dictionaryMetadata.getInputConversionPairs());
    }
    candidates.clear();
    if (word.length() > 0 && word.length() < MAX_WORD_LENGTH && !isInDictionary(word)) {
      List<String> wordsToCheck = new ArrayList<String>();
      if (replacementsTheRest != null && word.length() > MIN_WORD_LENGTH) {
        for (final String wordChecked : getAllReplacements(word, 0, 0)) {
          boolean found = false;
          if (isInDictionary(wordChecked)) {
            candidates.add(new CandidateData(wordChecked, 0));
            found = true;
          } else if (dictionaryMetadata.isConvertingCase()) {
            String lowerWord = wordChecked.toLowerCase(dictionaryMetadata.getLocale());
            String upperWord = wordChecked.toUpperCase(dictionaryMetadata.getLocale());
            if (isInDictionary(lowerWord)) {
              // add the word as it is in the dictionary, not mixed-case versions of it
              candidates.add(new CandidateData(lowerWord, 0));
              found = true;
            }
            if (isInDictionary(upperWord)) {
              candidates.add(new CandidateData(upperWord, 0));
              found = true;
            }
            if (lowerWord.length() > 1) {
              String firstupperWord =
                  Character.toUpperCase(lowerWord.charAt(0)) + lowerWord.substring(1);
              if (isInDictionary(firstupperWord)) {
                candidates.add(new CandidateData(firstupperWord, 0));
                found = true;
              }
            }
          }
          if (!found) {
            wordsToCheck.add(wordChecked);
          }
        }
      } else {
        wordsToCheck.add(word);
      }

      // If at least one candidate was found with the replacement pairs (which are usual errors),
      // probably there is no need for more candidates
      if (candidates.isEmpty()) {
        int i = 1;
        for (final String wordChecked : wordsToCheck) {
          i++;
          if (i > UPPER_SEARCH_LIMIT) { // for performance reasons, do not search too deeply
            break;
          }
          wordProcessed = wordChecked.toCharArray();
          wordLen = wordProcessed.length;
          if (wordLen < MIN_WORD_LENGTH
              && i > 2) { // three-letter replacements make little sense anyway
            break;
          }
          candidate = new char[MAX_WORD_LENGTH];
          candLen = candidate.length;
          effectEditDistance = wordLen <= editDistance ? wordLen - 1 : editDistance;
          charBuffer = BufferUtils.clearAndEnsureCapacity(charBuffer, MAX_WORD_LENGTH);
          byteBuffer = BufferUtils.clearAndEnsureCapacity(byteBuffer, MAX_WORD_LENGTH);
          final byte[] prevBytes = new byte[0];
          findRepl(0, fsa.getRootNode(), prevBytes, 0, 0);
        }
      }
    }

    Collections.sort(candidates);

    // Use a linked set to avoid duplicates and preserve the ordering of candidates.
    final Set<String> candStringSet = new LinkedHashSet<String>();
    for (final CandidateData cd : candidates) {
      candStringSet.add(
          DictionaryLookup.applyReplacements(
                  cd.getWord(), dictionaryMetadata.getOutputConversionPairs())
              .toString());
    }
    final List<String> candStringList = new ArrayList<String>(candStringSet.size());
    candStringList.addAll(candStringSet);
    return candStringList;
  }