Ejemplo n.º 1
0
  /**
   * Calculates edit distance.
   *
   * @param i length of first word (here: misspelled) - 1;
   * @param j length of second word (here: candidate) - 1.
   * @param wordIndex (TODO: javadoc?)
   * @param candIndex (TODO: javadoc?)
   * @return Edit distance between the two words. Remarks: See Oflazer.
   */
  public int ed(final int i, final int j, final int wordIndex, final int candIndex) {
    int result;
    int a, b, c;

    if (areEqual(wordProcessed[wordIndex], candidate[candIndex])) {
      // last characters are the same
      result = hMatrix.get(i, j);
    } else if (wordIndex > 0
        && candIndex > 0
        && wordProcessed[wordIndex] == candidate[candIndex - 1]
        && wordProcessed[wordIndex - 1] == candidate[candIndex]) {
      // last two characters are transposed
      a = hMatrix.get(i - 1, j - 1); // transposition, e.g. ababab, ababba
      b = hMatrix.get(i + 1, j); // deletion, e.g. abab, aba
      c = hMatrix.get(i, j + 1); // insertion e.g. aba, abab
      result = 1 + min(a, b, c);
    } else {
      // otherwise
      a = hMatrix.get(i, j); // replacement, e.g. ababa, ababb
      b = hMatrix.get(i + 1, j); // deletion, e.g. ab, a
      c = hMatrix.get(i, j + 1); // insertion e.g. a, ab
      result = 1 + min(a, b, c);
    }

    hMatrix.set(i + 1, j + 1, result);
    return result;
  }
Ejemplo n.º 2
0
  private void findRepl(
      final int depth,
      final int node,
      final byte[] prevBytes,
      final int wordIndex,
      final int candIndex) {
    int dist = 0;
    for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
      byteBuffer = BufferUtils.clearAndEnsureCapacity(byteBuffer, prevBytes.length + 1);
      byteBuffer.put(prevBytes);
      byteBuffer.put(fsa.getArcLabel(arc));
      final int bufPos = byteBuffer.position();
      byteBuffer.flip();
      decoder.reset();
      // FIXME: this isn't correct -- no checks for overflows, no decoder flush. I don't think this
      // should be in here
      // too, the decoder should run once on accumulated temporary byte buffer (current path) only
      // when there's
      // a potential that this buffer can become a replacement candidate (isEndOfCandidate). Because
      // we assume candidates
      // are valid input strings (this is verified when building the dictionary), it's save a lot of
      // conversions.
      final CoderResult c = decoder.decode(byteBuffer, charBuffer, true);
      if (c.isMalformed()) { // assume that only valid
        // encodings are there
        final byte[] prev = new byte[bufPos];
        byteBuffer.position(0);
        byteBuffer.get(prev);
        if (!fsa.isArcTerminal(arc)) {
          findRepl(
              depth,
              fsa.getEndNode(arc),
              prev,
              wordIndex,
              candIndex); // note: depth is not incremented
        }
        byteBuffer.clear();
      } else if (!c.isError()) { // unmappable characters are silently discarded
        charBuffer.flip();
        candidate[candIndex] = charBuffer.get();
        charBuffer.clear();
        byteBuffer.clear();

        int lengthReplacement;
        // replacement "any to two"
        if ((lengthReplacement = matchAnyToTwo(wordIndex, candIndex)) > 0) {
          // the replacement takes place at the end of the candidate
          if (isEndOfCandidate(arc, wordIndex)
              && (dist = hMatrix.get(depth - 1, depth - 1)) <= effectEditDistance) {
            if (Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 2)) > 0) {
              // there are extra letters in the word after the replacement
              dist = dist + Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 2));
            }
            if (dist <= effectEditDistance) {
              addCandidate(candIndex, dist);
            }
          }
          if (isArcNotTerminal(arc, candIndex)) {
            int x = hMatrix.get(depth, depth);
            hMatrix.set(depth, depth, hMatrix.get(depth - 1, depth - 1));
            findRepl(
                Math.max(0, depth),
                fsa.getEndNode(arc),
                new byte[0],
                wordIndex + lengthReplacement - 1,
                candIndex + 1);
            hMatrix.set(depth, depth, x);
          }
        }
        // replacement "any to one"
        if ((lengthReplacement = matchAnyToOne(wordIndex, candIndex)) > 0) {
          // the replacement takes place at the end of the candidate
          if (isEndOfCandidate(arc, wordIndex)
              && (dist = hMatrix.get(depth, depth)) <= effectEditDistance) {
            if (Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 1)) > 0) {
              // there are extra letters in the word after the replacement
              dist = dist + Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 1));
            }
            if (dist <= effectEditDistance) {
              addCandidate(candIndex, dist);
            }
          }
          if (isArcNotTerminal(arc, candIndex)) {
            findRepl(
                depth,
                fsa.getEndNode(arc),
                new byte[0],
                wordIndex + lengthReplacement,
                candIndex + 1);
          }
        }
        // general
        if (cuted(depth, wordIndex, candIndex) <= effectEditDistance) {
          if ((isEndOfCandidate(arc, wordIndex))
              && (dist = ed(wordLen - 1 - (wordIndex - depth), depth, wordLen - 1, candIndex))
                  <= effectEditDistance) {
            addCandidate(candIndex, dist);
          }
          if (isArcNotTerminal(arc, candIndex)) {
            findRepl(depth + 1, fsa.getEndNode(arc), new byte[0], wordIndex + 1, candIndex + 1);
          }
        }
      }
    }
  }