/** * Calculates edit distance. * * @param i length of first word (here: misspelled) - 1; * @param j length of second word (here: candidate) - 1. * @param wordIndex (TODO: javadoc?) * @param candIndex (TODO: javadoc?) * @return Edit distance between the two words. Remarks: See Oflazer. */ public int ed(final int i, final int j, final int wordIndex, final int candIndex) { int result; int a, b, c; if (areEqual(wordProcessed[wordIndex], candidate[candIndex])) { // last characters are the same result = hMatrix.get(i, j); } else if (wordIndex > 0 && candIndex > 0 && wordProcessed[wordIndex] == candidate[candIndex - 1] && wordProcessed[wordIndex - 1] == candidate[candIndex]) { // last two characters are transposed a = hMatrix.get(i - 1, j - 1); // transposition, e.g. ababab, ababba b = hMatrix.get(i + 1, j); // deletion, e.g. abab, aba c = hMatrix.get(i, j + 1); // insertion e.g. aba, abab result = 1 + min(a, b, c); } else { // otherwise a = hMatrix.get(i, j); // replacement, e.g. ababa, ababb b = hMatrix.get(i + 1, j); // deletion, e.g. ab, a c = hMatrix.get(i, j + 1); // insertion e.g. a, ab result = 1 + min(a, b, c); } hMatrix.set(i + 1, j + 1, result); return result; }
private void findRepl( final int depth, final int node, final byte[] prevBytes, final int wordIndex, final int candIndex) { int dist = 0; for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) { byteBuffer = BufferUtils.clearAndEnsureCapacity(byteBuffer, prevBytes.length + 1); byteBuffer.put(prevBytes); byteBuffer.put(fsa.getArcLabel(arc)); final int bufPos = byteBuffer.position(); byteBuffer.flip(); decoder.reset(); // FIXME: this isn't correct -- no checks for overflows, no decoder flush. I don't think this // should be in here // too, the decoder should run once on accumulated temporary byte buffer (current path) only // when there's // a potential that this buffer can become a replacement candidate (isEndOfCandidate). Because // we assume candidates // are valid input strings (this is verified when building the dictionary), it's save a lot of // conversions. final CoderResult c = decoder.decode(byteBuffer, charBuffer, true); if (c.isMalformed()) { // assume that only valid // encodings are there final byte[] prev = new byte[bufPos]; byteBuffer.position(0); byteBuffer.get(prev); if (!fsa.isArcTerminal(arc)) { findRepl( depth, fsa.getEndNode(arc), prev, wordIndex, candIndex); // note: depth is not incremented } byteBuffer.clear(); } else if (!c.isError()) { // unmappable characters are silently discarded charBuffer.flip(); candidate[candIndex] = charBuffer.get(); charBuffer.clear(); byteBuffer.clear(); int lengthReplacement; // replacement "any to two" if ((lengthReplacement = matchAnyToTwo(wordIndex, candIndex)) > 0) { // the replacement takes place at the end of the candidate if (isEndOfCandidate(arc, wordIndex) && (dist = hMatrix.get(depth - 1, depth - 1)) <= effectEditDistance) { if (Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 2)) > 0) { // there are extra letters in the word after the replacement dist = dist + Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 2)); } if (dist <= effectEditDistance) { addCandidate(candIndex, dist); } } if (isArcNotTerminal(arc, candIndex)) { int x = hMatrix.get(depth, depth); hMatrix.set(depth, depth, hMatrix.get(depth - 1, depth - 1)); findRepl( Math.max(0, depth), fsa.getEndNode(arc), new byte[0], wordIndex + lengthReplacement - 1, candIndex + 1); hMatrix.set(depth, depth, x); } } // replacement "any to one" if ((lengthReplacement = matchAnyToOne(wordIndex, candIndex)) > 0) { // the replacement takes place at the end of the candidate if (isEndOfCandidate(arc, wordIndex) && (dist = hMatrix.get(depth, depth)) <= effectEditDistance) { if (Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 1)) > 0) { // there are extra letters in the word after the replacement dist = dist + Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 1)); } if (dist <= effectEditDistance) { addCandidate(candIndex, dist); } } if (isArcNotTerminal(arc, candIndex)) { findRepl( depth, fsa.getEndNode(arc), new byte[0], wordIndex + lengthReplacement, candIndex + 1); } } // general if (cuted(depth, wordIndex, candIndex) <= effectEditDistance) { if ((isEndOfCandidate(arc, wordIndex)) && (dist = ed(wordLen - 1 - (wordIndex - depth), depth, wordLen - 1, candIndex)) <= effectEditDistance) { addCandidate(candIndex, dist); } if (isArcNotTerminal(arc, candIndex)) { findRepl(depth + 1, fsa.getEndNode(arc), new byte[0], wordIndex + 1, candIndex + 1); } } } } }