/** * Get the frequency value for a word form. It is taken from the first entry with this word form. * * @param word the word to be tested * @return frequency value in range: 0..FREQ_RANGE-1 (0: less frequent). */ public int getFrequency(final CharSequence word) { if (!dictionaryMetadata.isFrequencyIncluded()) { return 0; } final byte separator = dictionaryMetadata.getSeparator(); try { byteBuffer = charSequenceToBytes(word); } catch (UnmappableInputException e) { return 0; } final MatchResult match = matcher.match(matchResult, byteBuffer.array(), 0, byteBuffer.remaining(), rootNode); if (match.kind == SEQUENCE_IS_A_PREFIX) { final int arc = fsa.getArc(match.node, separator); if (arc != 0 && !fsa.isArcFinal(arc)) { finalStatesIterator.restartFrom(fsa.getEndNode(arc)); if (finalStatesIterator.hasNext()) { final ByteBuffer bb = finalStatesIterator.next(); final byte[] ba = bb.array(); final int bbSize = bb.remaining(); // the last byte contains the frequency after a separator return ba[bbSize - 1] - FIRST_RANGE_CODE; } } } return 0; }
// by Jaume Ortola private boolean areEqual(final char x, final char y) { if (x == y) { return true; } if (dictionaryMetadata.getEquivalentChars() != null && dictionaryMetadata.getEquivalentChars().containsKey(x) && dictionaryMetadata.getEquivalentChars().get(x).contains(y)) { return true; } if (dictionaryMetadata.isIgnoringDiacritics()) { String xn = Normalizer.normalize(Character.toString(x), Form.NFD); String yn = Normalizer.normalize(Character.toString(y), Form.NFD); if (xn.charAt(0) == yn.charAt(0)) { // avoid case conversion, if possible return true; } if (dictionaryMetadata.isConvertingCase()) { // again case conversion only when needed -- we // do not need String.lowercase because we only check // single characters, so a cheaper method is enough if (Character.isLetter(xn.charAt(0))) { boolean testNeeded = Character.isLowerCase(xn.charAt(0)) != Character.isLowerCase(yn.charAt(0)); if (testNeeded) { return Character.toLowerCase(xn.charAt(0)) == Character.toLowerCase(yn.charAt(0)); } } } return xn.charAt(0) == yn.charAt(0); } return false; }
public Speller(final Dictionary dictionary, final int editDistance) { this.editDistance = editDistance; this.hMatrix = new HMatrix(editDistance, MAX_WORD_LENGTH); this.dictionaryMetadata = dictionary.metadata; this.rootNode = dictionary.fsa.getRootNode(); this.fsa = dictionary.fsa; this.matcher = new FSATraversal(fsa); this.finalStatesIterator = new ByteSequenceIterator(fsa, rootNode); if (rootNode == 0) { throw new IllegalArgumentException("Dictionary must have at least the root node."); } if (dictionaryMetadata == null) { throw new IllegalArgumentException("Dictionary metadata must not be null."); } encoder = dictionaryMetadata.getEncoder(); decoder = dictionaryMetadata.getDecoder(); // Multibyte separator will result in an exception here. dictionaryMetadata.getSeparatorAsChar(); this.createReplacementsMaps(); }
private boolean isBeforeSeparator(final int arc) { if (containsSeparators) { final int arc1 = fsa.getArc(fsa.getEndNode(arc), dictionaryMetadata.getSeparator()); return arc1 != 0 && !fsa.isArcTerminal(arc1); } return false; }
/** * Propose suggestions for misspelled run-on words. This algorithm is inspired by spell.cc in * s_fsa package by Jan Daciuk. * * @param original The original misspelled word. * @return The list of suggested pairs, as space-concatenated strings. */ public List<String> replaceRunOnWords(final String original) { final List<String> candidates = new ArrayList<String>(); if (!isInDictionary( DictionaryLookup.applyReplacements( original, dictionaryMetadata.getInputConversionPairs())) && dictionaryMetadata.isSupportingRunOnWords()) { for (int i = 1; i < original.length(); i++) { // chop from left to right final CharSequence firstCh = original.subSequence(0, i); if (isInDictionary(firstCh) && isInDictionary(original.subSequence(i, original.length()))) { if (!dictionaryMetadata.getOutputConversionPairs().isEmpty()) { candidates.add(firstCh + " " + original.subSequence(i, original.length())); } else { candidates.add( DictionaryLookup.applyReplacements( firstCh + " " + original.subSequence(i, original.length()), dictionaryMetadata.getOutputConversionPairs()) .toString()); } } } } return candidates; }
/** * Checks whether the word is misspelled, by performing a series of checks according to properties * of the dictionary. * * <p>If the flag <code>fsa.dict.speller.ignore-punctuation</code> is set, then all non-alphabetic * characters are considered to be correctly spelled. * * <p>If the flag <code>fsa.dict.speller.ignore-numbers</code> is set, then all words containing * decimal digits are considered to be correctly spelled. * * <p>If the flag <code>fsa.dict.speller.ignore-camel-case</code> is set, then all CamelCase words * are considered to be correctly spelled. * * <p>If the flag <code>fsa.dict.speller.ignore-all-uppercase</code> is set, then all alphabetic * words composed of only uppercase characters are considered to be correctly spelled. * * <p>Otherwise, the word is checked in the dictionary. If the test fails, and the dictionary does * not perform any case conversions (as set by <code>fsa.dict.speller.convert-case</code> flag), * then the method returns false. In case of case conversions, it is checked whether a non-mixed * case word is found in its lowercase version in the dictionary, and for all-uppercase words, * whether the word is found in the dictionary with the initial uppercase letter. * * @param word - the word to be checked * @return true if the word is misspelled */ public boolean isMisspelled(final String word) { // dictionaries usually do not contain punctuation String wordToCheck = word; if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) { wordToCheck = DictionaryLookup.applyReplacements(word, dictionaryMetadata.getInputConversionPairs()); } boolean isAlphabetic = wordToCheck.length() != 1 || isAlphabetic(wordToCheck.charAt(0)); return wordToCheck.length() > 0 && (!dictionaryMetadata.isIgnoringPunctuation() || isAlphabetic) && (!dictionaryMetadata.isIgnoringNumbers() || containsNoDigit(wordToCheck)) && !(dictionaryMetadata.isIgnoringCamelCase() && isCamelCase(wordToCheck)) && !(dictionaryMetadata.isIgnoringAllUppercase() && isAlphabetic && isAllUppercase(wordToCheck)) && !isInDictionary(wordToCheck) && (!dictionaryMetadata.isConvertingCase() || !(!isMixedCase(wordToCheck) && (isInDictionary(wordToCheck.toLowerCase(dictionaryMetadata.getLocale())) || isAllUppercase(wordToCheck) && isInDictionary(initialUppercase(wordToCheck))))); }
/** * Test whether the word is found in the dictionary. * * @param word the word to be tested * @return True if it is found. */ public boolean isInDictionary(final CharSequence word) { try { byteBuffer = charSequenceToBytes(word); } catch (UnmappableInputException e) { return false; } // Try to find a partial match in the dictionary. final MatchResult match = matcher.match(matchResult, byteBuffer.array(), 0, byteBuffer.remaining(), rootNode); if (match.kind == EXACT_MATCH) { containsSeparators = false; return true; } return containsSeparators && match.kind == SEQUENCE_IS_A_PREFIX && byteBuffer.remaining() > 0 && fsa.getArc(match.node, dictionaryMetadata.getSeparator()) != 0; }
private void createReplacementsMaps() { for (Map.Entry<String, List<String>> entry : dictionaryMetadata.getReplacementPairs().entrySet()) { for (String s : entry.getValue()) { // replacements any to one // the new key is the target of the replacement pair if (s.length() == 1) { if (!replacementsAnyToOne.containsKey(s.charAt(0))) { List<char[]> charList = new ArrayList<char[]>(); charList.add(entry.getKey().toCharArray()); replacementsAnyToOne.put(s.charAt(0), charList); } else { replacementsAnyToOne.get(s.charAt(0)).add(entry.getKey().toCharArray()); } } // replacements any to two // the new key is the target of the replacement pair else if (s.length() == 2) { if (!replacementsAnyToTwo.containsKey(s)) { List<char[]> charList = new ArrayList<char[]>(); charList.add(entry.getKey().toCharArray()); replacementsAnyToTwo.put(s, charList); } else { replacementsAnyToTwo.get(s).add(entry.getKey().toCharArray()); } } else { if (!replacementsTheRest.containsKey(entry.getKey())) { List<String> charList = new ArrayList<String>(); charList.add(s); replacementsTheRest.put(entry.getKey(), charList); } else { replacementsTheRest.get(entry.getKey()).add(s); } } } } }
/** * Used to determine whether the dictionary supports case conversions. * * @return boolean value that answers this question in a deep and meaningful way. * @since 1.9 */ public boolean convertsCase() { return dictionaryMetadata.isConvertingCase(); }
private boolean isArcNotTerminal(final int arc, final int candIndex) { return !fsa.isArcTerminal(arc) && !(containsSeparators && candidate[candIndex] == dictionaryMetadata.getSeparatorAsChar()); }
/** * Find suggestions by using K. Oflazer's algorithm. See Jan Daciuk's s_fsa package, spell.cc for * further explanation. * * @param w The original misspelled word. * @return A list of suggested replacements. */ public List<String> findReplacements(final String w) { String word = w; if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) { word = DictionaryLookup.applyReplacements(w, dictionaryMetadata.getInputConversionPairs()); } candidates.clear(); if (word.length() > 0 && word.length() < MAX_WORD_LENGTH && !isInDictionary(word)) { List<String> wordsToCheck = new ArrayList<String>(); if (replacementsTheRest != null && word.length() > MIN_WORD_LENGTH) { for (final String wordChecked : getAllReplacements(word, 0, 0)) { boolean found = false; if (isInDictionary(wordChecked)) { candidates.add(new CandidateData(wordChecked, 0)); found = true; } else if (dictionaryMetadata.isConvertingCase()) { String lowerWord = wordChecked.toLowerCase(dictionaryMetadata.getLocale()); String upperWord = wordChecked.toUpperCase(dictionaryMetadata.getLocale()); if (isInDictionary(lowerWord)) { // add the word as it is in the dictionary, not mixed-case versions of it candidates.add(new CandidateData(lowerWord, 0)); found = true; } if (isInDictionary(upperWord)) { candidates.add(new CandidateData(upperWord, 0)); found = true; } if (lowerWord.length() > 1) { String firstupperWord = Character.toUpperCase(lowerWord.charAt(0)) + lowerWord.substring(1); if (isInDictionary(firstupperWord)) { candidates.add(new CandidateData(firstupperWord, 0)); found = true; } } } if (!found) { wordsToCheck.add(wordChecked); } } } else { wordsToCheck.add(word); } // If at least one candidate was found with the replacement pairs (which are usual errors), // probably there is no need for more candidates if (candidates.isEmpty()) { int i = 1; for (final String wordChecked : wordsToCheck) { i++; if (i > UPPER_SEARCH_LIMIT) { // for performance reasons, do not search too deeply break; } wordProcessed = wordChecked.toCharArray(); wordLen = wordProcessed.length; if (wordLen < MIN_WORD_LENGTH && i > 2) { // three-letter replacements make little sense anyway break; } candidate = new char[MAX_WORD_LENGTH]; candLen = candidate.length; effectEditDistance = wordLen <= editDistance ? wordLen - 1 : editDistance; charBuffer = BufferUtils.clearAndEnsureCapacity(charBuffer, MAX_WORD_LENGTH); byteBuffer = BufferUtils.clearAndEnsureCapacity(byteBuffer, MAX_WORD_LENGTH); final byte[] prevBytes = new byte[0]; findRepl(0, fsa.getRootNode(), prevBytes, 0, 0); } } } Collections.sort(candidates); // Use a linked set to avoid duplicates and preserve the ordering of candidates. final Set<String> candStringSet = new LinkedHashSet<String>(); for (final CandidateData cd : candidates) { candStringSet.add( DictionaryLookup.applyReplacements( cd.getWord(), dictionaryMetadata.getOutputConversionPairs()) .toString()); } final List<String> candStringList = new ArrayList<String>(candStringSet.size()); candStringList.addAll(candStringSet); return candStringList; }
private CharSequence initialUppercase(final String wordToCheck) { return wordToCheck.substring(0, 1) + wordToCheck.substring(1).toLowerCase(dictionaryMetadata.getLocale()); }