private boolean lookup() { /** * **** debugging code String thisLookup = word.toString(); boolean added = * lookups.add(thisLookup); if (!added) { System.out.println("######extra lookup:" + * thisLookup); // occaasional extra lookups aren't necessarily errors... could happen by diff * manipulations // throw new RuntimeException("######extra lookup:" + thisLookup); } else { // * System.out.println("new lookup:" + thisLookup); } **** */ matchedEntry = dict_ht.get(word.getArray(), 0, word.size()); return matchedEntry != null; }
private DictEntry wordInDict() { /** * * if (matchedEntry != null) { if (dict_ht.get(word.getArray(), 0, word.size()) != * matchedEntry) { System.out.println("Uh oh... cached entry doesn't match"); } return * matchedEntry; } * */ if (matchedEntry != null) return matchedEntry; DictEntry e = dict_ht.get(word.getArray(), 0, word.length()); if (e != null && !e.exception) { matchedEntry = e; // only cache if it's not an exception. } // lookups.add(word.toString()); return e; }
private CharArrayMap convertPhraseSet(CharArraySet phraseSet) { CharArrayMap<CharArraySet> phraseMap = new CharArrayMap(100, false); Iterator<Object> phraseIt = phraseSet.iterator(); while (phraseIt != null && phraseIt.hasNext()) { char[] phrase = (char[]) phraseIt.next(); Log.debug("'" + new String(phrase) + "'"); char[] firstTerm = getFirstTerm(phrase); Log.debug("'" + new String(firstTerm) + "'"); CharArraySet itsPhrases = phraseMap.get(firstTerm, 0, firstTerm.length); if (itsPhrases == null) { itsPhrases = new CharArraySet(5, false); phraseMap.put(new String(firstTerm), itsPhrases); } itsPhrases.add(phrase); } return phraseMap; }
@Override public boolean incrementToken() throws IOException { if (!emitSingleTokens && unusedTokens.size() > 0) { Log.debug("emitting unused phrases"); // emit these until the queue is empty before emitting any new stuff Token aToken = unusedTokens.remove(0); emit(aToken); return true; } if (lastToken != null) { emit(lastToken); lastToken = null; return true; } char[] nextToken = nextToken(); // if (nextToken != null) System.out.println( "nextToken: " + new String( nextToken )); if (nextToken == null) { if (lastValid != null) { emit(lastValid); lastValid = null; return true; } if (emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) { char[] phrase = getFirst(currentSetToCheck); char[] lastTok = getCurrentBuffer(new char[0]); if (phrase != null && endsWith(lastTok, phrase)) { currentSetToCheck = remove(currentSetToCheck, phrase); emit(phrase); return true; } } else if (!emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) { if (lastEmitted != null && !equals(fixWhitespace(lastEmitted), getCurrentBuffer(new char[0]))) { discardCharTokens(currentPhrase, unusedTokens); currentSetToCheck = null; if (unusedTokens.size() > 0) { Token aToken = unusedTokens.remove(0); Log.debug("emitting putback token"); emit(aToken); return true; } } } if (lastEmitted == null && (currentPhrase != null && currentPhrase.length() > 0)) { char[] lastTok = getCurrentBuffer(new char[0]); if (currentSetToCheck.contains(lastTok, 0, lastTok.length)) { emit(lastTok); currentPhrase.setLength(0); return true; } else if (!emitSingleTokens) { discardCharTokens(currentPhrase, unusedTokens); currentSetToCheck = null; currentPhrase.setLength(0); if (unusedTokens.size() > 0) { Token aToken = unusedTokens.remove(0); Log.debug("emitting putback token"); emit(aToken); return true; } } } return false; } // if emitSingleToken, set lastToken = nextToken if (emitSingleTokens) { lastToken = nextToken; } if (currentSetToCheck == null || currentSetToCheck.size() == 0) { Log.debug("Checking for phrase start on '" + new String(nextToken) + "'"); if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) { // get the phrase set for this token, add it to currentSetTocheck currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length); if (currentPhrase == null) currentPhrase = new StringBuffer(); else currentPhrase.setLength(0); currentPhrase.append(nextToken); return incrementToken(); } else { emit(nextToken); // clear lastToken lastToken = null; return true; } } else { // add token to the current string buffer. char[] currentBuffer = getCurrentBuffer(nextToken); if (currentSetToCheck.contains(currentBuffer, 0, currentBuffer.length)) { // if its the only one valid, emit it // if there is a longer one, wait to see if it will be matched // if the longer one breaks on the next token, emit this one... // emit the current phrase currentSetToCheck = remove(currentSetToCheck, currentBuffer); if (currentSetToCheck.size() == 0) { emit(currentBuffer); lastValid = null; --positionIncr; } else { if (emitSingleTokens) { lastToken = currentBuffer; return true; } lastValid = currentBuffer; } if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) { // get the phrase set for this token, add it to currentPhrasesTocheck currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length); if (currentPhrase == null) currentPhrase = new StringBuffer(); else currentPhrase.setLength(0); currentPhrase.append(nextToken); } return (lastValid != null) ? incrementToken() : true; } if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) { // get the phrase set for this token, add it to currentPhrasesTocheck // System.out.println( "starting new phrase with " + new String( nextToken ) ); // does this add all of the set? if not need iterator loop CharArraySet newSet = phraseMap.get(nextToken, 0, nextToken.length); Iterator<Object> phraseIt = newSet.iterator(); while (phraseIt != null && phraseIt.hasNext()) { char[] phrase = (char[]) phraseIt.next(); currentSetToCheck.add(phrase); } } // for each phrase in currentSetToCheck - // if there is a phrase prefix match, get the next token recursively Iterator<Object> phraseIt = currentSetToCheck.iterator(); while (phraseIt != null && phraseIt.hasNext()) { char[] phrase = (char[]) phraseIt.next(); if (startsWith(phrase, currentBuffer)) { return incrementToken(); } } if (lastValid != null) { emit(lastValid); lastValid = null; return true; } if (!emitSingleTokens) { // current phrase didn't match fully: put the tokens back // into the unusedTokens list discardCharTokens(currentPhrase, unusedTokens); currentPhrase.setLength(0); currentSetToCheck = null; if (unusedTokens.size() > 0) { Token aToken = unusedTokens.remove(0); Log.debug("emitting putback token"); emit(aToken); return true; } } currentSetToCheck = null; Log.debug("returning at end."); return incrementToken(); } }
/** Stems the text in the token. Returns true if changed. */ boolean stem(char[] term, int len) { result = null; k = len - 1; if ((k <= 1) || (k >= MaxWordLen - 1)) { return false; // don't stem } // first check the stemmer dictionaries, and avoid using the // cache if it's in there. DictEntry entry = dict_ht.get(term, 0, len); if (entry != null) { if (entry.root != null) { result = entry.root; return true; } return false; } /** * * caching off is normally faster if (cache == null) initializeStemHash(); * * <p>// now check the cache, before we copy chars to "word" if (cache != null) { String val = * cache.get(term, 0, len); if (val != null) { if (val != SAME) { result = val; return true; } * return false; } } * */ word.reset(); // allocate enough space so that an expansion is never needed word.reserve(len + 10); for (int i = 0; i < len; i++) { char ch = term[i]; if (!isAlpha(ch)) return false; // don't stem // don't lowercase... it's a requirement that lowercase filter be // used before this stemmer. word.unsafeWrite(ch); } matchedEntry = null; /** * lookups.clear(); lookups.add(word.toString()); * */ /* * This while loop will never be executed more than one time; it is here * only to allow the break statement to be used to escape as soon as a word * is recognized */ while (true) { // YCS: extra lookup()s were inserted so we don't need to // do an extra wordInDict() here. plural(); if (matched()) break; pastTense(); if (matched()) break; aspect(); if (matched()) break; ityEndings(); if (matched()) break; nessEndings(); if (matched()) break; ionEndings(); if (matched()) break; erAndOrEndings(); if (matched()) break; lyEndings(); if (matched()) break; alEndings(); if (matched()) break; entry = wordInDict(); iveEndings(); if (matched()) break; izeEndings(); if (matched()) break; mentEndings(); if (matched()) break; bleEndings(); if (matched()) break; ismEndings(); if (matched()) break; icEndings(); if (matched()) break; ncyEndings(); if (matched()) break; nceEndings(); matched(); break; } /* * try for a direct mapping (allows for cases like `Italian'->`Italy' and * `Italians'->`Italy') */ entry = matchedEntry; if (entry != null) { result = entry.root; // may be null, which means that "word" is the stem } /** * * caching off is normally faster if (cache != null && cache.size() < maxCacheSize) { char[] * key = new char[len]; System.arraycopy(term, 0, key, 0, len); if (result != null) { * cache.put(key, result); } else { cache.put(key, word.toString()); } } * */ /** * * if (entry == null) { if (!word.toString().equals(new String(term,0,len))) { * System.out.println("CASE:" + word.toString() + "," + new String(term,0,len)); * * <p>} } * */ // no entry matched means result is "word" return true; }