/** * Returns a copy of the given map as a {@link CharArrayMap}. If the given map is a {@link * CharArrayMap} the ignoreCase property will be preserved. * * @param map a map to copy * @return a copy of the given map as a {@link CharArrayMap}. If the given map is a {@link * CharArrayMap} the ignoreCase property as well as the matchVersion will be of the given map * will be preserved. */ @SuppressWarnings("unchecked") public static <V> CharArrayMap<V> copy(final Map<?, ? extends V> map) { if (map == EMPTY_MAP) return emptyMap(); if (map instanceof CharArrayMap) { CharArrayMap<V> m = (CharArrayMap<V>) map; // use fast path instead of iterating all values // this is even on very small sets ~10 times faster than iterating final char[][] keys = new char[m.keys.length][]; System.arraycopy(m.keys, 0, keys, 0, keys.length); final V[] values = (V[]) new Object[m.values.length]; System.arraycopy(m.values, 0, values, 0, values.length); m = new CharArrayMap<>(m); m.keys = keys; m.values = values; return m; } // In jdk-9b54 or later, a plain diamond causes compile error with "-source 1.7": return new CharArrayMap<V>(map, false); }
private boolean lookup() { /** * **** debugging code String thisLookup = word.toString(); boolean added = * lookups.add(thisLookup); if (!added) { System.out.println("######extra lookup:" + * thisLookup); // occaasional extra lookups aren't necessarily errors... could happen by diff * manipulations // throw new RuntimeException("######extra lookup:" + thisLookup); } else { // * System.out.println("new lookup:" + thisLookup); } **** */ matchedEntry = dict_ht.get(word.getArray(), 0, word.size()); return matchedEntry != null; }
private CharArrayMap convertPhraseSet(CharArraySet phraseSet) { CharArrayMap<CharArraySet> phraseMap = new CharArrayMap(100, false); Iterator<Object> phraseIt = phraseSet.iterator(); while (phraseIt != null && phraseIt.hasNext()) { char[] phrase = (char[]) phraseIt.next(); Log.debug("'" + new String(phrase) + "'"); char[] firstTerm = getFirstTerm(phrase); Log.debug("'" + new String(firstTerm) + "'"); CharArraySet itsPhrases = phraseMap.get(firstTerm, 0, firstTerm.length); if (itsPhrases == null) { itsPhrases = new CharArraySet(5, false); phraseMap.put(new String(firstTerm), itsPhrases); } itsPhrases.add(phrase); } return phraseMap; }
private DictEntry wordInDict() { /** * * if (matchedEntry != null) { if (dict_ht.get(word.getArray(), 0, word.size()) != * matchedEntry) { System.out.println("Uh oh... cached entry doesn't match"); } return * matchedEntry; } * */ if (matchedEntry != null) return matchedEntry; DictEntry e = dict_ht.get(word.getArray(), 0, word.length()); if (e != null && !e.exception) { matchedEntry = e; // only cache if it's not an exception. } // lookups.add(word.toString()); return e; }
@Override public boolean incrementToken() throws IOException { if (!emitSingleTokens && unusedTokens.size() > 0) { Log.debug("emitting unused phrases"); // emit these until the queue is empty before emitting any new stuff Token aToken = unusedTokens.remove(0); emit(aToken); return true; } if (lastToken != null) { emit(lastToken); lastToken = null; return true; } char[] nextToken = nextToken(); // if (nextToken != null) System.out.println( "nextToken: " + new String( nextToken )); if (nextToken == null) { if (lastValid != null) { emit(lastValid); lastValid = null; return true; } if (emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) { char[] phrase = getFirst(currentSetToCheck); char[] lastTok = getCurrentBuffer(new char[0]); if (phrase != null && endsWith(lastTok, phrase)) { currentSetToCheck = remove(currentSetToCheck, phrase); emit(phrase); return true; } } else if (!emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) { if (lastEmitted != null && !equals(fixWhitespace(lastEmitted), getCurrentBuffer(new char[0]))) { discardCharTokens(currentPhrase, unusedTokens); currentSetToCheck = null; if (unusedTokens.size() > 0) { Token aToken = unusedTokens.remove(0); Log.debug("emitting putback token"); emit(aToken); return true; } } } if (lastEmitted == null && (currentPhrase != null && currentPhrase.length() > 0)) { char[] lastTok = getCurrentBuffer(new char[0]); if (currentSetToCheck.contains(lastTok, 0, lastTok.length)) { emit(lastTok); currentPhrase.setLength(0); return true; } else if (!emitSingleTokens) { discardCharTokens(currentPhrase, unusedTokens); currentSetToCheck = null; currentPhrase.setLength(0); if (unusedTokens.size() > 0) { Token aToken = unusedTokens.remove(0); Log.debug("emitting putback token"); emit(aToken); return true; } } } return false; } // if emitSingleToken, set lastToken = nextToken if (emitSingleTokens) { lastToken = nextToken; } if (currentSetToCheck == null || currentSetToCheck.size() == 0) { Log.debug("Checking for phrase start on '" + new String(nextToken) + "'"); if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) { // get the phrase set for this token, add it to currentSetTocheck currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length); if (currentPhrase == null) currentPhrase = new StringBuffer(); else currentPhrase.setLength(0); currentPhrase.append(nextToken); return incrementToken(); } else { emit(nextToken); // clear lastToken lastToken = null; return true; } } else { // add token to the current string buffer. char[] currentBuffer = getCurrentBuffer(nextToken); if (currentSetToCheck.contains(currentBuffer, 0, currentBuffer.length)) { // if its the only one valid, emit it // if there is a longer one, wait to see if it will be matched // if the longer one breaks on the next token, emit this one... // emit the current phrase currentSetToCheck = remove(currentSetToCheck, currentBuffer); if (currentSetToCheck.size() == 0) { emit(currentBuffer); lastValid = null; --positionIncr; } else { if (emitSingleTokens) { lastToken = currentBuffer; return true; } lastValid = currentBuffer; } if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) { // get the phrase set for this token, add it to currentPhrasesTocheck currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length); if (currentPhrase == null) currentPhrase = new StringBuffer(); else currentPhrase.setLength(0); currentPhrase.append(nextToken); } return (lastValid != null) ? incrementToken() : true; } if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) { // get the phrase set for this token, add it to currentPhrasesTocheck // System.out.println( "starting new phrase with " + new String( nextToken ) ); // does this add all of the set? if not need iterator loop CharArraySet newSet = phraseMap.get(nextToken, 0, nextToken.length); Iterator<Object> phraseIt = newSet.iterator(); while (phraseIt != null && phraseIt.hasNext()) { char[] phrase = (char[]) phraseIt.next(); currentSetToCheck.add(phrase); } } // for each phrase in currentSetToCheck - // if there is a phrase prefix match, get the next token recursively Iterator<Object> phraseIt = currentSetToCheck.iterator(); while (phraseIt != null && phraseIt.hasNext()) { char[] phrase = (char[]) phraseIt.next(); if (startsWith(phrase, currentBuffer)) { return incrementToken(); } } if (lastValid != null) { emit(lastValid); lastValid = null; return true; } if (!emitSingleTokens) { // current phrase didn't match fully: put the tokens back // into the unusedTokens list discardCharTokens(currentPhrase, unusedTokens); currentPhrase.setLength(0); currentSetToCheck = null; if (unusedTokens.size() > 0) { Token aToken = unusedTokens.remove(0); Log.debug("emitting putback token"); emit(aToken); return true; } } currentSetToCheck = null; Log.debug("returning at end."); return incrementToken(); } }
private boolean isPhrase(char[] phrase) { return phraseMap != null && phraseMap.containsKey(phrase, 0, phrase.length); }
/** * Returns an unmodifiable {@link CharArrayMap}. This allows to provide unmodifiable views of * internal map for "read-only" use. * * @param map a map for which the unmodifiable map is returned. * @return an new unmodifiable {@link CharArrayMap}. * @throws NullPointerException if the given map is <code>null</code>. */ public static <V> CharArrayMap<V> unmodifiableMap(CharArrayMap<V> map) { if (map == null) throw new NullPointerException("Given map is null"); if (map == emptyMap() || map.isEmpty()) return emptyMap(); if (map instanceof UnmodifiableCharArrayMap) return map; return new UnmodifiableCharArrayMap<>(map); }
private static CharArrayMap<DictEntry> initializeDictHash() { DictEntry defaultEntry; DictEntry entry; CharArrayMap<DictEntry> d = new CharArrayMap<DictEntry>(Version.LUCENE_31, 1000, false); d = new CharArrayMap<DictEntry>(Version.LUCENE_31, 1000, false); for (int i = 0; i < exceptionWords.length; i++) { if (!d.containsKey(exceptionWords[i])) { entry = new DictEntry(exceptionWords[i], true); d.put(exceptionWords[i], entry); } else { System.out.println("Warning: Entry [" + exceptionWords[i] + "] already in dictionary 1"); } } for (int i = 0; i < directConflations.length; i++) { if (!d.containsKey(directConflations[i][0])) { entry = new DictEntry(directConflations[i][1], false); d.put(directConflations[i][0], entry); } else { System.out.println( "Warning: Entry [" + directConflations[i][0] + "] already in dictionary 2"); } } for (int i = 0; i < countryNationality.length; i++) { if (!d.containsKey(countryNationality[i][0])) { entry = new DictEntry(countryNationality[i][1], false); d.put(countryNationality[i][0], entry); } else { System.out.println( "Warning: Entry [" + countryNationality[i][0] + "] already in dictionary 3"); } } defaultEntry = new DictEntry(null, false); String[] array; array = KStemData1.data; for (int i = 0; i < array.length; i++) { if (!d.containsKey(array[i])) { d.put(array[i], defaultEntry); } else { System.out.println("Warning: Entry [" + array[i] + "] already in dictionary 4"); } } array = KStemData2.data; for (int i = 0; i < array.length; i++) { if (!d.containsKey(array[i])) { d.put(array[i], defaultEntry); } else { System.out.println("Warning: Entry [" + array[i] + "] already in dictionary 4"); } } array = KStemData3.data; for (int i = 0; i < array.length; i++) { if (!d.containsKey(array[i])) { d.put(array[i], defaultEntry); } else { System.out.println("Warning: Entry [" + array[i] + "] already in dictionary 4"); } } array = KStemData4.data; for (int i = 0; i < array.length; i++) { if (!d.containsKey(array[i])) { d.put(array[i], defaultEntry); } else { System.out.println("Warning: Entry [" + array[i] + "] already in dictionary 4"); } } array = KStemData5.data; for (int i = 0; i < array.length; i++) { if (!d.containsKey(array[i])) { d.put(array[i], defaultEntry); } else { System.out.println("Warning: Entry [" + array[i] + "] already in dictionary 4"); } } array = KStemData6.data; for (int i = 0; i < array.length; i++) { if (!d.containsKey(array[i])) { d.put(array[i], defaultEntry); } else { System.out.println("Warning: Entry [" + array[i] + "] already in dictionary 4"); } } array = KStemData7.data; for (int i = 0; i < array.length; i++) { if (!d.containsKey(array[i])) { d.put(array[i], defaultEntry); } else { System.out.println("Warning: Entry [" + array[i] + "] already in dictionary 4"); } } for (int i = 0; i < KStemData8.data.length; i++) { if (!d.containsKey(KStemData8.data[i])) { d.put(KStemData8.data[i], defaultEntry); } else { System.out.println("Warning: Entry [" + KStemData8.data[i] + "] already in dictionary 4"); } } for (int i = 0; i < supplementDict.length; i++) { if (!d.containsKey(supplementDict[i])) { d.put(supplementDict[i], defaultEntry); } else { System.out.println("Warning: Entry [" + supplementDict[i] + "] already in dictionary 5"); } } for (int i = 0; i < properNouns.length; i++) { if (!d.containsKey(properNouns[i])) { d.put(properNouns[i], defaultEntry); } else { System.out.println("Warning: Entry [" + properNouns[i] + "] already in dictionary 6"); } } return d; }
/** Stems the text in the token. Returns true if changed. */ boolean stem(char[] term, int len) { result = null; k = len - 1; if ((k <= 1) || (k >= MaxWordLen - 1)) { return false; // don't stem } // first check the stemmer dictionaries, and avoid using the // cache if it's in there. DictEntry entry = dict_ht.get(term, 0, len); if (entry != null) { if (entry.root != null) { result = entry.root; return true; } return false; } /** * * caching off is normally faster if (cache == null) initializeStemHash(); * * <p>// now check the cache, before we copy chars to "word" if (cache != null) { String val = * cache.get(term, 0, len); if (val != null) { if (val != SAME) { result = val; return true; } * return false; } } * */ word.reset(); // allocate enough space so that an expansion is never needed word.reserve(len + 10); for (int i = 0; i < len; i++) { char ch = term[i]; if (!isAlpha(ch)) return false; // don't stem // don't lowercase... it's a requirement that lowercase filter be // used before this stemmer. word.unsafeWrite(ch); } matchedEntry = null; /** * lookups.clear(); lookups.add(word.toString()); * */ /* * This while loop will never be executed more than one time; it is here * only to allow the break statement to be used to escape as soon as a word * is recognized */ while (true) { // YCS: extra lookup()s were inserted so we don't need to // do an extra wordInDict() here. plural(); if (matched()) break; pastTense(); if (matched()) break; aspect(); if (matched()) break; ityEndings(); if (matched()) break; nessEndings(); if (matched()) break; ionEndings(); if (matched()) break; erAndOrEndings(); if (matched()) break; lyEndings(); if (matched()) break; alEndings(); if (matched()) break; entry = wordInDict(); iveEndings(); if (matched()) break; izeEndings(); if (matched()) break; mentEndings(); if (matched()) break; bleEndings(); if (matched()) break; ismEndings(); if (matched()) break; icEndings(); if (matched()) break; ncyEndings(); if (matched()) break; nceEndings(); matched(); break; } /* * try for a direct mapping (allows for cases like `Italian'->`Italy' and * `Italians'->`Italy') */ entry = matchedEntry; if (entry != null) { result = entry.root; // may be null, which means that "word" is the stem } /** * * caching off is normally faster if (cache != null && cache.size() < maxCacheSize) { char[] * key = new char[len]; System.arraycopy(term, 0, key, 0, len); if (result != null) { * cache.put(key, result); } else { cache.put(key, word.toString()); } } * */ /** * * if (entry == null) { if (!word.toString().equals(new String(term,0,len))) { * System.out.println("CASE:" + word.toString() + "," + new String(term,0,len)); * * <p>} } * */ // no entry matched means result is "word" return true; }