public boolean addWord(String str, long fileid, long pos) { /* * Filter out certain strings. e.g. words that are either very rare (e.g. * spelling mistakes) or extremely common "stop words". * http://en.wikipedia.org/wiki/Stop_words * * Our definition of stop words is completely arbitrary. It has more to do * with what kind of computation we can do on a single appengine node. * * Ideally we would not ignore any words, but that will require sharding the * word index across a number of serving machines. */ if (str.length() < 3) return false; int freq = dict.getFrequency(str); if (freq == 0 || freq > 90000) return false; ArrayList<FilePositions> fplist = idx.get(str); if (fplist == null) { fplist = new ArrayList<FilePositions>(); idx.put(str, fplist); } if (fplist.size() == 0 || fplist.get(fplist.size() - 1).fileid != fileid) { FilePositions fp = new FilePositions(); fp.fileid = fileid; fp.poslist = new ArrayList<Long>(); fplist.add(fp); } fplist.get(fplist.size() - 1).poslist.add(pos); return true; }
public static int getMaxFrequency( final ConcurrentHashMap<String, Dictionary> dictionaries, CharSequence word) { if (TextUtils.isEmpty(word)) { return Dictionary.NOT_A_PROBABILITY; } int maxFreq = -1; for (final String key : dictionaries.keySet()) { final Dictionary dictionary = dictionaries.get(key); if (null == dictionary) continue; final int tempFreq = dictionary.getFrequency(word); if (tempFreq >= maxFreq) { maxFreq = tempFreq; } } return maxFreq; }