/** * Analyze a piece of text * * @param text the text to be analyzed */ public void analyze(StringBuffer text) { if (ngrams != null) { ngrams.clear(); sorted = null; ngramcounts = null; } word.clear().append(SEPARATOR); for (int i = 0; i < text.length(); i++) { char c = Character.toLowerCase(text.charAt(i)); if (Character.isLetter(c)) { add(word.append(c)); } else { // found word boundary if (word.length() > 1) { // we have a word! add(word.append(SEPARATOR)); word.clear().append(SEPARATOR); } } } if (word.length() > 1) { // we have a word! add(word.append(SEPARATOR)); } normalize(); }
/** Add the last NGrams from the specified word. */ private void add(QuickStringBuffer word) { int wlen = word.length(); if (wlen >= minLength) { int max = Math.min(maxLength, wlen); for (int i = minLength; i <= max; i++) { add(word.subSequence(wlen - i, wlen)); } } }
/** * @param word * @param n sequence length */ private void add(StringBuffer word, int n) { for (int i = 0; i <= word.length() - n; i++) { add(word.subSequence(i, i + n)); } }
/** * Add ngrams from a single word to this profile * * @param word is the word to add */ public void add(StringBuffer word) { for (int i = minLength; (i <= maxLength) && (i < word.length()); i++) { add(word, i); } }
/** * Add ngrams from a token to this profile * * @param t is the Token to be added */ public void add(Token t) { add(new StringBuffer().append(SEPARATOR).append(t.termText()).append(SEPARATOR)); }