public void test() throws Exception { final CharArraySet cas = new CharArraySet(3, false); cas.add("jjp"); cas.add("wlmwoknt"); cas.add("tcgyreo"); final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.add("mtqlpi", ""); builder.add("mwoknt", "jjp"); builder.add("tcgyreo", "zpfpajyws"); final NormalizeCharMap map = builder.build(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer t = new MockTokenizer(MockTokenFilter.ENGLISH_STOPSET, false, -65); TokenFilter f = new CommonGramsFilter(t, cas); return new TokenStreamComponents(t, f); } @Override protected Reader initReader(String fieldName, Reader reader) { reader = new MockCharFilter(reader, 0); reader = new MappingCharFilter(map, reader); reader = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader); return reader; } }; checkAnalysisConsistency(random(), a, false, "wmgddzunizdomqyj"); a.close(); }
public void testWithKeywordAttribute() throws IOException { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true); set.add("yourselves"); Tokenizer tokenizer = new MockTokenizer(new StringReader("yourselves yours"), MockTokenizer.WHITESPACE, false); TokenStream filter = new PorterStemFilter(new KeywordMarkerFilter(tokenizer, set)); assertTokenStreamContents(filter, new String[] {"yourselves", "your"}); }
public void testWithKeywordAttribute() throws IOException { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true); set.add("ساهدهات"); MockTokenizer tokenStream = new MockTokenizer(new StringReader("ساهدهات"), MockTokenizer.WHITESPACE, false); ArabicStemFilter filter = new ArabicStemFilter(new KeywordMarkerFilter(tokenStream, set)); assertTokenStreamContents(filter, new String[] {"ساهدهات"}); }
public void testWithKeywordAttribute() throws IOException { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true); set.add("fischen"); GermanStemFilter filter = new GermanStemFilter( new KeywordMarkerFilter( new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader("Fischen Trinken")), set)); assertTokenStreamContents(filter, new String[] {"fischen", "trink"}); }
public void testWithKeywordAttribute() throws IOException { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true); set.add("hole"); CzechStemFilter filter = new CzechStemFilter( new KeywordMarkerFilter( new MockTokenizer(new StringReader("hole desek"), MockTokenizer.WHITESPACE, false), set)); assertTokenStreamContents(filter, new String[] {"hole", "desk"}); }
private CharArraySet getWordSet(ResourceLoader loader, String wordFiles, boolean ignoreCase) throws IOException { List<String> files = splitFileNames(wordFiles); CharArraySet words = null; if (files.size() > 0) { // default stopwords list has 35 or so words, but maybe don't make it that // big to start words = new CharArraySet(files.size() * 10, ignoreCase); for (String file : files) { List<String> wlist = getLines(loader, file.trim()); words.addAll(StopFilter.makeStopSet(wlist, ignoreCase)); } } return words; }
private static CharArraySet resolveNamedWords( Collection<String> words, Map<String, Set<?>> namedWords, boolean ignoreCase) { if (namedWords == null) { return new CharArraySet(words, ignoreCase); } CharArraySet setWords = new CharArraySet(words.size(), ignoreCase); for (String word : words) { if (namedWords.containsKey(word)) { setWords.addAll(namedWords.get(word)); } else { setWords.add(word); } } return setWords; }
/** * Create a new KeywordMarkerFilter, that marks the current token as a keyword if the tokens term * buffer is contained in the given set via the {@link KeywordAttribute}. * * @param in TokenStream to filter * @param keywordSet the keywords set to lookup the current termbuffer */ public KeywordMarkerFilter(final TokenStream in, final Set<?> keywordSet) { this( in, keywordSet instanceof CharArraySet ? (CharArraySet) keywordSet : CharArraySet.copy(Version.LUCENE_31, keywordSet)); }
private CharArraySet remove(CharArraySet fromSet, char[] charArray) { // System.out.println( "remove from: " + new String( charArray )); CharArraySet newSet = new CharArraySet(5, false); Iterator<Object> phraseIt = currentSetToCheck.iterator(); while (phraseIt != null && phraseIt.hasNext()) { char[] phrase = (char[]) phraseIt.next(); // if (!equals( phrase, charArray) && (startsWith( charArray, phrase ) || endsWith( charArray, // phrase))) { if (!equals(phrase, charArray) && startsWith(phrase, charArray) || endsWith(charArray, phrase)) { newSet.add(phrase); } else { // System.out.println( "removing " + new String( phrase )); } } return newSet; }
/** * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all * the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from a {@link * StandardTokenizer} filtered with {@link StandardFilter}, {@link LowerCaseFilter}, {@link * StopFilter} , {@link KeywordMarkerFilter} if a stem exclusion set is provided, and {@link * SnowballFilter} */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { if (matchVersion.onOrAfter(Version.LUCENE_31)) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerFilter(result, stemExclusionSet); result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer()); return new TokenStreamComponents(source, result); } else { final Tokenizer source = new RussianLetterTokenizer(matchVersion, reader); TokenStream result = new LowerCaseFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerFilter(result, stemExclusionSet); result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer()); return new TokenStreamComponents(source, result); } }
/** * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which tokenizes all * the text in the provided {@link Reader}. * * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an * {@link StandardTokenizer} filtered with {@link StandardFilter}, {@link LowerCaseFilter}, * {@link StopFilter} , {@link KeywordMarkerFilter} if a stem exclusion set is provided and * {@link BulgarianStemFilter}. */ @Override public TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerFilter(result, stemExclusionSet); result = new BulgarianStemFilter(result); return new TokenStreamComponents(source, result); }
/** * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all * the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from a {@link * StandardTokenizer} filtered with {@link StandardFilter}, {@link LowerCaseFilter}, {@link * StopFilter} , {@link SetKeywordMarkerFilter} if a stem exclusion set is provided, and * {@link SnowballFilter} */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new LowerCaseFilter(result); result = new StopFilter(result, stopwords); if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer()); return new TokenStreamComponents(source, result); }
@Override public final boolean incrementToken() throws IOException { if (input.incrementToken()) { if (keywordSet.contains(termAtt.buffer(), 0, termAtt.length())) { keywordAttr.setKeyword(true); } return true; } else { return false; } }
/** * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which tokenizes all * the text in the provided {@link Reader}. * * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an * {@link StandardTokenizer} filtered with {@link StandardFilter}, {@link * IrishLowerCaseFilter}, {@link StopFilter} , {@link SetKeywordMarkerFilter} if a stem * exclusion set is provided and {@link SnowballFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new StopFilter(result, HYPHENATIONS); result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new IrishLowerCaseFilter(result); result = new StopFilter(result, stopwords); if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SnowballFilter(result, new IrishStemmer()); return new TokenStreamComponents(source, result); }
@Override public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) { TokenStream stream = new StandardFilter(matchVersion, tokenizer); if (caseInsensitive) stream = new LowerCaseFilter(matchVersion, stream); if (useStopWords) stream = new StopFilter(matchVersion, stream, SpanishAnalyzer.getDefaultStopSet()); if (useStem) { if (!stemExclusionSet.isEmpty()) stream = new SetKeywordMarkerFilter(stream, stemExclusionSet); stream = new SpanishLightStemFilter(stream); } return stream; }
private CharArrayMap convertPhraseSet(CharArraySet phraseSet) { CharArrayMap<CharArraySet> phraseMap = new CharArrayMap(100, false); Iterator<Object> phraseIt = phraseSet.iterator(); while (phraseIt != null && phraseIt.hasNext()) { char[] phrase = (char[]) phraseIt.next(); Log.debug("'" + new String(phrase) + "'"); char[] firstTerm = getFirstTerm(phrase); Log.debug("'" + new String(firstTerm) + "'"); CharArraySet itsPhrases = phraseMap.get(firstTerm, 0, firstTerm.length); if (itsPhrases == null) { itsPhrases = new CharArraySet(5, false); phraseMap.put(new String(firstTerm), itsPhrases); } itsPhrases.add(phrase); } return phraseMap; }
/** * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which tokenizes all * the text in the provided {@link Reader}. * * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an * {@link StandardTokenizer} filtered with {@link StandardFilter}, {@link LowerCaseFilter}, * {@link StopFilter} , {@link SetKeywordMarkerFilter} if a stem exclusion set is provided and * {@link SpanishLightStemFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source; if (getVersion().onOrAfter(Version.LUCENE_4_7_0)) { source = new StandardTokenizer(); } else { source = new StandardTokenizer40(); } TokenStream result = new StandardFilter(source); result = new LowerCaseFilter(result); result = new StopFilter(result, stopwords); if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SpanishLightStemFilter(result); return new TokenStreamComponents(source, result); }
/** * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all * the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an {@link * StandardTokenizer} filtered with {@link LowerCaseFilter}, {@link DecimalDigitFilter}, * {@link StopFilter}, {@link ArabicNormalizationFilter}, {@link SetKeywordMarkerFilter} if a * stem exclusion set is provided and {@link ArabicStemFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new LowerCaseFilter(source); if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) { result = new DecimalDigitFilter(result); } // the order here is important: the stopword list is not normalized! result = new StopFilter(result, stopwords); // TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?! result = new ArabicNormalizationFilter(result); if (!stemExclusionSet.isEmpty()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } return new TokenStreamComponents(source, new ArabicStemFilter(result)); }
/** * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all * the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from a {@link * StandardTokenizer} filtered with {@link LowerCaseFilter}, {@link IndicNormalizationFilter}, * {@link HindiNormalizationFilter}, {@link KeywordMarkerFilter} if a stem exclusion set is * provided, {@link HindiStemFilter}, and Hindi Stop words */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source; if (matchVersion.onOrAfter(Version.LUCENE_36)) { source = new StandardTokenizer(matchVersion, reader); } else { source = new IndicTokenizer(matchVersion, reader); } TokenStream result = new LowerCaseFilter(matchVersion, source); if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerFilter(result, stemExclusionSet); result = new IndicNormalizationFilter(result); result = new HindiNormalizationFilter(result); result = new StopFilter(matchVersion, result, stopwords); result = new HindiStemFilter(result); return new TokenStreamComponents(source, result); }
@Inject public WordDelimiterTokenFilterFactory( Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); // Sample Format for the type table: // $ => DIGIT // % => DIGIT // . => DIGIT // \u002C => DIGIT // \u200D => ALPHANUM List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table"); if (charTypeTableValues == null) { this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; } else { this.charTypeTable = parseTypes(charTypeTableValues); } int flags = 0; // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot" flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true); // If set, causes number subwords to be generated: "500-42" => "500" "42" flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true); // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false); // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042" flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false); // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false); // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true); // If set, includes original words in subwords: "500-42" => "500" "42" "500-42" flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false); // 1, causes "j2se" to be three tokens; "j" "2" "se" flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true); // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); // If not null is the set of tokens to protect from being delimited Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words", version); this.protoWords = protectedWords == null ? null : CharArraySet.copy(Lucene.VERSION, protectedWords); this.flags = flags; }
@Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { final StreamLemmasFilter src = new StreamLemmasFilter(reader, hebMorphLemmatizer, commonWords, lemmaFilter); src.setAlwaysSaveMarkedOriginal(alwaysSaveMarkedOriginal); src.setSuffixForExactMatch(suffixForExactMatch); TokenStream tok = new SynonymFilter(src, acronymMergingMap, false); if (commonWords != null && commonWords.size() > 0) tok = new CommonGramsFilter(matchVersion, tok, commonWords, false); tok = new SuffixKeywordFilter(tok, '$'); return new TokenStreamComponents(src, tok) { @Override protected void setReader(final Reader reader) throws IOException { super.setReader(reader); } }; }
private static class DefaultSetHolder { /** @deprecated (3.1) remove this for Lucene 5.0 */ @Deprecated static final CharArraySet DEFAULT_STOP_SET_30 = CharArraySet.unmodifiableSet( new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(RUSSIAN_STOP_WORDS_30), false)); static final CharArraySet DEFAULT_STOP_SET; static { try { DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet( IOUtils.getDecodingReader( SnowballFilter.class, DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) throw new RuntimeException("Unable to load default stopword set", ex); } } }
private char[] getFirst(CharArraySet charSet) { if (charSet.isEmpty()) return null; Iterator<Object> phraseIt = charSet.iterator(); return (char[]) phraseIt.next(); }
/** * Builds an analyzer with the given stop words and a stem exclusion set. If a stem exclusion set * is provided this analyzer will add a {@link KeywordMarkerFilter} before {@link * BulgarianStemFilter}. */ public BulgarianAnalyzer( Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet)); }
@Override public boolean matches(char s[], int len) { return super.matches(s, len) && !exceptions.contains(s, 0, len); }
/** * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is provided * this analyzer will add a {@link SetKeywordMarkerFilter} before stemming. * * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ public SpanishAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { super(stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); }
/** * Builds an analyzer with the given stop words and a set of work to be excluded from the {@link * CzechStemFilter}. * * @param matchVersion Lucene version to match * @param stopwords a stopword set * @param stemExclusionTable a stemming exclusion set */ public CzechAnalyzer( Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable) { super(matchVersion, stopwords); this.stemExclusionTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable)); }
@Override public boolean incrementToken() throws IOException { while (true) { if (!hasSavedState) { // process a new input word if (!input.incrementToken()) { return false; } int termLength = termAttribute.length(); char[] termBuffer = termAttribute.buffer(); accumPosInc += posIncAttribute.getPositionIncrement(); iterator.setText(termBuffer, termLength); iterator.next(); // word of no delimiters, or protected word: just return it if ((iterator.current == 0 && iterator.end == termLength) || (protWords != null && protWords.contains(termBuffer, 0, termLength))) { posIncAttribute.setPositionIncrement(accumPosInc); accumPosInc = 0; first = false; return true; } // word of simply delimiters if (iterator.end == WordDelimiterIterator.DONE && !has(PRESERVE_ORIGINAL)) { // if the posInc is 1, simply ignore it in the accumulation // TODO: proper hole adjustment (FilteringTokenFilter-like) instead of this previous // logic! if (posIncAttribute.getPositionIncrement() == 1 && !first) { accumPosInc--; } continue; } saveState(); hasOutputToken = false; hasOutputFollowingOriginal = !has(PRESERVE_ORIGINAL); lastConcatCount = 0; if (has(PRESERVE_ORIGINAL)) { posIncAttribute.setPositionIncrement(accumPosInc); accumPosInc = 0; first = false; return true; } } // at the end of the string, output any concatenations if (iterator.end == WordDelimiterIterator.DONE) { if (!concat.isEmpty()) { if (flushConcatenation(concat)) { buffer(); continue; } } if (!concatAll.isEmpty()) { // only if we haven't output this same combo above! if (concatAll.subwordCount > lastConcatCount) { concatAll.writeAndClear(); buffer(); continue; } concatAll.clear(); } if (bufferedPos < bufferedLen) { if (bufferedPos == 0) { sorter.sort(0, bufferedLen); } clearAttributes(); restoreState(buffered[bufferedPos++]); if (first && posIncAttribute.getPositionIncrement() == 0) { // can easily happen with strange combinations (e.g. not outputting numbers, but // concat-all) posIncAttribute.setPositionIncrement(1); } first = false; return true; } // no saved concatenations, on to the next input word bufferedPos = bufferedLen = 0; hasSavedState = false; continue; } // word surrounded by delimiters: always output if (iterator.isSingleWord()) { generatePart(true); iterator.next(); first = false; return true; } int wordType = iterator.type(); // do we already have queued up incompatible concatenations? if (!concat.isEmpty() && (concat.type & wordType) == 0) { if (flushConcatenation(concat)) { hasOutputToken = false; buffer(); continue; } hasOutputToken = false; } // add subwords depending upon options if (shouldConcatenate(wordType)) { if (concat.isEmpty()) { concat.type = wordType; } concatenate(concat); } // add all subwords (catenateAll) if (has(CATENATE_ALL)) { concatenate(concatAll); } // if we should output the word or number part if (shouldGenerateParts(wordType)) { generatePart(false); buffer(); } iterator.next(); } }
@Override public boolean incrementToken() throws IOException { if (!emitSingleTokens && unusedTokens.size() > 0) { Log.debug("emitting unused phrases"); // emit these until the queue is empty before emitting any new stuff Token aToken = unusedTokens.remove(0); emit(aToken); return true; } if (lastToken != null) { emit(lastToken); lastToken = null; return true; } char[] nextToken = nextToken(); // if (nextToken != null) System.out.println( "nextToken: " + new String( nextToken )); if (nextToken == null) { if (lastValid != null) { emit(lastValid); lastValid = null; return true; } if (emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) { char[] phrase = getFirst(currentSetToCheck); char[] lastTok = getCurrentBuffer(new char[0]); if (phrase != null && endsWith(lastTok, phrase)) { currentSetToCheck = remove(currentSetToCheck, phrase); emit(phrase); return true; } } else if (!emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) { if (lastEmitted != null && !equals(fixWhitespace(lastEmitted), getCurrentBuffer(new char[0]))) { discardCharTokens(currentPhrase, unusedTokens); currentSetToCheck = null; if (unusedTokens.size() > 0) { Token aToken = unusedTokens.remove(0); Log.debug("emitting putback token"); emit(aToken); return true; } } } if (lastEmitted == null && (currentPhrase != null && currentPhrase.length() > 0)) { char[] lastTok = getCurrentBuffer(new char[0]); if (currentSetToCheck.contains(lastTok, 0, lastTok.length)) { emit(lastTok); currentPhrase.setLength(0); return true; } else if (!emitSingleTokens) { discardCharTokens(currentPhrase, unusedTokens); currentSetToCheck = null; currentPhrase.setLength(0); if (unusedTokens.size() > 0) { Token aToken = unusedTokens.remove(0); Log.debug("emitting putback token"); emit(aToken); return true; } } } return false; } // if emitSingleToken, set lastToken = nextToken if (emitSingleTokens) { lastToken = nextToken; } if (currentSetToCheck == null || currentSetToCheck.size() == 0) { Log.debug("Checking for phrase start on '" + new String(nextToken) + "'"); if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) { // get the phrase set for this token, add it to currentSetTocheck currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length); if (currentPhrase == null) currentPhrase = new StringBuffer(); else currentPhrase.setLength(0); currentPhrase.append(nextToken); return incrementToken(); } else { emit(nextToken); // clear lastToken lastToken = null; return true; } } else { // add token to the current string buffer. char[] currentBuffer = getCurrentBuffer(nextToken); if (currentSetToCheck.contains(currentBuffer, 0, currentBuffer.length)) { // if its the only one valid, emit it // if there is a longer one, wait to see if it will be matched // if the longer one breaks on the next token, emit this one... // emit the current phrase currentSetToCheck = remove(currentSetToCheck, currentBuffer); if (currentSetToCheck.size() == 0) { emit(currentBuffer); lastValid = null; --positionIncr; } else { if (emitSingleTokens) { lastToken = currentBuffer; return true; } lastValid = currentBuffer; } if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) { // get the phrase set for this token, add it to currentPhrasesTocheck currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length); if (currentPhrase == null) currentPhrase = new StringBuffer(); else currentPhrase.setLength(0); currentPhrase.append(nextToken); } return (lastValid != null) ? incrementToken() : true; } if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) { // get the phrase set for this token, add it to currentPhrasesTocheck // System.out.println( "starting new phrase with " + new String( nextToken ) ); // does this add all of the set? if not need iterator loop CharArraySet newSet = phraseMap.get(nextToken, 0, nextToken.length); Iterator<Object> phraseIt = newSet.iterator(); while (phraseIt != null && phraseIt.hasNext()) { char[] phrase = (char[]) phraseIt.next(); currentSetToCheck.add(phrase); } } // for each phrase in currentSetToCheck - // if there is a phrase prefix match, get the next token recursively Iterator<Object> phraseIt = currentSetToCheck.iterator(); while (phraseIt != null && phraseIt.hasNext()) { char[] phrase = (char[]) phraseIt.next(); if (startsWith(phrase, currentBuffer)) { return incrementToken(); } } if (lastValid != null) { emit(lastValid); lastValid = null; return true; } if (!emitSingleTokens) { // current phrase didn't match fully: put the tokens back // into the unusedTokens list discardCharTokens(currentPhrase, unusedTokens); currentPhrase.setLength(0); currentSetToCheck = null; if (unusedTokens.size() > 0) { Token aToken = unusedTokens.remove(0); Log.debug("emitting putback token"); emit(aToken); return true; } } currentSetToCheck = null; Log.debug("returning at end."); return incrementToken(); } }
/** * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is provided * this analyzer will add a {@link KeywordMarkerFilter} before stemming. * * @param matchVersion lucene compatibility version * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ public DanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet)); }