private static class DefaultSetHolder { /** @deprecated (3.1) remove this for Lucene 5.0 */ @Deprecated static final CharArraySet DEFAULT_STOP_SET_30 = CharArraySet.unmodifiableSet( new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(RUSSIAN_STOP_WORDS_30), false)); static final CharArraySet DEFAULT_STOP_SET; static { try { DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet( IOUtils.getDecodingReader( SnowballFilter.class, DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) throw new RuntimeException("Unable to load default stopword set", ex); } } }
/** * Builds an analyzer with the given stop words and a stem exclusion set. If a stem exclusion set * is provided this analyzer will add a {@link KeywordMarkerFilter} before {@link * BulgarianStemFilter}. */ public BulgarianAnalyzer( Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet)); }
/** * Builds an analyzer with the given stop words and a set of work to be excluded from the {@link * CzechStemFilter}. * * @param matchVersion Lucene version to match * @param stopwords a stopword set * @param stemExclusionTable a stemming exclusion set */ public CzechAnalyzer( Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable) { super(matchVersion, stopwords); this.stemExclusionTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable)); }
/** * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is provided * this analyzer will add a {@link KeywordMarkerFilter} before stemming. * * @param matchVersion lucene compatibility version * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ public DanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet)); }
/** * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is provided * this analyzer will add a {@link SetKeywordMarkerFilter} before stemming. * * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ public SpanishAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { super(stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); }
/** {@link Analyzer} for Irish. */ public final class IrishAnalyzer extends StopwordAnalyzerBase { private final CharArraySet stemExclusionSet; /** File containing default Irish stopwords. */ public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt"; private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(new CharArraySet(Arrays.asList("d", "m", "b"), true)); /** * When StandardTokenizer splits t‑athair into {t, athair}, we don't want to cause a position * increment, otherwise there will be problems with phrase queries versus tAthair (which would not * have a gap). */ private static final CharArraySet HYPHENATIONS = CharArraySet.unmodifiableSet(new CharArraySet(Arrays.asList("h", "n", "t"), true)); /** * Returns an unmodifiable instance of the default stop words set. * * @return default stop words set. */ public static CharArraySet getDefaultStopSet() { return DefaultSetHolder.DEFAULT_STOP_SET; } /** * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the * static final set the first time.; */ private static class DefaultSetHolder { static final CharArraySet DEFAULT_STOP_SET; static { try { DEFAULT_STOP_SET = loadStopwordSet(false, IrishAnalyzer.class, DEFAULT_STOPWORD_FILE, "#"); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) throw new RuntimeException("Unable to load default stopword set"); } } } /** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */ public IrishAnalyzer() { this(DefaultSetHolder.DEFAULT_STOP_SET); } /** * Builds an analyzer with the given stop words. * * @param stopwords a stopword set */ public IrishAnalyzer(CharArraySet stopwords) { this(stopwords, CharArraySet.EMPTY_SET); } /** * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is provided * this analyzer will add a {@link SetKeywordMarkerFilter} before stemming. * * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ public IrishAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { super(stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); } /** * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which tokenizes all * the text in the provided {@link Reader}. * * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an * {@link StandardTokenizer} filtered with {@link StandardFilter}, {@link * IrishLowerCaseFilter}, {@link StopFilter} , {@link SetKeywordMarkerFilter} if a stem * exclusion set is provided and {@link SnowballFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new StopFilter(result, HYPHENATIONS); result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new IrishLowerCaseFilter(result); result = new StopFilter(result, stopwords); if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SnowballFilter(result, new IrishStemmer()); return new TokenStreamComponents(source, result); } }