private static class DefaultSetHolder {
    /** @deprecated (3.1) remove this for Lucene 5.0 */
    @Deprecated
    static final CharArraySet DEFAULT_STOP_SET_30 =
        CharArraySet.unmodifiableSet(
            new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(RUSSIAN_STOP_WORDS_30), false));

    static final CharArraySet DEFAULT_STOP_SET;

    static {
      try {
        DEFAULT_STOP_SET =
            WordlistLoader.getSnowballWordSet(
                IOUtils.getDecodingReader(
                    SnowballFilter.class, DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8),
                Version.LUCENE_CURRENT);
      } catch (IOException ex) {
        // default set should always be present as it is part of the
        // distribution (JAR)
        throw new RuntimeException("Unable to load default stopword set", ex);
      }
    }
  }
 /**
  * Builds an analyzer with the given stop words and a stem exclusion set. If a stem exclusion set
  * is provided this analyzer will add a {@link KeywordMarkerFilter} before {@link
  * BulgarianStemFilter}.
  */
 public BulgarianAnalyzer(
     Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
   super(matchVersion, stopwords);
   this.stemExclusionSet =
       CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
 }
Esempio n. 3
0
 /**
  * Builds an analyzer with the given stop words and a set of work to be excluded from the {@link
  * CzechStemFilter}.
  *
  * @param matchVersion Lucene version to match
  * @param stopwords a stopword set
  * @param stemExclusionTable a stemming exclusion set
  */
 public CzechAnalyzer(
     Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable) {
   super(matchVersion, stopwords);
   this.stemExclusionTable =
       CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
 }
Esempio n. 4
0
 /**
  * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is provided
  * this analyzer will add a {@link KeywordMarkerFilter} before stemming.
  *
  * @param matchVersion lucene compatibility version
  * @param stopwords a stopword set
  * @param stemExclusionSet a set of terms not to be stemmed
  */
 public DanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
   super(matchVersion, stopwords);
   this.stemExclusionSet =
       CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
 }
 /**
  * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is provided
  * this analyzer will add a {@link SetKeywordMarkerFilter} before stemming.
  *
  * @param stopwords a stopword set
  * @param stemExclusionSet a set of terms not to be stemmed
  */
 public SpanishAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
   super(stopwords);
   this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
 }
/** {@link Analyzer} for Irish. */
public final class IrishAnalyzer extends StopwordAnalyzerBase {
  private final CharArraySet stemExclusionSet;

  /** File containing default Irish stopwords. */
  public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";

  private static final CharArraySet DEFAULT_ARTICLES =
      CharArraySet.unmodifiableSet(new CharArraySet(Arrays.asList("d", "m", "b"), true));

  /**
   * When StandardTokenizer splits t‑athair into {t, athair}, we don't want to cause a position
   * increment, otherwise there will be problems with phrase queries versus tAthair (which would not
   * have a gap).
   */
  private static final CharArraySet HYPHENATIONS =
      CharArraySet.unmodifiableSet(new CharArraySet(Arrays.asList("h", "n", "t"), true));

  /**
   * Returns an unmodifiable instance of the default stop words set.
   *
   * @return default stop words set.
   */
  public static CharArraySet getDefaultStopSet() {
    return DefaultSetHolder.DEFAULT_STOP_SET;
  }

  /**
   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
   * static final set the first time.;
   */
  private static class DefaultSetHolder {
    static final CharArraySet DEFAULT_STOP_SET;

    static {
      try {
        DEFAULT_STOP_SET = loadStopwordSet(false, IrishAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
      } catch (IOException ex) {
        // default set should always be present as it is part of the
        // distribution (JAR)
        throw new RuntimeException("Unable to load default stopword set");
      }
    }
  }

  /** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */
  public IrishAnalyzer() {
    this(DefaultSetHolder.DEFAULT_STOP_SET);
  }

  /**
   * Builds an analyzer with the given stop words.
   *
   * @param stopwords a stopword set
   */
  public IrishAnalyzer(CharArraySet stopwords) {
    this(stopwords, CharArraySet.EMPTY_SET);
  }

  /**
   * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is provided
   * this analyzer will add a {@link SetKeywordMarkerFilter} before stemming.
   *
   * @param stopwords a stopword set
   * @param stemExclusionSet a set of terms not to be stemmed
   */
  public IrishAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
    super(stopwords);
    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
  }

  /**
   * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which tokenizes all
   * the text in the provided {@link Reader}.
   *
   * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an
   *     {@link StandardTokenizer} filtered with {@link StandardFilter}, {@link
   *     IrishLowerCaseFilter}, {@link StopFilter} , {@link SetKeywordMarkerFilter} if a stem
   *     exclusion set is provided and {@link SnowballFilter}.
   */
  @Override
  protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new StopFilter(result, HYPHENATIONS);
    result = new ElisionFilter(result, DEFAULT_ARTICLES);
    result = new IrishLowerCaseFilter(result);
    result = new StopFilter(result, stopwords);
    if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    result = new SnowballFilter(result, new IrishStemmer());
    return new TokenStreamComponents(source, result);
  }
}