/** * same as {@link #getWordSet(ResourceLoader, String, boolean)}, except the input is in snowball * format. */ protected final CharArraySet getSnowballWordSet( ResourceLoader loader, String wordFiles, boolean ignoreCase) throws IOException { List<String> files = splitFileNames(wordFiles); CharArraySet words = null; if (files.size() > 0) { // default stopwords list has 35 or so words, but maybe don't make it that // big to start words = new CharArraySet(files.size() * 10, ignoreCase); for (String file : files) { InputStream stream = null; Reader reader = null; try { stream = loader.openResource(file.trim()); CharsetDecoder decoder = StandardCharsets.UTF_8 .newDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); reader = new InputStreamReader(stream, decoder); WordlistLoader.getSnowballWordSet(reader, words); } finally { IOUtils.closeWhileHandlingException(reader, stream); } } } return words; }
/** * Builds an analyzer with the stop words from the given file. * * @see WordlistLoader#getWordSet(Reader, Version) * @param matchVersion Lucene version to match See {@link <a href="#version">above</a>} * @param stopwords File to read stop words from */ public PhaidraAnalyzer(Version matchVersion, File stopwords) throws IOException { this( matchVersion, WordlistLoader.getWordSet( IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8), matchVersion)); }
/** * Builds an analyzer with the stop words from the given reader. * * @see WordlistLoader#getWordSet(Reader, Version) * @param matchVersion Lucene version to match See {@link <a href="#version">above</a>} * @param stopwords Reader to read stop words from */ public PhaidraAnalyzer(Version matchVersion, Reader stopwords) throws IOException { this(matchVersion, WordlistLoader.getWordSet(stopwords, matchVersion)); }
/** Returns the resource's lines (with content treated as UTF-8) */ protected final List<String> getLines(ResourceLoader loader, String resource) throws IOException { return WordlistLoader.getLines(loader.openResource(resource), StandardCharsets.UTF_8); }
/** * Builds an analyzer with the stop words from the given file. * * @see WordlistLoader#getWordSet(File) * @param matchVersion See <a href="#version">above</a> * @param stopwordsFile File to load stop words from */ public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException { this(matchVersion, WordlistLoader.getWordSet(stopwordsFile)); }