public static List<String> tokenizeString(String textFile) throws IOException { EnglishAnalyzer ena = new EnglishAnalyzer(Version.LUCENE_4_10_4); TokenStream tokenStream = ena.tokenStream(textFile.trim(), new StringReader(textFile.trim())); // StringBuilder sb = new StringBuilder(); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); tokens.add(term); // sb.append(term + " "); } return tokens; }
/** * Instantiates a new english analyzer provider. * * @param index the index * @param indexSettings the index settings * @param env the env * @param name the name * @param settings the settings */ @Inject public EnglishAnalyzerProvider( Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); analyzer = new EnglishAnalyzer( version, Analysis.parseStopWords(env, settings, EnglishAnalyzer.getDefaultStopSet(), version), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); }
public class Analysis { public static Version parseAnalysisVersion( @IndexSettings Settings indexSettings, Settings settings, ESLogger logger) { // check for explicit version on the specific analyzer component String sVersion = settings.get("version"); if (sVersion != null) { return Lucene.parseVersion(sVersion, Lucene.ANALYZER_VERSION, logger); } // check for explicit version on the index itself as default for all analysis components sVersion = indexSettings.get("index.analysis.version"); if (sVersion != null) { return Lucene.parseVersion(sVersion, Lucene.ANALYZER_VERSION, logger); } // resolve the analysis version based on the version the index was created with return org.elasticsearch.Version.indexCreated(indexSettings).luceneVersion; } public static boolean isNoStopwords(Settings settings) { String value = settings.get("stopwords"); return value != null && "_none_".equals(value); } public static CharArraySet parseStemExclusion( Settings settings, CharArraySet defaultStemExclusion) { String value = settings.get("stem_exclusion"); if (value != null) { if ("_none_".equals(value)) { return CharArraySet.EMPTY_SET; } else { // LUCENE 4 UPGRADE: Should be settings.getAsBoolean("stem_exclusion_case", false)? return new CharArraySet(Strings.commaDelimitedListToSet(value), false); } } String[] stemExclusion = settings.getAsArray("stem_exclusion", null); if (stemExclusion != null) { // LUCENE 4 UPGRADE: Should be settings.getAsBoolean("stem_exclusion_case", false)? return new CharArraySet(Arrays.asList(stemExclusion), false); } else { return defaultStemExclusion; } } public static final ImmutableMap<String, Set<?>> namedStopWords = MapBuilder.<String, Set<?>>newMapBuilder() .put("_arabic_", ArabicAnalyzer.getDefaultStopSet()) .put("_armenian_", ArmenianAnalyzer.getDefaultStopSet()) .put("_basque_", BasqueAnalyzer.getDefaultStopSet()) .put("_brazilian_", BrazilianAnalyzer.getDefaultStopSet()) .put("_bulgarian_", BulgarianAnalyzer.getDefaultStopSet()) .put("_catalan_", CatalanAnalyzer.getDefaultStopSet()) .put("_czech_", CzechAnalyzer.getDefaultStopSet()) .put("_danish_", DanishAnalyzer.getDefaultStopSet()) .put("_dutch_", DutchAnalyzer.getDefaultStopSet()) .put("_english_", EnglishAnalyzer.getDefaultStopSet()) .put("_finnish_", FinnishAnalyzer.getDefaultStopSet()) .put("_french_", FrenchAnalyzer.getDefaultStopSet()) .put("_galician_", GalicianAnalyzer.getDefaultStopSet()) .put("_german_", GermanAnalyzer.getDefaultStopSet()) .put("_greek_", GreekAnalyzer.getDefaultStopSet()) .put("_hindi_", HindiAnalyzer.getDefaultStopSet()) .put("_hungarian_", HungarianAnalyzer.getDefaultStopSet()) .put("_indonesian_", IndonesianAnalyzer.getDefaultStopSet()) .put("_irish_", IrishAnalyzer.getDefaultStopSet()) .put("_italian_", ItalianAnalyzer.getDefaultStopSet()) .put("_latvian_", LatvianAnalyzer.getDefaultStopSet()) .put("_lithuanian_", LithuanianAnalyzer.getDefaultStopSet()) .put("_norwegian_", NorwegianAnalyzer.getDefaultStopSet()) .put("_persian_", PersianAnalyzer.getDefaultStopSet()) .put("_portuguese_", PortugueseAnalyzer.getDefaultStopSet()) .put("_romanian_", RomanianAnalyzer.getDefaultStopSet()) .put("_russian_", RussianAnalyzer.getDefaultStopSet()) .put("_sorani_", SoraniAnalyzer.getDefaultStopSet()) .put("_spanish_", SpanishAnalyzer.getDefaultStopSet()) .put("_swedish_", SwedishAnalyzer.getDefaultStopSet()) .put("_thai_", ThaiAnalyzer.getDefaultStopSet()) .put("_turkish_", TurkishAnalyzer.getDefaultStopSet()) .immutableMap(); public static CharArraySet parseWords( Environment env, Settings settings, String name, CharArraySet defaultWords, Map<String, Set<?>> namedWords, boolean ignoreCase) { String value = settings.get(name); if (value != null) { if ("_none_".equals(value)) { return CharArraySet.EMPTY_SET; } else { return resolveNamedWords(Strings.commaDelimitedListToSet(value), namedWords, ignoreCase); } } List<String> pathLoadedWords = getWordList(env, settings, name); if (pathLoadedWords != null) { return resolveNamedWords(pathLoadedWords, namedWords, ignoreCase); } return defaultWords; } public static CharArraySet parseCommonWords( Environment env, Settings settings, CharArraySet defaultCommonWords, boolean ignoreCase) { return parseWords( env, settings, "common_words", defaultCommonWords, namedStopWords, ignoreCase); } public static CharArraySet parseArticles(Environment env, Settings settings) { return parseWords( env, settings, "articles", null, null, settings.getAsBoolean("articles_case", false)); } public static CharArraySet parseStopWords( Environment env, Settings settings, CharArraySet defaultStopWords) { return parseStopWords( env, settings, defaultStopWords, settings.getAsBoolean("stopwords_case", false)); } public static CharArraySet parseStopWords( Environment env, Settings settings, CharArraySet defaultStopWords, boolean ignoreCase) { return parseWords(env, settings, "stopwords", defaultStopWords, namedStopWords, ignoreCase); } private static CharArraySet resolveNamedWords( Collection<String> words, Map<String, Set<?>> namedWords, boolean ignoreCase) { if (namedWords == null) { return new CharArraySet(words, ignoreCase); } CharArraySet setWords = new CharArraySet(words.size(), ignoreCase); for (String word : words) { if (namedWords.containsKey(word)) { setWords.addAll(namedWords.get(word)); } else { setWords.add(word); } } return setWords; } public static CharArraySet getWordSet(Environment env, Settings settings, String settingsPrefix) { List<String> wordList = getWordList(env, settings, settingsPrefix); if (wordList == null) { return null; } return new CharArraySet(wordList, settings.getAsBoolean(settingsPrefix + "_case", false)); } /** * Fetches a list of words from the specified settings file. The list should either be available * at the key specified by settingsPrefix or in a file specified by settingsPrefix + _path. * * @throws IllegalArgumentException If the word list cannot be found at either key. */ public static List<String> getWordList(Environment env, Settings settings, String settingPrefix) { String wordListPath = settings.get(settingPrefix + "_path", null); if (wordListPath == null) { String[] explicitWordList = settings.getAsArray(settingPrefix, null); if (explicitWordList == null) { return null; } else { return Arrays.asList(explicitWordList); } } final Path wordListFile = env.configFile().resolve(wordListPath); try (BufferedReader reader = FileSystemUtils.newBufferedReader(wordListFile.toUri().toURL(), StandardCharsets.UTF_8)) { return loadWordList(reader, "#"); } catch (IOException ioe) { String message = String.format( Locale.ROOT, "IOException while reading %s_path: %s", settingPrefix, ioe.getMessage()); throw new IllegalArgumentException(message); } } public static List<String> loadWordList(Reader reader, String comment) throws IOException { final List<String> result = new ArrayList<>(); BufferedReader br = null; try { if (reader instanceof BufferedReader) { br = (BufferedReader) reader; } else { br = new BufferedReader(reader); } String word = null; while ((word = br.readLine()) != null) { if (!Strings.hasText(word)) { continue; } if (!word.startsWith(comment)) { result.add(word.trim()); } } } finally { if (br != null) br.close(); } return result; } /** * @return null If no settings set for "settingsPrefix" then return <code>null</code>. * @throws IllegalArgumentException If the Reader can not be instantiated. */ public static Reader getReaderFromFile(Environment env, Settings settings, String settingPrefix) { String filePath = settings.get(settingPrefix, null); if (filePath == null) { return null; } final Path path = env.configFile().resolve(filePath); try { return FileSystemUtils.newBufferedReader(path.toUri().toURL(), StandardCharsets.UTF_8); } catch (IOException ioe) { String message = String.format( Locale.ROOT, "IOException while reading %s_path: %s", settingPrefix, ioe.getMessage()); throw new IllegalArgumentException(message); } } /** * Check whether the provided token stream is able to provide character terms. * * <p>Although most analyzers generate character terms (CharTermAttribute), some token only * contain binary terms (BinaryTermAttribute, CharTermAttribute being a special type of * BinaryTermAttribute), such as {@link NumericTokenStream} and unsuitable for highlighting and * more-like-this queries which expect character terms. */ public static boolean isCharacterTokenStream(TokenStream tokenStream) { try { tokenStream.addAttribute(CharTermAttribute.class); return true; } catch (IllegalArgumentException e) { return false; } } /** * Check whether {@link TokenStream}s generated with <code>analyzer</code> provide with character * terms. * * @see #isCharacterTokenStream(TokenStream) */ public static boolean generatesCharacterTokenStream(Analyzer analyzer, String fieldName) throws IOException { try (TokenStream ts = analyzer.tokenStream(fieldName, "")) { return isCharacterTokenStream(ts); } } }