public void testDefaultsKuromojiAnalysis() throws IOException { AnalysisService analysisService = createAnalysisService(); TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_tokenizer"); assertThat(tokenizerFactory, instanceOf(KuromojiTokenizerFactory.class)); TokenFilterFactory filterFactory = analysisService.tokenFilter("kuromoji_part_of_speech"); assertThat(filterFactory, instanceOf(KuromojiPartOfSpeechFilterFactory.class)); filterFactory = analysisService.tokenFilter("kuromoji_readingform"); assertThat(filterFactory, instanceOf(KuromojiReadingFormFilterFactory.class)); filterFactory = analysisService.tokenFilter("kuromoji_baseform"); assertThat(filterFactory, instanceOf(KuromojiBaseFormFilterFactory.class)); filterFactory = analysisService.tokenFilter("kuromoji_stemmer"); assertThat(filterFactory, instanceOf(KuromojiKatakanaStemmerFactory.class)); filterFactory = analysisService.tokenFilter("ja_stop"); assertThat(filterFactory, instanceOf(JapaneseStopTokenFilterFactory.class)); NamedAnalyzer analyzer = analysisService.analyzer("kuromoji"); assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class)); analyzer = analysisService.analyzer("my_analyzer"); assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class)); assertThat( analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(JapaneseTokenizer.class)); CharFilterFactory charFilterFactory = analysisService.charFilter("kuromoji_iteration_mark"); assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class)); }
@Test public void testDefault() throws IOException { Settings settings = ImmutableSettings.settingsBuilder() .put("index.analysis.filter.limit_default.type", "limit") .build(); AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings); { TokenFilterFactory tokenFilter = analysisService.tokenFilter("limit_default"); String source = "the quick brown fox"; String[] expected = new String[] {"the"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); } { TokenFilterFactory tokenFilter = analysisService.tokenFilter("limit"); String source = "the quick brown fox"; String[] expected = new String[] {"the"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); } }
public void testKatakanaStemFilter() throws IOException { AnalysisService analysisService = createAnalysisService(); TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_stemmer"); assertThat(tokenFilter, instanceOf(KuromojiKatakanaStemmerFactory.class)); String source = "明後日パーティーに行く予定がある。図書館で資料をコピーしました。"; Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); // パーティー should be stemmed by default // (min len) コピー should not be stemmed String[] expected_tokens_katakana = new String[] { "明後日", "パーティ", "に", "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た" }; assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana); tokenFilter = analysisService.tokenFilter("kuromoji_ks"); assertThat(tokenFilter, instanceOf(KuromojiKatakanaStemmerFactory.class)); tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); // パーティー should not be stemmed since min len == 6 // コピー should not be stemmed expected_tokens_katakana = new String[] { "明後日", "パーティー", "に", "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た" }; assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana); }
@Test public void testSettings() throws IOException { { Settings settings = ImmutableSettings.settingsBuilder() .put("index.analysis.filter.limit_1.type", "limit") .put("index.analysis.filter.limit_1.max_token_count", 3) .put("index.analysis.filter.limit_1.consume_all_tokens", true) .build(); AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings); TokenFilterFactory tokenFilter = analysisService.tokenFilter("limit_1"); String source = "the quick brown fox"; String[] expected = new String[] {"the", "quick", "brown"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); } { Settings settings = ImmutableSettings.settingsBuilder() .put("index.analysis.filter.limit_1.type", "limit") .put("index.analysis.filter.limit_1.max_token_count", 3) .put("index.analysis.filter.limit_1.consume_all_tokens", false) .build(); AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings); TokenFilterFactory tokenFilter = analysisService.tokenFilter("limit_1"); String source = "the quick brown fox"; String[] expected = new String[] {"the", "quick", "brown"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); } { Settings settings = ImmutableSettings.settingsBuilder() .put("index.analysis.filter.limit_1.type", "limit") .put("index.analysis.filter.limit_1.max_token_count", 17) .put("index.analysis.filter.limit_1.consume_all_tokens", true) .build(); AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings); TokenFilterFactory tokenFilter = analysisService.tokenFilter("limit_1"); String source = "the quick brown fox"; String[] expected = new String[] {"the", "quick", "brown", "fox"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); } }
public void testCameCaseOverride() throws IOException { Settings settings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); Settings indexSettings = settingsBuilder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("index.analysis.filter.wordDelimiter.type", "word_delimiter") .put("index.analysis.filter.wordDelimiter.split_on_numerics", false) .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace") .putArray( "index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter") .put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace") .putArray( "index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter") .build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); AnalysisService analysisService = new AnalysisRegistry(null, new Environment(settings)).build(idxSettings); TokenFilterFactory word_delimiter = analysisService.tokenFilter("word_delimiter"); TokenFilterFactory override = analysisService.tokenFilter("wordDelimiter"); assertNotEquals(word_delimiter.name(), override.name()); assertNotSame( analysisService.tokenFilter("wordDelimiter"), analysisService.tokenFilter("word_delimiter")); assertSame( analysisService.tokenFilter("porterStem"), analysisService.tokenFilter("porter_stem")); // unconfigured IndexSettings idxSettings1 = IndexSettingsModule.newIndexSettings( "index", settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build()); AnalysisService analysisService1 = new AnalysisRegistry(null, new Environment(settings)).build(idxSettings1); assertSame( analysisService1.tokenFilter("wordDelimiter"), analysisService1.tokenFilter("word_delimiter")); assertSame( analysisService1.tokenFilter("porterStem"), analysisService1.tokenFilter("porter_stem")); }
public void testBaseFormFilterFactory() throws IOException { AnalysisService analysisService = createAnalysisService(); TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_pos"); assertThat(tokenFilter, instanceOf(KuromojiPartOfSpeechFilterFactory.class)); String source = "私は制限スピードを超える。"; String[] expected = new String[] {"私", "は", "制限", "スピード", "を"}; Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); }
public void testJapaneseStopFilterFactory() throws IOException { AnalysisService analysisService = createAnalysisService(); TokenFilterFactory tokenFilter = analysisService.tokenFilter("ja_stop"); assertThat(tokenFilter, instanceOf(JapaneseStopTokenFilterFactory.class)); String source = "私は制限スピードを超える。"; String[] expected = new String[] {"私", "制限", "超える"}; Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); }
@Test public void testHanOnly() throws IOException { AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromClassPath(RESOURCE); TokenFilterFactory tokenFilter = analysisService.tokenFilter("cjk_han_only"); String source = "多くの学生が試験に落ちた。"; String[] expected = new String[] {"多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た"}; Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
public void testReadingFormFilterFactory() throws IOException { AnalysisService analysisService = createAnalysisService(); TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_rf"); assertThat(tokenFilter, instanceOf(KuromojiReadingFormFilterFactory.class)); String source = "今夜はロバート先生と話した"; String[] expected_tokens_romaji = new String[] {"kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta"}; Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_romaji); tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); String[] expected_tokens_katakana = new String[] {"コンヤ", "ハ", "ロバート", "センセイ", "ト", "ハナシ", "タ"}; tokenFilter = analysisService.tokenFilter("kuromoji_readingform"); assertThat(tokenFilter, instanceOf(KuromojiReadingFormFilterFactory.class)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana); }
public void testCaseInsensitiveMapping() throws IOException { AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromClassPath(createTempDir(), RESOURCE); TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_keep_filter"); assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class)); String source = "hello small world"; String[] expected = new String[] {"hello", "world"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] {1, 2}); }
public void testFillerToken() throws IOException { AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromClassPath(createTempDir(), RESOURCE); TokenFilterFactory tokenFilter = analysisService.tokenFilter("shingle_filler"); String source = "simon the sorcerer"; String[] expected = new String[] {"simon FILLER", "simon FILLER sorcerer", "FILLER sorcerer"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); TokenStream stream = new StopFilter(tokenizer, StopFilter.makeStopSet("the")); assertTokenStreamContents(tokenFilter.create(stream), expected); }
public void testInverseMappingNoShingles() throws IOException { AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromClassPath(createTempDir(), RESOURCE); TokenFilterFactory tokenFilter = analysisService.tokenFilter("shingle_inverse"); assertThat(tokenFilter, instanceOf(ShingleTokenFilterFactory.class)); String source = "the quick"; String[] expected = new String[] {"the", "quick"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
public void testDefault() throws IOException { AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromClassPath(createTempDir(), RESOURCE); TokenFilterFactory tokenFilter = analysisService.tokenFilter("shingle"); String source = "the quick brown fox"; String[] expected = new String[] {"the", "the quick", "quick", "quick brown", "brown", "brown fox", "fox"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
public void testDefault() throws IOException { AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings( settingsBuilder() .put("path.home", createTempDir().toString()) .put("index.analysis.filter.my_ascii_folding.type", "asciifolding") .build()); TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_ascii_folding"); String source = "Ansprüche"; String[] expected = new String[] {"Anspruche"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
public void testPreserveOriginal() throws IOException { AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings( settingsBuilder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put("index.analysis.filter.my_ascii_folding.type", "asciifolding") .put("index.analysis.filter.my_ascii_folding.preserve_original", true) .build()); TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_ascii_folding"); String source = "Ansprüche"; String[] expected = new String[] {"Anspruche", "Ansprüche"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
public void testLoadWithoutSettings() throws IOException { AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromClassPath(createTempDir(), RESOURCE); TokenFilterFactory tokenFilter = analysisService.tokenFilter("keep"); Assert.assertNull(tokenFilter); }