@Test public void testKatakanaStemFilter() throws IOException { AnalysisService analysisService = createAnalysisService(); TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_stemmer"); assertThat(tokenFilter, instanceOf(KuromojiKatakanaStemmerFactory.class)); String source = "明後日パーティーに行く予定がある。図書館で資料をコピーしました。"; Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); // パーティー should be stemmed by default // (min len) コピー should not be stemmed String[] expected_tokens_katakana = new String[] { "明後日", "パーティ", "に", "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た" }; assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana); tokenFilter = analysisService.tokenFilter("kuromoji_ks"); assertThat(tokenFilter, instanceOf(KuromojiKatakanaStemmerFactory.class)); tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); // パーティー should not be stemmed since min len == 6 // コピー should not be stemmed expected_tokens_katakana = new String[] { "明後日", "パーティー", "に", "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た" }; assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana); }
@Test public void testBaseFormFilterFactory() throws IOException { AnalysisService analysisService = createAnalysisService(); TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_pos"); assertThat(tokenFilter, instanceOf(KuromojiPartOfSpeechFilterFactory.class)); String source = "私は制限スピードを超える。"; String[] expected = new String[] {"私", "は", "制限", "スピード", "を"}; Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); }
@Test public void testJapaneseStopFilterFactory() throws IOException { AnalysisService analysisService = createAnalysisService(); TokenFilterFactory tokenFilter = analysisService.tokenFilter("ja_stop"); assertThat(tokenFilter, instanceOf(JapaneseStopTokenFilterFactory.class)); String source = "私は制限スピードを超える。"; String[] expected = new String[] {"私", "制限", "超える"}; Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); }
@Test public void testReadingFormFilterFactory() throws IOException { AnalysisService analysisService = createAnalysisService(); TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_rf"); assertThat(tokenFilter, instanceOf(KuromojiReadingFormFilterFactory.class)); String source = "今夜はロバート先生と話した"; String[] expected_tokens_romaji = new String[] {"kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta"}; Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_romaji); tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); String[] expected_tokens_katakana = new String[] {"コンヤ", "ハ", "ロバート", "センセイ", "ト", "ハナシ", "タ"}; tokenFilter = analysisService.tokenFilter("kuromoji_readingform"); assertThat(tokenFilter, instanceOf(KuromojiReadingFormFilterFactory.class)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana); }