public void testDefaultsKuromojiAnalysis() throws IOException {
    AnalysisService analysisService = createAnalysisService();

    TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_tokenizer");
    assertThat(tokenizerFactory, instanceOf(KuromojiTokenizerFactory.class));

    TokenFilterFactory filterFactory = analysisService.tokenFilter("kuromoji_part_of_speech");
    assertThat(filterFactory, instanceOf(KuromojiPartOfSpeechFilterFactory.class));

    filterFactory = analysisService.tokenFilter("kuromoji_readingform");
    assertThat(filterFactory, instanceOf(KuromojiReadingFormFilterFactory.class));

    filterFactory = analysisService.tokenFilter("kuromoji_baseform");
    assertThat(filterFactory, instanceOf(KuromojiBaseFormFilterFactory.class));

    filterFactory = analysisService.tokenFilter("kuromoji_stemmer");
    assertThat(filterFactory, instanceOf(KuromojiKatakanaStemmerFactory.class));

    filterFactory = analysisService.tokenFilter("ja_stop");
    assertThat(filterFactory, instanceOf(JapaneseStopTokenFilterFactory.class));

    NamedAnalyzer analyzer = analysisService.analyzer("kuromoji");
    assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class));

    analyzer = analysisService.analyzer("my_analyzer");
    assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class));
    assertThat(
        analyzer.analyzer().tokenStream(null, new StringReader("")),
        instanceOf(JapaneseTokenizer.class));

    CharFilterFactory charFilterFactory = analysisService.charFilter("kuromoji_iteration_mark");
    assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
  }
 @Test
 public void testDefault() throws IOException {
   Settings settings =
       ImmutableSettings.settingsBuilder()
           .put("index.analysis.filter.limit_default.type", "limit")
           .build();
   AnalysisService analysisService =
       AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
   {
     TokenFilterFactory tokenFilter = analysisService.tokenFilter("limit_default");
     String source = "the quick brown fox";
     String[] expected = new String[] {"the"};
     Tokenizer tokenizer = new WhitespaceTokenizer();
     tokenizer.setReader(new StringReader(source));
     assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
   }
   {
     TokenFilterFactory tokenFilter = analysisService.tokenFilter("limit");
     String source = "the quick brown fox";
     String[] expected = new String[] {"the"};
     Tokenizer tokenizer = new WhitespaceTokenizer();
     tokenizer.setReader(new StringReader(source));
     assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
   }
 }
  public void testKatakanaStemFilter() throws IOException {
    AnalysisService analysisService = createAnalysisService();
    TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_stemmer");
    assertThat(tokenFilter, instanceOf(KuromojiKatakanaStemmerFactory.class));
    String source = "明後日パーティーに行く予定がある。図書館で資料をコピーしました。";

    Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
    tokenizer.setReader(new StringReader(source));

    // パーティー should be stemmed by default
    // (min len) コピー should not be stemmed
    String[] expected_tokens_katakana =
        new String[] {
          "明後日", "パーティ", "に", "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た"
        };
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);

    tokenFilter = analysisService.tokenFilter("kuromoji_ks");
    assertThat(tokenFilter, instanceOf(KuromojiKatakanaStemmerFactory.class));
    tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
    tokenizer.setReader(new StringReader(source));

    // パーティー should not be stemmed since min len == 6
    // コピー should not be stemmed
    expected_tokens_katakana =
        new String[] {
          "明後日", "パーティー", "に", "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た"
        };
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
  }
  @Test
  public void testSettings() throws IOException {
    {
      Settings settings =
          ImmutableSettings.settingsBuilder()
              .put("index.analysis.filter.limit_1.type", "limit")
              .put("index.analysis.filter.limit_1.max_token_count", 3)
              .put("index.analysis.filter.limit_1.consume_all_tokens", true)
              .build();
      AnalysisService analysisService =
          AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
      TokenFilterFactory tokenFilter = analysisService.tokenFilter("limit_1");
      String source = "the quick brown fox";
      String[] expected = new String[] {"the", "quick", "brown"};
      Tokenizer tokenizer = new WhitespaceTokenizer();
      tokenizer.setReader(new StringReader(source));
      assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
    {
      Settings settings =
          ImmutableSettings.settingsBuilder()
              .put("index.analysis.filter.limit_1.type", "limit")
              .put("index.analysis.filter.limit_1.max_token_count", 3)
              .put("index.analysis.filter.limit_1.consume_all_tokens", false)
              .build();
      AnalysisService analysisService =
          AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
      TokenFilterFactory tokenFilter = analysisService.tokenFilter("limit_1");
      String source = "the quick brown fox";
      String[] expected = new String[] {"the", "quick", "brown"};
      Tokenizer tokenizer = new WhitespaceTokenizer();
      tokenizer.setReader(new StringReader(source));
      assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }

    {
      Settings settings =
          ImmutableSettings.settingsBuilder()
              .put("index.analysis.filter.limit_1.type", "limit")
              .put("index.analysis.filter.limit_1.max_token_count", 17)
              .put("index.analysis.filter.limit_1.consume_all_tokens", true)
              .build();
      AnalysisService analysisService =
          AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
      TokenFilterFactory tokenFilter = analysisService.tokenFilter("limit_1");
      String source = "the quick brown fox";
      String[] expected = new String[] {"the", "quick", "brown", "fox"};
      Tokenizer tokenizer = new WhitespaceTokenizer();
      tokenizer.setReader(new StringReader(source));
      assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
  }
  public void testCameCaseOverride() throws IOException {
    Settings settings =
        Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .build();
    Settings indexSettings =
        settingsBuilder()
            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("index.analysis.filter.wordDelimiter.type", "word_delimiter")
            .put("index.analysis.filter.wordDelimiter.split_on_numerics", false)
            .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
            .putArray(
                "index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
            .put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace")
            .putArray(
                "index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter")
            .build();
    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
    AnalysisService analysisService =
        new AnalysisRegistry(null, new Environment(settings)).build(idxSettings);

    TokenFilterFactory word_delimiter = analysisService.tokenFilter("word_delimiter");
    TokenFilterFactory override = analysisService.tokenFilter("wordDelimiter");
    assertNotEquals(word_delimiter.name(), override.name());
    assertNotSame(
        analysisService.tokenFilter("wordDelimiter"),
        analysisService.tokenFilter("word_delimiter"));
    assertSame(
        analysisService.tokenFilter("porterStem"), analysisService.tokenFilter("porter_stem"));

    // unconfigured
    IndexSettings idxSettings1 =
        IndexSettingsModule.newIndexSettings(
            "index",
            settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build());
    AnalysisService analysisService1 =
        new AnalysisRegistry(null, new Environment(settings)).build(idxSettings1);
    assertSame(
        analysisService1.tokenFilter("wordDelimiter"),
        analysisService1.tokenFilter("word_delimiter"));
    assertSame(
        analysisService1.tokenFilter("porterStem"), analysisService1.tokenFilter("porter_stem"));
  }
 public void testBaseFormFilterFactory() throws IOException {
   AnalysisService analysisService = createAnalysisService();
   TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_pos");
   assertThat(tokenFilter, instanceOf(KuromojiPartOfSpeechFilterFactory.class));
   String source = "私は制限スピードを超える。";
   String[] expected = new String[] {"私", "は", "制限", "スピード", "を"};
   Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
   tokenizer.setReader(new StringReader(source));
   assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
 }
 public void testJapaneseStopFilterFactory() throws IOException {
   AnalysisService analysisService = createAnalysisService();
   TokenFilterFactory tokenFilter = analysisService.tokenFilter("ja_stop");
   assertThat(tokenFilter, instanceOf(JapaneseStopTokenFilterFactory.class));
   String source = "私は制限スピードを超える。";
   String[] expected = new String[] {"私", "制限", "超える"};
   Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
   tokenizer.setReader(new StringReader(source));
   assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
 }
 @Test
 public void testHanOnly() throws IOException {
   AnalysisService analysisService =
       AnalysisTestsHelper.createAnalysisServiceFromClassPath(RESOURCE);
   TokenFilterFactory tokenFilter = analysisService.tokenFilter("cjk_han_only");
   String source = "多くの学生が試験に落ちた。";
   String[] expected = new String[] {"多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た"};
   Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(source));
   assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
 }
  public void testReadingFormFilterFactory() throws IOException {
    AnalysisService analysisService = createAnalysisService();
    TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_rf");
    assertThat(tokenFilter, instanceOf(KuromojiReadingFormFilterFactory.class));
    String source = "今夜はロバート先生と話した";
    String[] expected_tokens_romaji =
        new String[] {"kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta"};

    Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
    tokenizer.setReader(new StringReader(source));

    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_romaji);

    tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
    tokenizer.setReader(new StringReader(source));
    String[] expected_tokens_katakana = new String[] {"コンヤ", "ハ", "ロバート", "センセイ", "ト", "ハナシ", "タ"};
    tokenFilter = analysisService.tokenFilter("kuromoji_readingform");
    assertThat(tokenFilter, instanceOf(KuromojiReadingFormFilterFactory.class));
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
  }
 public void testCaseInsensitiveMapping() throws IOException {
   AnalysisService analysisService =
       AnalysisTestsHelper.createAnalysisServiceFromClassPath(createTempDir(), RESOURCE);
   TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_keep_filter");
   assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class));
   String source = "hello small world";
   String[] expected = new String[] {"hello", "world"};
   Tokenizer tokenizer = new WhitespaceTokenizer();
   tokenizer.setReader(new StringReader(source));
   assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] {1, 2});
 }
 public void testFillerToken() throws IOException {
   AnalysisService analysisService =
       AnalysisTestsHelper.createAnalysisServiceFromClassPath(createTempDir(), RESOURCE);
   TokenFilterFactory tokenFilter = analysisService.tokenFilter("shingle_filler");
   String source = "simon the sorcerer";
   String[] expected = new String[] {"simon FILLER", "simon FILLER sorcerer", "FILLER sorcerer"};
   Tokenizer tokenizer = new WhitespaceTokenizer();
   tokenizer.setReader(new StringReader(source));
   TokenStream stream = new StopFilter(tokenizer, StopFilter.makeStopSet("the"));
   assertTokenStreamContents(tokenFilter.create(stream), expected);
 }
 public void testInverseMappingNoShingles() throws IOException {
   AnalysisService analysisService =
       AnalysisTestsHelper.createAnalysisServiceFromClassPath(createTempDir(), RESOURCE);
   TokenFilterFactory tokenFilter = analysisService.tokenFilter("shingle_inverse");
   assertThat(tokenFilter, instanceOf(ShingleTokenFilterFactory.class));
   String source = "the quick";
   String[] expected = new String[] {"the", "quick"};
   Tokenizer tokenizer = new WhitespaceTokenizer();
   tokenizer.setReader(new StringReader(source));
   assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
 }
 public void testDefault() throws IOException {
   AnalysisService analysisService =
       AnalysisTestsHelper.createAnalysisServiceFromClassPath(createTempDir(), RESOURCE);
   TokenFilterFactory tokenFilter = analysisService.tokenFilter("shingle");
   String source = "the quick brown fox";
   String[] expected =
       new String[] {"the", "the quick", "quick", "quick brown", "brown", "brown fox", "fox"};
   Tokenizer tokenizer = new WhitespaceTokenizer();
   tokenizer.setReader(new StringReader(source));
   assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
 }
 public void testDefault() throws IOException {
   AnalysisService analysisService =
       AnalysisTestsHelper.createAnalysisServiceFromSettings(
           settingsBuilder()
               .put("path.home", createTempDir().toString())
               .put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
               .build());
   TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_ascii_folding");
   String source = "Ansprüche";
   String[] expected = new String[] {"Anspruche"};
   Tokenizer tokenizer = new WhitespaceTokenizer();
   tokenizer.setReader(new StringReader(source));
   assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
 }
 public void testPreserveOriginal() throws IOException {
   AnalysisService analysisService =
       AnalysisTestsHelper.createAnalysisServiceFromSettings(
           settingsBuilder()
               .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
               .put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
               .put("index.analysis.filter.my_ascii_folding.preserve_original", true)
               .build());
   TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_ascii_folding");
   String source = "Ansprüche";
   String[] expected = new String[] {"Anspruche", "Ansprüche"};
   Tokenizer tokenizer = new WhitespaceTokenizer();
   tokenizer.setReader(new StringReader(source));
   assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
 }
 public void testLoadWithoutSettings() throws IOException {
   AnalysisService analysisService =
       AnalysisTestsHelper.createAnalysisServiceFromClassPath(createTempDir(), RESOURCE);
   TokenFilterFactory tokenFilter = analysisService.tokenFilter("keep");
   Assert.assertNull(tokenFilter);
 }