public void testDefaultsKuromojiAnalysis() throws IOException {
    AnalysisService analysisService = createAnalysisService();

    TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_tokenizer");
    assertThat(tokenizerFactory, instanceOf(KuromojiTokenizerFactory.class));

    TokenFilterFactory filterFactory = analysisService.tokenFilter("kuromoji_part_of_speech");
    assertThat(filterFactory, instanceOf(KuromojiPartOfSpeechFilterFactory.class));

    filterFactory = analysisService.tokenFilter("kuromoji_readingform");
    assertThat(filterFactory, instanceOf(KuromojiReadingFormFilterFactory.class));

    filterFactory = analysisService.tokenFilter("kuromoji_baseform");
    assertThat(filterFactory, instanceOf(KuromojiBaseFormFilterFactory.class));

    filterFactory = analysisService.tokenFilter("kuromoji_stemmer");
    assertThat(filterFactory, instanceOf(KuromojiKatakanaStemmerFactory.class));

    filterFactory = analysisService.tokenFilter("ja_stop");
    assertThat(filterFactory, instanceOf(JapaneseStopTokenFilterFactory.class));

    NamedAnalyzer analyzer = analysisService.analyzer("kuromoji");
    assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class));

    analyzer = analysisService.analyzer("my_analyzer");
    assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class));
    assertThat(
        analyzer.analyzer().tokenStream(null, new StringReader("")),
        instanceOf(JapaneseTokenizer.class));

    CharFilterFactory charFilterFactory = analysisService.charFilter("kuromoji_iteration_mark");
    assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
  }
  public void testConfigureCamelCaseTokenFilter() throws IOException {
    Settings settings =
        Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .build();
    Settings indexSettings =
        Settings.builder()
            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("index.analysis.filter.wordDelimiter.type", "word_delimiter")
            .put("index.analysis.filter.wordDelimiter.split_on_numerics", false)
            .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
            .putArray(
                "index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
            .put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace")
            .putArray(
                "index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter")
            .build();

    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);

    IndexAnalyzers indexAnalyzers =
        new AnalysisModule(new Environment(settings), emptyList())
            .getAnalysisRegistry()
            .build(idxSettings);
    try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) {
      assertNotNull(custom_analyser);
      TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
      tokenStream.reset();
      CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
      List<String> token = new ArrayList<>();
      while (tokenStream.incrementToken()) {
        token.add(charTermAttribute.toString());
      }
      assertEquals(token.toString(), 2, token.size());
      assertEquals("j2se", token.get(0));
      assertEquals("j2ee", token.get(1));
    }

    try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) {
      assertNotNull(custom_analyser);
      TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
      tokenStream.reset();
      CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
      List<String> token = new ArrayList<>();
      while (tokenStream.incrementToken()) {
        token.add(charTermAttribute.toString());
      }
      assertEquals(token.toString(), 6, token.size());
      assertEquals("j", token.get(0));
      assertEquals("2", token.get(1));
      assertEquals("se", token.get(2));
      assertEquals("j", token.get(3));
      assertEquals("2", token.get(4));
      assertEquals("ee", token.get(5));
    }
  }
 @Override
 public void close() {
   for (NamedAnalyzer analyzer : analyzers.values()) {
     if (analyzer.scope() == AnalyzerScope.INDEX) {
       try {
         analyzer.close();
       } catch (NullPointerException e) {
         // because analyzers are aliased, they might be closed several times
         // an NPE is thrown in this case, so ignore....
       } catch (Exception e) {
         logger.debug("failed to close analyzer " + analyzer);
       }
     }
   }
 }
 @Override
 public void close() {
   for (NamedAnalyzer analyzer : analyzers.values()) {
     if (analyzer.scope() == AnalyzerScope.INDEX) {
       try {
         analyzer.close();
       } catch (NullPointerException e) {
         // because analyzers are aliased, they might be closed several times
         // an NPE is thrown in this case, so ignore....
         // TODO: Analyzer's can no longer have aliases in indices created in 5.x and beyond,
         // so we only allow the aliases for analyzers on indices created pre 5.x for backwards
         // compatibility.  Once pre 5.0 indices are no longer supported, this check should be
         // removed.
       } catch (Exception e) {
         logger.debug("failed to close analyzer {}", analyzer);
       }
     }
   }
 }
  public void testDefaultsCompoundAnalysis() throws Exception {
    String json = "/org/elasticsearch/index/analysis/stop.json";
    Settings settings =
        Settings.builder()
            .loadFromStream(json, getClass().getResourceAsStream(json))
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .build();
    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
    AnalysisService analysisService = createAnalysisService(idxSettings, settings);

    NamedAnalyzer analyzer1 = analysisService.analyzer("analyzer1");

    assertTokenStreamContents(analyzer1.tokenStream("test", "to be or not to be"), new String[0]);

    NamedAnalyzer analyzer2 = analysisService.analyzer("analyzer2");

    assertTokenStreamContents(analyzer2.tokenStream("test", "to be or not to be"), new String[0]);
  }
Exemplo n.º 6
0
  public void testHtmlStripCharFilter() throws Exception {
    Settings settings =
        settingsBuilder()
            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("index.analysis.analyzer.custom_with_char_filter.tokenizer", "standard")
            .putArray("index.analysis.analyzer.custom_with_char_filter.char_filter", "html_strip")
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .build();
    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
    AnalysisService analysisService =
        new AnalysisRegistry(null, new Environment(settings)).build(idxSettings);

    NamedAnalyzer analyzer1 = analysisService.analyzer("custom_with_char_filter");

    assertTokenStreamContents(
        analyzer1.tokenStream("test", "<b>hello</b>!"), new String[] {"hello"});

    // Repeat one more time to make sure that char filter is reinitialized correctly
    assertTokenStreamContents(
        analyzer1.tokenStream("test", "<b>hello</b>!"), new String[] {"hello"});
  }
Exemplo n.º 7
0
  @Test
  public void testMappingCharFilter() throws Exception {
    Index index = new Index("test");
    Settings settings =
        settingsBuilder()
            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("index.analysis.char_filter.my_mapping.type", "mapping")
            .putArray("index.analysis.char_filter.my_mapping.mappings", "ph=>f", "qu=>q")
            .put("index.analysis.analyzer.custom_with_char_filter.tokenizer", "standard")
            .putArray("index.analysis.analyzer.custom_with_char_filter.char_filter", "my_mapping")
            .put("path.home", createTempDir().toString())
            .build();
    Injector parentInjector =
        new ModulesBuilder()
            .add(
                new SettingsModule(settings),
                new EnvironmentModule(new Environment(settings)),
                new IndicesAnalysisModule())
            .createInjector();
    Injector injector =
        new ModulesBuilder()
            .add(
                new IndexSettingsModule(index, settings),
                new IndexNameModule(index),
                new AnalysisModule(
                    settings, parentInjector.getInstance(IndicesAnalysisService.class)))
            .createChildInjector(parentInjector);

    AnalysisService analysisService = injector.getInstance(AnalysisService.class);

    NamedAnalyzer analyzer1 = analysisService.analyzer("custom_with_char_filter");

    assertTokenStreamContents(
        analyzer1.tokenStream("test", "jeff quit phish"), new String[] {"jeff", "qit", "fish"});

    // Repeat one more time to make sure that char filter is reinitialized correctly
    assertTokenStreamContents(
        analyzer1.tokenStream("test", "jeff quit phish"), new String[] {"jeff", "qit", "fish"});
  }
  private void processAnalyzerFactory(
      String name,
      AnalyzerProvider<?> analyzerFactory,
      Map<String, NamedAnalyzer> analyzerAliases,
      Map<String, NamedAnalyzer> analyzers) {
    /*
     * Lucene defaults positionIncrementGap to 0 in all analyzers but
     * Elasticsearch defaults them to 0 only before version 2.0
     * and 100 afterwards so we override the positionIncrementGap if it
     * doesn't match here.
     */
    int overridePositionIncrementGap = TextFieldMapper.Defaults.POSITION_INCREMENT_GAP;
    if (analyzerFactory instanceof CustomAnalyzerProvider) {
      ((CustomAnalyzerProvider) analyzerFactory).build(this);
      /*
       * Custom analyzers already default to the correct, version
       * dependent positionIncrementGap and the user is be able to
       * configure the positionIncrementGap directly on the analyzer so
       * we disable overriding the positionIncrementGap to preserve the
       * user's setting.
       */
      overridePositionIncrementGap = Integer.MIN_VALUE;
    }
    Analyzer analyzerF = analyzerFactory.get();
    if (analyzerF == null) {
      throw new IllegalArgumentException(
          "analyzer [" + analyzerFactory.name() + "] created null analyzer");
    }
    NamedAnalyzer analyzer;
    if (analyzerF instanceof NamedAnalyzer) {
      // if we got a named analyzer back, use it...
      analyzer = (NamedAnalyzer) analyzerF;
      if (overridePositionIncrementGap >= 0
          && analyzer.getPositionIncrementGap(analyzer.name()) != overridePositionIncrementGap) {
        // unless the positionIncrementGap needs to be overridden
        analyzer = new NamedAnalyzer(analyzer, overridePositionIncrementGap);
      }
    } else {
      analyzer =
          new NamedAnalyzer(name, analyzerFactory.scope(), analyzerF, overridePositionIncrementGap);
    }
    if (analyzers.containsKey(name)) {
      throw new IllegalStateException("already registered analyzer with name: " + name);
    }
    analyzers.put(name, analyzer);
    // TODO: remove alias support completely when we no longer support pre 5.0 indices
    final String analyzerAliasKey = "index.analysis.analyzer." + analyzerFactory.name() + ".alias";
    if (indexSettings.getSettings().get(analyzerAliasKey) != null) {
      if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_5_0_0_alpha6)) {
        // do not allow alias creation if the index was created on or after v5.0 alpha6
        throw new IllegalArgumentException("setting [" + analyzerAliasKey + "] is not supported");
      }

      // the setting is now removed but we only support it for loading indices created before v5.0
      deprecationLogger.deprecated(
          "setting [{}] is only allowed on index [{}] because it was created before 5.x; "
              + "analyzer aliases can no longer be created on new indices.",
          analyzerAliasKey,
          index().getName());
      Set<String> aliases =
          Sets.newHashSet(indexSettings.getSettings().getAsArray(analyzerAliasKey));
      for (String alias : aliases) {
        if (analyzerAliases.putIfAbsent(alias, analyzer) != null) {
          throw new IllegalStateException(
              "alias ["
                  + alias
                  + "] is already used by ["
                  + analyzerAliases.get(alias).name()
                  + "]");
        }
      }
    }
  }
  public AnalysisService(
      IndexSettings indexSettings,
      Map<String, AnalyzerProvider> analyzerProviders,
      Map<String, TokenizerFactory> tokenizerFactoryFactories,
      Map<String, CharFilterFactory> charFilterFactoryFactories,
      Map<String, TokenFilterFactory> tokenFilterFactoryFactories) {
    super(indexSettings);
    this.tokenizers = unmodifiableMap(tokenizerFactoryFactories);
    this.charFilters = unmodifiableMap(charFilterFactoryFactories);
    this.tokenFilters = unmodifiableMap(tokenFilterFactoryFactories);
    analyzerProviders = new HashMap<>(analyzerProviders);

    if (!analyzerProviders.containsKey("default")) {
      analyzerProviders.put(
          "default",
          new StandardAnalyzerProvider(
              indexSettings, null, "default", Settings.Builder.EMPTY_SETTINGS));
    }
    if (!analyzerProviders.containsKey("default_search")) {
      analyzerProviders.put("default_search", analyzerProviders.get("default"));
    }
    if (!analyzerProviders.containsKey("default_search_quoted")) {
      analyzerProviders.put("default_search_quoted", analyzerProviders.get("default_search"));
    }

    Map<String, NamedAnalyzer> analyzers = new HashMap<>();
    for (Map.Entry<String, AnalyzerProvider> entry : analyzerProviders.entrySet()) {
      AnalyzerProvider analyzerFactory = entry.getValue();
      String name = entry.getKey();
      /*
       * Lucene defaults positionIncrementGap to 0 in all analyzers but
       * Elasticsearch defaults them to 0 only before version 2.0
       * and 100 afterwards so we override the positionIncrementGap if it
       * doesn't match here.
       */
      int overridePositionIncrementGap = StringFieldMapper.Defaults.POSITION_INCREMENT_GAP;
      if (analyzerFactory instanceof CustomAnalyzerProvider) {
        ((CustomAnalyzerProvider) analyzerFactory).build(this);
        /*
         * Custom analyzers already default to the correct, version
         * dependent positionIncrementGap and the user is be able to
         * configure the positionIncrementGap directly on the analyzer so
         * we disable overriding the positionIncrementGap to preserve the
         * user's setting.
         */
        overridePositionIncrementGap = Integer.MIN_VALUE;
      }
      Analyzer analyzerF = analyzerFactory.get();
      if (analyzerF == null) {
        throw new IllegalArgumentException(
            "analyzer [" + analyzerFactory.name() + "] created null analyzer");
      }
      NamedAnalyzer analyzer;
      if (analyzerF instanceof NamedAnalyzer) {
        // if we got a named analyzer back, use it...
        analyzer = (NamedAnalyzer) analyzerF;
        if (overridePositionIncrementGap >= 0
            && analyzer.getPositionIncrementGap(analyzer.name()) != overridePositionIncrementGap) {
          // unless the positionIncrementGap needs to be overridden
          analyzer = new NamedAnalyzer(analyzer, overridePositionIncrementGap);
        }
      } else {
        analyzer =
            new NamedAnalyzer(
                name, analyzerFactory.scope(), analyzerF, overridePositionIncrementGap);
      }
      if (analyzers.containsKey(name)) {
        throw new IllegalStateException("already registered analyzer with name: " + name);
      }
      analyzers.put(name, analyzer);
      String strAliases =
          this.indexSettings
              .getSettings()
              .get("index.analysis.analyzer." + analyzerFactory.name() + ".alias");
      if (strAliases != null) {
        for (String alias : Strings.commaDelimitedListToStringArray(strAliases)) {
          analyzers.put(alias, analyzer);
        }
      }
      String[] aliases =
          this.indexSettings
              .getSettings()
              .getAsArray("index.analysis.analyzer." + analyzerFactory.name() + ".alias");
      for (String alias : aliases) {
        analyzers.put(alias, analyzer);
      }
    }

    NamedAnalyzer defaultAnalyzer = analyzers.get("default");
    if (defaultAnalyzer == null) {
      throw new IllegalArgumentException("no default analyzer configured");
    }
    if (analyzers.containsKey("default_index")) {
      final Version createdVersion = indexSettings.getIndexVersionCreated();
      if (createdVersion.onOrAfter(Version.V_3_0_0)) {
        throw new IllegalArgumentException(
            "setting [index.analysis.analyzer.default_index] is not supported anymore, use [index.analysis.analyzer.default] instead for index ["
                + index().getName()
                + "]");
      } else {
        deprecationLogger.deprecated(
            "setting [index.analysis.analyzer.default_index] is deprecated, use [index.analysis.analyzer.default] instead for index [{}]",
            index().getName());
      }
    }
    defaultIndexAnalyzer =
        analyzers.containsKey("default_index") ? analyzers.get("default_index") : defaultAnalyzer;
    defaultSearchAnalyzer =
        analyzers.containsKey("default_search") ? analyzers.get("default_search") : defaultAnalyzer;
    defaultSearchQuoteAnalyzer =
        analyzers.containsKey("default_search_quote")
            ? analyzers.get("default_search_quote")
            : defaultSearchAnalyzer;

    for (Map.Entry<String, NamedAnalyzer> analyzer : analyzers.entrySet()) {
      if (analyzer.getKey().startsWith("_")) {
        throw new IllegalArgumentException(
            "analyzer name must not start with '_'. got \"" + analyzer.getKey() + "\"");
      }
    }
    this.analyzers = unmodifiableMap(analyzers);
  }