Ejemplo n.º 1
0
 public final ImmutableList<AnalyzeToken> analyze(final String field, final String text)
     throws IoExceptionT {
   AnalyzeResponse response;
   try {
     response =
         client
             .admin()
             .indices()
             .prepareAnalyze(text)
             .setField(field)
             .setIndex(indexName)
             .execute()
             .actionGet();
   } catch (final ElasticsearchException e) {
     throw exceptionFactory.newIoException(
         e, String.format("error analyzing text with index %s", indexName));
   }
   return ImmutableList.copyOf(response.getTokens());
 }
 public void testAnalyze() {
   createIndexWithAlias();
   assertAcked(
       client()
           .admin()
           .indices()
           .preparePutMapping("test")
           .setType("test")
           .setSource("field", "type=text,analyzer=keyword"));
   ensureYellow("test");
   AnalyzeResponse analyzeResponse =
       client()
           .admin()
           .indices()
           .prepareAnalyze("this is a test")
           .setIndex(indexOrAlias())
           .setField("field")
           .get();
   assertThat(analyzeResponse.getTokens().size(), equalTo(1));
   assertThat(analyzeResponse.getTokens().get(0).getTerm(), equalTo("this is a test"));
 }
  /**
   * Simple upgrade test for analyzers to make sure they analyze to the same tokens after upgrade
   * TODO we need this for random tokenizers / tokenfilters as well
   */
  @Test
  public void testAnalyzerTokensAfterUpgrade()
      throws IOException, ExecutionException, InterruptedException {
    int numFields =
        randomIntBetween(PreBuiltAnalyzers.values().length, PreBuiltAnalyzers.values().length * 10);
    StringBuilder builder = new StringBuilder();
    String[] fields = new String[numFields * 2];
    int fieldId = 0;
    for (int i = 0; i < fields.length; i++) {
      fields[i++] = "field_" + fieldId++;
      String analyzer = randomAnalyzer();
      fields[i] = "type=string,analyzer=" + analyzer;
    }
    assertAcked(prepareCreate("test").addMapping("type", fields).setSettings(indexSettings()));
    ensureYellow();
    InputOutput[] inout = new InputOutput[numFields];
    for (int i = 0; i < numFields; i++) {
      String input;
      Matcher matcher;
      do {
        // In Lucene 4.10, a bug was fixed in StandardTokenizer which was causing breaks on complex
        // characters.
        // The bug was fixed without backcompat Version handling, so testing between >=4.10 vs <=
        // 4.9 can
        // cause differences when the random string generated contains these complex characters. To
        // mitigate
        // the problem, we skip any strings containing these characters.
        // TODO: only skip strings containing complex chars when comparing against ES <= 1.3.x
        input = TestUtil.randomAnalysisString(getRandom(), 100, false);
        matcher = complexUnicodeChars.matcher(input);
      } while (matcher.find());

      AnalyzeResponse test =
          client().admin().indices().prepareAnalyze("test", input).setField("field_" + i).get();
      inout[i] = new InputOutput(test, input, "field_" + i);
    }

    logClusterState();
    boolean upgraded;
    do {
      logClusterState();
      upgraded = backwardsCluster().upgradeOneNode();
      ensureYellow();
    } while (upgraded);

    for (int i = 0; i < inout.length; i++) {
      InputOutput inputOutput = inout[i];
      AnalyzeResponse test =
          client()
              .admin()
              .indices()
              .prepareAnalyze("test", inputOutput.input)
              .setField(inputOutput.field)
              .get();
      List<AnalyzeResponse.AnalyzeToken> tokens = test.getTokens();
      List<AnalyzeResponse.AnalyzeToken> expectedTokens = inputOutput.response.getTokens();
      assertThat(
          "size mismatch field: "
              + fields[i * 2]
              + " analyzer: "
              + fields[i * 2 + 1]
              + " input: "
              + BaseTokenStreamTestCase.escape(inputOutput.input),
          expectedTokens.size(),
          equalTo(tokens.size()));
      for (int j = 0; j < tokens.size(); j++) {
        String msg =
            "failed for term: "
                + expectedTokens.get(j).getTerm()
                + " field: "
                + fields[i * 2]
                + " analyzer: "
                + fields[i * 2 + 1]
                + " input: "
                + BaseTokenStreamTestCase.escape(inputOutput.input);
        assertThat(
            msg,
            BaseTokenStreamTestCase.escape(expectedTokens.get(j).getTerm()),
            equalTo(BaseTokenStreamTestCase.escape(tokens.get(j).getTerm())));
        assertThat(msg, expectedTokens.get(j).getPosition(), equalTo(tokens.get(j).getPosition()));
        assertThat(
            msg, expectedTokens.get(j).getStartOffset(), equalTo(tokens.get(j).getStartOffset()));
        assertThat(
            msg, expectedTokens.get(j).getEndOffset(), equalTo(tokens.get(j).getEndOffset()));
        assertThat(msg, expectedTokens.get(j).getType(), equalTo(tokens.get(j).getType()));
      }
    }
  }