public final ImmutableList<AnalyzeToken> analyze(final String field, final String text) throws IoExceptionT { AnalyzeResponse response; try { response = client .admin() .indices() .prepareAnalyze(text) .setField(field) .setIndex(indexName) .execute() .actionGet(); } catch (final ElasticsearchException e) { throw exceptionFactory.newIoException( e, String.format("error analyzing text with index %s", indexName)); } return ImmutableList.copyOf(response.getTokens()); }
public void testAnalyze() { createIndexWithAlias(); assertAcked( client() .admin() .indices() .preparePutMapping("test") .setType("test") .setSource("field", "type=text,analyzer=keyword")); ensureYellow("test"); AnalyzeResponse analyzeResponse = client() .admin() .indices() .prepareAnalyze("this is a test") .setIndex(indexOrAlias()) .setField("field") .get(); assertThat(analyzeResponse.getTokens().size(), equalTo(1)); assertThat(analyzeResponse.getTokens().get(0).getTerm(), equalTo("this is a test")); }
/** * Simple upgrade test for analyzers to make sure they analyze to the same tokens after upgrade * TODO we need this for random tokenizers / tokenfilters as well */ @Test public void testAnalyzerTokensAfterUpgrade() throws IOException, ExecutionException, InterruptedException { int numFields = randomIntBetween(PreBuiltAnalyzers.values().length, PreBuiltAnalyzers.values().length * 10); StringBuilder builder = new StringBuilder(); String[] fields = new String[numFields * 2]; int fieldId = 0; for (int i = 0; i < fields.length; i++) { fields[i++] = "field_" + fieldId++; String analyzer = randomAnalyzer(); fields[i] = "type=string,analyzer=" + analyzer; } assertAcked(prepareCreate("test").addMapping("type", fields).setSettings(indexSettings())); ensureYellow(); InputOutput[] inout = new InputOutput[numFields]; for (int i = 0; i < numFields; i++) { String input; Matcher matcher; do { // In Lucene 4.10, a bug was fixed in StandardTokenizer which was causing breaks on complex // characters. // The bug was fixed without backcompat Version handling, so testing between >=4.10 vs <= // 4.9 can // cause differences when the random string generated contains these complex characters. To // mitigate // the problem, we skip any strings containing these characters. // TODO: only skip strings containing complex chars when comparing against ES <= 1.3.x input = TestUtil.randomAnalysisString(getRandom(), 100, false); matcher = complexUnicodeChars.matcher(input); } while (matcher.find()); AnalyzeResponse test = client().admin().indices().prepareAnalyze("test", input).setField("field_" + i).get(); inout[i] = new InputOutput(test, input, "field_" + i); } logClusterState(); boolean upgraded; do { logClusterState(); upgraded = backwardsCluster().upgradeOneNode(); ensureYellow(); } while (upgraded); for (int i = 0; i < inout.length; i++) { InputOutput inputOutput = inout[i]; AnalyzeResponse test = client() .admin() .indices() .prepareAnalyze("test", inputOutput.input) .setField(inputOutput.field) .get(); List<AnalyzeResponse.AnalyzeToken> tokens = test.getTokens(); List<AnalyzeResponse.AnalyzeToken> expectedTokens = inputOutput.response.getTokens(); assertThat( "size mismatch field: " + fields[i * 2] + " analyzer: " + fields[i * 2 + 1] + " input: " + BaseTokenStreamTestCase.escape(inputOutput.input), expectedTokens.size(), equalTo(tokens.size())); for (int j = 0; j < tokens.size(); j++) { String msg = "failed for term: " + expectedTokens.get(j).getTerm() + " field: " + fields[i * 2] + " analyzer: " + fields[i * 2 + 1] + " input: " + BaseTokenStreamTestCase.escape(inputOutput.input); assertThat( msg, BaseTokenStreamTestCase.escape(expectedTokens.get(j).getTerm()), equalTo(BaseTokenStreamTestCase.escape(tokens.get(j).getTerm()))); assertThat(msg, expectedTokens.get(j).getPosition(), equalTo(tokens.get(j).getPosition())); assertThat( msg, expectedTokens.get(j).getStartOffset(), equalTo(tokens.get(j).getStartOffset())); assertThat( msg, expectedTokens.get(j).getEndOffset(), equalTo(tokens.get(j).getEndOffset())); assertThat(msg, expectedTokens.get(j).getType(), equalTo(tokens.get(j).getType())); } } }