private void parseCandidateGenerator( XContentParser parser, SearchContext context, String fieldName, PhraseSuggestionContext.DirectCandidateGenerator generator) throws IOException { if (!SuggestUtils.parseDirectSpellcheckerSettings(parser, fieldName, generator)) { if ("field".equals(fieldName)) { generator.setField(parser.text()); } else if ("size".equals(fieldName)) { generator.size(parser.intValue()); } else if ("pre_filter".equals(fieldName) || "preFilter".equals(fieldName)) { String analyzerName = parser.text(); Analyzer analyzer = context.mapperService().analysisService().analyzer(analyzerName); if (analyzer == null) { throw new ElasticSearchIllegalArgumentException( "Analyzer [" + analyzerName + "] doesn't exists"); } generator.preFilter(analyzer); } else if ("post_filter".equals(fieldName) || "postFilter".equals(fieldName)) { String analyzerName = parser.text(); Analyzer analyzer = context.mapperService().analysisService().analyzer(analyzerName); if (analyzer == null) { throw new ElasticSearchIllegalArgumentException( "Analyzer [" + analyzerName + "] doesn't exists"); } generator.postFilter(analyzer); } else { throw new ElasticSearchIllegalArgumentException( "CandidateGenerator doesn't support [" + fieldName + "]"); } } }
public SuggestionSearchContext.SuggestionContext parse( XContentParser parser, SearchContext context) throws IOException { PhraseSuggestionContext suggestion = new PhraseSuggestionContext(suggester); XContentParser.Token token; String fieldName = null; boolean gramSizeSet = false; while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { if (token == XContentParser.Token.FIELD_NAME) { fieldName = parser.currentName(); } else if (token.isValue()) { if (!SuggestUtils.parseSuggestContext(parser, context, fieldName, suggestion)) { if ("real_word_error_likelihood".equals(fieldName)) { suggestion.setRealWordErrorLikelihood(parser.floatValue()); if (suggestion.realworldErrorLikelyhood() <= 0.0) { throw new ElasticSearchIllegalArgumentException( "real_word_error_likelihood must be > 0.0"); } } else if ("confidence".equals(fieldName)) { suggestion.setConfidence(parser.floatValue()); if (suggestion.confidence() < 0.0) { throw new ElasticSearchIllegalArgumentException("confidence must be >= 0.0"); } } else if ("separator".equals(fieldName)) { suggestion.setSeparator(new BytesRef(parser.text())); } else if ("max_errors".equals(fieldName)) { suggestion.setMaxErrors(parser.floatValue()); if (suggestion.maxErrors() <= 0.0) { throw new ElasticSearchIllegalArgumentException("max_error must be > 0.0"); } } else if ("gram_size".equals(fieldName)) { suggestion.setGramSize(parser.intValue()); if (suggestion.gramSize() < 1) { throw new ElasticSearchIllegalArgumentException("gram_size must be >= 1"); } gramSizeSet = true; } else if ("force_unigrams".equals(fieldName)) { suggestion.setRequireUnigram(parser.booleanValue()); } } } else if (token == Token.START_ARRAY) { if ("direct_generator".equals(fieldName)) { // for now we only have a single type of generators while ((token = parser.nextToken()) == Token.START_OBJECT) { PhraseSuggestionContext.DirectCandidateGenerator generator = new PhraseSuggestionContext.DirectCandidateGenerator(); while ((token = parser.nextToken()) != Token.END_OBJECT) { if (token == XContentParser.Token.FIELD_NAME) { fieldName = parser.currentName(); } if (token.isValue()) { parseCandidateGenerator(parser, context, fieldName, generator); } } verifyGenerator(context, generator); suggestion.addGenerator(generator); } } else { throw new ElasticSearchIllegalArgumentException( "suggester[phrase] doesn't support array field [" + fieldName + "]"); } } else if (token == Token.START_OBJECT) { if ("linear".equals(fieldName)) { ensureNoSmoothing(suggestion); final double[] lambdas = new double[3]; while ((token = parser.nextToken()) != Token.END_OBJECT) { if (token == XContentParser.Token.FIELD_NAME) { fieldName = parser.currentName(); } if (token.isValue()) { if ("trigram_lambda".equals(fieldName)) { lambdas[0] = parser.doubleValue(); if (lambdas[0] < 0) { throw new ElasticSearchIllegalArgumentException( "trigram_lambda must be positive"); } } if ("bigram_lambda".equals(fieldName)) { lambdas[1] = parser.doubleValue(); if (lambdas[1] < 0) { throw new ElasticSearchIllegalArgumentException("bigram_lambda must be positive"); } } if ("unigram_lambda".equals(fieldName)) { lambdas[2] = parser.doubleValue(); if (lambdas[2] < 0) { throw new ElasticSearchIllegalArgumentException( "unigram_lambda must be positive"); } } } } double sum = 0.0d; for (int i = 0; i < lambdas.length; i++) { sum += lambdas[i]; } if (Math.abs(sum - 1.0) > 0.001) { throw new ElasticSearchIllegalArgumentException( "linear smoothing lambdas must sum to 1"); } suggestion.setModel( new WordScorer.WordScorerFactory() { @Override public WordScorer newScorer( IndexReader reader, String field, double realWordLikelyhood, BytesRef separator) throws IOException { return new LinearInterpoatingScorer( reader, field, realWordLikelyhood, separator, lambdas[0], lambdas[1], lambdas[2]); } }); } else if ("laplace".equals(fieldName)) { ensureNoSmoothing(suggestion); double theAlpha = 0.5; while ((token = parser.nextToken()) != Token.END_OBJECT) { if (token == XContentParser.Token.FIELD_NAME) { fieldName = parser.currentName(); } if (token.isValue()) { if ("alpha".equals(fieldName)) { theAlpha = parser.doubleValue(); } } } final double alpha = theAlpha; suggestion.setModel( new WordScorer.WordScorerFactory() { @Override public WordScorer newScorer( IndexReader reader, String field, double realWordLikelyhood, BytesRef separator) throws IOException { return new LaplaceScorer(reader, field, realWordLikelyhood, separator, alpha); } }); } else if ("stupid_backoff".equals(fieldName)) { ensureNoSmoothing(suggestion); double theDiscount = 0.4; while ((token = parser.nextToken()) != Token.END_OBJECT) { if (token == XContentParser.Token.FIELD_NAME) { fieldName = parser.currentName(); } if (token.isValue()) { if ("discount".equals(fieldName)) { theDiscount = parser.doubleValue(); } } } final double discount = theDiscount; suggestion.setModel( new WordScorer.WordScorerFactory() { @Override public WordScorer newScorer( IndexReader reader, String field, double realWordLikelyhood, BytesRef separator) throws IOException { return new StupidBackoffScorer( reader, field, realWordLikelyhood, separator, discount); } }); } else { throw new ElasticSearchIllegalArgumentException( "suggester[phrase] doesn't support object field [" + fieldName + "]"); } } else { throw new ElasticSearchIllegalArgumentException( "suggester[phrase] doesn't support field [" + fieldName + "]"); } } if (suggestion.getField() == null) { throw new ElasticSearchIllegalArgumentException("The required field option is missing"); } if (suggestion.model() == null) { suggestion.setModel(LaplaceScorer.FACTORY); } if (!gramSizeSet || suggestion.generators().isEmpty()) { final ShingleTokenFilterFactory shingleFilterFactory = SuggestUtils.getShingleFilterFactory( suggestion.getAnalyzer() == null ? context.mapperService().fieldSearchAnalyzer(suggestion.getField()) : suggestion.getAnalyzer()); ; if (!gramSizeSet) { // try to detect the shingle size if (shingleFilterFactory != null) { suggestion.setGramSize(shingleFilterFactory.getMaxShingleSize()); if (suggestion.getAnalyzer() == null && shingleFilterFactory.getMinShingleSize() > 1 && !shingleFilterFactory.getOutputUnigrams()) { throw new ElasticSearchIllegalArgumentException( "The default analyzer for field: [" + suggestion.getField() + "] doesn't emit unigrams. If this is intentional try to set the analyzer explicitly"); } } } if (suggestion.generators().isEmpty()) { if (shingleFilterFactory != null && shingleFilterFactory.getMinShingleSize() > 1 && !shingleFilterFactory.getOutputUnigrams() && suggestion.getRequireUnigram()) { throw new ElasticSearchIllegalArgumentException( "The default candidate generator for phrase suggest can't operate on field: [" + suggestion.getField() + "] since it doesn't emit unigrams. If this is intentional try to set the candidate generator field explicitly"); } // use a default generator on the same field DirectCandidateGenerator generator = new DirectCandidateGenerator(); generator.setField(suggestion.getField()); suggestion.addGenerator(generator); } } return suggestion; }
/* * More Ideas: * - add ability to find whitespace problems -> we can build a poor mans decompounder with our index based on a automaton? * - add ability to build different error models maybe based on a confusion matrix? * - try to combine a token with its subsequent token to find / detect word splits (optional) * - for this to work we need some way to defined the position length of a candidate * - phonetic filters could be interesting here too for candidate selection */ @Override public Suggestion<? extends Entry<? extends Option>> innerExecute( String name, PhraseSuggestionContext suggestion, IndexSearcher searcher, CharsRefBuilder spare) throws IOException { double realWordErrorLikelihood = suggestion.realworldErrorLikelyhood(); final PhraseSuggestion response = new PhraseSuggestion(name, suggestion.getSize()); final IndexReader indexReader = searcher.getIndexReader(); List<PhraseSuggestionContext.DirectCandidateGenerator> generators = suggestion.generators(); final int numGenerators = generators.size(); final List<CandidateGenerator> gens = new ArrayList<>(generators.size()); for (int i = 0; i < numGenerators; i++) { PhraseSuggestionContext.DirectCandidateGenerator generator = generators.get(i); DirectSpellChecker directSpellChecker = SuggestUtils.getDirectSpellChecker(generator); Terms terms = MultiFields.getTerms(indexReader, generator.field()); if (terms != null) { gens.add( new DirectCandidateGenerator( directSpellChecker, generator.field(), generator.suggestMode(), indexReader, realWordErrorLikelihood, generator.size(), generator.preFilter(), generator.postFilter(), terms)); } } final String suggestField = suggestion.getField(); final Terms suggestTerms = MultiFields.getTerms(indexReader, suggestField); if (gens.size() > 0 && suggestTerms != null) { final NoisyChannelSpellChecker checker = new NoisyChannelSpellChecker( realWordErrorLikelihood, suggestion.getRequireUnigram(), suggestion.getTokenLimit()); final BytesRef separator = suggestion.separator(); TokenStream stream = checker.tokenStream( suggestion.getAnalyzer(), suggestion.getText(), spare, suggestion.getField()); WordScorer wordScorer = suggestion .model() .newScorer( indexReader, suggestTerms, suggestField, realWordErrorLikelihood, separator); Result checkerResult = checker.getCorrections( stream, new MultiCandidateGeneratorWrapper( suggestion.getShardSize(), gens.toArray(new CandidateGenerator[gens.size()])), suggestion.maxErrors(), suggestion.getShardSize(), wordScorer, suggestion.confidence(), suggestion.gramSize()); PhraseSuggestion.Entry resultEntry = buildResultEntry(suggestion, spare, checkerResult.cutoffScore); response.addTerm(resultEntry); final BytesRefBuilder byteSpare = new BytesRefBuilder(); final EarlyTerminatingCollector collector = Lucene.createExistsCollector(); final CompiledScript collateScript; if (suggestion.getCollateQueryScript() != null) { collateScript = suggestion.getCollateQueryScript(); } else if (suggestion.getCollateFilterScript() != null) { collateScript = suggestion.getCollateFilterScript(); } else { collateScript = null; } final boolean collatePrune = (collateScript != null) && suggestion.collatePrune(); for (int i = 0; i < checkerResult.corrections.length; i++) { Correction correction = checkerResult.corrections[i]; spare.copyUTF8Bytes(correction.join(SEPARATOR, byteSpare, null, null)); boolean collateMatch = true; if (collateScript != null) { // Checks if the template query collateScript yields any documents // from the index for a correction, collateMatch is updated final Map<String, Object> vars = suggestion.getCollateScriptParams(); vars.put(SUGGESTION_TEMPLATE_VAR_NAME, spare.toString()); final ExecutableScript executable = scriptService.executable(collateScript, vars); final BytesReference querySource = (BytesReference) executable.run(); final ParsedQuery parsedQuery; if (suggestion.getCollateFilterScript() != null) { parsedQuery = suggestion .getQueryParserService() .parse( QueryBuilders.constantScoreQuery(QueryBuilders.wrapperQuery(querySource))); } else { parsedQuery = suggestion.getQueryParserService().parse(querySource); } collateMatch = Lucene.exists(searcher, parsedQuery.query(), collector); } if (!collateMatch && !collatePrune) { continue; } Text phrase = new StringText(spare.toString()); Text highlighted = null; if (suggestion.getPreTag() != null) { spare.copyUTF8Bytes( correction.join( SEPARATOR, byteSpare, suggestion.getPreTag(), suggestion.getPostTag())); highlighted = new StringText(spare.toString()); } if (collatePrune) { resultEntry.addOption( new Suggestion.Entry.Option( phrase, highlighted, (float) (correction.score), collateMatch)); } else { resultEntry.addOption( new Suggestion.Entry.Option(phrase, highlighted, (float) (correction.score))); } } } else { response.addTerm(buildResultEntry(suggestion, spare, Double.MIN_VALUE)); } return response; }
@Test public void testDuellCompletions() throws IOException, NoSuchFieldException, SecurityException, IllegalArgumentException, IllegalAccessException { final boolean preserveSeparators = getRandom().nextBoolean(); final boolean preservePositionIncrements = getRandom().nextBoolean(); final boolean usePayloads = getRandom().nextBoolean(); final int options = preserveSeparators ? AnalyzingSuggester.PRESERVE_SEP : 0; XAnalyzingSuggester reference = new XAnalyzingSuggester( new StandardAnalyzer(), null, new StandardAnalyzer(), options, 256, -1, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER); LineFileDocs docs = new LineFileDocs(getRandom()); int num = scaledRandomIntBetween(150, 300); final String[] titles = new String[num]; final long[] weights = new long[num]; for (int i = 0; i < titles.length; i++) { Document nextDoc = docs.nextDoc(); IndexableField field = nextDoc.getField("title"); titles[i] = field.stringValue(); weights[i] = between(0, 100); } docs.close(); final InputIterator primaryIter = new InputIterator() { int index = 0; long currentWeight = -1; @Override public BytesRef next() throws IOException { if (index < titles.length) { currentWeight = weights[index]; return new BytesRef(titles[index++]); } return null; } @Override public long weight() { return currentWeight; } @Override public BytesRef payload() { return null; } @Override public boolean hasPayloads() { return false; } @Override public Set<BytesRef> contexts() { return null; } @Override public boolean hasContexts() { return false; } }; InputIterator iter; if (usePayloads) { iter = new InputIterator() { @Override public long weight() { return primaryIter.weight(); } @Override public BytesRef next() throws IOException { return primaryIter.next(); } @Override public BytesRef payload() { return new BytesRef(Long.toString(weight())); } @Override public boolean hasPayloads() { return true; } @Override public Set<BytesRef> contexts() { return null; } @Override public boolean hasContexts() { return false; } }; } else { iter = primaryIter; } reference.build(iter); PostingsFormat provider = PostingsFormat.forName(Lucene.LATEST_POSTINGS_FORMAT); NamedAnalyzer namedAnalzyer = new NamedAnalyzer("foo", new StandardAnalyzer()); final CompletionFieldMapper mapper = new CompletionFieldMapper( new Names("foo"), namedAnalzyer, namedAnalzyer, provider, null, usePayloads, preserveSeparators, preservePositionIncrements, Integer.MAX_VALUE, indexSettings, AbstractFieldMapper.MultiFields.empty(), null, ContextMapping.EMPTY_MAPPING); Lookup buildAnalyzingLookup = buildAnalyzingLookup(mapper, titles, titles, weights); Field field = buildAnalyzingLookup.getClass().getDeclaredField("maxAnalyzedPathsForOneInput"); field.setAccessible(true); Field refField = reference.getClass().getDeclaredField("maxAnalyzedPathsForOneInput"); refField.setAccessible(true); assertThat(refField.get(reference), equalTo(field.get(buildAnalyzingLookup))); for (int i = 0; i < titles.length; i++) { int res = between(1, 10); final StringBuilder builder = new StringBuilder(); SuggestUtils.analyze( namedAnalzyer.tokenStream("foo", titles[i]), new SuggestUtils.TokenConsumer() { @Override public void nextToken() throws IOException { if (builder.length() == 0) { builder.append(this.charTermAttr.toString()); } } }); String firstTerm = builder.toString(); String prefix = firstTerm.isEmpty() ? "" : firstTerm.substring(0, between(1, firstTerm.length())); List<LookupResult> refLookup = reference.lookup(prefix, false, res); List<LookupResult> lookup = buildAnalyzingLookup.lookup(prefix, false, res); assertThat(refLookup.toString(), lookup.size(), equalTo(refLookup.size())); for (int j = 0; j < refLookup.size(); j++) { assertThat(lookup.get(j).key, equalTo(refLookup.get(j).key)); assertThat( "prefix: " + prefix + " " + j + " -- missmatch cost: " + lookup.get(j).key + " - " + lookup.get(j).value + " | " + refLookup.get(j).key + " - " + refLookup.get(j).value, lookup.get(j).value, equalTo(refLookup.get(j).value)); assertThat(lookup.get(j).payload, equalTo(refLookup.get(j).payload)); if (usePayloads) { assertThat( lookup.get(j).payload.utf8ToString(), equalTo(Long.toString(lookup.get(j).value))); } } } }