private void updateTop( CandidateSet[] candidates, Candidate[] path, PriorityQueue<Correction> corrections, double cutoffScore, double score) throws IOException { score = Math.exp(score); assert Math.abs(score - score(path, candidates)) < 0.00001; if (score > cutoffScore) { if (corrections.size() < maxNumCorrections) { Candidate[] c = new Candidate[candidates.length]; System.arraycopy(path, 0, c, 0, path.length); corrections.add(new Correction(score, c)); } else if (corrections.top().compareTo(score, path) < 0) { Correction top = corrections.top(); System.arraycopy(path, 0, top.candidates, 0, path.length); top.score = score; corrections.updateTop(); } } }
/* * More Ideas: * - add ability to find whitespace problems -> we can build a poor mans decompounder with our index based on a automaton? * - add ability to build different error models maybe based on a confusion matrix? * - try to combine a token with its subsequent token to find / detect word splits (optional) * - for this to work we need some way to defined the position length of a candidate * - phonetic filters could be interesting here too for candidate selection */ @Override public Suggestion<? extends Entry<? extends Option>> innerExecute( String name, PhraseSuggestionContext suggestion, IndexSearcher searcher, CharsRefBuilder spare) throws IOException { double realWordErrorLikelihood = suggestion.realworldErrorLikelyhood(); final PhraseSuggestion response = new PhraseSuggestion(name, suggestion.getSize()); final IndexReader indexReader = searcher.getIndexReader(); List<PhraseSuggestionContext.DirectCandidateGenerator> generators = suggestion.generators(); final int numGenerators = generators.size(); final List<CandidateGenerator> gens = new ArrayList<>(generators.size()); for (int i = 0; i < numGenerators; i++) { PhraseSuggestionContext.DirectCandidateGenerator generator = generators.get(i); DirectSpellChecker directSpellChecker = SuggestUtils.getDirectSpellChecker(generator); Terms terms = MultiFields.getTerms(indexReader, generator.field()); if (terms != null) { gens.add( new DirectCandidateGenerator( directSpellChecker, generator.field(), generator.suggestMode(), indexReader, realWordErrorLikelihood, generator.size(), generator.preFilter(), generator.postFilter(), terms)); } } final String suggestField = suggestion.getField(); final Terms suggestTerms = MultiFields.getTerms(indexReader, suggestField); if (gens.size() > 0 && suggestTerms != null) { final NoisyChannelSpellChecker checker = new NoisyChannelSpellChecker( realWordErrorLikelihood, suggestion.getRequireUnigram(), suggestion.getTokenLimit()); final BytesRef separator = suggestion.separator(); TokenStream stream = checker.tokenStream( suggestion.getAnalyzer(), suggestion.getText(), spare, suggestion.getField()); WordScorer wordScorer = suggestion .model() .newScorer( indexReader, suggestTerms, suggestField, realWordErrorLikelihood, separator); Result checkerResult = checker.getCorrections( stream, new MultiCandidateGeneratorWrapper( suggestion.getShardSize(), gens.toArray(new CandidateGenerator[gens.size()])), suggestion.maxErrors(), suggestion.getShardSize(), wordScorer, suggestion.confidence(), suggestion.gramSize()); PhraseSuggestion.Entry resultEntry = buildResultEntry(suggestion, spare, checkerResult.cutoffScore); response.addTerm(resultEntry); final BytesRefBuilder byteSpare = new BytesRefBuilder(); final EarlyTerminatingCollector collector = Lucene.createExistsCollector(); final CompiledScript collateScript; if (suggestion.getCollateQueryScript() != null) { collateScript = suggestion.getCollateQueryScript(); } else if (suggestion.getCollateFilterScript() != null) { collateScript = suggestion.getCollateFilterScript(); } else { collateScript = null; } final boolean collatePrune = (collateScript != null) && suggestion.collatePrune(); for (int i = 0; i < checkerResult.corrections.length; i++) { Correction correction = checkerResult.corrections[i]; spare.copyUTF8Bytes(correction.join(SEPARATOR, byteSpare, null, null)); boolean collateMatch = true; if (collateScript != null) { // Checks if the template query collateScript yields any documents // from the index for a correction, collateMatch is updated final Map<String, Object> vars = suggestion.getCollateScriptParams(); vars.put(SUGGESTION_TEMPLATE_VAR_NAME, spare.toString()); final ExecutableScript executable = scriptService.executable(collateScript, vars); final BytesReference querySource = (BytesReference) executable.run(); final ParsedQuery parsedQuery; if (suggestion.getCollateFilterScript() != null) { parsedQuery = suggestion .getQueryParserService() .parse( QueryBuilders.constantScoreQuery(QueryBuilders.wrapperQuery(querySource))); } else { parsedQuery = suggestion.getQueryParserService().parse(querySource); } collateMatch = Lucene.exists(searcher, parsedQuery.query(), collector); } if (!collateMatch && !collatePrune) { continue; } Text phrase = new StringText(spare.toString()); Text highlighted = null; if (suggestion.getPreTag() != null) { spare.copyUTF8Bytes( correction.join( SEPARATOR, byteSpare, suggestion.getPreTag(), suggestion.getPostTag())); highlighted = new StringText(spare.toString()); } if (collatePrune) { resultEntry.addOption( new Suggestion.Entry.Option( phrase, highlighted, (float) (correction.score), collateMatch)); } else { resultEntry.addOption( new Suggestion.Entry.Option(phrase, highlighted, (float) (correction.score))); } } } else { response.addTerm(buildResultEntry(suggestion, spare, Double.MIN_VALUE)); } return response; }