private void runTest(Class<? extends Lookup> lookupClass, boolean supportsExactWeights) throws Exception { // Add all input keys. Lookup lookup = lookupClass.newInstance(); TermFreq[] keys = new TermFreq[this.keys.length]; for (int i = 0; i < keys.length; i++) keys[i] = new TermFreq(this.keys[i], i); lookup.build(new TermFreqArrayIterator(keys)); // Store the suggester. File storeDir = TEMP_DIR; lookup.store(new FileOutputStream(new File(storeDir, "lookup.dat"))); // Re-read it from disk. lookup = lookupClass.newInstance(); lookup.load(new FileInputStream(new File(storeDir, "lookup.dat"))); // Assert validity. Random random = random(); long previous = Long.MIN_VALUE; for (TermFreq k : keys) { List<LookupResult> list = lookup.lookup(_TestUtil.bytesToCharSequence(k.term, random), false, 1); assertEquals(1, list.size()); LookupResult lookupResult = list.get(0); assertNotNull(k.term.utf8ToString(), lookupResult.key); if (supportsExactWeights) { assertEquals(k.term.utf8ToString(), k.v, lookupResult.value); } else { assertTrue(lookupResult.value + ">=" + previous, lookupResult.value >= previous); previous = lookupResult.value; } } }
@Override public void build(SolrCore core, SolrIndexSearcher searcher) throws IOException { LOG.info("build()"); if (sourceLocation == null) { reader = searcher.getIndexReader(); dictionary = new HighFrequencyDictionary(reader, field, threshold); } else { try { final String fileDelim = ","; if (sourceLocation.contains(fileDelim)) { String[] files = sourceLocation.split(fileDelim); Reader[] readers = new Reader[files.length]; for (int i = 0; i < files.length; i++) { Reader reader = new InputStreamReader( core.getResourceLoader().openResource(files[i]), IOUtils.CHARSET_UTF_8); readers[i] = reader; } dictionary = new MultipleFileDictionary(readers); } else { dictionary = new FileDictionary( new InputStreamReader( core.getResourceLoader().openResource(sourceLocation), IOUtils.CHARSET_UTF_8)); } } catch (UnsupportedEncodingException e) { // should not happen LOG.error("should not happen", e); } } lookup.build(dictionary); if (storeDir != null) { File target = new File(storeDir, factory.storeFileName()); if (!lookup.store(new FileOutputStream(target))) { if (sourceLocation == null) { assert reader != null && field != null; LOG.error( "Store Lookup build from index on field: " + field + " failed reader has: " + reader.maxDoc() + " docs"); } else { LOG.error("Store Lookup build from sourceloaction: " + sourceLocation + " failed"); } } else { LOG.info("Stored suggest data to: " + target.getAbsolutePath()); } } }
@Test public void testCompletionPostingsFormat() throws IOException { AnalyzingCompletionLookupProviderV1 providerV1 = new AnalyzingCompletionLookupProviderV1(true, false, true, true); AnalyzingCompletionLookupProvider currentProvider = new AnalyzingCompletionLookupProvider(true, false, true, true); List<Completion090PostingsFormat.CompletionLookupProvider> providers = Lists.newArrayList(providerV1, currentProvider); Completion090PostingsFormat.CompletionLookupProvider randomProvider = providers.get(getRandom().nextInt(providers.size())); RAMDirectory dir = new RAMDirectory(); writeData(dir, randomProvider); IndexInput input = dir.openInput("foo.txt", IOContext.DEFAULT); LookupFactory load = currentProvider.load(input); PostingsFormat format = PostingsFormat.forName(Lucene.LATEST_POSTINGS_FORMAT); NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer()); Lookup lookup = load.getLookup( new CompletionFieldMapper( new Names("foo"), analyzer, analyzer, format, null, true, true, true, Integer.MAX_VALUE, indexSettings, AbstractFieldMapper.MultiFields.empty(), null, ContextMapping.EMPTY_MAPPING), new CompletionSuggestionContext(null)); List<LookupResult> result = lookup.lookup("ge", false, 10); assertThat(result.get(0).key.toString(), equalTo("Generator - Foo Fighters")); assertThat(result.get(0).payload.utf8ToString(), equalTo("id:10")); dir.close(); }
@Override public String init(NamedList config, SolrCore core) { LOG.info("init: " + config); String name = super.init(config, core); threshold = config.get(THRESHOLD_TOKEN_FREQUENCY) == null ? 0.0f : (Float) config.get(THRESHOLD_TOKEN_FREQUENCY); sourceLocation = (String) config.get(LOCATION); lookupImpl = (String) config.get(LOOKUP_IMPL); IndexSchema schema = core.getLatestSchema(); suggestionAnalyzerFieldTypeName = (String) config.get(SUGGESTION_ANALYZER_FIELDTYPE); if (schema.getFieldTypes().containsKey(suggestionAnalyzerFieldTypeName)) { FieldType fieldType = schema.getFieldTypes().get(suggestionAnalyzerFieldTypeName); suggestionAnalyzer = fieldType.getQueryAnalyzer(); } // support the old classnames without -Factory for config file backwards compatibility. if (lookupImpl == null || "org.apache.solr.spelling.suggest.jaspell.JaspellLookup".equals(lookupImpl)) { lookupImpl = JaspellLookupFactory.class.getName(); } else if ("org.apache.solr.spelling.suggest.tst.TSTLookup".equals(lookupImpl)) { lookupImpl = TSTLookupFactory.class.getName(); } else if ("org.apache.solr.spelling.suggest.fst.FSTLookup".equals(lookupImpl)) { lookupImpl = FSTLookupFactory.class.getName(); } factory = core.getResourceLoader().newInstance(lookupImpl, LookupFactory.class); lookup = factory.create(config, core); String store = (String) config.get(STORE_DIR); if (store != null) { storeDir = new File(store); if (!storeDir.isAbsolute()) { storeDir = new File(core.getDataDir() + File.separator + storeDir); } if (!storeDir.exists()) { storeDir.mkdirs(); } else { // attempt reload of the stored lookup try { lookup.load(new FileInputStream(new File(storeDir, factory.storeFileName()))); } catch (IOException e) { LOG.warn("Loading stored lookup data failed", e); } } } return name; }
public List<String> autocomplete(String query) { List<String> result = new ArrayList<String>(); String prefix = ""; query = query.trim(); if (query.contains(" ")) { prefix = query.substring(0, query.lastIndexOf(' ')) + ' '; query = query.substring(query.lastIndexOf(' ') + 1); } try { Lookup lookup = new TSTLookup(); Directory indexPathDir = FSDirectory.open(new File(SearcherConfiguration.getIndexPath())); IndexReader ir = DirectoryReader.open(indexPathDir); Dictionary dictionary = new LuceneDictionary(ir, WebPage.TITLE); lookup.build(dictionary); List<LookupResult> resultsList = lookup.lookup(query, false, 30); for (LookupResult lr : resultsList) { String v = lr.key.toString(); if (!v.equals(query)) { result.add(prefix + v); } } } catch (IOException e) { e.printStackTrace(); } return result; }
@Override public void reload(SolrCore core, SolrIndexSearcher searcher) throws IOException { LOG.info("reload()"); if (dictionary == null && storeDir != null) { // this may be a firstSearcher event, try loading it FileInputStream is = new FileInputStream(new File(storeDir, factory.storeFileName())); try { if (lookup.load(is)) { return; // loaded ok } } finally { IOUtils.closeWhileHandlingException(is); } LOG.debug("load failed, need to build Lookup again"); } // loading was unsuccessful - build it again build(core, searcher); }
@Override protected Suggest.Suggestion< ? extends Suggest.Suggestion.Entry<? extends Suggest.Suggestion.Entry.Option>> innerExecute( String name, CompletionSuggestionContext suggestionContext, IndexReader indexReader, CharsRef spare) throws IOException { if (suggestionContext.mapper() == null || !(suggestionContext.mapper() instanceof CompletionFieldMapper)) { throw new ElasticsearchException( "Field [" + suggestionContext.getField() + "] is not a completion suggest field"); } CompletionSuggestion completionSuggestion = new CompletionSuggestion(name, suggestionContext.getSize()); UnicodeUtil.UTF8toUTF16(suggestionContext.getText(), spare); CompletionSuggestion.Entry completionSuggestEntry = new CompletionSuggestion.Entry(new StringText(spare.toString()), 0, spare.length()); completionSuggestion.addTerm(completionSuggestEntry); String fieldName = suggestionContext.getField(); Map<String, CompletionSuggestion.Entry.Option> results = Maps.newHashMapWithExpectedSize(indexReader.leaves().size() * suggestionContext.getSize()); for (AtomicReaderContext atomicReaderContext : indexReader.leaves()) { AtomicReader atomicReader = atomicReaderContext.reader(); Terms terms = atomicReader.fields().terms(fieldName); if (terms instanceof Completion090PostingsFormat.CompletionTerms) { final Completion090PostingsFormat.CompletionTerms lookupTerms = (Completion090PostingsFormat.CompletionTerms) terms; final Lookup lookup = lookupTerms.getLookup(suggestionContext.mapper(), suggestionContext); if (lookup == null) { // we don't have a lookup for this segment.. this might be possible if a merge dropped all // docs from the segment that had a value in this segment. continue; } List<Lookup.LookupResult> lookupResults = lookup.lookup(spare, false, suggestionContext.getSize()); for (Lookup.LookupResult res : lookupResults) { final String key = res.key.toString(); final float score = res.value; final Option value = results.get(key); if (value == null) { final Option option = new CompletionSuggestion.Entry.Option( new StringText(key), score, res.payload == null ? null : new BytesArray(res.payload)); results.put(key, option); } else if (value.getScore() < score) { value.setScore(score); value.setPayload(res.payload == null ? null : new BytesArray(res.payload)); } } } } final List<CompletionSuggestion.Entry.Option> options = new ArrayList<CompletionSuggestion.Entry.Option>(results.values()); CollectionUtil.introSort(options, scoreComparator); int optionCount = Math.min(suggestionContext.getSize(), options.size()); for (int i = 0; i < optionCount; i++) { completionSuggestEntry.addOption(options.get(i)); } return completionSuggestion; }
@Override public SpellingResult getSuggestions(SpellingOptions options) throws IOException { LOG.debug("getSuggestions: " + options.tokens); if (lookup == null) { LOG.info("Lookup is null - invoke spellchecker.build first"); return EMPTY_RESULT; } SpellingResult res = new SpellingResult(); CharsRef scratch = new CharsRef(); for (Token currentToken : options.tokens) { scratch.chars = currentToken.buffer(); scratch.offset = 0; scratch.length = currentToken.length(); boolean onlyMorePopular = (options.suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) && !(lookup instanceof WFSTCompletionLookup) && !(lookup instanceof AnalyzingSuggester); // get more than the requested suggestions as a lot get collapsed by the corrections List<LookupResult> suggestions = lookup.lookup(scratch, onlyMorePopular, options.count * 10); if (suggestions == null || suggestions.size() == 0) { continue; } if (options.suggestMode != SuggestMode.SUGGEST_MORE_POPULAR) { Collections.sort(suggestions); } final LinkedHashMap<String, Integer> lhm = new LinkedHashMap<String, Integer>(); for (LookupResult lr : suggestions) { String suggestion = lr.key.toString(); if (this.suggestionAnalyzer != null) { String correction = getAnalyzerResult(suggestion); // multiple could map to the same, so don't repeat suggestions if (!isStringNullOrEmpty(correction)) { if (lhm.containsKey(correction)) { lhm.put(correction, lhm.get(correction) + (int) lr.value); } else { lhm.put(correction, (int) lr.value); } } } else { lhm.put(suggestion, (int) lr.value); } if (lhm.size() >= options.count) { break; } } // sort by new doc frequency Map<String, Integer> orderedMap = null; if (options.suggestMode != SuggestMode.SUGGEST_MORE_POPULAR) { // retain the sort order from above orderedMap = lhm; } else { orderedMap = new TreeMap<String, Integer>( new Comparator<String>() { @Override public int compare(String s1, String s2) { return lhm.get(s2).compareTo(lhm.get(s1)); } }); orderedMap.putAll(lhm); } for (Map.Entry<String, Integer> entry : orderedMap.entrySet()) { res.add(currentToken, entry.getKey(), entry.getValue()); } } return res; }
@Test public void testDuellCompletions() throws IOException, NoSuchFieldException, SecurityException, IllegalArgumentException, IllegalAccessException { final boolean preserveSeparators = getRandom().nextBoolean(); final boolean preservePositionIncrements = getRandom().nextBoolean(); final boolean usePayloads = getRandom().nextBoolean(); final int options = preserveSeparators ? AnalyzingSuggester.PRESERVE_SEP : 0; XAnalyzingSuggester reference = new XAnalyzingSuggester( new StandardAnalyzer(), null, new StandardAnalyzer(), options, 256, -1, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER); LineFileDocs docs = new LineFileDocs(getRandom()); int num = scaledRandomIntBetween(150, 300); final String[] titles = new String[num]; final long[] weights = new long[num]; for (int i = 0; i < titles.length; i++) { Document nextDoc = docs.nextDoc(); IndexableField field = nextDoc.getField("title"); titles[i] = field.stringValue(); weights[i] = between(0, 100); } docs.close(); final InputIterator primaryIter = new InputIterator() { int index = 0; long currentWeight = -1; @Override public BytesRef next() throws IOException { if (index < titles.length) { currentWeight = weights[index]; return new BytesRef(titles[index++]); } return null; } @Override public long weight() { return currentWeight; } @Override public BytesRef payload() { return null; } @Override public boolean hasPayloads() { return false; } @Override public Set<BytesRef> contexts() { return null; } @Override public boolean hasContexts() { return false; } }; InputIterator iter; if (usePayloads) { iter = new InputIterator() { @Override public long weight() { return primaryIter.weight(); } @Override public BytesRef next() throws IOException { return primaryIter.next(); } @Override public BytesRef payload() { return new BytesRef(Long.toString(weight())); } @Override public boolean hasPayloads() { return true; } @Override public Set<BytesRef> contexts() { return null; } @Override public boolean hasContexts() { return false; } }; } else { iter = primaryIter; } reference.build(iter); PostingsFormat provider = PostingsFormat.forName(Lucene.LATEST_POSTINGS_FORMAT); NamedAnalyzer namedAnalzyer = new NamedAnalyzer("foo", new StandardAnalyzer()); final CompletionFieldMapper mapper = new CompletionFieldMapper( new Names("foo"), namedAnalzyer, namedAnalzyer, provider, null, usePayloads, preserveSeparators, preservePositionIncrements, Integer.MAX_VALUE, indexSettings, AbstractFieldMapper.MultiFields.empty(), null, ContextMapping.EMPTY_MAPPING); Lookup buildAnalyzingLookup = buildAnalyzingLookup(mapper, titles, titles, weights); Field field = buildAnalyzingLookup.getClass().getDeclaredField("maxAnalyzedPathsForOneInput"); field.setAccessible(true); Field refField = reference.getClass().getDeclaredField("maxAnalyzedPathsForOneInput"); refField.setAccessible(true); assertThat(refField.get(reference), equalTo(field.get(buildAnalyzingLookup))); for (int i = 0; i < titles.length; i++) { int res = between(1, 10); final StringBuilder builder = new StringBuilder(); SuggestUtils.analyze( namedAnalzyer.tokenStream("foo", titles[i]), new SuggestUtils.TokenConsumer() { @Override public void nextToken() throws IOException { if (builder.length() == 0) { builder.append(this.charTermAttr.toString()); } } }); String firstTerm = builder.toString(); String prefix = firstTerm.isEmpty() ? "" : firstTerm.substring(0, between(1, firstTerm.length())); List<LookupResult> refLookup = reference.lookup(prefix, false, res); List<LookupResult> lookup = buildAnalyzingLookup.lookup(prefix, false, res); assertThat(refLookup.toString(), lookup.size(), equalTo(refLookup.size())); for (int j = 0; j < refLookup.size(); j++) { assertThat(lookup.get(j).key, equalTo(refLookup.get(j).key)); assertThat( "prefix: " + prefix + " " + j + " -- missmatch cost: " + lookup.get(j).key + " - " + lookup.get(j).value + " | " + refLookup.get(j).key + " - " + refLookup.get(j).value, lookup.get(j).value, equalTo(refLookup.get(j).value)); assertThat(lookup.get(j).payload, equalTo(refLookup.get(j).payload)); if (usePayloads) { assertThat( lookup.get(j).payload.utf8ToString(), equalTo(Long.toString(lookup.get(j).value))); } } } }