Query createCandidateQuery(IndexReader indexReader) throws IOException { List<Term> extractedTerms = new ArrayList<>(); // include extractionResultField:failed, because docs with this term have no // extractedTermsField // and otherwise we would fail to return these docs. Docs that failed query term extraction // always need to be verified by MemoryIndex: extractedTerms.add(new Term(extractionResultField.name(), EXTRACTION_FAILED)); LeafReader reader = indexReader.leaves().get(0).reader(); Fields fields = reader.fields(); for (String field : fields) { Terms terms = fields.terms(field); if (terms == null) { continue; } BytesRef fieldBr = new BytesRef(field); TermsEnum tenum = terms.iterator(); for (BytesRef term = tenum.next(); term != null; term = tenum.next()) { BytesRefBuilder builder = new BytesRefBuilder(); builder.append(fieldBr); builder.append(FIELD_VALUE_SEPARATOR); builder.append(term); extractedTerms.add(new Term(queryTermsField.name(), builder.toBytesRef())); } } return new TermsQuery(extractedTerms); }
/** * Returns total in-heap bytes used by all suggesters. This method has CPU cost <code> * O(numIndexedFields)</code>. * * @param fieldNamePatterns if non-null, any completion field name matching any of these patterns * will break out its in-heap bytes separately in the returned {@link CompletionStats} */ public CompletionStats completionStats(IndexReader indexReader, String... fieldNamePatterns) { CompletionStats completionStats = new CompletionStats(); for (LeafReaderContext atomicReaderContext : indexReader.leaves()) { LeafReader atomicReader = atomicReaderContext.reader(); try { Fields fields = atomicReader.fields(); for (String fieldName : fields) { Terms terms = fields.terms(fieldName); if (terms instanceof CompletionTerms) { CompletionTerms completionTerms = (CompletionTerms) terms; completionStats.add(completionTerms.stats(fieldNamePatterns)); } } } catch (IOException ioe) { logger.error("Could not get completion stats", ioe); } } return completionStats; }
public static DocSet createDocSet(SolrIndexSearcher searcher, Term term) throws IOException { DirectoryReader reader = searcher.getRawReader(); // raw reader to avoid extra wrapping overhead int maxDoc = searcher.getIndexReader().maxDoc(); int smallSetSize = smallSetSize(maxDoc); String field = term.field(); BytesRef termVal = term.bytes(); int maxCount = 0; int firstReader = -1; List<LeafReaderContext> leaves = reader.leaves(); PostingsEnum[] postList = new PostingsEnum [leaves .size()]; // use array for slightly higher scanning cost, but fewer memory // allocations for (LeafReaderContext ctx : leaves) { assert leaves.get(ctx.ord) == ctx; LeafReader r = ctx.reader(); Fields f = r.fields(); Terms t = f.terms(field); if (t == null) continue; // field is missing TermsEnum te = t.iterator(); if (te.seekExact(termVal)) { maxCount += te.docFreq(); postList[ctx.ord] = te.postings(null, PostingsEnum.NONE); if (firstReader < 0) firstReader = ctx.ord; } } if (maxCount == 0) { return DocSet.EMPTY; } if (maxCount <= smallSetSize) { return createSmallSet(leaves, postList, maxCount, firstReader); } return createBigSet(leaves, postList, maxDoc, firstReader); }
public OrdWrappedTermsEnum(LeafReader reader) throws IOException { assert indexedTermsArray != null; assert 0 != indexedTermsArray.length; termsEnum = reader.fields().terms(field).iterator(); }
/** * Returns a list of terms in the specified field along with the corresponding count of documents * in the set that match that constraint. This method uses the FilterCache to get the intersection * count between <code>docs</code> and the DocSet for each term in the filter. * * @see FacetParams#FACET_LIMIT * @see FacetParams#FACET_ZEROS * @see FacetParams#FACET_MISSING */ public NamedList<Integer> getFacetTermEnumCounts( SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase, SolrParams params) throws IOException { /* :TODO: potential optimization... * cache the Terms with the highest docFreq and try them first * don't enum if we get our max from them */ // Minimum term docFreq in order to use the filterCache for that term. int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0); // make sure we have a set that is fast for random access, if we will use it for that DocSet fastForRandomSet = docs; if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) { SortedIntDocSet sset = (SortedIntDocSet) docs; fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size()); } IndexSchema schema = searcher.getSchema(); LeafReader r = searcher.getLeafReader(); FieldType ft = schema.getFieldType(field); boolean sortByCount = sort.equals("count") || sort.equals("true"); final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1; final BoundedTreeSet<CountPair<BytesRef, Integer>> queue = sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null; final NamedList<Integer> res = new NamedList<>(); int min = mincount - 1; // the smallest value in the top 'N' values int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; BytesRef prefixTermBytes = null; if (prefix != null) { String indexedPrefix = ft.toInternal(prefix); prefixTermBytes = new BytesRef(indexedPrefix); } Fields fields = r.fields(); Terms terms = fields == null ? null : fields.terms(field); TermsEnum termsEnum = null; SolrIndexSearcher.DocsEnumState deState = null; BytesRef term = null; if (terms != null) { termsEnum = terms.iterator(); // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for // facet.offset when sorting by index order. if (prefixTermBytes != null) { if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); } } else { // position termsEnum on first term term = termsEnum.next(); } } PostingsEnum postingsEnum = null; CharsRefBuilder charsRef = new CharsRefBuilder(); if (docs.size() >= mincount) { while (term != null) { if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes)) break; if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) { int df = termsEnum.docFreq(); // If we are sorting, we can use df>min (rather than >=) since we // are going in index order. For certain term distributions this can // make a large difference (for example, many terms with df=1). if (df > 0 && df > min) { int c; if (df >= minDfFilterCache) { // use the filter cache if (deState == null) { deState = new SolrIndexSearcher.DocsEnumState(); deState.fieldName = field; deState.liveDocs = r.getLiveDocs(); deState.termsEnum = termsEnum; deState.postingsEnum = postingsEnum; } c = searcher.numDocs(docs, deState); postingsEnum = deState.postingsEnum; } else { // iterate over TermDocs to calculate the intersection // TODO: specialize when base docset is a bitset or hash set (skipDocs)? or does it // matter for this? // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class // impl) // TODO: would passing deleted docs lead to better efficiency over checking the // fastForRandomSet? postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); c = 0; if (postingsEnum instanceof MultiPostingsEnum) { MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs(); int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs(); for (int subindex = 0; subindex < numSubs; subindex++) { MultiPostingsEnum.EnumWithSlice sub = subs[subindex]; if (sub.postingsEnum == null) continue; int base = sub.slice.start; int docid; while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid + base)) c++; } } } else { int docid; while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid)) c++; } } } if (sortByCount) { if (c > min) { BytesRef termCopy = BytesRef.deepCopyOf(term); queue.add(new CountPair<>(termCopy, c)); if (queue.size() >= maxsize) min = queue.last().val; } } else { if (c >= mincount && --off < 0) { if (--lim < 0) break; ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } } } term = termsEnum.next(); } } if (sortByCount) { for (CountPair<BytesRef, Integer> p : queue) { if (--off >= 0) continue; if (--lim < 0) break; ft.indexedToReadable(p.key, charsRef); res.add(charsRef.toString(), p.val); } } if (missing) { res.add(null, getFieldMissingCount(searcher, docs, field)); } return res; }
private void duellReaders(CompositeReader other, LeafReader memIndexReader) throws IOException { Fields memFields = memIndexReader.fields(); for (String field : MultiFields.getFields(other)) { Terms memTerms = memFields.terms(field); Terms iwTerms = memIndexReader.terms(field); if (iwTerms == null) { assertNull(memTerms); } else { NumericDocValues normValues = MultiDocValues.getNormValues(other, field); NumericDocValues memNormValues = memIndexReader.getNormValues(field); if (normValues != null) { // mem idx always computes norms on the fly assertNotNull(memNormValues); assertEquals(normValues.get(0), memNormValues.get(0)); } assertNotNull(memTerms); assertEquals(iwTerms.getDocCount(), memTerms.getDocCount()); assertEquals(iwTerms.getSumDocFreq(), memTerms.getSumDocFreq()); assertEquals(iwTerms.getSumTotalTermFreq(), memTerms.getSumTotalTermFreq()); TermsEnum iwTermsIter = iwTerms.iterator(); TermsEnum memTermsIter = memTerms.iterator(); if (iwTerms.hasPositions()) { final boolean offsets = iwTerms.hasOffsets() && memTerms.hasOffsets(); while (iwTermsIter.next() != null) { assertNotNull(memTermsIter.next()); assertEquals(iwTermsIter.term(), memTermsIter.term()); PostingsEnum iwDocsAndPos = iwTermsIter.postings(null, PostingsEnum.ALL); PostingsEnum memDocsAndPos = memTermsIter.postings(null, PostingsEnum.ALL); while (iwDocsAndPos.nextDoc() != PostingsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.docID(), memDocsAndPos.nextDoc()); assertEquals(iwDocsAndPos.freq(), memDocsAndPos.freq()); for (int i = 0; i < iwDocsAndPos.freq(); i++) { assertEquals( "term: " + iwTermsIter.term().utf8ToString(), iwDocsAndPos.nextPosition(), memDocsAndPos.nextPosition()); if (offsets) { assertEquals(iwDocsAndPos.startOffset(), memDocsAndPos.startOffset()); assertEquals(iwDocsAndPos.endOffset(), memDocsAndPos.endOffset()); } if (iwTerms.hasPayloads()) { assertEquals(iwDocsAndPos.getPayload(), memDocsAndPos.getPayload()); } } } } } else { while (iwTermsIter.next() != null) { assertEquals(iwTermsIter.term(), memTermsIter.term()); PostingsEnum iwDocsAndPos = iwTermsIter.postings(null); PostingsEnum memDocsAndPos = memTermsIter.postings(null); while (iwDocsAndPos.nextDoc() != PostingsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.docID(), memDocsAndPos.nextDoc()); assertEquals(iwDocsAndPos.freq(), memDocsAndPos.freq()); } } } } } }