void fillBucket(SimpleOrderedMap<Object> bucket, Query q, DocSet result) throws IOException { boolean needDocSet = freq.getFacetStats().size() > 0 || freq.getSubFacets().size() > 0; // TODO: always collect counts or not??? int count; if (result != null) { count = result.size(); } else if (needDocSet) { if (q == null) { result = fcontext.base; // result.incref(); // OFF-HEAP } else { result = fcontext.searcher.getDocSet(q, fcontext.base); } count = result.size(); } else { if (q == null) { count = fcontext.base.size(); } else { count = fcontext.searcher.numDocs(q, fcontext.base); } } try { processStats(bucket, result, count); processSubs(bucket, q, result); } finally { if (result != null) { // result.decref(); // OFF-HEAP result = null; } } }
void processSubs(SimpleOrderedMap<Object> response, Query filter, DocSet domain) throws IOException { // TODO: what if a zero bucket has a sub-facet with an exclusion that would yield results? // should we check for domain-altering exclusions, or even ask the sub-facet for // it's domain and then only skip it if it's 0? if (domain == null || domain.size() == 0 && !freq.processEmpty) { return; } for (Map.Entry<String, FacetRequest> sub : freq.getSubFacets().entrySet()) { // make a new context for each sub-facet since they can change the domain FacetContext subContext = fcontext.sub(filter, domain); FacetProcessor subProcessor = sub.getValue().createFacetProcessor(subContext); if (fcontext.getDebugInfo() != null) { // if fcontext.debugInfo != null, it means rb.debug() == true FacetDebugInfo fdebug = new FacetDebugInfo(); subContext.setDebugInfo(fdebug); fcontext.getDebugInfo().addChild(fdebug); fdebug.setReqDescription(sub.getValue().getFacetDescription()); fdebug.setProcessor(subProcessor.getClass().getSimpleName()); if (subContext.filter != null) fdebug.setFilter(subContext.filter.toString()); final RTimer timer = new RTimer(); subProcessor.process(); long timeElapsed = (long) timer.getTime(); fdebug.setElapse(timeElapsed); fdebug.putInfoItem("domainSize", (long) subContext.base.size()); } else { subProcessor.process(); } response.add(sub.getKey(), subProcessor.getResponse()); } }
/** * Returns a list of terms in the specified field along with the corresponding count of documents * in the set that match that constraint. This method uses the FilterCache to get the intersection * count between <code>docs</code> and the DocSet for each term in the filter. * * @see FacetParams#FACET_LIMIT * @see FacetParams#FACET_ZEROS * @see FacetParams#FACET_MISSING */ public NamedList<Integer> getFacetTermEnumCounts( SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase, SolrParams params) throws IOException { /* :TODO: potential optimization... * cache the Terms with the highest docFreq and try them first * don't enum if we get our max from them */ // Minimum term docFreq in order to use the filterCache for that term. int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0); // make sure we have a set that is fast for random access, if we will use it for that DocSet fastForRandomSet = docs; if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) { SortedIntDocSet sset = (SortedIntDocSet) docs; fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size()); } IndexSchema schema = searcher.getSchema(); LeafReader r = searcher.getLeafReader(); FieldType ft = schema.getFieldType(field); boolean sortByCount = sort.equals("count") || sort.equals("true"); final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1; final BoundedTreeSet<CountPair<BytesRef, Integer>> queue = sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null; final NamedList<Integer> res = new NamedList<>(); int min = mincount - 1; // the smallest value in the top 'N' values int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; BytesRef prefixTermBytes = null; if (prefix != null) { String indexedPrefix = ft.toInternal(prefix); prefixTermBytes = new BytesRef(indexedPrefix); } Fields fields = r.fields(); Terms terms = fields == null ? null : fields.terms(field); TermsEnum termsEnum = null; SolrIndexSearcher.DocsEnumState deState = null; BytesRef term = null; if (terms != null) { termsEnum = terms.iterator(); // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for // facet.offset when sorting by index order. if (prefixTermBytes != null) { if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); } } else { // position termsEnum on first term term = termsEnum.next(); } } PostingsEnum postingsEnum = null; CharsRefBuilder charsRef = new CharsRefBuilder(); if (docs.size() >= mincount) { while (term != null) { if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes)) break; if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) { int df = termsEnum.docFreq(); // If we are sorting, we can use df>min (rather than >=) since we // are going in index order. For certain term distributions this can // make a large difference (for example, many terms with df=1). if (df > 0 && df > min) { int c; if (df >= minDfFilterCache) { // use the filter cache if (deState == null) { deState = new SolrIndexSearcher.DocsEnumState(); deState.fieldName = field; deState.liveDocs = r.getLiveDocs(); deState.termsEnum = termsEnum; deState.postingsEnum = postingsEnum; } c = searcher.numDocs(docs, deState); postingsEnum = deState.postingsEnum; } else { // iterate over TermDocs to calculate the intersection // TODO: specialize when base docset is a bitset or hash set (skipDocs)? or does it // matter for this? // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class // impl) // TODO: would passing deleted docs lead to better efficiency over checking the // fastForRandomSet? postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); c = 0; if (postingsEnum instanceof MultiPostingsEnum) { MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs(); int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs(); for (int subindex = 0; subindex < numSubs; subindex++) { MultiPostingsEnum.EnumWithSlice sub = subs[subindex]; if (sub.postingsEnum == null) continue; int base = sub.slice.start; int docid; while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid + base)) c++; } } } else { int docid; while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid)) c++; } } } if (sortByCount) { if (c > min) { BytesRef termCopy = BytesRef.deepCopyOf(term); queue.add(new CountPair<>(termCopy, c)); if (queue.size() >= maxsize) min = queue.last().val; } } else { if (c >= mincount && --off < 0) { if (--lim < 0) break; ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } } } term = termsEnum.next(); } } if (sortByCount) { for (CountPair<BytesRef, Integer> p : queue) { if (--off >= 0) continue; if (--lim < 0) break; ft.indexedToReadable(p.key, charsRef); res.add(charsRef.toString(), p.val); } } if (missing) { res.add(null, getFieldMissingCount(searcher, docs, field)); } return res; }
public NamedList get(String[] fields, DocSet baseDocs) throws IOException, ParseException { if (this.crcget == null) { this.container = this.parse.createContainer(fields, baseDocs, this.reader, this.searcher, this.req); DocIterator iter = baseDocs.iterator(); this.recordCount.inc(baseDocs.size()); Doclist res = new Doclist(this.parse.limit_offset); int doc = -1; while (iter.hasNext()) { doc = iter.nextDoc(); res.add(doc); if (res.index >= this.parse.limit_offset) { break; } } PriorityQueue<SelectDetailRow> topItems = this.transGroupValue(res, fields); this.container.free(); return this.toNameList(topItems); } String hostkey = String.valueOf(this.getkeyCrc()); ConcurrentHashMap<Long, String> cache = MdrillUtils.CRC_CACHE_SIZE.remove(crcget + "@" + hostkey); NamedList rtn = new NamedList(); Map<Long, String> crcvalue = new HashMap<Long, String>(); rtn.add("fdtcre", crcvalue); if (cache != null) { MapFieldSelector selector = new MapFieldSelector(fields); FieldType[] ftlist = new FieldType[fields.length]; IndexSchema schema = this.searcher.getSchema(); for (int j = 0; j < fields.length; j++) { ftlist[j] = schema.getFieldType(fields[j]); } String crcliststr = params.get("mdrill.crc.key.get.crclist"); if (crcliststr != null) { String[] crclist = crcliststr.split(","); for (String s : crclist) { Long crc = Long.parseLong(s); String v = cache.get(crc); if (v != null) { String cols[] = v.split(UniqConfig.GroupJoinString(), -1); if (cols.length >= 2) { int doc = Integer.parseInt(cols[0]); SortGroupVal buff = new SortGroupVal(); buff.groupbuff.append("-"); buff.groupbuff.append(UniqConfig.GroupJoinString()); buff.groupbuff.append("-"); Document docfields = this.reader.document(doc, selector); if (docfields == null) { for (int j = 0; j < fields.length; j++) { buff.groupbuff.append(UniqConfig.GroupJoinString()); buff.groupbuff.append(EncodeUtils.encode("-")); } if (!crcvalue.containsKey(crc)) { crcvalue.put(crc, buff.groupbuff.toString()); } } else { for (int j = 0; j < fields.length; j++) { buff.groupbuff.append(UniqConfig.GroupJoinString()); Fieldable fv = docfields.getFieldable(fields[j]); if (fv != null) { buff.groupbuff.append(ftlist[j].toExternal(fv)); } else { buff.groupbuff.append(EncodeUtils.encode("-")); } } crcvalue.put(crc, buff.groupbuff.toString()); } } } } } } return rtn; }
public static NamedList<Integer> getCounts( SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix) throws IOException { SchemaField schemaField = searcher.getSchema().getField(fieldName); FieldType ft = schemaField.getType(); NamedList<Integer> res = new NamedList<Integer>(); final SortedSetDocValues si; // for term lookups only OrdinalMap ordinalMap = null; // for mapping per-segment ords to global ones if (schemaField.multiValued()) { si = searcher.getAtomicReader().getSortedSetDocValues(fieldName); if (si instanceof MultiSortedSetDocValues) { ordinalMap = ((MultiSortedSetDocValues) si).mapping; } } else { SortedDocValues single = searcher.getAtomicReader().getSortedDocValues(fieldName); si = single == null ? null : new SingletonSortedSetDocValues(single); if (single instanceof MultiSortedDocValues) { ordinalMap = ((MultiSortedDocValues) single).mapping; } } if (si == null) { return finalize(res, searcher, schemaField, docs, -1, missing); } if (si.getValueCount() >= Integer.MAX_VALUE) { throw new UnsupportedOperationException( "Currently this faceting method is limited to " + Integer.MAX_VALUE + " unique terms"); } final BytesRef br = new BytesRef(); final BytesRef prefixRef; if (prefix == null) { prefixRef = null; } else if (prefix.length() == 0) { prefix = null; prefixRef = null; } else { prefixRef = new BytesRef(prefix); } int startTermIndex, endTermIndex; if (prefix != null) { startTermIndex = (int) si.lookupTerm(prefixRef); if (startTermIndex < 0) startTermIndex = -startTermIndex - 1; prefixRef.append(UnicodeUtil.BIG_TERM); endTermIndex = (int) si.lookupTerm(prefixRef); assert endTermIndex < 0; endTermIndex = -endTermIndex - 1; } else { startTermIndex = -1; endTermIndex = (int) si.getValueCount(); } final int nTerms = endTermIndex - startTermIndex; int missingCount = -1; final CharsRef charsRef = new CharsRef(10); if (nTerms > 0 && docs.size() >= mincount) { // count collection array only needs to be as big as the number of terms we are // going to collect counts for. final int[] counts = new int[nTerms]; Filter filter = docs.getTopFilter(); List<AtomicReaderContext> leaves = searcher.getTopReaderContext().leaves(); for (int subIndex = 0; subIndex < leaves.size(); subIndex++) { AtomicReaderContext leaf = leaves.get(subIndex); DocIdSet dis = filter.getDocIdSet(leaf, null); // solr docsets already exclude any deleted docs DocIdSetIterator disi = null; if (dis != null) { disi = dis.iterator(); } if (disi != null) { if (schemaField.multiValued()) { SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName); if (sub == null) { sub = SortedSetDocValues.EMPTY; } if (sub instanceof SingletonSortedSetDocValues) { // some codecs may optimize SORTED_SET storage for single-valued fields final SortedDocValues values = ((SingletonSortedSetDocValues) sub).getSortedDocValues(); accumSingle(counts, startTermIndex, values, disi, subIndex, ordinalMap); } else { accumMulti(counts, startTermIndex, sub, disi, subIndex, ordinalMap); } } else { SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName); if (sub == null) { sub = SortedDocValues.EMPTY; } accumSingle(counts, startTermIndex, sub, disi, subIndex, ordinalMap); } } } if (startTermIndex == -1) { missingCount = counts[0]; } // IDEA: we could also maintain a count of "other"... everything that fell outside // of the top 'N' int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) { int maxsize = limit > 0 ? offset + limit : Integer.MAX_VALUE - 1; maxsize = Math.min(maxsize, nTerms); LongPriorityQueue queue = new LongPriorityQueue(Math.min(maxsize, 1000), maxsize, Long.MIN_VALUE); int min = mincount - 1; // the smallest value in the top 'N' values for (int i = (startTermIndex == -1) ? 1 : 0; i < nTerms; i++) { int c = counts[i]; if (c > min) { // NOTE: we use c>min rather than c>=min as an optimization because we are going in // index order, so we already know that the keys are ordered. This can be very // important if a lot of the counts are repeated (like zero counts would be). // smaller term numbers sort higher, so subtract the term number instead long pair = (((long) c) << 32) + (Integer.MAX_VALUE - i); boolean displaced = queue.insert(pair); if (displaced) min = (int) (queue.top() >>> 32); } } // if we are deep paging, we don't have to order the highest "offset" counts. int collectCount = Math.max(0, queue.size() - off); assert collectCount <= lim; // the start and end indexes of our list "sorted" (starting with the highest value) int sortedIdxStart = queue.size() - (collectCount - 1); int sortedIdxEnd = queue.size() + 1; final long[] sorted = queue.sort(collectCount); for (int i = sortedIdxStart; i < sortedIdxEnd; i++) { long pair = sorted[i]; int c = (int) (pair >>> 32); int tnum = Integer.MAX_VALUE - (int) pair; si.lookupOrd(startTermIndex + tnum, br); ft.indexedToReadable(br, charsRef); res.add(charsRef.toString(), c); } } else { // add results in index order int i = (startTermIndex == -1) ? 1 : 0; if (mincount <= 0) { // if mincount<=0, then we won't discard any terms and we know exactly // where to start. i += off; off = 0; } for (; i < nTerms; i++) { int c = counts[i]; if (c < mincount || --off >= 0) continue; if (--lim < 0) break; si.lookupOrd(startTermIndex + i, br); ft.indexedToReadable(br, charsRef); res.add(charsRef.toString(), c); } } } return finalize(res, searcher, schemaField, docs, missingCount, missing); }