@Override public FacetProcessor createFacetProcessor(FacetContext fcontext) { SchemaField sf = fcontext.searcher.getSchema().getField(field); FieldType ft = sf.getType(); boolean multiToken = sf.multiValued() || ft.multiValuedFieldCache(); LegacyNumericType ntype = ft.getNumericType(); // ensure we can support the requested options for numeric faceting: if (ntype != null) { if (prefix != null) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Doesn't make sense to set facet prefix on a numeric field"); } if (mincount == 0) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Numeric fields do not support facet mincount=0; try indexing as terms"); // TODO if indexed=true then we could add support } } // TODO auto-pick ENUM/STREAM SOLR-9351 when index asc and DocSet cardinality is *not* much // smaller than term cardinality if (method == FacetMethod.ENUM) { // at the moment these two are the same method = FacetMethod.STREAM; } if (method == FacetMethod.STREAM && sf.indexed() && "index".equals(sortVariable) && sortDirection == SortDirection.asc) { return new FacetFieldProcessorByEnumTermsStream(fcontext, this, sf); } // TODO if method=UIF and not single-valued numerics then simply choose that now? TODO add // FieldType.getDocValuesType() if (!multiToken) { if (mincount > 0 && prefix == null && (ntype != null || method == FacetMethod.DVHASH)) { // TODO can we auto-pick for strings when term cardinality is much greater than DocSet // cardinality? // or if we don't know cardinality but DocSet size is very small return new FacetFieldProcessorByHashDV(fcontext, this, sf); } else if (ntype == null) { // single valued string... return new FacetFieldProcessorByArrayDV(fcontext, this, sf); } else { throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "Couldn't pick facet algorithm for field " + sf); } } // multi-valued after this point if (sf.hasDocValues() || method == FacetMethod.DV) { // single and multi-valued string docValues return new FacetFieldProcessorByArrayDV(fcontext, this, sf); } // Top-level multi-valued field cache (UIF) return new FacetFieldProcessorByArrayUIF(fcontext, this, sf); }
/** * Term counts for use in field faceting that resepcts the specified mincount - if mincount is * null, the "zeros" param is consulted for the appropriate backcompat default * * @see FacetParams#FACET_ZEROS */ private NamedList<Integer> getTermCounts(String field, Integer mincount, ParsedParams parsed) throws IOException { final SolrParams params = parsed.params; final DocSet docs = parsed.docs; final int threads = parsed.threads; int offset = params.getFieldInt(field, FacetParams.FACET_OFFSET, 0); int limit = params.getFieldInt(field, FacetParams.FACET_LIMIT, 100); if (limit == 0) return new NamedList<>(); if (mincount == null) { Boolean zeros = params.getFieldBool(field, FacetParams.FACET_ZEROS); // mincount = (zeros!=null && zeros) ? 0 : 1; mincount = (zeros != null && !zeros) ? 1 : 0; // current default is to include zeros. } boolean missing = params.getFieldBool(field, FacetParams.FACET_MISSING, false); // default to sorting if there is a limit. String sort = params.getFieldParam( field, FacetParams.FACET_SORT, limit > 0 ? FacetParams.FACET_SORT_COUNT : FacetParams.FACET_SORT_INDEX); String prefix = params.getFieldParam(field, FacetParams.FACET_PREFIX); String contains = params.getFieldParam(field, FacetParams.FACET_CONTAINS); boolean ignoreCase = params.getFieldBool(field, FacetParams.FACET_CONTAINS_IGNORE_CASE, false); NamedList<Integer> counts; SchemaField sf = searcher.getSchema().getField(field); FieldType ft = sf.getType(); // determine what type of faceting method to use final String methodStr = params.getFieldParam(field, FacetParams.FACET_METHOD); FacetMethod method = null; if (FacetParams.FACET_METHOD_enum.equals(methodStr)) { method = FacetMethod.ENUM; } else if (FacetParams.FACET_METHOD_fcs.equals(methodStr)) { method = FacetMethod.FCS; } else if (FacetParams.FACET_METHOD_fc.equals(methodStr)) { method = FacetMethod.FC; } if (method == FacetMethod.ENUM && TrieField.getMainValuePrefix(ft) != null) { // enum can't deal with trie fields that index several terms per value method = sf.multiValued() ? FacetMethod.FC : FacetMethod.FCS; } if (method == null && ft instanceof BoolField) { // Always use filters for booleans... we know the number of values is very small. method = FacetMethod.ENUM; } final boolean multiToken = sf.multiValued() || ft.multiValuedFieldCache(); if (ft.getNumericType() != null && !sf.multiValued()) { // the per-segment approach is optimal for numeric field types since there // are no global ords to merge and no need to create an expensive // top-level reader method = FacetMethod.FCS; } if (method == null) { // TODO: default to per-segment or not? method = FacetMethod.FC; } if (method == FacetMethod.FCS && multiToken) { // only fc knows how to deal with multi-token fields method = FacetMethod.FC; } if (method == FacetMethod.ENUM && sf.hasDocValues()) { // only fc can handle docvalues types method = FacetMethod.FC; } if (params.getFieldBool(field, GroupParams.GROUP_FACET, false)) { counts = getGroupedCounts( searcher, docs, field, multiToken, offset, limit, mincount, missing, sort, prefix, contains, ignoreCase); } else { assert method != null; switch (method) { case ENUM: assert TrieField.getMainValuePrefix(ft) == null; counts = getFacetTermEnumCounts( searcher, docs, field, offset, limit, mincount, missing, sort, prefix, contains, ignoreCase, params); break; case FCS: assert !multiToken; if (ft.getNumericType() != null && !sf.multiValued()) { // force numeric faceting if (prefix != null && !prefix.isEmpty()) { throw new SolrException( ErrorCode.BAD_REQUEST, FacetParams.FACET_PREFIX + " is not supported on numeric types"); } if (contains != null && !contains.isEmpty()) { throw new SolrException( ErrorCode.BAD_REQUEST, FacetParams.FACET_CONTAINS + " is not supported on numeric types"); } counts = NumericFacets.getCounts( searcher, docs, field, offset, limit, mincount, missing, sort); } else { PerSegmentSingleValuedFaceting ps = new PerSegmentSingleValuedFaceting( searcher, docs, field, offset, limit, mincount, missing, sort, prefix, contains, ignoreCase); Executor executor = threads == 0 ? directExecutor : facetExecutor; ps.setNumThreads(threads); counts = ps.getFacetCounts(executor); } break; case FC: counts = DocValuesFacets.getCounts( searcher, docs, field, offset, limit, mincount, missing, sort, prefix, contains, ignoreCase); break; default: throw new AssertionError(); } } return counts; }
public static NamedList<Integer> getCounts( SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase) throws IOException { SchemaField schemaField = searcher.getSchema().getField(fieldName); FieldType ft = schemaField.getType(); NamedList<Integer> res = new NamedList<>(); // TODO: remove multiValuedFieldCache(), check dv type / uninversion type? final boolean multiValued = schemaField.multiValued() || ft.multiValuedFieldCache(); final SortedSetDocValues si; // for term lookups only OrdinalMap ordinalMap = null; // for mapping per-segment ords to global ones if (multiValued) { si = searcher.getLeafReader().getSortedSetDocValues(fieldName); if (si instanceof MultiSortedSetDocValues) { ordinalMap = ((MultiSortedSetDocValues) si).mapping; } } else { SortedDocValues single = searcher.getLeafReader().getSortedDocValues(fieldName); si = single == null ? null : DocValues.singleton(single); if (single instanceof MultiSortedDocValues) { ordinalMap = ((MultiSortedDocValues) single).mapping; } } if (si == null) { return finalize(res, searcher, schemaField, docs, -1, missing); } if (si.getValueCount() >= Integer.MAX_VALUE) { throw new UnsupportedOperationException( "Currently this faceting method is limited to " + Integer.MAX_VALUE + " unique terms"); } final BytesRefBuilder prefixRef; if (prefix == null) { prefixRef = null; } else if (prefix.length() == 0) { prefix = null; prefixRef = null; } else { prefixRef = new BytesRefBuilder(); prefixRef.copyChars(prefix); } int startTermIndex, endTermIndex; if (prefix != null) { startTermIndex = (int) si.lookupTerm(prefixRef.get()); if (startTermIndex < 0) startTermIndex = -startTermIndex - 1; prefixRef.append(UnicodeUtil.BIG_TERM); endTermIndex = (int) si.lookupTerm(prefixRef.get()); assert endTermIndex < 0; endTermIndex = -endTermIndex - 1; } else { startTermIndex = -1; endTermIndex = (int) si.getValueCount(); } final int nTerms = endTermIndex - startTermIndex; int missingCount = -1; final CharsRefBuilder charsRef = new CharsRefBuilder(); if (nTerms > 0 && docs.size() >= mincount) { // count collection array only needs to be as big as the number of terms we are // going to collect counts for. final int[] counts = new int[nTerms]; Filter filter = docs.getTopFilter(); List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves(); for (int subIndex = 0; subIndex < leaves.size(); subIndex++) { LeafReaderContext leaf = leaves.get(subIndex); DocIdSet dis = filter.getDocIdSet(leaf, null); // solr docsets already exclude any deleted docs DocIdSetIterator disi = null; if (dis != null) { disi = dis.iterator(); } if (disi != null) { if (multiValued) { SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName); if (sub == null) { sub = DocValues.emptySortedSet(); } final SortedDocValues singleton = DocValues.unwrapSingleton(sub); if (singleton != null) { // some codecs may optimize SORTED_SET storage for single-valued fields accumSingle(counts, startTermIndex, singleton, disi, subIndex, ordinalMap); } else { accumMulti(counts, startTermIndex, sub, disi, subIndex, ordinalMap); } } else { SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName); if (sub == null) { sub = DocValues.emptySorted(); } accumSingle(counts, startTermIndex, sub, disi, subIndex, ordinalMap); } } } if (startTermIndex == -1) { missingCount = counts[0]; } // IDEA: we could also maintain a count of "other"... everything that fell outside // of the top 'N' int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) { int maxsize = limit > 0 ? offset + limit : Integer.MAX_VALUE - 1; maxsize = Math.min(maxsize, nTerms); LongPriorityQueue queue = new LongPriorityQueue(Math.min(maxsize, 1000), maxsize, Long.MIN_VALUE); int min = mincount - 1; // the smallest value in the top 'N' values for (int i = (startTermIndex == -1) ? 1 : 0; i < nTerms; i++) { int c = counts[i]; if (contains != null) { final BytesRef term = si.lookupOrd(startTermIndex + i); if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) { continue; } } if (c > min) { // NOTE: we use c>min rather than c>=min as an optimization because we are going in // index order, so we already know that the keys are ordered. This can be very // important if a lot of the counts are repeated (like zero counts would be). // smaller term numbers sort higher, so subtract the term number instead long pair = (((long) c) << 32) + (Integer.MAX_VALUE - i); boolean displaced = queue.insert(pair); if (displaced) min = (int) (queue.top() >>> 32); } } // if we are deep paging, we don't have to order the highest "offset" counts. int collectCount = Math.max(0, queue.size() - off); assert collectCount <= lim; // the start and end indexes of our list "sorted" (starting with the highest value) int sortedIdxStart = queue.size() - (collectCount - 1); int sortedIdxEnd = queue.size() + 1; final long[] sorted = queue.sort(collectCount); for (int i = sortedIdxStart; i < sortedIdxEnd; i++) { long pair = sorted[i]; int c = (int) (pair >>> 32); int tnum = Integer.MAX_VALUE - (int) pair; final BytesRef term = si.lookupOrd(startTermIndex + tnum); ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } else { // add results in index order int i = (startTermIndex == -1) ? 1 : 0; if (mincount <= 0 && contains == null) { // if mincount<=0 and we're not examining the values for contains, then // we won't discard any terms and we know exactly where to start. i += off; off = 0; } for (; i < nTerms; i++) { int c = counts[i]; if (c < mincount) continue; BytesRef term = null; if (contains != null) { term = si.lookupOrd(startTermIndex + i); if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) { continue; } } if (--off >= 0) continue; if (--lim < 0) break; if (term == null) { term = si.lookupOrd(startTermIndex + i); } ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } } return finalize(res, searcher, schemaField, docs, missingCount, missing); }