@Test public void testSvValues() throws IOException { int numDocs = 1000000; int numOrdinals = numDocs / 4; Map<Integer, Long> controlDocToOrdinal = new HashMap<>(); OrdinalsBuilder builder = new OrdinalsBuilder(numDocs); long ordinal = builder.currentOrdinal(); for (int doc = 0; doc < numDocs; doc++) { if (doc % numOrdinals == 0) { ordinal = builder.nextOrdinal(); } controlDocToOrdinal.put(doc, ordinal); builder.addDoc(doc); } Ordinals ords = builder.build(ImmutableSettings.EMPTY); assertThat(ords, instanceOf(SinglePackedOrdinals.class)); RandomAccessOrds docs = ords.ordinals(); final SortedDocValues singleOrds = DocValues.unwrapSingleton(docs); assertNotNull(singleOrds); for (Map.Entry<Integer, Long> entry : controlDocToOrdinal.entrySet()) { assertThat(entry.getValue(), equalTo((long) singleOrds.getOrd(entry.getKey()))); } }
@Override public MultiGeoPointValues getGeoPointValues() { final RandomAccessOrds ords = ordinals.ordinals(); final SortedDocValues singleOrds = DocValues.unwrapSingleton(ords); if (singleOrds != null) { final GeoPoint point = new GeoPoint(); final GeoPointValues values = new GeoPointValues() { @Override public GeoPoint get(int docID) { final int ord = singleOrds.getOrd(docID); if (ord >= 0) { return point.reset(lat.get(ord), lon.get(ord)); } return point.reset(Double.NaN, Double.NaN); } }; return FieldData.singleton(values, DocValues.docsWithValue(singleOrds, maxDoc)); } else { final GeoPoint point = new GeoPoint(); return new MultiGeoPointValues() { @Override public GeoPoint valueAt(int index) { final long ord = ords.ordAt(index); if (ord >= 0) { return point.reset(lat.get(ord), lon.get(ord)); } return point.reset(Double.NaN, Double.NaN); } @Override public void setDocument(int docId) { ords.setDocument(docId); } @Override public int count() { return ords.cardinality(); } }; } }
private static SortedNumericDocValues withOrdinals( Ordinals ordinals, final LongValues values, int maxDoc) { final RandomAccessOrds ords = ordinals.ordinals(); final SortedDocValues singleOrds = DocValues.unwrapSingleton(ords); if (singleOrds != null) { final NumericDocValues singleValues = new NumericDocValues() { @Override public long get(int docID) { final int ord = singleOrds.getOrd(docID); if (ord >= 0) { return values.get(singleOrds.getOrd(docID)); } else { return 0; } } }; return DocValues.singleton(singleValues, DocValues.docsWithValue(ords, maxDoc)); } else { return new SortedNumericDocValues() { @Override public long valueAt(int index) { return values.get(ords.ordAt(index)); } @Override public void setDocument(int doc) { ords.setDocument(doc); } @Override public int count() { return ords.cardinality(); } }; } }
public static NamedList<Integer> getCounts( SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase) throws IOException { SchemaField schemaField = searcher.getSchema().getField(fieldName); FieldType ft = schemaField.getType(); NamedList<Integer> res = new NamedList<>(); // TODO: remove multiValuedFieldCache(), check dv type / uninversion type? final boolean multiValued = schemaField.multiValued() || ft.multiValuedFieldCache(); final SortedSetDocValues si; // for term lookups only OrdinalMap ordinalMap = null; // for mapping per-segment ords to global ones if (multiValued) { si = searcher.getLeafReader().getSortedSetDocValues(fieldName); if (si instanceof MultiSortedSetDocValues) { ordinalMap = ((MultiSortedSetDocValues) si).mapping; } } else { SortedDocValues single = searcher.getLeafReader().getSortedDocValues(fieldName); si = single == null ? null : DocValues.singleton(single); if (single instanceof MultiSortedDocValues) { ordinalMap = ((MultiSortedDocValues) single).mapping; } } if (si == null) { return finalize(res, searcher, schemaField, docs, -1, missing); } if (si.getValueCount() >= Integer.MAX_VALUE) { throw new UnsupportedOperationException( "Currently this faceting method is limited to " + Integer.MAX_VALUE + " unique terms"); } final BytesRefBuilder prefixRef; if (prefix == null) { prefixRef = null; } else if (prefix.length() == 0) { prefix = null; prefixRef = null; } else { prefixRef = new BytesRefBuilder(); prefixRef.copyChars(prefix); } int startTermIndex, endTermIndex; if (prefix != null) { startTermIndex = (int) si.lookupTerm(prefixRef.get()); if (startTermIndex < 0) startTermIndex = -startTermIndex - 1; prefixRef.append(UnicodeUtil.BIG_TERM); endTermIndex = (int) si.lookupTerm(prefixRef.get()); assert endTermIndex < 0; endTermIndex = -endTermIndex - 1; } else { startTermIndex = -1; endTermIndex = (int) si.getValueCount(); } final int nTerms = endTermIndex - startTermIndex; int missingCount = -1; final CharsRefBuilder charsRef = new CharsRefBuilder(); if (nTerms > 0 && docs.size() >= mincount) { // count collection array only needs to be as big as the number of terms we are // going to collect counts for. final int[] counts = new int[nTerms]; Filter filter = docs.getTopFilter(); List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves(); for (int subIndex = 0; subIndex < leaves.size(); subIndex++) { LeafReaderContext leaf = leaves.get(subIndex); DocIdSet dis = filter.getDocIdSet(leaf, null); // solr docsets already exclude any deleted docs DocIdSetIterator disi = null; if (dis != null) { disi = dis.iterator(); } if (disi != null) { if (multiValued) { SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName); if (sub == null) { sub = DocValues.emptySortedSet(); } final SortedDocValues singleton = DocValues.unwrapSingleton(sub); if (singleton != null) { // some codecs may optimize SORTED_SET storage for single-valued fields accumSingle(counts, startTermIndex, singleton, disi, subIndex, ordinalMap); } else { accumMulti(counts, startTermIndex, sub, disi, subIndex, ordinalMap); } } else { SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName); if (sub == null) { sub = DocValues.emptySorted(); } accumSingle(counts, startTermIndex, sub, disi, subIndex, ordinalMap); } } } if (startTermIndex == -1) { missingCount = counts[0]; } // IDEA: we could also maintain a count of "other"... everything that fell outside // of the top 'N' int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) { int maxsize = limit > 0 ? offset + limit : Integer.MAX_VALUE - 1; maxsize = Math.min(maxsize, nTerms); LongPriorityQueue queue = new LongPriorityQueue(Math.min(maxsize, 1000), maxsize, Long.MIN_VALUE); int min = mincount - 1; // the smallest value in the top 'N' values for (int i = (startTermIndex == -1) ? 1 : 0; i < nTerms; i++) { int c = counts[i]; if (contains != null) { final BytesRef term = si.lookupOrd(startTermIndex + i); if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) { continue; } } if (c > min) { // NOTE: we use c>min rather than c>=min as an optimization because we are going in // index order, so we already know that the keys are ordered. This can be very // important if a lot of the counts are repeated (like zero counts would be). // smaller term numbers sort higher, so subtract the term number instead long pair = (((long) c) << 32) + (Integer.MAX_VALUE - i); boolean displaced = queue.insert(pair); if (displaced) min = (int) (queue.top() >>> 32); } } // if we are deep paging, we don't have to order the highest "offset" counts. int collectCount = Math.max(0, queue.size() - off); assert collectCount <= lim; // the start and end indexes of our list "sorted" (starting with the highest value) int sortedIdxStart = queue.size() - (collectCount - 1); int sortedIdxEnd = queue.size() + 1; final long[] sorted = queue.sort(collectCount); for (int i = sortedIdxStart; i < sortedIdxEnd; i++) { long pair = sorted[i]; int c = (int) (pair >>> 32); int tnum = Integer.MAX_VALUE - (int) pair; final BytesRef term = si.lookupOrd(startTermIndex + tnum); ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } else { // add results in index order int i = (startTermIndex == -1) ? 1 : 0; if (mincount <= 0 && contains == null) { // if mincount<=0 and we're not examining the values for contains, then // we won't discard any terms and we know exactly where to start. i += off; off = 0; } for (; i < nTerms; i++) { int c = counts[i]; if (c < mincount) continue; BytesRef term = null; if (contains != null) { term = si.lookupOrd(startTermIndex + i); if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) { continue; } } if (--off >= 0) continue; if (--lim < 0) break; if (term == null) { term = si.lookupOrd(startTermIndex + i); } ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } } return finalize(res, searcher, schemaField, docs, missingCount, missing); }