public void testDocValues() throws IOException { assertU(adoc("id", "1")); assertU(commit()); try (SolrCore core = h.getCoreInc()) { final RefCounted<SolrIndexSearcher> searcherRef = core.openNewSearcher(true, true); final SolrIndexSearcher searcher = searcherRef.get(); try { final LeafReader reader = searcher.getLeafReader(); assertEquals(1, reader.numDocs()); final FieldInfos infos = reader.getFieldInfos(); assertEquals(DocValuesType.NUMERIC, infos.fieldInfo("floatdv").getDocValuesType()); assertEquals(DocValuesType.NUMERIC, infos.fieldInfo("intdv").getDocValuesType()); assertEquals(DocValuesType.NUMERIC, infos.fieldInfo("doubledv").getDocValuesType()); assertEquals(DocValuesType.NUMERIC, infos.fieldInfo("longdv").getDocValuesType()); assertEquals(DocValuesType.SORTED, infos.fieldInfo("stringdv").getDocValuesType()); assertEquals((long) Float.floatToIntBits(1), reader.getNumericDocValues("floatdv").get(0)); assertEquals(2L, reader.getNumericDocValues("intdv").get(0)); assertEquals(Double.doubleToLongBits(3), reader.getNumericDocValues("doubledv").get(0)); assertEquals(4L, reader.getNumericDocValues("longdv").get(0)); final IndexSchema schema = core.getLatestSchema(); final SchemaField floatDv = schema.getField("floatdv"); final SchemaField intDv = schema.getField("intdv"); final SchemaField doubleDv = schema.getField("doubledv"); final SchemaField longDv = schema.getField("longdv"); FunctionValues values = floatDv .getType() .getValueSource(floatDv, null) .getValues(null, searcher.getLeafReader().leaves().get(0)); assertEquals(1f, values.floatVal(0), 0f); assertEquals(1f, values.objectVal(0)); values = intDv .getType() .getValueSource(intDv, null) .getValues(null, searcher.getLeafReader().leaves().get(0)); assertEquals(2, values.intVal(0)); assertEquals(2, values.objectVal(0)); values = doubleDv .getType() .getValueSource(doubleDv, null) .getValues(null, searcher.getLeafReader().leaves().get(0)); assertEquals(3d, values.doubleVal(0), 0d); assertEquals(3d, values.objectVal(0)); values = longDv .getType() .getValueSource(longDv, null) .getValues(null, searcher.getLeafReader().leaves().get(0)); assertEquals(4L, values.longVal(0)); assertEquals(4L, values.objectVal(0)); } finally { searcherRef.decref(); } } }
public void writeResponse() throws IOException { Boolean omitHeader = req.getParams().getBool(CommonParams.OMIT_HEADER); if (omitHeader != null && omitHeader) rsp.getValues().remove("responseHeader"); SolrIndexSearcher searcher = req.getSearcher(); if (liveDocs == null) { liveDocs = searcher.getLeafReader().getLiveDocs(); } int maxDoc = searcher.maxDoc(); try { // responseWriter.write(sw,req,rsp); ReturnFields fields = rsp.getReturnFields(); // return everything Set<String> fnames = fields.getLuceneFieldNames(); int docCounter = 0; for (int i = 0; i < maxDoc; i++) { if (liveDocs != null && !liveDocs.get(i)) { continue; } Document doc = searcher.doc(i); SolrDocument sdoc = toSolrDocument(doc, schema); writeSolrDocument(null, sdoc, fields, docCounter++); getWriter().write("\n"); } } finally { close(); writer.write('\n'); // ending with a newline looks much better from the command line writer.close(); } }
public static Terms getTermVector( int docId, String fieldname, SolrIndexSearcher solrIndexSearcher) throws JATEException { try { Terms vector = solrIndexSearcher.getLeafReader().getTermVector(docId, fieldname); if (vector == null) throw new JATEException("Cannot find expected field: " + fieldname); return vector; } catch (IOException ioe) { StringBuilder sb = new StringBuilder( String.format("Cannot find expected field: %s. Error stacktrack:\n", fieldname)); sb.append(org.apache.commons.lang.exception.ExceptionUtils.getFullStackTrace(ioe)); throw new JATEException(sb.toString()); } }
/** * Returns a list of terms in the specified field along with the corresponding count of documents * in the set that match that constraint. This method uses the FilterCache to get the intersection * count between <code>docs</code> and the DocSet for each term in the filter. * * @see FacetParams#FACET_LIMIT * @see FacetParams#FACET_ZEROS * @see FacetParams#FACET_MISSING */ public NamedList<Integer> getFacetTermEnumCounts( SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase, SolrParams params) throws IOException { /* :TODO: potential optimization... * cache the Terms with the highest docFreq and try them first * don't enum if we get our max from them */ // Minimum term docFreq in order to use the filterCache for that term. int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0); // make sure we have a set that is fast for random access, if we will use it for that DocSet fastForRandomSet = docs; if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) { SortedIntDocSet sset = (SortedIntDocSet) docs; fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size()); } IndexSchema schema = searcher.getSchema(); LeafReader r = searcher.getLeafReader(); FieldType ft = schema.getFieldType(field); boolean sortByCount = sort.equals("count") || sort.equals("true"); final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1; final BoundedTreeSet<CountPair<BytesRef, Integer>> queue = sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null; final NamedList<Integer> res = new NamedList<>(); int min = mincount - 1; // the smallest value in the top 'N' values int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; BytesRef prefixTermBytes = null; if (prefix != null) { String indexedPrefix = ft.toInternal(prefix); prefixTermBytes = new BytesRef(indexedPrefix); } Fields fields = r.fields(); Terms terms = fields == null ? null : fields.terms(field); TermsEnum termsEnum = null; SolrIndexSearcher.DocsEnumState deState = null; BytesRef term = null; if (terms != null) { termsEnum = terms.iterator(); // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for // facet.offset when sorting by index order. if (prefixTermBytes != null) { if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); } } else { // position termsEnum on first term term = termsEnum.next(); } } PostingsEnum postingsEnum = null; CharsRefBuilder charsRef = new CharsRefBuilder(); if (docs.size() >= mincount) { while (term != null) { if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes)) break; if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) { int df = termsEnum.docFreq(); // If we are sorting, we can use df>min (rather than >=) since we // are going in index order. For certain term distributions this can // make a large difference (for example, many terms with df=1). if (df > 0 && df > min) { int c; if (df >= minDfFilterCache) { // use the filter cache if (deState == null) { deState = new SolrIndexSearcher.DocsEnumState(); deState.fieldName = field; deState.liveDocs = r.getLiveDocs(); deState.termsEnum = termsEnum; deState.postingsEnum = postingsEnum; } c = searcher.numDocs(docs, deState); postingsEnum = deState.postingsEnum; } else { // iterate over TermDocs to calculate the intersection // TODO: specialize when base docset is a bitset or hash set (skipDocs)? or does it // matter for this? // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class // impl) // TODO: would passing deleted docs lead to better efficiency over checking the // fastForRandomSet? postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); c = 0; if (postingsEnum instanceof MultiPostingsEnum) { MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs(); int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs(); for (int subindex = 0; subindex < numSubs; subindex++) { MultiPostingsEnum.EnumWithSlice sub = subs[subindex]; if (sub.postingsEnum == null) continue; int base = sub.slice.start; int docid; while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid + base)) c++; } } } else { int docid; while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid)) c++; } } } if (sortByCount) { if (c > min) { BytesRef termCopy = BytesRef.deepCopyOf(term); queue.add(new CountPair<>(termCopy, c)); if (queue.size() >= maxsize) min = queue.last().val; } } else { if (c >= mincount && --off < 0) { if (--lim < 0) break; ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } } } term = termsEnum.next(); } } if (sortByCount) { for (CountPair<BytesRef, Integer> p : queue) { if (--off >= 0) continue; if (--lim < 0) break; ft.indexedToReadable(p.key, charsRef); res.add(charsRef.toString(), p.val); } } if (missing) { res.add(null, getFieldMissingCount(searcher, docs, field)); } return res; }
public static NamedList<Integer> getCounts( SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase) throws IOException { SchemaField schemaField = searcher.getSchema().getField(fieldName); FieldType ft = schemaField.getType(); NamedList<Integer> res = new NamedList<>(); // TODO: remove multiValuedFieldCache(), check dv type / uninversion type? final boolean multiValued = schemaField.multiValued() || ft.multiValuedFieldCache(); final SortedSetDocValues si; // for term lookups only OrdinalMap ordinalMap = null; // for mapping per-segment ords to global ones if (multiValued) { si = searcher.getLeafReader().getSortedSetDocValues(fieldName); if (si instanceof MultiSortedSetDocValues) { ordinalMap = ((MultiSortedSetDocValues) si).mapping; } } else { SortedDocValues single = searcher.getLeafReader().getSortedDocValues(fieldName); si = single == null ? null : DocValues.singleton(single); if (single instanceof MultiSortedDocValues) { ordinalMap = ((MultiSortedDocValues) single).mapping; } } if (si == null) { return finalize(res, searcher, schemaField, docs, -1, missing); } if (si.getValueCount() >= Integer.MAX_VALUE) { throw new UnsupportedOperationException( "Currently this faceting method is limited to " + Integer.MAX_VALUE + " unique terms"); } final BytesRefBuilder prefixRef; if (prefix == null) { prefixRef = null; } else if (prefix.length() == 0) { prefix = null; prefixRef = null; } else { prefixRef = new BytesRefBuilder(); prefixRef.copyChars(prefix); } int startTermIndex, endTermIndex; if (prefix != null) { startTermIndex = (int) si.lookupTerm(prefixRef.get()); if (startTermIndex < 0) startTermIndex = -startTermIndex - 1; prefixRef.append(UnicodeUtil.BIG_TERM); endTermIndex = (int) si.lookupTerm(prefixRef.get()); assert endTermIndex < 0; endTermIndex = -endTermIndex - 1; } else { startTermIndex = -1; endTermIndex = (int) si.getValueCount(); } final int nTerms = endTermIndex - startTermIndex; int missingCount = -1; final CharsRefBuilder charsRef = new CharsRefBuilder(); if (nTerms > 0 && docs.size() >= mincount) { // count collection array only needs to be as big as the number of terms we are // going to collect counts for. final int[] counts = new int[nTerms]; Filter filter = docs.getTopFilter(); List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves(); for (int subIndex = 0; subIndex < leaves.size(); subIndex++) { LeafReaderContext leaf = leaves.get(subIndex); DocIdSet dis = filter.getDocIdSet(leaf, null); // solr docsets already exclude any deleted docs DocIdSetIterator disi = null; if (dis != null) { disi = dis.iterator(); } if (disi != null) { if (multiValued) { SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName); if (sub == null) { sub = DocValues.emptySortedSet(); } final SortedDocValues singleton = DocValues.unwrapSingleton(sub); if (singleton != null) { // some codecs may optimize SORTED_SET storage for single-valued fields accumSingle(counts, startTermIndex, singleton, disi, subIndex, ordinalMap); } else { accumMulti(counts, startTermIndex, sub, disi, subIndex, ordinalMap); } } else { SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName); if (sub == null) { sub = DocValues.emptySorted(); } accumSingle(counts, startTermIndex, sub, disi, subIndex, ordinalMap); } } } if (startTermIndex == -1) { missingCount = counts[0]; } // IDEA: we could also maintain a count of "other"... everything that fell outside // of the top 'N' int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) { int maxsize = limit > 0 ? offset + limit : Integer.MAX_VALUE - 1; maxsize = Math.min(maxsize, nTerms); LongPriorityQueue queue = new LongPriorityQueue(Math.min(maxsize, 1000), maxsize, Long.MIN_VALUE); int min = mincount - 1; // the smallest value in the top 'N' values for (int i = (startTermIndex == -1) ? 1 : 0; i < nTerms; i++) { int c = counts[i]; if (contains != null) { final BytesRef term = si.lookupOrd(startTermIndex + i); if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) { continue; } } if (c > min) { // NOTE: we use c>min rather than c>=min as an optimization because we are going in // index order, so we already know that the keys are ordered. This can be very // important if a lot of the counts are repeated (like zero counts would be). // smaller term numbers sort higher, so subtract the term number instead long pair = (((long) c) << 32) + (Integer.MAX_VALUE - i); boolean displaced = queue.insert(pair); if (displaced) min = (int) (queue.top() >>> 32); } } // if we are deep paging, we don't have to order the highest "offset" counts. int collectCount = Math.max(0, queue.size() - off); assert collectCount <= lim; // the start and end indexes of our list "sorted" (starting with the highest value) int sortedIdxStart = queue.size() - (collectCount - 1); int sortedIdxEnd = queue.size() + 1; final long[] sorted = queue.sort(collectCount); for (int i = sortedIdxStart; i < sortedIdxEnd; i++) { long pair = sorted[i]; int c = (int) (pair >>> 32); int tnum = Integer.MAX_VALUE - (int) pair; final BytesRef term = si.lookupOrd(startTermIndex + tnum); ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } else { // add results in index order int i = (startTermIndex == -1) ? 1 : 0; if (mincount <= 0 && contains == null) { // if mincount<=0 and we're not examining the values for contains, then // we won't discard any terms and we know exactly where to start. i += off; off = 0; } for (; i < nTerms; i++) { int c = counts[i]; if (c < mincount) continue; BytesRef term = null; if (contains != null) { term = si.lookupOrd(startTermIndex + i); if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) { continue; } } if (--off >= 0) continue; if (--lim < 0) break; if (term == null) { term = si.lookupOrd(startTermIndex + i); } ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } } return finalize(res, searcher, schemaField, docs, missingCount, missing); }