/** * Returns a count of the documents in the set which do not have any terms for for the specified * field. * * @see FacetParams#FACET_MISSING */ public static int getFieldMissingCount(SolrIndexSearcher searcher, DocSet docs, String fieldName) throws IOException { SchemaField sf = searcher.getSchema().getField(fieldName); DocSet hasVal = searcher.getDocSet(sf.getType().getRangeQuery(null, sf, null, null, false, false)); return docs.andNotSize(hasVal); }
static Query getFieldMissingQuery(SolrIndexSearcher searcher, String fieldName) throws IOException { SchemaField sf = searcher.getSchema().getField(fieldName); Query hasVal = sf.getType().getRangeQuery(null, sf, null, null, false, false); BooleanQuery.Builder noVal = new BooleanQuery.Builder(); noVal.add(hasVal, BooleanClause.Occur.MUST_NOT); return noVal.build(); }
@SuppressWarnings("unused") static DocSet getFieldMissing(SolrIndexSearcher searcher, DocSet docs, String fieldName) throws IOException { SchemaField sf = searcher.getSchema().getField(fieldName); DocSet hasVal = searcher.getDocSet(sf.getType().getRangeQuery(null, sf, null, null, false, false)); DocSet answer = docs.andNot(hasVal); // hasVal.decref(); // OFF-HEAP return answer; }
/** * Computes the term->count counts for the specified term values relative to the * * @param field the name of the field to compute term counts against * @param parsed contains the docset to compute term counts relative to * @param terms a list of term values (in the specified field) to compute the counts for */ protected NamedList<Integer> getListedTermCounts( String field, final ParsedParams parsed, List<String> terms) throws IOException { FieldType ft = searcher.getSchema().getFieldType(field); NamedList<Integer> res = new NamedList<>(); for (String term : terms) { String internal = ft.toInternal(term); int count = searcher.numDocs(new TermQuery(new Term(field, internal)), parsed.docs); res.add(term, count); } return res; }
public static SolrReaderSetScorer createReaderSetScorer( Weight weight, AtomicReaderContext context, Bits acceptDocs, SolrIndexSearcher searcher, String authorities, AtomicReader reader) throws IOException { DocSet readableDocSet = (DocSet) searcher.cacheLookup(CacheConstants.ALFRESCO_READER_CACHE, authorities); if (readableDocSet == null) { String[] auths = authorities.substring(1).split(authorities.substring(0, 1)); readableDocSet = new BitDocSet(new FixedBitSet(searcher.maxDoc())); BooleanQuery bQuery = new BooleanQuery(); for (String current : auths) { bQuery.add(new TermQuery(new Term(QueryConstants.FIELD_READER, current)), Occur.SHOULD); } DocSet aclDocs = searcher.getDocSet(bQuery); BooleanQuery aQuery = new BooleanQuery(); for (DocIterator it = aclDocs.iterator(); it.hasNext(); /**/ ) { int docID = it.nextDoc(); // Obtain the ACL ID for this ACL doc. long aclID = searcher.getAtomicReader().getNumericDocValues(QueryConstants.FIELD_ACLID).get(docID); SchemaField schemaField = searcher.getSchema().getField(QueryConstants.FIELD_ACLID); Query query = schemaField.getType().getFieldQuery(null, schemaField, Long.toString(aclID)); aQuery.add(query, Occur.SHOULD); if ((aQuery.clauses().size() > 999) || !it.hasNext()) { DocSet docsForAclId = searcher.getDocSet(aQuery); readableDocSet = readableDocSet.union(docsForAclId); aQuery = new BooleanQuery(); } } // Exclude the ACL docs from the results, we only want real docs that match. // Probably not very efficient, what we really want is remove(docID) readableDocSet = readableDocSet.andNot(aclDocs); searcher.cacheInsert(CacheConstants.ALFRESCO_READER_CACHE, authorities, readableDocSet); } // TODO: cache the full set? e.g. searcher.cacheInsert(CacheConstants.ALFRESCO_READERSET_CACHE, // authorities, readableDocSet) // plus check of course, for presence in cache at start of method. return new SolrReaderSetScorer(weight, readableDocSet, context, acceptDocs, searcher); }
/** * Generates a list of Highlighted query fragments for each item in a list of documents, or * returns null if highlighting is disabled. * * @param docs query results * @param query the query * @param req the current request * @param defaultFields default list of fields to summarize * @return NamedList containing a NamedList for each document, which in turns contains sets * (field, summary) pairs. */ @Override @SuppressWarnings("unchecked") public NamedList<Object> doHighlighting( DocList docs, Query query, SolrQueryRequest req, String[] defaultFields) throws IOException { SolrParams params = req.getParams(); if (!isHighlightingEnabled(params)) return null; SolrIndexSearcher searcher = req.getSearcher(); IndexSchema schema = searcher.getSchema(); NamedList fragments = new SimpleOrderedMap(); String[] fieldNames = getHighlightFields(query, req, defaultFields); Set<String> fset = new HashSet<String>(); { // pre-fetch documents using the Searcher's doc cache for (String f : fieldNames) { fset.add(f); } // fetch unique key if one exists. SchemaField keyField = schema.getUniqueKeyField(); if (null != keyField) fset.add(keyField.getName()); } // get FastVectorHighlighter instance out of the processing loop FastVectorHighlighter fvh = new FastVectorHighlighter( // FVH cannot process hl.usePhraseHighlighter parameter per-field basis params.getBool(HighlightParams.USE_PHRASE_HIGHLIGHTER, true), // FVH cannot process hl.requireFieldMatch parameter per-field basis params.getBool(HighlightParams.FIELD_MATCH, false)); fvh.setPhraseLimit(params.getInt(HighlightParams.PHRASE_LIMIT, Integer.MAX_VALUE)); FieldQuery fieldQuery = fvh.getFieldQuery(query, searcher.getIndexReader()); // Highlight each document DocIterator iterator = docs.iterator(); for (int i = 0; i < docs.size(); i++) { int docId = iterator.nextDoc(); Document doc = searcher.doc(docId, fset); NamedList docSummaries = new SimpleOrderedMap(); for (String fieldName : fieldNames) { fieldName = fieldName.trim(); if (useFastVectorHighlighter(params, schema, fieldName)) doHighlightingByFastVectorHighlighter( fvh, fieldQuery, req, docSummaries, docId, doc, fieldName); else doHighlightingByHighlighter(query, req, docSummaries, docId, doc, fieldName); } String printId = schema.printableUniqueKey(doc); fragments.add(printId == null ? null : printId, docSummaries); } return fragments; }
private void mockDocument(int docId, double price, double discount, boolean isCloseout) throws IOException { Document doc = PowerMockito.mock(Document.class); when(searcher.doc(eq(docId), any(Set.class))).thenReturn(doc); when(searcher.getSchema()).thenReturn(schema); IndexableField priceField = mock(IndexableField.class); when(doc.getField(FIELD_PRICE)).thenReturn(priceField); when(priceField.numericValue()).thenReturn(new Double(price)); IndexableField discountField = mock(IndexableField.class); when(doc.getField(FIELD_DISCOUNT)).thenReturn(discountField); when(discountField.numericValue()).thenReturn(new Double(discount)); IndexableField closeoutField = mock(IndexableField.class); when(doc.getField(FIELD_CLOSEOUT)).thenReturn(closeoutField); when(closeoutField.stringValue()).thenReturn(isCloseout ? "T" : "F"); }
@Override public void process(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (!params.getBool(COMPONENT_NAME, false)) return; SolrIndexSearcher searcher = rb.req.getSearcher(); IndexSchema schema = searcher.getSchema(); if (schema.getUniqueKeyField() == null) return; ResultContext rc = (ResultContext) rb.rsp.getResponse(); DocList docs = rc.getDocList(); if (docs.hasScores()) { processScores(rb, docs, schema, searcher); } else { processIds(rb, docs, schema, searcher); } }
@Override public FunctionValues getValues(Map context, LeafReaderContext readerContext) throws IOException { final int off = readerContext.docBase; final LeafReader r; Object o = context.get("searcher"); if (o instanceof SolrIndexSearcher) { SolrIndexSearcher is = (SolrIndexSearcher) o; SchemaField sf = is.getSchema().getFieldOrNull(field); if (sf != null && sf.hasDocValues() == false && sf.multiValued() == false && sf.getType().getNumericType() != null) { // it's a single-valued numeric field: we must currently create insanity :( List<LeafReaderContext> leaves = is.getIndexReader().leaves(); LeafReader insaneLeaves[] = new LeafReader[leaves.size()]; int upto = 0; for (LeafReaderContext raw : leaves) { insaneLeaves[upto++] = Insanity.wrapInsanity(raw.reader(), field); } r = SlowCompositeReaderWrapper.wrap(new MultiReader(insaneLeaves)); } else { // reuse ordinalmap r = ((SolrIndexSearcher) o).getLeafReader(); } } else { IndexReader topReader = ReaderUtil.getTopLevelContext(readerContext).reader(); r = SlowCompositeReaderWrapper.wrap(topReader); } // if it's e.g. tokenized/multivalued, emulate old behavior of single-valued fc final SortedDocValues sindex = SortedSetSelector.wrap(DocValues.getSortedSet(r, field), SortedSetSelector.Type.MIN); final int end = sindex.getValueCount(); return new IntDocValues(this) { @Override public int intVal(int doc) { return (end - sindex.getOrd(doc + off) - 1); } }; }
private void doHighlightingByHighlighter( Query query, SolrQueryRequest req, NamedList docSummaries, int docId, Document doc, String fieldName) throws IOException { SolrParams params = req.getParams(); String[] docTexts = doc.getValues(fieldName); // according to Document javadoc, doc.getValues() never returns null. check empty instead of // null if (docTexts.length == 0) return; SolrIndexSearcher searcher = req.getSearcher(); IndexSchema schema = searcher.getSchema(); TokenStream tstream = null; int numFragments = getMaxSnippets(fieldName, params); boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params); String[] summaries = null; List<TextFragment> frags = new ArrayList<TextFragment>(); TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization try { TokenStream tvStream = TokenSources.getTokenStream(searcher.getIndexReader(), docId, fieldName); if (tvStream != null) { tots = new TermOffsetsTokenStream(tvStream); } } catch (IllegalArgumentException e) { // No problem. But we can't use TermOffsets optimization. } for (int j = 0; j < docTexts.length; j++) { if (tots != null) { // if we're using TermOffsets optimization, then get the next // field value's TokenStream (i.e. get field j's TokenStream) from tots: tstream = tots.getMultiValuedTokenStream(docTexts[j].length()); } else { // fall back to analyzer tstream = createAnalyzerTStream(schema, fieldName, docTexts[j]); } Highlighter highlighter; if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) { // TODO: this is not always necessary - eventually we would like to avoid this wrap // when it is not needed. tstream = new CachingTokenFilter(tstream); // get highlighter highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream); // after highlighter initialization, reset tstream since construction of highlighter already // used it tstream.reset(); } else { // use "the old way" highlighter = getHighlighter(query, fieldName, req); } int maxCharsToAnalyze = params.getFieldInt( fieldName, HighlightParams.MAX_CHARS, Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); if (maxCharsToAnalyze < 0) { highlighter.setMaxDocCharsToAnalyze(docTexts[j].length()); } else { highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze); } try { TextFragment[] bestTextFragments = highlighter.getBestTextFragments( tstream, docTexts[j], mergeContiguousFragments, numFragments); for (int k = 0; k < bestTextFragments.length; k++) { if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) { frags.add(bestTextFragments[k]); } } } catch (InvalidTokenOffsetsException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } } // sort such that the fragments with the highest score come first Collections.sort( frags, new Comparator<TextFragment>() { public int compare(TextFragment arg0, TextFragment arg1) { return Math.round(arg1.getScore() - arg0.getScore()); } }); // convert fragments back into text // TODO: we can include score and position information in output as snippet attributes if (frags.size() > 0) { ArrayList<String> fragTexts = new ArrayList<String>(); for (TextFragment fragment : frags) { if ((fragment != null) && (fragment.getScore() > 0)) { fragTexts.add(fragment.toString()); } if (fragTexts.size() >= numFragments) break; } summaries = fragTexts.toArray(new String[0]); if (summaries.length > 0) docSummaries.add(fieldName, summaries); } // no summeries made, copy text from alternate field if (summaries == null || summaries.length == 0) { alternateField(docSummaries, params, doc, fieldName); } }
/** * Returns a list of terms in the specified field along with the corresponding count of documents * in the set that match that constraint. This method uses the FilterCache to get the intersection * count between <code>docs</code> and the DocSet for each term in the filter. * * @see FacetParams#FACET_LIMIT * @see FacetParams#FACET_ZEROS * @see FacetParams#FACET_MISSING */ public NamedList<Integer> getFacetTermEnumCounts( SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase, SolrParams params) throws IOException { /* :TODO: potential optimization... * cache the Terms with the highest docFreq and try them first * don't enum if we get our max from them */ // Minimum term docFreq in order to use the filterCache for that term. int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0); // make sure we have a set that is fast for random access, if we will use it for that DocSet fastForRandomSet = docs; if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) { SortedIntDocSet sset = (SortedIntDocSet) docs; fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size()); } IndexSchema schema = searcher.getSchema(); LeafReader r = searcher.getLeafReader(); FieldType ft = schema.getFieldType(field); boolean sortByCount = sort.equals("count") || sort.equals("true"); final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1; final BoundedTreeSet<CountPair<BytesRef, Integer>> queue = sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null; final NamedList<Integer> res = new NamedList<>(); int min = mincount - 1; // the smallest value in the top 'N' values int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; BytesRef prefixTermBytes = null; if (prefix != null) { String indexedPrefix = ft.toInternal(prefix); prefixTermBytes = new BytesRef(indexedPrefix); } Fields fields = r.fields(); Terms terms = fields == null ? null : fields.terms(field); TermsEnum termsEnum = null; SolrIndexSearcher.DocsEnumState deState = null; BytesRef term = null; if (terms != null) { termsEnum = terms.iterator(); // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for // facet.offset when sorting by index order. if (prefixTermBytes != null) { if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); } } else { // position termsEnum on first term term = termsEnum.next(); } } PostingsEnum postingsEnum = null; CharsRefBuilder charsRef = new CharsRefBuilder(); if (docs.size() >= mincount) { while (term != null) { if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes)) break; if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) { int df = termsEnum.docFreq(); // If we are sorting, we can use df>min (rather than >=) since we // are going in index order. For certain term distributions this can // make a large difference (for example, many terms with df=1). if (df > 0 && df > min) { int c; if (df >= minDfFilterCache) { // use the filter cache if (deState == null) { deState = new SolrIndexSearcher.DocsEnumState(); deState.fieldName = field; deState.liveDocs = r.getLiveDocs(); deState.termsEnum = termsEnum; deState.postingsEnum = postingsEnum; } c = searcher.numDocs(docs, deState); postingsEnum = deState.postingsEnum; } else { // iterate over TermDocs to calculate the intersection // TODO: specialize when base docset is a bitset or hash set (skipDocs)? or does it // matter for this? // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class // impl) // TODO: would passing deleted docs lead to better efficiency over checking the // fastForRandomSet? postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); c = 0; if (postingsEnum instanceof MultiPostingsEnum) { MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs(); int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs(); for (int subindex = 0; subindex < numSubs; subindex++) { MultiPostingsEnum.EnumWithSlice sub = subs[subindex]; if (sub.postingsEnum == null) continue; int base = sub.slice.start; int docid; while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid + base)) c++; } } } else { int docid; while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid)) c++; } } } if (sortByCount) { if (c > min) { BytesRef termCopy = BytesRef.deepCopyOf(term); queue.add(new CountPair<>(termCopy, c)); if (queue.size() >= maxsize) min = queue.last().val; } } else { if (c >= mincount && --off < 0) { if (--lim < 0) break; ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } } } term = termsEnum.next(); } } if (sortByCount) { for (CountPair<BytesRef, Integer> p : queue) { if (--off >= 0) continue; if (--lim < 0) break; ft.indexedToReadable(p.key, charsRef); res.add(charsRef.toString(), p.val); } } if (missing) { res.add(null, getFieldMissingCount(searcher, docs, field)); } return res; }
public static NamedList<Integer> getCounts( SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix) throws IOException { SchemaField schemaField = searcher.getSchema().getField(fieldName); FieldType ft = schemaField.getType(); NamedList<Integer> res = new NamedList<Integer>(); final SortedSetDocValues si; // for term lookups only OrdinalMap ordinalMap = null; // for mapping per-segment ords to global ones if (schemaField.multiValued()) { si = searcher.getAtomicReader().getSortedSetDocValues(fieldName); if (si instanceof MultiSortedSetDocValues) { ordinalMap = ((MultiSortedSetDocValues) si).mapping; } } else { SortedDocValues single = searcher.getAtomicReader().getSortedDocValues(fieldName); si = single == null ? null : new SingletonSortedSetDocValues(single); if (single instanceof MultiSortedDocValues) { ordinalMap = ((MultiSortedDocValues) single).mapping; } } if (si == null) { return finalize(res, searcher, schemaField, docs, -1, missing); } if (si.getValueCount() >= Integer.MAX_VALUE) { throw new UnsupportedOperationException( "Currently this faceting method is limited to " + Integer.MAX_VALUE + " unique terms"); } final BytesRef br = new BytesRef(); final BytesRef prefixRef; if (prefix == null) { prefixRef = null; } else if (prefix.length() == 0) { prefix = null; prefixRef = null; } else { prefixRef = new BytesRef(prefix); } int startTermIndex, endTermIndex; if (prefix != null) { startTermIndex = (int) si.lookupTerm(prefixRef); if (startTermIndex < 0) startTermIndex = -startTermIndex - 1; prefixRef.append(UnicodeUtil.BIG_TERM); endTermIndex = (int) si.lookupTerm(prefixRef); assert endTermIndex < 0; endTermIndex = -endTermIndex - 1; } else { startTermIndex = -1; endTermIndex = (int) si.getValueCount(); } final int nTerms = endTermIndex - startTermIndex; int missingCount = -1; final CharsRef charsRef = new CharsRef(10); if (nTerms > 0 && docs.size() >= mincount) { // count collection array only needs to be as big as the number of terms we are // going to collect counts for. final int[] counts = new int[nTerms]; Filter filter = docs.getTopFilter(); List<AtomicReaderContext> leaves = searcher.getTopReaderContext().leaves(); for (int subIndex = 0; subIndex < leaves.size(); subIndex++) { AtomicReaderContext leaf = leaves.get(subIndex); DocIdSet dis = filter.getDocIdSet(leaf, null); // solr docsets already exclude any deleted docs DocIdSetIterator disi = null; if (dis != null) { disi = dis.iterator(); } if (disi != null) { if (schemaField.multiValued()) { SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName); if (sub == null) { sub = SortedSetDocValues.EMPTY; } if (sub instanceof SingletonSortedSetDocValues) { // some codecs may optimize SORTED_SET storage for single-valued fields final SortedDocValues values = ((SingletonSortedSetDocValues) sub).getSortedDocValues(); accumSingle(counts, startTermIndex, values, disi, subIndex, ordinalMap); } else { accumMulti(counts, startTermIndex, sub, disi, subIndex, ordinalMap); } } else { SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName); if (sub == null) { sub = SortedDocValues.EMPTY; } accumSingle(counts, startTermIndex, sub, disi, subIndex, ordinalMap); } } } if (startTermIndex == -1) { missingCount = counts[0]; } // IDEA: we could also maintain a count of "other"... everything that fell outside // of the top 'N' int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) { int maxsize = limit > 0 ? offset + limit : Integer.MAX_VALUE - 1; maxsize = Math.min(maxsize, nTerms); LongPriorityQueue queue = new LongPriorityQueue(Math.min(maxsize, 1000), maxsize, Long.MIN_VALUE); int min = mincount - 1; // the smallest value in the top 'N' values for (int i = (startTermIndex == -1) ? 1 : 0; i < nTerms; i++) { int c = counts[i]; if (c > min) { // NOTE: we use c>min rather than c>=min as an optimization because we are going in // index order, so we already know that the keys are ordered. This can be very // important if a lot of the counts are repeated (like zero counts would be). // smaller term numbers sort higher, so subtract the term number instead long pair = (((long) c) << 32) + (Integer.MAX_VALUE - i); boolean displaced = queue.insert(pair); if (displaced) min = (int) (queue.top() >>> 32); } } // if we are deep paging, we don't have to order the highest "offset" counts. int collectCount = Math.max(0, queue.size() - off); assert collectCount <= lim; // the start and end indexes of our list "sorted" (starting with the highest value) int sortedIdxStart = queue.size() - (collectCount - 1); int sortedIdxEnd = queue.size() + 1; final long[] sorted = queue.sort(collectCount); for (int i = sortedIdxStart; i < sortedIdxEnd; i++) { long pair = sorted[i]; int c = (int) (pair >>> 32); int tnum = Integer.MAX_VALUE - (int) pair; si.lookupOrd(startTermIndex + tnum, br); ft.indexedToReadable(br, charsRef); res.add(charsRef.toString(), c); } } else { // add results in index order int i = (startTermIndex == -1) ? 1 : 0; if (mincount <= 0) { // if mincount<=0, then we won't discard any terms and we know exactly // where to start. i += off; off = 0; } for (; i < nTerms; i++) { int c = counts[i]; if (c < mincount || --off >= 0) continue; if (--lim < 0) break; si.lookupOrd(startTermIndex + i, br); ft.indexedToReadable(br, charsRef); res.add(charsRef.toString(), c); } } } return finalize(res, searcher, schemaField, docs, missingCount, missing); }
private void doHighlightingByHighlighter( Query query, SolrQueryRequest req, NamedList docSummaries, int docId, Document doc, String fieldName) throws IOException { final SolrIndexSearcher searcher = req.getSearcher(); final IndexSchema schema = searcher.getSchema(); // TODO: Currently in trunk highlighting numeric fields is broken (Lucene) - // so we disable them until fixed (see LUCENE-3080)! // BEGIN: Hack final SchemaField schemaField = schema.getFieldOrNull(fieldName); if (schemaField != null && ((schemaField.getType() instanceof org.apache.solr.schema.TrieField) || (schemaField.getType() instanceof org.apache.solr.schema.TrieDateField))) return; // END: Hack SolrParams params = req.getParams(); IndexableField[] docFields = doc.getFields(fieldName); List<String> listFields = new ArrayList<String>(); for (IndexableField field : docFields) { listFields.add(field.stringValue()); } // preserve order of values in a multiValued list boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false); String[] docTexts = (String[]) listFields.toArray(new String[listFields.size()]); // according to Document javadoc, doc.getValues() never returns null. check empty instead of // null if (docTexts.length == 0) return; TokenStream tstream = null; int numFragments = getMaxSnippets(fieldName, params); boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params); String[] summaries = null; List<TextFragment> frags = new ArrayList<TextFragment>(); TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization TokenStream tvStream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName); if (tvStream != null) { tots = new TermOffsetsTokenStream(tvStream); } for (int j = 0; j < docTexts.length; j++) { if (tots != null) { // if we're using TermOffsets optimization, then get the next // field value's TokenStream (i.e. get field j's TokenStream) from tots: tstream = tots.getMultiValuedTokenStream(docTexts[j].length()); } else { // fall back to analyzer tstream = createAnalyzerTStream(schema, fieldName, docTexts[j]); } int maxCharsToAnalyze = params.getFieldInt( fieldName, HighlightParams.MAX_CHARS, Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); Highlighter highlighter; if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) { if (maxCharsToAnalyze < 0) { tstream = new CachingTokenFilter(tstream); } else { tstream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze)); } // get highlighter highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream); // after highlighter initialization, reset tstream since construction of highlighter already // used it tstream.reset(); } else { // use "the old way" highlighter = getHighlighter(query, fieldName, req); } if (maxCharsToAnalyze < 0) { highlighter.setMaxDocCharsToAnalyze(docTexts[j].length()); } else { highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze); } try { TextFragment[] bestTextFragments = highlighter.getBestTextFragments( tstream, docTexts[j], mergeContiguousFragments, numFragments); for (int k = 0; k < bestTextFragments.length; k++) { if (preserveMulti) { if (bestTextFragments[k] != null) { frags.add(bestTextFragments[k]); } } else { if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) { frags.add(bestTextFragments[k]); } } } } catch (InvalidTokenOffsetsException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } } // sort such that the fragments with the highest score come first if (!preserveMulti) { Collections.sort( frags, new Comparator<TextFragment>() { @Override public int compare(TextFragment arg0, TextFragment arg1) { return Math.round(arg1.getScore() - arg0.getScore()); } }); } // convert fragments back into text // TODO: we can include score and position information in output as snippet attributes if (frags.size() > 0) { ArrayList<String> fragTexts = new ArrayList<String>(); for (TextFragment fragment : frags) { if (preserveMulti) { if (fragment != null) { fragTexts.add(fragment.toString()); } } else { if ((fragment != null) && (fragment.getScore() > 0)) { fragTexts.add(fragment.toString()); } } if (fragTexts.size() >= numFragments && !preserveMulti) break; } summaries = fragTexts.toArray(new String[0]); if (summaries.length > 0) docSummaries.add(fieldName, summaries); } // no summeries made, copy text from alternate field if (summaries == null || summaries.length == 0) { alternateField(docSummaries, params, doc, fieldName); } }
public NamedList<Integer> getGroupedCounts( SolrIndexSearcher searcher, DocSet base, String field, boolean multiToken, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase) throws IOException { GroupingSpecification groupingSpecification = rb.getGroupingSpec(); final String groupField = groupingSpecification != null ? groupingSpecification.getFields()[0] : null; if (groupField == null) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Specify the group.field as parameter or local parameter"); } BytesRef prefixBytesRef = prefix != null ? new BytesRef(prefix) : null; final TermGroupFacetCollector collector = TermGroupFacetCollector.createTermGroupFacetCollector( groupField, field, multiToken, prefixBytesRef, 128); SchemaField sf = searcher.getSchema().getFieldOrNull(groupField); if (sf != null && sf.hasDocValues() == false && sf.multiValued() == false && sf.getType().getNumericType() != null) { // it's a single-valued numeric field: we must currently create insanity :( // there isn't a GroupedFacetCollector that works on numerics right now... searcher.search( base.getTopFilter(), new FilterCollector(collector) { @Override public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException { LeafReader insane = Insanity.wrapInsanity(context.reader(), groupField); return in.getLeafCollector(insane.getContext()); } }); } else { searcher.search(base.getTopFilter(), collector); } boolean orderByCount = sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY); TermGroupFacetCollector.GroupedFacetResult result = collector.mergeSegmentResults( limit < 0 ? Integer.MAX_VALUE : (offset + limit), mincount, orderByCount); CharsRefBuilder charsRef = new CharsRefBuilder(); FieldType facetFieldType = searcher.getSchema().getFieldType(field); NamedList<Integer> facetCounts = new NamedList<>(); List<TermGroupFacetCollector.FacetEntry> scopedEntries = result.getFacetEntries(offset, limit < 0 ? Integer.MAX_VALUE : limit); for (TermGroupFacetCollector.FacetEntry facetEntry : scopedEntries) { // :TODO:can we do contains earlier than this to make it more efficient? if (contains != null && !contains(facetEntry.getValue().utf8ToString(), contains, ignoreCase)) { continue; } facetFieldType.indexedToReadable(facetEntry.getValue(), charsRef); facetCounts.add(charsRef.toString(), facetEntry.getCount()); } if (missing) { facetCounts.add(null, result.getTotalMissingCount()); } return facetCounts; }
/** * Term counts for use in field faceting that resepcts the specified mincount - if mincount is * null, the "zeros" param is consulted for the appropriate backcompat default * * @see FacetParams#FACET_ZEROS */ private NamedList<Integer> getTermCounts(String field, Integer mincount, ParsedParams parsed) throws IOException { final SolrParams params = parsed.params; final DocSet docs = parsed.docs; final int threads = parsed.threads; int offset = params.getFieldInt(field, FacetParams.FACET_OFFSET, 0); int limit = params.getFieldInt(field, FacetParams.FACET_LIMIT, 100); if (limit == 0) return new NamedList<>(); if (mincount == null) { Boolean zeros = params.getFieldBool(field, FacetParams.FACET_ZEROS); // mincount = (zeros!=null && zeros) ? 0 : 1; mincount = (zeros != null && !zeros) ? 1 : 0; // current default is to include zeros. } boolean missing = params.getFieldBool(field, FacetParams.FACET_MISSING, false); // default to sorting if there is a limit. String sort = params.getFieldParam( field, FacetParams.FACET_SORT, limit > 0 ? FacetParams.FACET_SORT_COUNT : FacetParams.FACET_SORT_INDEX); String prefix = params.getFieldParam(field, FacetParams.FACET_PREFIX); String contains = params.getFieldParam(field, FacetParams.FACET_CONTAINS); boolean ignoreCase = params.getFieldBool(field, FacetParams.FACET_CONTAINS_IGNORE_CASE, false); NamedList<Integer> counts; SchemaField sf = searcher.getSchema().getField(field); FieldType ft = sf.getType(); // determine what type of faceting method to use final String methodStr = params.getFieldParam(field, FacetParams.FACET_METHOD); FacetMethod method = null; if (FacetParams.FACET_METHOD_enum.equals(methodStr)) { method = FacetMethod.ENUM; } else if (FacetParams.FACET_METHOD_fcs.equals(methodStr)) { method = FacetMethod.FCS; } else if (FacetParams.FACET_METHOD_fc.equals(methodStr)) { method = FacetMethod.FC; } if (method == FacetMethod.ENUM && TrieField.getMainValuePrefix(ft) != null) { // enum can't deal with trie fields that index several terms per value method = sf.multiValued() ? FacetMethod.FC : FacetMethod.FCS; } if (method == null && ft instanceof BoolField) { // Always use filters for booleans... we know the number of values is very small. method = FacetMethod.ENUM; } final boolean multiToken = sf.multiValued() || ft.multiValuedFieldCache(); if (ft.getNumericType() != null && !sf.multiValued()) { // the per-segment approach is optimal for numeric field types since there // are no global ords to merge and no need to create an expensive // top-level reader method = FacetMethod.FCS; } if (method == null) { // TODO: default to per-segment or not? method = FacetMethod.FC; } if (method == FacetMethod.FCS && multiToken) { // only fc knows how to deal with multi-token fields method = FacetMethod.FC; } if (method == FacetMethod.ENUM && sf.hasDocValues()) { // only fc can handle docvalues types method = FacetMethod.FC; } if (params.getFieldBool(field, GroupParams.GROUP_FACET, false)) { counts = getGroupedCounts( searcher, docs, field, multiToken, offset, limit, mincount, missing, sort, prefix, contains, ignoreCase); } else { assert method != null; switch (method) { case ENUM: assert TrieField.getMainValuePrefix(ft) == null; counts = getFacetTermEnumCounts( searcher, docs, field, offset, limit, mincount, missing, sort, prefix, contains, ignoreCase, params); break; case FCS: assert !multiToken; if (ft.getNumericType() != null && !sf.multiValued()) { // force numeric faceting if (prefix != null && !prefix.isEmpty()) { throw new SolrException( ErrorCode.BAD_REQUEST, FacetParams.FACET_PREFIX + " is not supported on numeric types"); } if (contains != null && !contains.isEmpty()) { throw new SolrException( ErrorCode.BAD_REQUEST, FacetParams.FACET_CONTAINS + " is not supported on numeric types"); } counts = NumericFacets.getCounts( searcher, docs, field, offset, limit, mincount, missing, sort); } else { PerSegmentSingleValuedFaceting ps = new PerSegmentSingleValuedFaceting( searcher, docs, field, offset, limit, mincount, missing, sort, prefix, contains, ignoreCase); Executor executor = threads == 0 ? directExecutor : facetExecutor; ps.setNumThreads(threads); counts = ps.getFacetCounts(executor); } break; case FC: counts = DocValuesFacets.getCounts( searcher, docs, field, offset, limit, mincount, missing, sort, prefix, contains, ignoreCase); break; default: throw new AssertionError(); } } return counts; }
@SuppressWarnings("unchecked") private static SimpleOrderedMap<Object> getIndexedFieldsInfo( final SolrIndexSearcher searcher, final Set<String> fields, final int numTerms) throws Exception { IndexReader reader = searcher.getReader(); IndexSchema schema = searcher.getSchema(); // Walk the term enum and keep a priority queue for each map in our set Map<String, TopTermQueue> ttinfo = null; if (numTerms > 0) { ttinfo = getTopTerms(reader, fields, numTerms, null); } SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>(); Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL); for (String fieldName : fieldNames) { if (fields != null && !fields.contains(fieldName)) { continue; // if a field is specified, only them } SimpleOrderedMap<Object> f = new SimpleOrderedMap<Object>(); SchemaField sfield = schema.getFieldOrNull(fieldName); FieldType ftype = (sfield == null) ? null : sfield.getType(); f.add("type", (ftype == null) ? null : ftype.getTypeName()); f.add("schema", getFieldFlags(sfield)); if (sfield != null && schema.isDynamicField(sfield.getName()) && schema.getDynamicPattern(sfield.getName()) != null) { f.add("dynamicBase", schema.getDynamicPattern(sfield.getName())); } // If numTerms==0, the call is just asking for a quick field list if (ttinfo != null && sfield != null && sfield.indexed()) { Query q = new TermRangeQuery(fieldName, null, null, false, false); TopDocs top = searcher.search(q, 1); if (top.totalHits > 0) { // Find a document with this field try { Document doc = searcher.doc(top.scoreDocs[0].doc); Fieldable fld = doc.getFieldable(fieldName); if (fld != null) { f.add("index", getFieldFlags(fld)); } else { // it is a non-stored field... f.add("index", "(unstored field)"); } } catch (Exception ex) { log.warn("error reading field: " + fieldName); } } f.add("docs", top.totalHits); TopTermQueue topTerms = ttinfo.get(fieldName); if (topTerms != null) { f.add("distinct", topTerms.distinctTerms); // Include top terms f.add("topTerms", topTerms.toNamedList(searcher.getSchema())); // Add a histogram f.add("histogram", topTerms.histogram.toNamedList()); } } // Add the field finfo.add(fieldName, f); } return finfo; }
public static SolrAuthoritySetScorer createAuthoritySetScorer( SolrIndexSearcher searcher, Similarity similarity, String authorities, SolrIndexReader reader) throws IOException { // Get hold of solr top level searcher // Execute query with caching // translate reults to leaf docs // build ordered doc list Properties p = searcher.getSchema().getResourceLoader().getCoreProperties(); boolean doPermissionChecks = Boolean.parseBoolean(p.getProperty("alfresco.doPermissionChecks", "true")); Query key = new SolrAuthoritySetQuery(authorities); DocSet answer = (DocSet) searcher.cacheLookup(AlfrescoSolrEventListener.ALFRESCO_AUTHORITY_CACHE, key); if (answer != null) { return new SolrAuthoritySetScorer(similarity, answer, reader); } HashSet<String> globalReaders = (HashSet<String>) searcher.cacheLookup( AlfrescoSolrEventListener.ALFRESCO_CACHE, AlfrescoSolrEventListener.KEY_GLOBAL_READERS); String[] auths = authorities.substring(1).split(authorities.substring(0, 1)); boolean hasGlobalRead = false; for (String auth : auths) { if (globalReaders.contains(auth)) { hasGlobalRead = true; break; } } if (hasGlobalRead || (doPermissionChecks == false)) { // can read all OpenBitSet allLeafDocs = (OpenBitSet) searcher.cacheLookup( AlfrescoSolrEventListener.ALFRESCO_CACHE, AlfrescoSolrEventListener.KEY_ALL_LEAF_DOCS); return new SolrAuthoritySetScorer(similarity, new BitDocSet(allLeafDocs), reader); } DocSet readableDocSet = searcher.getDocSet(new SolrReaderSetQuery(authorities)); if (globalReaders.contains(PermissionService.OWNER_AUTHORITY)) { DocSet authorityOwnedDocs = searcher.getDocSet(new SolrOwnerSetQuery(authorities)); DocSet toCache = readableDocSet.union(authorityOwnedDocs); searcher.cacheInsert(AlfrescoSolrEventListener.ALFRESCO_AUTHORITY_CACHE, key, toCache); return new SolrAuthoritySetScorer(similarity, toCache, reader); } else { // for that docs I own that have owner Read rights DocSet ownerReadableDocSet = searcher.getDocSet(new SolrReaderSetQuery("|" + PermissionService.OWNER_AUTHORITY)); DocSet authorityOwnedDocs = searcher.getDocSet(new SolrOwnerSetQuery(authorities)); DocSet docsAuthorityOwnsAndCanRead = ownerReadableDocSet.intersection(authorityOwnedDocs); DocSet toCache = readableDocSet.union(docsAuthorityOwnsAndCanRead); searcher.cacheInsert(AlfrescoSolrEventListener.ALFRESCO_AUTHORITY_CACHE, key, toCache); return new SolrAuthoritySetScorer(similarity, toCache, reader); } }