static Query parseQueryString( ExtendedCommonTermsQuery query, Object queryString, String field, Analyzer analyzer, String lowFreqMinimumShouldMatch, String highFreqMinimumShouldMatch) throws IOException { // Logic similar to QueryParser#getFieldQuery int count = 0; try (TokenStream source = analyzer.tokenStream(field, queryString.toString())) { source.reset(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); BytesRefBuilder builder = new BytesRefBuilder(); while (source.incrementToken()) { // UTF-8 builder.copyChars(termAtt); query.add(new Term(field, builder.toBytesRef())); count++; } } if (count == 0) { return null; } query.setLowFreqMinimumNumberShouldMatch(lowFreqMinimumShouldMatch); query.setHighFreqMinimumNumberShouldMatch(highFreqMinimumShouldMatch); return query; }
/** Unmarshals a string-based field value. */ protected static Object unmarshalStringSortValue(Object value) { if (null == value) { return null; } BytesRefBuilder spare = new BytesRefBuilder(); String stringVal = (String) value; spare.copyChars(stringVal); return spare.get(); }
// NOTE: while it's tempting to make this public, since // caller's parser likely knows the // numInput/numOutputWords, sneaky exceptions, much later // on, will result if these values are wrong; so we always // recompute ourselves to be safe: private void add( CharsRef input, int numInputWords, CharsRef output, int numOutputWords, boolean includeOrig) { // first convert to UTF-8 if (numInputWords <= 0) { throw new IllegalArgumentException("numInputWords must be > 0 (got " + numInputWords + ")"); } if (input.length <= 0) { throw new IllegalArgumentException("input.length must be > 0 (got " + input.length + ")"); } if (numOutputWords <= 0) { throw new IllegalArgumentException( "numOutputWords must be > 0 (got " + numOutputWords + ")"); } if (output.length <= 0) { throw new IllegalArgumentException("output.length must be > 0 (got " + output.length + ")"); } assert !hasHoles(input) : "input has holes: " + input; assert !hasHoles(output) : "output has holes: " + output; // System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " // output=" + output + " numOutputWords=" + numOutputWords); utf8Scratch.copyChars(output.chars, output.offset, output.length); // lookup in hash int ord = words.add(utf8Scratch.get()); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; // System.out.println(" output=" + output + " old ord=" + ord); } else { // System.out.println(" output=" + output + " new ord=" + ord); } MapEntry e = workingSet.get(input); if (e == null) { e = new MapEntry(); workingSet.put( CharsRef.deepCopyOf(input), e); // make a copy, since we will keep around in our map } e.ords.add(ord); e.includeOrig |= includeOrig; maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords); maxHorizontalContext = Math.max(maxHorizontalContext, numOutputWords); }
/** Given the readable value, return the term value that will match it. */ public void readableToIndexed(CharSequence val, BytesRefBuilder result) { final String internal = readableToIndexed(val.toString()); result.copyChars(internal); }
public static NamedList<Integer> getCounts( SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase) throws IOException { SchemaField schemaField = searcher.getSchema().getField(fieldName); FieldType ft = schemaField.getType(); NamedList<Integer> res = new NamedList<>(); // TODO: remove multiValuedFieldCache(), check dv type / uninversion type? final boolean multiValued = schemaField.multiValued() || ft.multiValuedFieldCache(); final SortedSetDocValues si; // for term lookups only OrdinalMap ordinalMap = null; // for mapping per-segment ords to global ones if (multiValued) { si = searcher.getLeafReader().getSortedSetDocValues(fieldName); if (si instanceof MultiSortedSetDocValues) { ordinalMap = ((MultiSortedSetDocValues) si).mapping; } } else { SortedDocValues single = searcher.getLeafReader().getSortedDocValues(fieldName); si = single == null ? null : DocValues.singleton(single); if (single instanceof MultiSortedDocValues) { ordinalMap = ((MultiSortedDocValues) single).mapping; } } if (si == null) { return finalize(res, searcher, schemaField, docs, -1, missing); } if (si.getValueCount() >= Integer.MAX_VALUE) { throw new UnsupportedOperationException( "Currently this faceting method is limited to " + Integer.MAX_VALUE + " unique terms"); } final BytesRefBuilder prefixRef; if (prefix == null) { prefixRef = null; } else if (prefix.length() == 0) { prefix = null; prefixRef = null; } else { prefixRef = new BytesRefBuilder(); prefixRef.copyChars(prefix); } int startTermIndex, endTermIndex; if (prefix != null) { startTermIndex = (int) si.lookupTerm(prefixRef.get()); if (startTermIndex < 0) startTermIndex = -startTermIndex - 1; prefixRef.append(UnicodeUtil.BIG_TERM); endTermIndex = (int) si.lookupTerm(prefixRef.get()); assert endTermIndex < 0; endTermIndex = -endTermIndex - 1; } else { startTermIndex = -1; endTermIndex = (int) si.getValueCount(); } final int nTerms = endTermIndex - startTermIndex; int missingCount = -1; final CharsRefBuilder charsRef = new CharsRefBuilder(); if (nTerms > 0 && docs.size() >= mincount) { // count collection array only needs to be as big as the number of terms we are // going to collect counts for. final int[] counts = new int[nTerms]; Filter filter = docs.getTopFilter(); List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves(); for (int subIndex = 0; subIndex < leaves.size(); subIndex++) { LeafReaderContext leaf = leaves.get(subIndex); DocIdSet dis = filter.getDocIdSet(leaf, null); // solr docsets already exclude any deleted docs DocIdSetIterator disi = null; if (dis != null) { disi = dis.iterator(); } if (disi != null) { if (multiValued) { SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName); if (sub == null) { sub = DocValues.emptySortedSet(); } final SortedDocValues singleton = DocValues.unwrapSingleton(sub); if (singleton != null) { // some codecs may optimize SORTED_SET storage for single-valued fields accumSingle(counts, startTermIndex, singleton, disi, subIndex, ordinalMap); } else { accumMulti(counts, startTermIndex, sub, disi, subIndex, ordinalMap); } } else { SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName); if (sub == null) { sub = DocValues.emptySorted(); } accumSingle(counts, startTermIndex, sub, disi, subIndex, ordinalMap); } } } if (startTermIndex == -1) { missingCount = counts[0]; } // IDEA: we could also maintain a count of "other"... everything that fell outside // of the top 'N' int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) { int maxsize = limit > 0 ? offset + limit : Integer.MAX_VALUE - 1; maxsize = Math.min(maxsize, nTerms); LongPriorityQueue queue = new LongPriorityQueue(Math.min(maxsize, 1000), maxsize, Long.MIN_VALUE); int min = mincount - 1; // the smallest value in the top 'N' values for (int i = (startTermIndex == -1) ? 1 : 0; i < nTerms; i++) { int c = counts[i]; if (contains != null) { final BytesRef term = si.lookupOrd(startTermIndex + i); if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) { continue; } } if (c > min) { // NOTE: we use c>min rather than c>=min as an optimization because we are going in // index order, so we already know that the keys are ordered. This can be very // important if a lot of the counts are repeated (like zero counts would be). // smaller term numbers sort higher, so subtract the term number instead long pair = (((long) c) << 32) + (Integer.MAX_VALUE - i); boolean displaced = queue.insert(pair); if (displaced) min = (int) (queue.top() >>> 32); } } // if we are deep paging, we don't have to order the highest "offset" counts. int collectCount = Math.max(0, queue.size() - off); assert collectCount <= lim; // the start and end indexes of our list "sorted" (starting with the highest value) int sortedIdxStart = queue.size() - (collectCount - 1); int sortedIdxEnd = queue.size() + 1; final long[] sorted = queue.sort(collectCount); for (int i = sortedIdxStart; i < sortedIdxEnd; i++) { long pair = sorted[i]; int c = (int) (pair >>> 32); int tnum = Integer.MAX_VALUE - (int) pair; final BytesRef term = si.lookupOrd(startTermIndex + tnum); ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } else { // add results in index order int i = (startTermIndex == -1) ? 1 : 0; if (mincount <= 0 && contains == null) { // if mincount<=0 and we're not examining the values for contains, then // we won't discard any terms and we know exactly where to start. i += off; off = 0; } for (; i < nTerms; i++) { int c = counts[i]; if (c < mincount) continue; BytesRef term = null; if (contains != null) { term = si.lookupOrd(startTermIndex + i); if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) { continue; } } if (--off >= 0) continue; if (--lim < 0) break; if (term == null) { term = si.lookupOrd(startTermIndex + i); } ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } } return finalize(res, searcher, schemaField, docs, missingCount, missing); }