private void process(int groupOrd, int facetOrd) { if (facetOrd < startFacetOrd || facetOrd >= endFacetOrd) { return; } int segmentGroupedFacetsIndex = groupOrd * (facetFieldNumTerms + 1) + facetOrd; if (segmentGroupedFacetHits.exists(segmentGroupedFacetsIndex)) { return; } segmentTotalCount++; segmentFacetCounts[facetOrd]++; segmentGroupedFacetHits.put(segmentGroupedFacetsIndex); BytesRef groupKey; if (groupOrd == -1) { groupKey = null; } else { groupKey = BytesRef.deepCopyOf(groupFieldTermsIndex.lookupOrd(groupOrd)); } final BytesRef facetValue; if (facetOrd == facetFieldNumTerms) { facetValue = null; } else { facetValue = BytesRef.deepCopyOf(facetFieldDocTermOrds.lookupOrd(facetOrd)); } groupedFacetHits.add(new GroupedFacetHit(groupKey, facetValue)); }
@Override public void collect(int doc) throws IOException { if (doc > facetFieldTermsIndex.docID()) { facetFieldTermsIndex.advance(doc); } int facetOrd; if (doc == facetFieldTermsIndex.docID()) { facetOrd = facetFieldTermsIndex.ordValue(); } else { facetOrd = -1; } if (facetOrd < startFacetOrd || facetOrd >= endFacetOrd) { return; } if (doc > groupFieldTermsIndex.docID()) { groupFieldTermsIndex.advance(doc); } int groupOrd; if (doc == groupFieldTermsIndex.docID()) { groupOrd = groupFieldTermsIndex.ordValue(); } else { groupOrd = -1; } int segmentGroupedFacetsIndex = groupOrd * (facetFieldTermsIndex.getValueCount() + 1) + facetOrd; if (segmentGroupedFacetHits.exists(segmentGroupedFacetsIndex)) { return; } segmentTotalCount++; segmentFacetCounts[facetOrd + 1]++; segmentGroupedFacetHits.put(segmentGroupedFacetsIndex); BytesRef groupKey; if (groupOrd == -1) { groupKey = null; } else { groupKey = BytesRef.deepCopyOf(groupFieldTermsIndex.lookupOrd(groupOrd)); } BytesRef facetKey; if (facetOrd == -1) { facetKey = null; } else { facetKey = BytesRef.deepCopyOf(facetFieldTermsIndex.lookupOrd(facetOrd)); } groupedFacetHits.add(new GroupedFacetHit(groupKey, facetKey)); }
public Query getQuery(Element e) throws ParserException { String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName"); String text = DOMUtils.getNonBlankTextOrFail(e); BooleanQuery bq = new BooleanQuery(DOMUtils.getAttribute(e, "disableCoord", false)); bq.setMinimumNumberShouldMatch(DOMUtils.getAttribute(e, "minimumNumberShouldMatch", 0)); try { TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text)); TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class); Term term = null; BytesRef bytes = termAtt.getBytesRef(); ts.reset(); while (ts.incrementToken()) { termAtt.fillBytesRef(); term = new Term(fieldName, BytesRef.deepCopyOf(bytes)); bq.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.SHOULD)); } ts.end(); ts.close(); } catch (IOException ioe) { throw new RuntimeException("Error constructing terms from index:" + ioe); } bq.setBoost(DOMUtils.getAttribute(e, "boost", 1.0f)); return bq; }
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) { if (part == null || analyzerIn == null) return null; try (TokenStream source = analyzerIn.tokenStream(field, part)) { source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "analyzer returned no terms for multiTerm term: " + part); termAtt.fillBytesRef(); if (source.incrementToken()) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "analyzer returned too many terms for multiTerm term: " + part); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e); } }
private void getPrefixTerms( ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException { // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment // into one terms // instance, which is very expensive. Therefore I think it is better to iterate over each leaf // individually. List<LeafReaderContext> leaves = reader.leaves(); for (LeafReaderContext leaf : leaves) { Terms _terms = leaf.reader().terms(field); if (_terms == null) { continue; } TermsEnum termsEnum = _terms.iterator(); TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes()); if (TermsEnum.SeekStatus.END == seekStatus) { continue; } for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) { if (!StringHelper.startsWith(term, prefix.bytes())) { break; } terms.add(new Term(field, BytesRef.deepCopyOf(term))); if (terms.size() >= maxExpansions) { return; } } } }
@Override public void seekExact(BytesRef target, TermState otherState) { if (!target.equals(term)) { state.copyFrom(otherState); term = BytesRef.deepCopyOf(target); seekPending = true; } }
@Override public PostingsConsumer startTerm(BytesRef text) throws IOException { assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START && lastPostingsConsumer.docFreq == 0; state = TermsConsumerState.START; assert lastTerm == null || in.getComparator().compare(text, lastTerm) > 0; lastTerm = BytesRef.deepCopyOf(text); return lastPostingsConsumer = new AssertingPostingsConsumer(in.startTerm(text), fieldInfo, visitedDocs); }
// used only by assert private boolean checkDeleteTerm(Term term) { if (term != null) { assert lastDeleteTerm == null || term.compareTo(lastDeleteTerm) > 0 : "lastTerm=" + lastDeleteTerm + " vs term=" + term; } // TODO: we re-use term now in our merged iterable, but we shouldn't clone, instead copy for // this assert lastDeleteTerm = term == null ? null : new Term(term.field(), BytesRef.deepCopyOf(term.bytes)); return true; }
public void testIterator() throws IOException { int length = randomIntBetween(10, PAGE_SIZE * randomIntBetween(2, 8)); BytesReference pbr = newBytesReference(length); BytesRefIterator iterator = pbr.iterator(); BytesRef ref; BytesRefBuilder builder = new BytesRefBuilder(); while ((ref = iterator.next()) != null) { builder.append(ref); } assertArrayEquals(pbr.toBytes(), BytesRef.deepCopyOf(builder.toBytesRef()).bytes); }
@Override public void finishTerm(BytesRef text, TermStats stats) throws IOException { assert stats.docFreq > 0; // if (DEBUG) System.out.println("BTTW.finishTerm term=" + fieldInfo.name + ":" + // toString(text) + " seg=" + segment + " df=" + stats.docFreq); blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput()); pending.add(new PendingTerm(BytesRef.deepCopyOf(text), stats)); postingsWriter.finishTerm(stats); numTerms++; }
@Override public void collect(int doc) throws IOException { if (doc > groupFieldTermsIndex.docID()) { groupFieldTermsIndex.advance(doc); } int groupOrd; if (doc == groupFieldTermsIndex.docID()) { groupOrd = groupFieldTermsIndex.ordValue(); } else { groupOrd = -1; } if (facetFieldNumTerms == 0) { int segmentGroupedFacetsIndex = groupOrd * (facetFieldNumTerms + 1); if (facetPrefix != null || segmentGroupedFacetHits.exists(segmentGroupedFacetsIndex)) { return; } segmentTotalCount++; segmentFacetCounts[facetFieldNumTerms]++; segmentGroupedFacetHits.put(segmentGroupedFacetsIndex); BytesRef groupKey; if (groupOrd == -1) { groupKey = null; } else { groupKey = BytesRef.deepCopyOf(groupFieldTermsIndex.lookupOrd(groupOrd)); } groupedFacetHits.add(new GroupedFacetHit(groupKey, null)); return; } if (doc > facetFieldDocTermOrds.docID()) { facetFieldDocTermOrds.advance(doc); } boolean empty = true; if (doc == facetFieldDocTermOrds.docID()) { long ord; while ((ord = facetFieldDocTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { process(groupOrd, (int) ord); empty = false; } } if (empty) { process( groupOrd, facetFieldNumTerms); // this facet ord is reserved for docs not containing facet field. } }
public void testSliceIterator() throws IOException { int length = randomIntBetween(10, PAGE_SIZE * randomIntBetween(2, 8)); BytesReference pbr = newBytesReference(length); int sliceOffset = randomIntBetween(0, pbr.length()); int sliceLength = randomIntBetween(pbr.length() - sliceOffset, pbr.length() - sliceOffset); BytesReference slice = pbr.slice(sliceOffset, sliceLength); BytesRefIterator iterator = slice.iterator(); BytesRef ref = null; BytesRefBuilder builder = new BytesRefBuilder(); while ((ref = iterator.next()) != null) { builder.append(ref); } assertArrayEquals(slice.toBytes(), BytesRef.deepCopyOf(builder.toBytesRef()).bytes); }
/** tests intersect: TODO start at a random term! */ public void testIntersect() throws Exception { for (int i = 0; i < numIterations; i++) { String reg = AutomatonTestUtil.randomRegexp(random()); Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton(); CompiledAutomaton ca = new CompiledAutomaton(automaton, SpecialOperations.isFinite(automaton), false); TermsEnum te = MultiFields.getTerms(reader, "field").intersect(ca, null); Automaton expected = BasicOperations.intersection(termsAutomaton, automaton); TreeSet<BytesRef> found = new TreeSet<BytesRef>(); while (te.next() != null) { found.add(BytesRef.deepCopyOf(te.term())); } Automaton actual = BasicAutomata.makeStringUnion(found); assertTrue(BasicOperations.sameLanguage(expected, actual)); } }
private int countTerms(MultiTermQuery q) throws Exception { final Terms terms = MultiFields.getTerms(reader, q.getField()); if (terms == null) return 0; final TermsEnum termEnum = q.getTermsEnum(terms); assertNotNull(termEnum); int count = 0; BytesRef cur, last = null; while ((cur = termEnum.next()) != null) { count++; if (last != null) { assertTrue(last.compareTo(cur) < 0); } last = BytesRef.deepCopyOf(cur); } // LUCENE-3314: the results after next() already returned null are undefined, // assertNull(termEnum.next()); return count; }
public void testSorted() throws Exception { Directory dir = newDirectory(); Document doc = new Document(); Field field = new SortedDocValuesField("bytes", new BytesRef()); doc.add(field); IndexWriterConfig iwc = newIndexWriterConfig(random(), null); iwc.setMergePolicy(newLogMergePolicy()); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); int numDocs = TEST_NIGHTLY ? atLeast(500) : atLeast(50); for (int i = 0; i < numDocs; i++) { BytesRef ref = new BytesRef(TestUtil.randomUnicodeString(random())); field.setBytesValue(ref); if (random().nextInt(7) == 0) { iw.addDocument(new Document()); } iw.addDocument(doc); if (random().nextInt(17) == 0) { iw.commit(); } } DirectoryReader ir = iw.getReader(); iw.forceMerge(1); DirectoryReader ir2 = iw.getReader(); LeafReader merged = getOnlyLeafReader(ir2); iw.close(); SortedDocValues multi = MultiDocValues.getSortedValues(ir, "bytes"); SortedDocValues single = merged.getSortedDocValues("bytes"); assertEquals(single.getValueCount(), multi.getValueCount()); for (int i = 0; i < numDocs; i++) { // check ord assertEquals(single.getOrd(i), multi.getOrd(i)); // check value final BytesRef expected = BytesRef.deepCopyOf(single.get(i)); final BytesRef actual = multi.get(i); assertEquals(expected, actual); } ir.close(); ir2.close(); dir.close(); }
protected List<BytesRef> analyze(String text, String field, Analyzer analyzer) throws IOException { List<BytesRef> bytesRefs = new ArrayList<>(); try (TokenStream tokenStream = analyzer.tokenStream(field, text)) { TermToBytesRefAttribute termAttribute = tokenStream.getAttribute(TermToBytesRefAttribute.class); BytesRef bytesRef = termAttribute.getBytesRef(); tokenStream.reset(); while (tokenStream.incrementToken()) { termAttribute.fillBytesRef(); bytesRefs.add(BytesRef.deepCopyOf(bytesRef)); } tokenStream.end(); } return bytesRefs; }
@Override public void collect(int doc) throws IOException { if (doc > index.docID()) { index.advance(doc); } int key; if (doc == index.docID()) { key = index.ordValue(); } else { key = -1; } if (!ordSet.exists(key)) { ordSet.put(key); final BytesRef term; if (key == -1) { term = null; } else { term = BytesRef.deepCopyOf(index.lookupOrd(key)); } groups.add(term); } }
@Override public SpanQuery getSpanQuery(Element e) throws ParserException { String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName"); String value = DOMUtils.getNonBlankTextOrFail(e); List<SpanQuery> clausesList = new ArrayList<>(); try (TokenStream ts = analyzer.tokenStream(fieldName, value)) { TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class); ts.reset(); while (ts.incrementToken()) { SpanTermQuery stq = new SpanTermQuery(new Term(fieldName, BytesRef.deepCopyOf(termAtt.getBytesRef()))); clausesList.add(stq); } ts.end(); SpanOrQuery soq = new SpanOrQuery(clausesList.toArray(new SpanQuery[clausesList.size()])); float boost = DOMUtils.getAttribute(e, "boost", 1.0f); return new SpanBoostQuery(soq, boost); } catch (IOException ioe) { throw new ParserException("IOException parsing value:" + value); } }
@Override public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { if (DEBUG) System.out.println( "PW pos=" + position + " payload=" + (payload == null ? "null" : payload.length + " bytes")); if (pendingCount == pending.length) { push(); } if (pendingCount == -1) { // We've already seen too many docs for this term -- // just forward to our fallback writer wrappedPostingsWriter.addPosition(position, payload, startOffset, endOffset); } else { // buffer up final Position pos = pending[pendingCount++]; pos.pos = position; pos.startOffset = startOffset; pos.endOffset = endOffset; pos.docID = currentDoc.docID; if (payload != null && payload.length > 0) { if (pos.payload == null) { pos.payload = BytesRef.deepCopyOf(payload); } else { pos.payload.copyBytes(payload); } } else if (pos.payload != null) { pos.payload.length = 0; } } }
private static void duelFieldDataBytes( Random random, AtomicReaderContext context, IndexFieldData<?> left, IndexFieldData<?> right, Preprocessor pre) throws Exception { AtomicFieldData leftData = random.nextBoolean() ? left.load(context) : left.loadDirect(context); AtomicFieldData rightData = random.nextBoolean() ? right.load(context) : right.loadDirect(context); int numDocs = context.reader().maxDoc(); SortedBinaryDocValues leftBytesValues = leftData.getBytesValues(); SortedBinaryDocValues rightBytesValues = rightData.getBytesValues(); BytesRef leftSpare = new BytesRef(); BytesRef rightSpare = new BytesRef(); for (int i = 0; i < numDocs; i++) { leftBytesValues.setDocument(i); rightBytesValues.setDocument(i); int numValues = leftBytesValues.count(); assertThat(numValues, equalTo(rightBytesValues.count())); BytesRef previous = null; for (int j = 0; j < numValues; j++) { rightSpare.copyBytes(rightBytesValues.valueAt(j)); leftSpare.copyBytes(leftBytesValues.valueAt(j)); if (previous != null) { assertThat(pre.compare(previous, rightSpare), lessThan(0)); } previous = BytesRef.deepCopyOf(rightSpare); pre.toString(rightSpare); pre.toString(leftSpare); assertThat(pre.toString(leftSpare), equalTo(pre.toString(rightSpare))); } } }
@Override public void process(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (!params.getBool(TermsParams.TERMS, false)) return; String[] fields = params.getParams(TermsParams.TERMS_FIELD); NamedList<Object> termsResult = new SimpleOrderedMap<>(); rb.rsp.add("terms", termsResult); if (fields == null || fields.length == 0) return; int limit = params.getInt(TermsParams.TERMS_LIMIT, 10); if (limit < 0) { limit = Integer.MAX_VALUE; } String lowerStr = params.get(TermsParams.TERMS_LOWER); String upperStr = params.get(TermsParams.TERMS_UPPER); boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false); boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true); boolean sort = !TermsParams.TERMS_SORT_INDEX.equals( params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT)); int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1); int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT); if (freqmax < 0) { freqmax = Integer.MAX_VALUE; } String prefix = params.get(TermsParams.TERMS_PREFIX_STR); String regexp = params.get(TermsParams.TERMS_REGEXP_STR); Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null; boolean raw = params.getBool(TermsParams.TERMS_RAW, false); final AtomicReader indexReader = rb.req.getSearcher().getAtomicReader(); Fields lfields = indexReader.fields(); for (String field : fields) { NamedList<Integer> fieldTerms = new NamedList<>(); termsResult.add(field, fieldTerms); Terms terms = lfields == null ? null : lfields.terms(field); if (terms == null) { // no terms for this field continue; } FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field); if (ft == null) ft = new StrField(); // prefix must currently be text BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix); BytesRef upperBytes = null; if (upperStr != null) { upperBytes = new BytesRef(); ft.readableToIndexed(upperStr, upperBytes); } BytesRef lowerBytes; if (lowerStr == null) { // If no lower bound was specified, use the prefix lowerBytes = prefixBytes; } else { lowerBytes = new BytesRef(); if (raw) { // TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists // perhaps we detect if the FieldType is non-character and expect hex if so? lowerBytes = new BytesRef(lowerStr); } else { lowerBytes = new BytesRef(); ft.readableToIndexed(lowerStr, lowerBytes); } } TermsEnum termsEnum = terms.iterator(null); BytesRef term = null; if (lowerBytes != null) { if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); // Only advance the enum if we are excluding the lower bound and the lower Term actually // matches if (lowerIncl == false && term.equals(lowerBytes)) { term = termsEnum.next(); } } } else { // position termsEnum on first term term = termsEnum.next(); } int i = 0; BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null); CharsRef external = new CharsRef(); while (term != null && (i < limit || sort)) { boolean externalized = false; // did we fill in "external" yet for this term? // stop if the prefix doesn't match if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes)) break; if (pattern != null) { // indexed text or external text? // TODO: support "raw" mode? ft.indexedToReadable(term, external); externalized = true; if (!pattern.matcher(external).matches()) { term = termsEnum.next(); continue; } } if (upperBytes != null) { int upperCmp = term.compareTo(upperBytes); // if we are past the upper term, or equal to it (when don't include upper) then stop. if (upperCmp > 0 || (upperCmp == 0 && !upperIncl)) break; } // This is a good term in the range. Check if mincount/maxcount conditions are satisfied. int docFreq = termsEnum.docFreq(); if (docFreq >= freqmin && docFreq <= freqmax) { // add the term to the list if (sort) { queue.add(new CountPair<>(BytesRef.deepCopyOf(term), docFreq)); } else { // TODO: handle raw somehow if (!externalized) { ft.indexedToReadable(term, external); } fieldTerms.add(external.toString(), docFreq); i++; } } term = termsEnum.next(); } if (sort) { for (CountPair<BytesRef, Integer> item : queue) { if (i >= limit) break; ft.indexedToReadable(item.key, external); fieldTerms.add(external.toString(), item.val); i++; } } } }
@Override public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException { if (postings.getPayload() != null) { payloads.add(BytesRef.deepCopyOf(postings.getPayload())); } }
/** Creates a query builder given a query provided as a {@link BytesReference} */ public WrapperQueryBuilder(BytesReference source) { if (source == null || source.length() == 0) { throw new IllegalArgumentException("query source text cannot be null or empty"); } this.source = BytesRef.deepCopyOf(source.toBytesRef()).bytes; }
/** * Returns a list of terms in the specified field along with the corresponding count of documents * in the set that match that constraint. This method uses the FilterCache to get the intersection * count between <code>docs</code> and the DocSet for each term in the filter. * * @see FacetParams#FACET_LIMIT * @see FacetParams#FACET_ZEROS * @see FacetParams#FACET_MISSING */ public NamedList<Integer> getFacetTermEnumCounts( SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase, SolrParams params) throws IOException { /* :TODO: potential optimization... * cache the Terms with the highest docFreq and try them first * don't enum if we get our max from them */ // Minimum term docFreq in order to use the filterCache for that term. int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0); // make sure we have a set that is fast for random access, if we will use it for that DocSet fastForRandomSet = docs; if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) { SortedIntDocSet sset = (SortedIntDocSet) docs; fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size()); } IndexSchema schema = searcher.getSchema(); LeafReader r = searcher.getLeafReader(); FieldType ft = schema.getFieldType(field); boolean sortByCount = sort.equals("count") || sort.equals("true"); final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1; final BoundedTreeSet<CountPair<BytesRef, Integer>> queue = sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null; final NamedList<Integer> res = new NamedList<>(); int min = mincount - 1; // the smallest value in the top 'N' values int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; BytesRef prefixTermBytes = null; if (prefix != null) { String indexedPrefix = ft.toInternal(prefix); prefixTermBytes = new BytesRef(indexedPrefix); } Fields fields = r.fields(); Terms terms = fields == null ? null : fields.terms(field); TermsEnum termsEnum = null; SolrIndexSearcher.DocsEnumState deState = null; BytesRef term = null; if (terms != null) { termsEnum = terms.iterator(); // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for // facet.offset when sorting by index order. if (prefixTermBytes != null) { if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); } } else { // position termsEnum on first term term = termsEnum.next(); } } PostingsEnum postingsEnum = null; CharsRefBuilder charsRef = new CharsRefBuilder(); if (docs.size() >= mincount) { while (term != null) { if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes)) break; if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) { int df = termsEnum.docFreq(); // If we are sorting, we can use df>min (rather than >=) since we // are going in index order. For certain term distributions this can // make a large difference (for example, many terms with df=1). if (df > 0 && df > min) { int c; if (df >= minDfFilterCache) { // use the filter cache if (deState == null) { deState = new SolrIndexSearcher.DocsEnumState(); deState.fieldName = field; deState.liveDocs = r.getLiveDocs(); deState.termsEnum = termsEnum; deState.postingsEnum = postingsEnum; } c = searcher.numDocs(docs, deState); postingsEnum = deState.postingsEnum; } else { // iterate over TermDocs to calculate the intersection // TODO: specialize when base docset is a bitset or hash set (skipDocs)? or does it // matter for this? // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class // impl) // TODO: would passing deleted docs lead to better efficiency over checking the // fastForRandomSet? postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); c = 0; if (postingsEnum instanceof MultiPostingsEnum) { MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs(); int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs(); for (int subindex = 0; subindex < numSubs; subindex++) { MultiPostingsEnum.EnumWithSlice sub = subs[subindex]; if (sub.postingsEnum == null) continue; int base = sub.slice.start; int docid; while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid + base)) c++; } } } else { int docid; while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid)) c++; } } } if (sortByCount) { if (c > min) { BytesRef termCopy = BytesRef.deepCopyOf(term); queue.add(new CountPair<>(termCopy, c)); if (queue.size() >= maxsize) min = queue.last().val; } } else { if (c >= mincount && --off < 0) { if (--lim < 0) break; ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } } } term = termsEnum.next(); } } if (sortByCount) { for (CountPair<BytesRef, Integer> p : queue) { if (--off >= 0) continue; if (--lim < 0) break; ft.indexedToReadable(p.key, charsRef); res.add(charsRef.toString(), p.val); } } if (missing) { res.add(null, getFieldMissingCount(searcher, docs, field)); } return res; }
public FieldAndTerm(FieldAndTerm other) { field = other.field; term = BytesRef.deepCopyOf(other.term); }
private void addTerms(IndexReader reader, FieldVals f, ScoreTermQueue q) throws IOException { if (f.queryString == null) return; final Terms terms = MultiFields.getTerms(reader, f.fieldName); if (terms == null) { return; } try (TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); int corpusNumDocs = reader.numDocs(); HashSet<String> processedTerms = new HashSet<>(); ts.reset(); while (ts.incrementToken()) { String term = termAtt.toString(); if (!processedTerms.contains(term)) { processedTerms.add(term); ScoreTermQueue variantsQ = new ScoreTermQueue( MAX_VARIANTS_PER_TERM); // maxNum variants considered for any one term float minScore = 0; Term startTerm = new Term(f.fieldName, term); AttributeSource atts = new AttributeSource(); MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength); // store the df so all variants use same idf int df = reader.docFreq(startTerm); int numVariants = 0; int totalVariantDocFreqs = 0; BytesRef possibleMatch; BoostAttribute boostAtt = fe.attributes().addAttribute(BoostAttribute.class); while ((possibleMatch = fe.next()) != null) { numVariants++; totalVariantDocFreqs += fe.docFreq(); float score = boostAtt.getBoost(); if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) { ScoreTerm st = new ScoreTerm( new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm); variantsQ.insertWithOverflow(st); minScore = variantsQ.top().score; // maintain minScore } maxBoostAtt.setMaxNonCompetitiveBoost( variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY); } if (numVariants > 0) { int avgDf = totalVariantDocFreqs / numVariants; if (df == 0) // no direct match we can use as df for all variants { df = avgDf; // use avg df of all variants } // take the top variants (scored by edit distance) and reset the score // to include an IDF factor then add to the global queue for ranking // overall top query terms int size = variantsQ.size(); for (int i = 0; i < size; i++) { ScoreTerm st = variantsQ.pop(); st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs); q.insertWithOverflow(st); } } } } ts.end(); } }
// tries to make more dups than testSortedSet public void testSortedSetWithDups() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = newIndexWriterConfig(random(), null); iwc.setMergePolicy(newLogMergePolicy()); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); int numDocs = TEST_NIGHTLY ? atLeast(500) : atLeast(50); for (int i = 0; i < numDocs; i++) { Document doc = new Document(); int numValues = random().nextInt(5); for (int j = 0; j < numValues; j++) { doc.add( new SortedSetDocValuesField( "bytes", new BytesRef(TestUtil.randomSimpleString(random(), 2)))); } iw.addDocument(doc); if (random().nextInt(17) == 0) { iw.commit(); } } DirectoryReader ir = iw.getReader(); iw.forceMerge(1); DirectoryReader ir2 = iw.getReader(); LeafReader merged = getOnlyLeafReader(ir2); iw.close(); SortedSetDocValues multi = MultiDocValues.getSortedSetValues(ir, "bytes"); SortedSetDocValues single = merged.getSortedSetDocValues("bytes"); if (multi == null) { assertNull(single); } else { assertEquals(single.getValueCount(), multi.getValueCount()); // check values for (long i = 0; i < single.getValueCount(); i++) { final BytesRef expected = BytesRef.deepCopyOf(single.lookupOrd(i)); final BytesRef actual = multi.lookupOrd(i); assertEquals(expected, actual); } // check ord list for (int i = 0; i < numDocs; i++) { single.setDocument(i); ArrayList<Long> expectedList = new ArrayList<>(); long ord; while ((ord = single.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { expectedList.add(ord); } multi.setDocument(i); int upto = 0; while ((ord = multi.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { assertEquals(expectedList.get(upto).longValue(), ord); upto++; } assertEquals(expectedList.size(), upto); } } ir.close(); ir2.close(); dir.close(); }
/** Call this only once (if you subclass!) */ protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix) throws IOException { final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) { throw new IllegalStateException( "Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); } // System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); final long startTime = System.nanoTime(); prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix); final int maxDoc = reader.maxDoc(); final int[] index = new int [maxDoc]; // immediate term numbers, or the index into the byte[] representing the last // number final int[] lastTerm = new int[maxDoc]; // last term we saw for this document final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) final Terms terms = reader.terms(field); if (terms == null) { // No terms return; } final TermsEnum te = terms.iterator(); final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); // System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // For our "term index wrapper" final List<BytesRef> indexedTerms = new ArrayList<>(); final PagedBytes indexedTermsBytes = new PagedBytes(15); // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. byte[] tempArr = new byte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in its byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. This requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; postingsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (; ; ) { final BytesRef t = te.term(); if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) { break; } // System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); visitTerm(te, termNum); if ((termNum & indexIntervalMask) == 0) { // Index this term sizeOfIndexedStrings += t.length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.add(indexedTerm); } final int df = te.docFreq(); if (df <= maxTermDocFreq) { postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE); // dF, but takes deletions into account int actualDF = 0; for (; ; ) { int doc = postingsEnum.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } // System.out.println(" chunk=" + chunk + " docs"); actualDF++; termInstances++; // System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = val >>> 8; int ilen = vIntSize(delta); byte[] arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit // boundary. // TODO: figure out what array lengths we can round up to w/o actually using more // memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment byte[] newarr = new byte[newLen]; System.arraycopy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = writeInt(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } // System.out.println(" ipos=" + ipos); int endPos = writeInt(delta, tempArr, ipos); // System.out.println(" endpos=" + endPos); if (endPos <= 4) { // System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (byte) val; val >>>= 8; } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new byte[12]; } } } setActualDocFreq(termNum, actualDF); } termNum++; if (te.next() == null) { break; } } numTermsInField = termNum; long midPoint = System.nanoTime(); if (termInstances == 0) { // we didn't invert anything // lower memory consumption. tnums = null; } else { this.index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { byte[] target = tnums[pass]; int pos = 0; // end in target; if (target != null) { pos = target.length; } else { target = new byte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { // System.out.println(" pass="******" process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = val >>> 8; // System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw new IllegalStateException( "Too many values for UnInvertedField faceting on field " + field); } byte[] arr = bytes[doc]; /* for(byte b : arr) { //System.out.println(" b=" + Integer.toHexString((int) b)); } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.length <= pos + len) { int newlen = target.length; /** * * we don't have to worry about the array getting too large since the "pos" param * will overflow first (only 24 bits available) if ((newlen<<1) <= 0) { // * overflow... newlen = Integer.MAX_VALUE; if (newlen <= pos + len) { throw new * SolrException(400,"Too many terms to uninvert field!"); } } else { while (newlen * <= pos + len) newlen<<=1; // doubling strategy } ** */ while (newlen <= pos + len) newlen <<= 1; // doubling strategy byte[] newtarget = new byte[newlen]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } System.arraycopy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.length) { byte[] newtarget = new byte[pos]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } tnums[pass] = target; if ((pass << 16) > maxDoc) break; } } indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]); long endTime = System.nanoTime(); total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS); phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS); }
@Override public void reflectWith(AttributeReflector reflector) { fillBytesRef(); reflector.reflect(TermToBytesRefAttribute.class, "bytes", BytesRef.deepCopyOf(bytes)); }
private void assertTermsSeeking(Terms leftTerms, Terms rightTerms) throws Exception { TermsEnum leftEnum = null; TermsEnum rightEnum = null; // just an upper bound int numTests = atLeast(20); Random random = random(); // collect this number of terms from the left side HashSet<BytesRef> tests = new HashSet<BytesRef>(); int numPasses = 0; while (numPasses < 10 && tests.size() < numTests) { leftEnum = leftTerms.iterator(leftEnum); BytesRef term = null; while ((term = leftEnum.next()) != null) { int code = random.nextInt(10); if (code == 0) { // the term tests.add(BytesRef.deepCopyOf(term)); } else if (code == 1) { // truncated subsequence of term term = BytesRef.deepCopyOf(term); if (term.length > 0) { // truncate it term.length = random.nextInt(term.length); } } else if (code == 2) { // term, but ensure a non-zero offset byte newbytes[] = new byte[term.length + 5]; System.arraycopy(term.bytes, term.offset, newbytes, 5, term.length); tests.add(new BytesRef(newbytes, 5, term.length)); } } numPasses++; } ArrayList<BytesRef> shuffledTests = new ArrayList<BytesRef>(tests); Collections.shuffle(shuffledTests, random); for (BytesRef b : shuffledTests) { leftEnum = leftTerms.iterator(leftEnum); rightEnum = rightTerms.iterator(rightEnum); assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b)); assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b)); SeekStatus leftStatus; SeekStatus rightStatus; leftStatus = leftEnum.seekCeil(b); rightStatus = rightEnum.seekCeil(b); assertEquals(leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { assertEquals(leftEnum.term(), rightEnum.term()); } leftStatus = leftEnum.seekCeil(b); rightStatus = rightEnum.seekCeil(b); assertEquals(leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { assertEquals(leftEnum.term(), rightEnum.term()); } } }