@Override public SeekStatus seekCeil(BytesRef target) throws IOException { // already here if (term != null && term.equals(target)) { return SeekStatus.FOUND; } int startIdx = Arrays.binarySearch(indexedTermsArray, target); if (startIdx >= 0) { // we hit the term exactly... lucky us! TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target); assert seekStatus == TermsEnum.SeekStatus.FOUND; ord = startIdx << indexIntervalBits; setTerm(); assert term != null; return SeekStatus.FOUND; } // we didn't hit the term exactly startIdx = -startIdx - 1; if (startIdx == 0) { // our target occurs *before* the first term TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target); assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND; ord = 0; setTerm(); assert term != null; return SeekStatus.NOT_FOUND; } // back up to the start of the block startIdx--; if ((ord >> indexIntervalBits) == startIdx && term != null && term.compareTo(target) <= 0) { // we are already in the right block and the current term is before the term we want, // so we don't need to seek. } else { // seek to the right block TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(indexedTermsArray[startIdx]); assert seekStatus == TermsEnum.SeekStatus.FOUND; ord = startIdx << indexIntervalBits; setTerm(); assert term != null; // should be non-null since it's in the index } while (term != null && term.compareTo(target) < 0) { next(); } if (term == null) { return SeekStatus.END; } else if (term.compareTo(target) == 0) { return SeekStatus.FOUND; } else { return SeekStatus.NOT_FOUND; } }
IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { super(); // if (TEST) System.out.println("Enum init, startTerm=" + startTerm); this.fst = dict; this.fstReader = fst.getBytesReader(); this.fstOutputs = dict.outputs; this.fsa = compiled.runAutomaton; this.level = -1; this.stack = new Frame[16]; for (int i = 0; i < stack.length; i++) { this.stack[i] = new Frame(); } Frame frame; frame = loadVirtualFrame(newFrame()); this.level++; frame = loadFirstFrame(newFrame()); pushFrame(frame); this.meta = null; this.metaUpto = 1; this.decoded = false; this.pending = false; if (startTerm == null) { pending = isAccept(topFrame()); } else { doSeekCeil(startTerm); pending = !startTerm.equals(term) && isValid(topFrame()) && isAccept(topFrame()); } }
@Override public boolean equals(Object obj) { if (this == obj) return true; if (!super.equals(obj)) return false; if (getClass() != obj.getClass()) return false; TermRangeQuery other = (TermRangeQuery) obj; if (includeLower != other.includeLower) return false; if (includeUpper != other.includeUpper) return false; if (lowerTerm == null) { if (other.lowerTerm != null) return false; } else if (!lowerTerm.equals(other.lowerTerm)) return false; if (upperTerm == null) { if (other.upperTerm != null) return false; } else if (!upperTerm.equals(other.upperTerm)) return false; return true; }
@Override public void seekExact(BytesRef target, TermState otherState) { if (!target.equals(term)) { state.copyFrom(otherState); term = BytesRef.deepCopyOf(target); seekPending = true; } }
/** * Returns <code>true</code> iff the length and the checksums are the same. otherwise <code>false * </code> */ public boolean isSame(StoreFileMetaData other) { if (checksum == null || other.checksum == null) { // we can't tell if either or is null so we return false in this case! this is why we don't // use equals for this! return false; } return length == other.length && checksum.equals(other.checksum) && hash.equals(other.hash); }
@Override public boolean equals(Object obj) { if (sameClassAs(obj) == false) { return false; } TypeQuery that = (TypeQuery) obj; return type.equals(that.type); }
@Override public boolean equals(Object obj) { if (obj == this) return true; if (obj instanceof Token) { final Token other = (Token) obj; return (flags == other.flags && (payload == null ? other.payload == null : payload.equals(other.payload)) && super.equals(obj)); } else return false; }
@Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; Term other = (Term) obj; if (field == null) { if (other.field != null) return false; } else if (!field.equals(other.field)) return false; if (bytes == null) { if (other.bytes != null) return false; } else if (!bytes.equals(other.bytes)) return false; return true; }
@Override public void finishTerm(BytesRef text, TermStats stats) throws IOException { assert state == TermsConsumerState.START; state = TermsConsumerState.INITIAL; assert text.equals(lastTerm); assert stats.docFreq > 0; // otherwise, this method should not be called. assert stats.docFreq == lastPostingsConsumer.docFreq; sumDocFreq += stats.docFreq; if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) { assert stats.totalTermFreq == -1; } else { assert stats.totalTermFreq == lastPostingsConsumer.totalTermFreq; sumTotalTermFreq += stats.totalTermFreq; } in.finishTerm(text, stats); }
@Override public String next() throws IOException { while (true) { SimpleTextUtil.readLine(in, scratch); if (scratch.equals(END)) { current = null; return null; } if (StringHelper.startsWith(scratch, FIELD)) { return current = new String( scratch.bytes, scratch.offset + FIELD.length, scratch.length - FIELD.length, "UTF-8"); } } }
@Override public void build(TermFreqIterator iterator) throws IOException { BytesRef scratch = new BytesRef(); TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator, BytesRef.getUTF8SortedAsUnicodeComparator()); IntsRef scratchInts = new IntsRef(); BytesRef previous = null; PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs); while ((scratch = iter.next()) != null) { long cost = iter.weight(); if (previous == null) { previous = new BytesRef(); } else if (scratch.equals(previous)) { continue; // for duplicate suggestions, the best weight is actually // added } Util.toIntsRef(scratch, scratchInts); builder.add(scratchInts, cost); previous.copyBytes(scratch); } fst = builder.finish(); }
private void loadTerms() throws IOException { PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false); final Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>> b; final PairOutputs<Long, Long> outputsInner = new PairOutputs<Long, Long>(posIntOutputs, posIntOutputs); final PairOutputs<Long, PairOutputs.Pair<Long, Long>> outputs = new PairOutputs<Long, PairOutputs.Pair<Long, Long>>(posIntOutputs, outputsInner); b = new Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>>( FST.INPUT_TYPE.BYTE1, outputs); IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); in.seek(termsStart); final BytesRef lastTerm = new BytesRef(10); long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; OpenBitSet visitedDocs = new OpenBitSet(); final IntsRef scratchIntsRef = new IntsRef(); while (true) { SimpleTextUtil.readLine(in, scratch); if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) { if (lastDocsStart != -1) { b.add( Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair( lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq))); sumTotalTermFreq += totalTermFreq; } break; } else if (StringHelper.startsWith(scratch, DOC)) { docFreq++; sumDocFreq++; UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + DOC.length, scratch.length - DOC.length, scratchUTF16); int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); visitedDocs.set(docID); } else if (StringHelper.startsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + FREQ.length, scratch.length - FREQ.length, scratchUTF16); totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); } else if (StringHelper.startsWith(scratch, TERM)) { if (lastDocsStart != -1) { b.add( Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair( lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq))); } lastDocsStart = in.getFilePointer(); final int len = scratch.length - TERM.length; if (len > lastTerm.length) { lastTerm.grow(len); } System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len); lastTerm.length = len; docFreq = 0; sumTotalTermFreq += totalTermFreq; totalTermFreq = 0; termCount++; } } docCount = (int) visitedDocs.cardinality(); fst = b.finish(); /* PrintStream ps = new PrintStream("out.dot"); fst.toDot(ps); ps.close(); System.out.println("SAVED out.dot"); */ // System.out.println("FST " + fst.sizeInBytes()); }
@Override public void process(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (!params.getBool(TermsParams.TERMS, false)) return; String[] fields = params.getParams(TermsParams.TERMS_FIELD); NamedList<Object> termsResult = new SimpleOrderedMap<>(); rb.rsp.add("terms", termsResult); if (fields == null || fields.length == 0) return; int limit = params.getInt(TermsParams.TERMS_LIMIT, 10); if (limit < 0) { limit = Integer.MAX_VALUE; } String lowerStr = params.get(TermsParams.TERMS_LOWER); String upperStr = params.get(TermsParams.TERMS_UPPER); boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false); boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true); boolean sort = !TermsParams.TERMS_SORT_INDEX.equals( params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT)); int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1); int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT); if (freqmax < 0) { freqmax = Integer.MAX_VALUE; } String prefix = params.get(TermsParams.TERMS_PREFIX_STR); String regexp = params.get(TermsParams.TERMS_REGEXP_STR); Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null; boolean raw = params.getBool(TermsParams.TERMS_RAW, false); final AtomicReader indexReader = rb.req.getSearcher().getAtomicReader(); Fields lfields = indexReader.fields(); for (String field : fields) { NamedList<Integer> fieldTerms = new NamedList<>(); termsResult.add(field, fieldTerms); Terms terms = lfields == null ? null : lfields.terms(field); if (terms == null) { // no terms for this field continue; } FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field); if (ft == null) ft = new StrField(); // prefix must currently be text BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix); BytesRef upperBytes = null; if (upperStr != null) { upperBytes = new BytesRef(); ft.readableToIndexed(upperStr, upperBytes); } BytesRef lowerBytes; if (lowerStr == null) { // If no lower bound was specified, use the prefix lowerBytes = prefixBytes; } else { lowerBytes = new BytesRef(); if (raw) { // TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists // perhaps we detect if the FieldType is non-character and expect hex if so? lowerBytes = new BytesRef(lowerStr); } else { lowerBytes = new BytesRef(); ft.readableToIndexed(lowerStr, lowerBytes); } } TermsEnum termsEnum = terms.iterator(null); BytesRef term = null; if (lowerBytes != null) { if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); // Only advance the enum if we are excluding the lower bound and the lower Term actually // matches if (lowerIncl == false && term.equals(lowerBytes)) { term = termsEnum.next(); } } } else { // position termsEnum on first term term = termsEnum.next(); } int i = 0; BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null); CharsRef external = new CharsRef(); while (term != null && (i < limit || sort)) { boolean externalized = false; // did we fill in "external" yet for this term? // stop if the prefix doesn't match if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes)) break; if (pattern != null) { // indexed text or external text? // TODO: support "raw" mode? ft.indexedToReadable(term, external); externalized = true; if (!pattern.matcher(external).matches()) { term = termsEnum.next(); continue; } } if (upperBytes != null) { int upperCmp = term.compareTo(upperBytes); // if we are past the upper term, or equal to it (when don't include upper) then stop. if (upperCmp > 0 || (upperCmp == 0 && !upperIncl)) break; } // This is a good term in the range. Check if mincount/maxcount conditions are satisfied. int docFreq = termsEnum.docFreq(); if (docFreq >= freqmin && docFreq <= freqmax) { // add the term to the list if (sort) { queue.add(new CountPair<>(BytesRef.deepCopyOf(term), docFreq)); } else { // TODO: handle raw somehow if (!externalized) { ft.indexedToReadable(term, external); } fieldTerms.add(external.toString(), docFreq); i++; } } term = termsEnum.next(); } if (sort) { for (CountPair<BytesRef, Integer> item : queue) { if (i >= limit) break; ft.indexedToReadable(item.key, external); fieldTerms.add(external.toString(), item.val); i++; } } } }
@Override public boolean equalsSameType(Object other) { MutableValueStr b = (MutableValueStr) other; return value.equals(b.value) && exists == b.exists; }
@Override public Facet facet() { if (current != null) { missing += current.counts[0]; total += current.total - current.counts[0]; // if we have values for this one, add it if (current.values.ordinals().getNumOrds() > 1) { aggregators.add(current); } } AggregatorPriorityQueue queue = new AggregatorPriorityQueue(aggregators.size()); for (ReaderAggregator aggregator : aggregators) { if (aggregator.nextPosition()) { queue.add(aggregator); } } // YACK, we repeat the same logic, but once with an optimizer priority queue for smaller sizes if (size < EntryPriorityQueue.LIMIT) { // optimize to use priority size EntryPriorityQueue ordered = new EntryPriorityQueue(size, comparatorType.comparator()); while (queue.size() > 0) { ReaderAggregator agg = queue.top(); BytesRef value = agg.values.makeSafe( agg.current); // we need to makeSafe it, since we end up pushing it... (can we get // around this?) int count = 0; do { count += agg.counts[agg.position]; if (agg.nextPosition()) { agg = queue.updateTop(); } else { // we are done with this reader queue.pop(); agg = queue.top(); } } while (agg != null && value.equals(agg.current)); if (count > minCount) { if (excluded != null && excluded.contains(value)) { continue; } // LUCENE 4 UPGRADE: use Lucene's RegexCapabilities if (matcher != null && !matcher.reset(value.utf8ToString()).matches()) { continue; } InternalStringTermsFacet.TermEntry entry = new InternalStringTermsFacet.TermEntry(value, count); ordered.insertWithOverflow(entry); } } InternalStringTermsFacet.TermEntry[] list = new InternalStringTermsFacet.TermEntry[ordered.size()]; for (int i = ordered.size() - 1; i >= 0; i--) { list[i] = (InternalStringTermsFacet.TermEntry) ordered.pop(); } for (ReaderAggregator aggregator : aggregators) { CacheRecycler.pushIntArray(aggregator.counts); } return new InternalStringTermsFacet( facetName, comparatorType, size, Arrays.asList(list), missing, total); } BoundedTreeSet<InternalStringTermsFacet.TermEntry> ordered = new BoundedTreeSet<InternalStringTermsFacet.TermEntry>(comparatorType.comparator(), size); while (queue.size() > 0) { ReaderAggregator agg = queue.top(); BytesRef value = agg.values.makeSafe( agg.current); // we need to makeSafe it, since we end up pushing it... (can we work // around that?) int count = 0; do { count += agg.counts[agg.position]; if (agg.nextPosition()) { agg = queue.updateTop(); } else { // we are done with this reader queue.pop(); agg = queue.top(); } } while (agg != null && value.equals(agg.current)); if (count > minCount) { if (excluded != null && excluded.contains(value)) { continue; } // LUCENE 4 UPGRADE: use Lucene's RegexCapabilities if (matcher != null && !matcher.reset(value.utf8ToString()).matches()) { continue; } InternalStringTermsFacet.TermEntry entry = new InternalStringTermsFacet.TermEntry(value, count); ordered.add(entry); } } for (ReaderAggregator aggregator : aggregators) { CacheRecycler.pushIntArray(aggregator.counts); } return new InternalStringTermsFacet(facetName, comparatorType, size, ordered, missing, total); }