/* Does initial decode of next block of terms; this doesn't actually decode the docFreq, totalTermFreq, postings details (frq/prx offset, etc.) metadata; it just loads them as byte[] blobs which are then decoded on-demand if the metadata is ever requested for any term in this block. This enables terms-only intensive consumes (eg certain MTQs, respelling) to not pay the price of decoding metadata they won't use. */ private boolean nextBlock() throws IOException { // TODO: we still lazy-decode the byte[] for each // term (the suffix), but, if we decoded // all N terms up front then seeking could do a fast // bsearch w/in the block... // System.out.println("BTR.nextBlock() fp=" + in.getFilePointer() + " this=" + this); state.blockFilePointer = in.getFilePointer(); blockTermCount = in.readVInt(); // System.out.println(" blockTermCount=" + blockTermCount); if (blockTermCount == 0) { return false; } termBlockPrefix = in.readVInt(); // term suffixes: int len = in.readVInt(); if (termSuffixes.length < len) { termSuffixes = new byte[ArrayUtil.oversize(len, 1)]; } // System.out.println(" termSuffixes len=" + len); in.readBytes(termSuffixes, 0, len); termSuffixesReader.reset(termSuffixes, 0, len); // docFreq, totalTermFreq len = in.readVInt(); if (docFreqBytes.length < len) { docFreqBytes = new byte[ArrayUtil.oversize(len, 1)]; } // System.out.println(" freq bytes len=" + len); in.readBytes(docFreqBytes, 0, len); freqReader.reset(docFreqBytes, 0, len); // metadata len = in.readVInt(); if (bytes == null) { bytes = new byte[ArrayUtil.oversize(len, 1)]; bytesReader = new ByteArrayDataInput(); } else if (bytes.length < len) { bytes = new byte[ArrayUtil.oversize(len, 1)]; } in.readBytes(bytes, 0, len); bytesReader.reset(bytes, 0, len); metaDataUpto = 0; state.termBlockOrd = 0; indexIsCurrent = false; // System.out.println(" indexIsCurrent=" + indexIsCurrent); return true; }
public SegmentTermsEnum() throws IOException { in = BlockTermsReader.this.in.clone(); in.seek(termsStartPointer); indexEnum = indexReader.getFieldEnum(fieldInfo); doOrd = indexReader.supportsOrd(); fieldTerm.field = fieldInfo.name; state = postingsReader.newTermState(); state.totalTermFreq = -1; state.ord = -1; termSuffixes = new byte[128]; docFreqBytes = new byte[64]; // System.out.println("BTR.enum init this=" + this + " postingsReader=" + postingsReader); longs = new long[longsSize]; }
@Override public void copyFrom(TermState _other) { super.copyFrom(_other); SepTermState other = (SepTermState) _other; if (docIndex == null) { docIndex = other.docIndex.clone(); } else { docIndex.copyFrom(other.docIndex); } if (other.freqIndex != null) { if (freqIndex == null) { freqIndex = other.freqIndex.clone(); } else { freqIndex.copyFrom(other.freqIndex); } } else { freqIndex = null; } if (other.posIndex != null) { if (posIndex == null) { posIndex = other.posIndex.clone(); } else { posIndex.copyFrom(other.posIndex); } } else { posIndex = null; } payloadFP = other.payloadFP; skipFP = other.skipFP; }
@Override public void seekExact(long ord) throws IOException { // System.out.println("BTR.seek by ord ord=" + ord); if (indexEnum == null) { throw new IllegalStateException("terms index was not loaded"); } assert ord < numTerms; // TODO: if ord is in same terms block and // after current ord, we should avoid this seek just // like we do in the seek(BytesRef) case in.seek(indexEnum.seek(ord)); boolean result = nextBlock(); // Block must exist since ord < numTerms: assert result; indexIsCurrent = true; didIndexNext = false; seekPending = false; state.ord = indexEnum.ord() - 1; assert state.ord >= -1 : "ord=" + state.ord; term.copyBytes(indexEnum.term()); // Now, scan: int left = (int) (ord - state.ord); while (left > 0) { final BytesRef term = _next(); assert term != null; left--; assert indexIsCurrent; } }
@Override public BytesRef next() throws IOException { // System.out.println("BTR.next() seekPending=" + seekPending + " pendingSeekCount=" + // state.termBlockOrd); // If seek was previously called and the term was cached, // usually caller is just going to pull a D/&PEnum or get // docFreq, etc. But, if they then call next(), // this method catches up all internal state so next() // works properly: if (seekPending) { assert !indexIsCurrent; in.seek(state.blockFilePointer); final int pendingSeekCount = state.termBlockOrd; boolean result = nextBlock(); final long savOrd = state.ord; // Block must exist since seek(TermState) was called w/ a // TermState previously returned by this enum when positioned // on a real term: assert result; while (state.termBlockOrd < pendingSeekCount) { BytesRef nextResult = _next(); assert nextResult != null; } seekPending = false; state.ord = savOrd; } return _next(); }
@Override public TermState termState() throws IOException { // System.out.println("BTR.termState this=" + this); decodeMetaData(); TermState ts = state.clone(); // System.out.println(" return ts=" + ts); return ts; }
private void decodeMetaData() throws IOException { // System.out.println("BTR.decodeMetadata mdUpto=" + metaDataUpto + " vs termCount=" + // state.termBlockOrd + " state=" + state); if (!seekPending) { // TODO: cutover to random-access API // here.... really stupid that we have to decode N // wasted term metadata just to get to the N+1th // that we really need... // lazily catch up on metadata decode: final int limit = state.termBlockOrd; boolean absolute = metaDataUpto == 0; // TODO: better API would be "jump straight to term=N"??? while (metaDataUpto < limit) { // System.out.println(" decode mdUpto=" + metaDataUpto); // TODO: we could make "tiers" of metadata, ie, // decode docFreq/totalTF but don't decode postings // metadata; this way caller could get // docFreq/totalTF w/o paying decode cost for // postings // TODO: if docFreq were bulk decoded we could // just skipN here: // docFreq, totalTermFreq state.docFreq = freqReader.readVInt(); // System.out.println(" dF=" + state.docFreq); if (fieldInfo.getIndexOptions() != IndexOptions.DOCS) { state.totalTermFreq = state.docFreq + freqReader.readVLong(); // System.out.println(" totTF=" + state.totalTermFreq); } // metadata for (int i = 0; i < longs.length; i++) { longs[i] = bytesReader.readVLong(); } postingsReader.decodeTerm(longs, bytesReader, fieldInfo, state, absolute); metaDataUpto++; absolute = false; } } else { // System.out.println(" skip! seekPending"); } }
@Override public void seekExact(BytesRef target, TermState otherState) { // System.out.println("BTR.seekExact termState target=" + target.utf8ToString() + " " + // target + " this=" + this); assert otherState != null && otherState instanceof BlockTermState; assert !doOrd || ((BlockTermState) otherState).ord < numTerms; state.copyFrom(otherState); seekPending = true; indexIsCurrent = false; term.copyBytes(target); }
@Override public TermState termState() throws IOException { decodeMetaData(); return state.clone(); }
// TODO: we may want an alternate mode here which is // "if you are about to return NOT_FOUND I won't use // the terms data from that"; eg FuzzyTermsEnum will // (usually) just immediately call seek again if we // return NOT_FOUND so it's a waste for us to fill in // the term that was actually NOT_FOUND @Override public SeekStatus seekCeil(final BytesRef target) throws IOException { if (indexEnum == null) { throw new IllegalStateException("terms index was not loaded"); } // System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + // target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() // + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" // + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this); if (didIndexNext) { if (nextIndexTerm == null) { // System.out.println(" nextIndexTerm=null"); } else { // System.out.println(" nextIndexTerm=" + nextIndexTerm.utf8ToString()); } } boolean doSeek = true; // See if we can avoid seeking, because target term // is after current term but before next index term: if (indexIsCurrent) { final int cmp = BytesRef.getUTF8SortedAsUnicodeComparator().compare(term.get(), target); if (cmp == 0) { // Already at the requested term return SeekStatus.FOUND; } else if (cmp < 0) { // Target term is after current term if (!didIndexNext) { if (indexEnum.next() == -1) { nextIndexTerm = null; } else { nextIndexTerm = indexEnum.term(); } // System.out.println(" now do index next() nextIndexTerm=" + (nextIndexTerm == null // ? "null" : nextIndexTerm.utf8ToString())); didIndexNext = true; } if (nextIndexTerm == null || BytesRef.getUTF8SortedAsUnicodeComparator().compare(target, nextIndexTerm) < 0) { // Optimization: requested term is within the // same term block we are now in; skip seeking // (but do scanning): doSeek = false; // System.out.println(" skip seek: nextIndexTerm=" + (nextIndexTerm == null ? "null" // : nextIndexTerm.utf8ToString())); } } } if (doSeek) { // System.out.println(" seek"); // Ask terms index to find biggest indexed term (= // first term in a block) that's <= our text: in.seek(indexEnum.seek(target)); boolean result = nextBlock(); // Block must exist since, at least, the indexed term // is in the block: assert result; indexIsCurrent = true; didIndexNext = false; if (doOrd) { state.ord = indexEnum.ord() - 1; } term.copyBytes(indexEnum.term()); // System.out.println(" seek: term=" + term.utf8ToString()); } else { // System.out.println(" skip seek"); if (state.termBlockOrd == blockTermCount && !nextBlock()) { indexIsCurrent = false; return SeekStatus.END; } } seekPending = false; int common = 0; // Scan within block. We could do this by calling // _next() and testing the resulting term, but this // is wasteful. Instead, we first confirm the // target matches the common prefix of this block, // and then we scan the term bytes directly from the // termSuffixesreader's byte[], saving a copy into // the BytesRef term per term. Only when we return // do we then copy the bytes into the term. while (true) { // First, see if target term matches common prefix // in this block: if (common < termBlockPrefix) { final int cmp = (term.byteAt(common) & 0xFF) - (target.bytes[target.offset + common] & 0xFF); if (cmp < 0) { // TODO: maybe we should store common prefix // in block header? (instead of relying on // last term of previous block) // Target's prefix is after the common block // prefix, so term cannot be in this block // but it could be in next block. We // must scan to end-of-block to set common // prefix for next block: if (state.termBlockOrd < blockTermCount) { while (state.termBlockOrd < blockTermCount - 1) { state.termBlockOrd++; state.ord++; termSuffixesReader.skipBytes(termSuffixesReader.readVInt()); } final int suffix = termSuffixesReader.readVInt(); term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); } state.ord++; if (!nextBlock()) { indexIsCurrent = false; return SeekStatus.END; } common = 0; } else if (cmp > 0) { // Target's prefix is before the common prefix // of this block, so we position to start of // block and return NOT_FOUND: assert state.termBlockOrd == 0; final int suffix = termSuffixesReader.readVInt(); term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); return SeekStatus.NOT_FOUND; } else { common++; } continue; } // Test every term in this block while (true) { state.termBlockOrd++; state.ord++; final int suffix = termSuffixesReader.readVInt(); // We know the prefix matches, so just compare the new suffix: final int termLen = termBlockPrefix + suffix; int bytePos = termSuffixesReader.getPosition(); boolean next = false; final int limit = target.offset + (termLen < target.length ? termLen : target.length); int targetPos = target.offset + termBlockPrefix; while (targetPos < limit) { final int cmp = (termSuffixes[bytePos++] & 0xFF) - (target.bytes[targetPos++] & 0xFF); if (cmp < 0) { // Current term is still before the target; // keep scanning next = true; break; } else if (cmp > 0) { // Done! Current term is after target. Stop // here, fill in real term, return NOT_FOUND. term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); // System.out.println(" NOT_FOUND"); return SeekStatus.NOT_FOUND; } } if (!next && target.length <= termLen) { term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); if (target.length == termLen) { // Done! Exact match. Stop here, fill in // real term, return FOUND. // System.out.println(" FOUND"); return SeekStatus.FOUND; } else { // System.out.println(" NOT_FOUND"); return SeekStatus.NOT_FOUND; } } if (state.termBlockOrd == blockTermCount) { // Must pre-fill term for next block's common prefix term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); break; } else { termSuffixesReader.skipBytes(suffix); } } // The purpose of the terms dict index is to seek // the enum to the closest index term before the // term we are looking for. So, we should never // cross another index term (besides the first // one) while we are scanning: assert indexIsCurrent; if (!nextBlock()) { // System.out.println(" END"); indexIsCurrent = false; return SeekStatus.END; } common = 0; } }