@Override public void seekExact(long ord) throws IOException { // System.out.println("BTR.seek by ord ord=" + ord); if (indexEnum == null) { throw new IllegalStateException("terms index was not loaded"); } assert ord < numTerms; // TODO: if ord is in same terms block and // after current ord, we should avoid this seek just // like we do in the seek(BytesRef) case in.seek(indexEnum.seek(ord)); boolean result = nextBlock(); // Block must exist since ord < numTerms: assert result; indexIsCurrent = true; didIndexNext = false; seekPending = false; state.ord = indexEnum.ord() - 1; assert state.ord >= -1 : "ord=" + state.ord; term.copyBytes(indexEnum.term()); // Now, scan: int left = (int) (ord - state.ord); while (left > 0) { final BytesRef term = _next(); assert term != null; left--; assert indexIsCurrent; } }
@Override public void seekExact(BytesRef target, TermState otherState) { // System.out.println("BTR.seekExact termState target=" + target.utf8ToString() + " " + // target + " this=" + this); assert otherState != null && otherState instanceof BlockTermState; assert !doOrd || ((BlockTermState) otherState).ord < numTerms; state.copyFrom(otherState); seekPending = true; indexIsCurrent = false; term.copyBytes(target); }
@Override public void seekExact(BytesRef target, TermState otherState) { // if (DEBUG) { // System.out.println("BTTR.seekExact termState seg=" + segment + " target=" + // target.utf8ToString() + " " + target + " state=" + otherState); // } assert clearEOF(); if (target.compareTo(term.get()) != 0 || !termExists) { assert otherState != null && otherState instanceof BlockTermState; currentFrame = staticFrame; currentFrame.state.copyFrom(otherState); term.copyBytes(target); currentFrame.metaDataUpto = currentFrame.getTermBlockOrd(); assert currentFrame.metaDataUpto > 0; validIndexPrefix = 0; } else { // if (DEBUG) { // System.out.println(" skip seek: already on target state=" + currentFrame.state); // } } }
@Override public SeekStatus seekCeil(final BytesRef target) throws IOException { if (fr.index == null) { throw new IllegalStateException("terms index was not loaded"); } term.grow(1 + target.length); assert clearEOF(); // if (DEBUG) { // System.out.println("\nBTTR.seekCeil seg=" + segment + " target=" + fieldInfo.name + ":" + // target.utf8ToString() + " " + target + " current=" + brToString(term) + " (exists?=" + // termExists + ") validIndexPrefix= " + validIndexPrefix); // printSeekState(); // } FST.Arc<Pair<BytesRef, Long>> arc; int targetUpto; Pair<BytesRef, Long> output; targetBeforeCurrentLength = currentFrame.ord; if (currentFrame != staticFrame) { // We are already seek'd; find the common // prefix of new seek term vs current term and // re-use the corresponding seek state. For // example, if app first seeks to foobar, then // seeks to foobaz, we can re-use the seek state // for the first 5 bytes. // if (DEBUG) { // System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix); // } arc = arcs[0]; assert arc.isFinal(); output = arc.output; targetUpto = 0; IDVersionSegmentTermsEnumFrame lastFrame = stack[0]; assert validIndexPrefix <= term.length(); final int targetLimit = Math.min(target.length, validIndexPrefix); int cmp = 0; // TODO: we should write our vLong backwards (MSB // first) to get better sharing from the FST // First compare up to valid seek frames: while (targetUpto < targetLimit) { cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF); // if (DEBUG) { // System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + // ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " // vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + // " output=" + output); // } if (cmp != 0) { break; } arc = arcs[1 + targetUpto]; assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF) : "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); // TODO: we could save the outputs in local // byte[][] instead of making new objs ever // seek; but, often the FST doesn't have any // shared bytes (but this could change if we // reverse vLong byte order) if (arc.output != VersionBlockTreeTermsWriter.NO_OUTPUT) { output = VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output); } if (arc.isFinal()) { lastFrame = stack[1 + lastFrame.ord]; } targetUpto++; } if (cmp == 0) { final int targetUptoMid = targetUpto; // Second compare the rest of the term, but // don't save arc/output/frame: final int targetLimit2 = Math.min(target.length, term.length()); while (targetUpto < targetLimit2) { cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF); // if (DEBUG) { // System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit // + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) // + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"); // } if (cmp != 0) { break; } targetUpto++; } if (cmp == 0) { cmp = term.length() - target.length; } targetUpto = targetUptoMid; } if (cmp < 0) { // Common case: target term is after current // term, ie, app is seeking multiple terms // in sorted order // if (DEBUG) { // System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); // clear frame.scanned ord=" + lastFrame.ord); // } currentFrame = lastFrame; } else if (cmp > 0) { // Uncommon case: target term // is before current term; this means we can // keep the currentFrame but we must rewind it // (so we scan from the start) targetBeforeCurrentLength = 0; // if (DEBUG) { // System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); // rewind frame ord=" + lastFrame.ord); // } currentFrame = lastFrame; currentFrame.rewind(); } else { // Target is exactly the same as current term assert term.length() == target.length; if (termExists) { // if (DEBUG) { // System.out.println(" target is same as current; return FOUND"); // } return SeekStatus.FOUND; } else { // if (DEBUG) { // System.out.println(" target is same as current but term doesn't exist"); // } } } } else { targetBeforeCurrentLength = -1; arc = fr.index.getFirstArc(arcs[0]); // Empty string prefix must have an output (block) in the index! assert arc.isFinal(); assert arc.output != null; // if (DEBUG) { // System.out.println(" no seek state; push root frame"); // } output = arc.output; currentFrame = staticFrame; // term.length = 0; targetUpto = 0; currentFrame = pushFrame( arc, VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0); } // if (DEBUG) { // System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " // currentFrame.ord+1=" + currentFrame.ord + " targetBeforeCurrentLength=" + // targetBeforeCurrentLength); // } // We are done sharing the common prefix with the incoming target and where we are currently // seek'd; now continue walking the index: while (targetUpto < target.length) { final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; final FST.Arc<Pair<BytesRef, Long>> nextArc = fr.index.findTargetArc(targetLabel, arc, getArc(1 + targetUpto), fstReader); if (nextArc == null) { // Index is exhausted // if (DEBUG) { // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + // toHex(targetLabel)); // } validIndexPrefix = currentFrame.prefix; // validIndexPrefix = targetUpto; currentFrame.scanToFloorFrame(target); currentFrame.loadBlock(); final SeekStatus result = currentFrame.scanToTerm(target, false); if (result == SeekStatus.END) { term.copyBytes(target); termExists = false; if (next() != null) { // if (DEBUG) { // System.out.println(" return NOT_FOUND term=" + brToString(term) + " " + term); // } return SeekStatus.NOT_FOUND; } else { // if (DEBUG) { // System.out.println(" return END"); // } return SeekStatus.END; } } else { // if (DEBUG) { // System.out.println(" return " + result + " term=" + brToString(term) + " " + term); // } return result; } } else { // Follow this arc term.setByteAt(targetUpto, (byte) targetLabel); arc = nextArc; // Aggregate output as we go: assert arc.output != null; if (arc.output != VersionBlockTreeTermsWriter.NO_OUTPUT) { output = VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output); } // if (DEBUG) { // System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + // targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput); // } targetUpto++; if (arc.isFinal()) { // if (DEBUG) System.out.println(" arc is final!"); currentFrame = pushFrame( arc, VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), targetUpto); // if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + // currentFrame.hasTerms); } } } // validIndexPrefix = targetUpto; validIndexPrefix = currentFrame.prefix; currentFrame.scanToFloorFrame(target); currentFrame.loadBlock(); final SeekStatus result = currentFrame.scanToTerm(target, false); if (result == SeekStatus.END) { term.copyBytes(target); termExists = false; if (next() != null) { // if (DEBUG) { // System.out.println(" return NOT_FOUND term=" + term.utf8ToString() + " " + term); // } return SeekStatus.NOT_FOUND; } else { // if (DEBUG) { // System.out.println(" return END"); // } return SeekStatus.END; } } else { return result; } }
// TODO: we may want an alternate mode here which is // "if you are about to return NOT_FOUND I won't use // the terms data from that"; eg FuzzyTermsEnum will // (usually) just immediately call seek again if we // return NOT_FOUND so it's a waste for us to fill in // the term that was actually NOT_FOUND @Override public SeekStatus seekCeil(final BytesRef target) throws IOException { if (indexEnum == null) { throw new IllegalStateException("terms index was not loaded"); } // System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + // target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() // + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" // + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this); if (didIndexNext) { if (nextIndexTerm == null) { // System.out.println(" nextIndexTerm=null"); } else { // System.out.println(" nextIndexTerm=" + nextIndexTerm.utf8ToString()); } } boolean doSeek = true; // See if we can avoid seeking, because target term // is after current term but before next index term: if (indexIsCurrent) { final int cmp = BytesRef.getUTF8SortedAsUnicodeComparator().compare(term.get(), target); if (cmp == 0) { // Already at the requested term return SeekStatus.FOUND; } else if (cmp < 0) { // Target term is after current term if (!didIndexNext) { if (indexEnum.next() == -1) { nextIndexTerm = null; } else { nextIndexTerm = indexEnum.term(); } // System.out.println(" now do index next() nextIndexTerm=" + (nextIndexTerm == null // ? "null" : nextIndexTerm.utf8ToString())); didIndexNext = true; } if (nextIndexTerm == null || BytesRef.getUTF8SortedAsUnicodeComparator().compare(target, nextIndexTerm) < 0) { // Optimization: requested term is within the // same term block we are now in; skip seeking // (but do scanning): doSeek = false; // System.out.println(" skip seek: nextIndexTerm=" + (nextIndexTerm == null ? "null" // : nextIndexTerm.utf8ToString())); } } } if (doSeek) { // System.out.println(" seek"); // Ask terms index to find biggest indexed term (= // first term in a block) that's <= our text: in.seek(indexEnum.seek(target)); boolean result = nextBlock(); // Block must exist since, at least, the indexed term // is in the block: assert result; indexIsCurrent = true; didIndexNext = false; if (doOrd) { state.ord = indexEnum.ord() - 1; } term.copyBytes(indexEnum.term()); // System.out.println(" seek: term=" + term.utf8ToString()); } else { // System.out.println(" skip seek"); if (state.termBlockOrd == blockTermCount && !nextBlock()) { indexIsCurrent = false; return SeekStatus.END; } } seekPending = false; int common = 0; // Scan within block. We could do this by calling // _next() and testing the resulting term, but this // is wasteful. Instead, we first confirm the // target matches the common prefix of this block, // and then we scan the term bytes directly from the // termSuffixesreader's byte[], saving a copy into // the BytesRef term per term. Only when we return // do we then copy the bytes into the term. while (true) { // First, see if target term matches common prefix // in this block: if (common < termBlockPrefix) { final int cmp = (term.byteAt(common) & 0xFF) - (target.bytes[target.offset + common] & 0xFF); if (cmp < 0) { // TODO: maybe we should store common prefix // in block header? (instead of relying on // last term of previous block) // Target's prefix is after the common block // prefix, so term cannot be in this block // but it could be in next block. We // must scan to end-of-block to set common // prefix for next block: if (state.termBlockOrd < blockTermCount) { while (state.termBlockOrd < blockTermCount - 1) { state.termBlockOrd++; state.ord++; termSuffixesReader.skipBytes(termSuffixesReader.readVInt()); } final int suffix = termSuffixesReader.readVInt(); term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); } state.ord++; if (!nextBlock()) { indexIsCurrent = false; return SeekStatus.END; } common = 0; } else if (cmp > 0) { // Target's prefix is before the common prefix // of this block, so we position to start of // block and return NOT_FOUND: assert state.termBlockOrd == 0; final int suffix = termSuffixesReader.readVInt(); term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); return SeekStatus.NOT_FOUND; } else { common++; } continue; } // Test every term in this block while (true) { state.termBlockOrd++; state.ord++; final int suffix = termSuffixesReader.readVInt(); // We know the prefix matches, so just compare the new suffix: final int termLen = termBlockPrefix + suffix; int bytePos = termSuffixesReader.getPosition(); boolean next = false; final int limit = target.offset + (termLen < target.length ? termLen : target.length); int targetPos = target.offset + termBlockPrefix; while (targetPos < limit) { final int cmp = (termSuffixes[bytePos++] & 0xFF) - (target.bytes[targetPos++] & 0xFF); if (cmp < 0) { // Current term is still before the target; // keep scanning next = true; break; } else if (cmp > 0) { // Done! Current term is after target. Stop // here, fill in real term, return NOT_FOUND. term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); // System.out.println(" NOT_FOUND"); return SeekStatus.NOT_FOUND; } } if (!next && target.length <= termLen) { term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); if (target.length == termLen) { // Done! Exact match. Stop here, fill in // real term, return FOUND. // System.out.println(" FOUND"); return SeekStatus.FOUND; } else { // System.out.println(" NOT_FOUND"); return SeekStatus.NOT_FOUND; } } if (state.termBlockOrd == blockTermCount) { // Must pre-fill term for next block's common prefix term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); break; } else { termSuffixesReader.skipBytes(suffix); } } // The purpose of the terms dict index is to seek // the enum to the closest index term before the // term we are looking for. So, we should never // cross another index term (besides the first // one) while we are scanning: assert indexIsCurrent; if (!nextBlock()) { // System.out.println(" END"); indexIsCurrent = false; return SeekStatus.END; } common = 0; } }