예제 #1
0
      @Override
      public void seekExact(long ord) throws IOException {
        // System.out.println("BTR.seek by ord ord=" + ord);
        if (indexEnum == null) {
          throw new IllegalStateException("terms index was not loaded");
        }

        assert ord < numTerms;

        // TODO: if ord is in same terms block and
        // after current ord, we should avoid this seek just
        // like we do in the seek(BytesRef) case
        in.seek(indexEnum.seek(ord));
        boolean result = nextBlock();

        // Block must exist since ord < numTerms:
        assert result;

        indexIsCurrent = true;
        didIndexNext = false;
        seekPending = false;

        state.ord = indexEnum.ord() - 1;
        assert state.ord >= -1 : "ord=" + state.ord;
        term.copyBytes(indexEnum.term());

        // Now, scan:
        int left = (int) (ord - state.ord);
        while (left > 0) {
          final BytesRef term = _next();
          assert term != null;
          left--;
          assert indexIsCurrent;
        }
      }
예제 #2
0
 @Override
 public void seekExact(BytesRef target, TermState otherState) {
   // System.out.println("BTR.seekExact termState target=" + target.utf8ToString() + " " +
   // target + " this=" + this);
   assert otherState != null && otherState instanceof BlockTermState;
   assert !doOrd || ((BlockTermState) otherState).ord < numTerms;
   state.copyFrom(otherState);
   seekPending = true;
   indexIsCurrent = false;
   term.copyBytes(target);
 }
 @Override
 public void seekExact(BytesRef target, TermState otherState) {
   // if (DEBUG) {
   //   System.out.println("BTTR.seekExact termState seg=" + segment + " target=" +
   // target.utf8ToString() + " " + target + " state=" + otherState);
   // }
   assert clearEOF();
   if (target.compareTo(term.get()) != 0 || !termExists) {
     assert otherState != null && otherState instanceof BlockTermState;
     currentFrame = staticFrame;
     currentFrame.state.copyFrom(otherState);
     term.copyBytes(target);
     currentFrame.metaDataUpto = currentFrame.getTermBlockOrd();
     assert currentFrame.metaDataUpto > 0;
     validIndexPrefix = 0;
   } else {
     // if (DEBUG) {
     //   System.out.println("  skip seek: already on target state=" + currentFrame.state);
     // }
   }
 }
  @Override
  public SeekStatus seekCeil(final BytesRef target) throws IOException {
    if (fr.index == null) {
      throw new IllegalStateException("terms index was not loaded");
    }

    term.grow(1 + target.length);

    assert clearEOF();

    // if (DEBUG) {
    // System.out.println("\nBTTR.seekCeil seg=" + segment + " target=" + fieldInfo.name + ":" +
    // target.utf8ToString() + " " + target + " current=" + brToString(term) + " (exists?=" +
    // termExists + ") validIndexPrefix=  " + validIndexPrefix);
    // printSeekState();
    // }

    FST.Arc<Pair<BytesRef, Long>> arc;
    int targetUpto;
    Pair<BytesRef, Long> output;

    targetBeforeCurrentLength = currentFrame.ord;

    if (currentFrame != staticFrame) {

      // We are already seek'd; find the common
      // prefix of new seek term vs current term and
      // re-use the corresponding seek state.  For
      // example, if app first seeks to foobar, then
      // seeks to foobaz, we can re-use the seek state
      // for the first 5 bytes.

      // if (DEBUG) {
      // System.out.println("  re-use current seek state validIndexPrefix=" + validIndexPrefix);
      // }

      arc = arcs[0];
      assert arc.isFinal();
      output = arc.output;
      targetUpto = 0;

      IDVersionSegmentTermsEnumFrame lastFrame = stack[0];
      assert validIndexPrefix <= term.length();

      final int targetLimit = Math.min(target.length, validIndexPrefix);

      int cmp = 0;

      // TODO: we should write our vLong backwards (MSB
      // first) to get better sharing from the FST

      // First compare up to valid seek frames:
      while (targetUpto < targetLimit) {
        cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
        // if (DEBUG) {
        // System.out.println("    cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit +
        // ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + "
        // vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"   + " arc.output=" + arc.output +
        // " output=" + output);
        // }
        if (cmp != 0) {
          break;
        }
        arc = arcs[1 + targetUpto];
        assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF)
            : "arc.label="
                + (char) arc.label
                + " targetLabel="
                + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
        // TODO: we could save the outputs in local
        // byte[][] instead of making new objs ever
        // seek; but, often the FST doesn't have any
        // shared bytes (but this could change if we
        // reverse vLong byte order)
        if (arc.output != VersionBlockTreeTermsWriter.NO_OUTPUT) {
          output = VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
        }
        if (arc.isFinal()) {
          lastFrame = stack[1 + lastFrame.ord];
        }
        targetUpto++;
      }

      if (cmp == 0) {
        final int targetUptoMid = targetUpto;
        // Second compare the rest of the term, but
        // don't save arc/output/frame:
        final int targetLimit2 = Math.min(target.length, term.length());
        while (targetUpto < targetLimit2) {
          cmp =
              (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
          // if (DEBUG) {
          // System.out.println("    cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit
          // + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto])
          // + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")");
          // }
          if (cmp != 0) {
            break;
          }
          targetUpto++;
        }

        if (cmp == 0) {
          cmp = term.length() - target.length;
        }
        targetUpto = targetUptoMid;
      }

      if (cmp < 0) {
        // Common case: target term is after current
        // term, ie, app is seeking multiple terms
        // in sorted order
        // if (DEBUG) {
        // System.out.println("  target is after current (shares prefixLen=" + targetUpto + ");
        // clear frame.scanned ord=" + lastFrame.ord);
        // }
        currentFrame = lastFrame;

      } else if (cmp > 0) {
        // Uncommon case: target term
        // is before current term; this means we can
        // keep the currentFrame but we must rewind it
        // (so we scan from the start)
        targetBeforeCurrentLength = 0;
        // if (DEBUG) {
        // System.out.println("  target is before current (shares prefixLen=" + targetUpto + ");
        // rewind frame ord=" + lastFrame.ord);
        // }
        currentFrame = lastFrame;
        currentFrame.rewind();
      } else {
        // Target is exactly the same as current term
        assert term.length() == target.length;
        if (termExists) {
          // if (DEBUG) {
          // System.out.println("  target is same as current; return FOUND");
          // }
          return SeekStatus.FOUND;
        } else {
          // if (DEBUG) {
          // System.out.println("  target is same as current but term doesn't exist");
          // }
        }
      }

    } else {

      targetBeforeCurrentLength = -1;
      arc = fr.index.getFirstArc(arcs[0]);

      // Empty string prefix must have an output (block) in the index!
      assert arc.isFinal();
      assert arc.output != null;

      // if (DEBUG) {
      // System.out.println("    no seek state; push root frame");
      // }

      output = arc.output;

      currentFrame = staticFrame;

      // term.length = 0;
      targetUpto = 0;
      currentFrame =
          pushFrame(
              arc, VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0);
    }

    // if (DEBUG) {
    // System.out.println("  start index loop targetUpto=" + targetUpto + " output=" + output + "
    // currentFrame.ord+1=" + currentFrame.ord + " targetBeforeCurrentLength=" +
    // targetBeforeCurrentLength);
    // }

    // We are done sharing the common prefix with the incoming target and where we are currently
    // seek'd; now continue walking the index:
    while (targetUpto < target.length) {

      final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;

      final FST.Arc<Pair<BytesRef, Long>> nextArc =
          fr.index.findTargetArc(targetLabel, arc, getArc(1 + targetUpto), fstReader);

      if (nextArc == null) {

        // Index is exhausted
        // if (DEBUG) {
        //   System.out.println("    index: index exhausted label=" + ((char) targetLabel) + " " +
        // toHex(targetLabel));
        // }

        validIndexPrefix = currentFrame.prefix;
        // validIndexPrefix = targetUpto;

        currentFrame.scanToFloorFrame(target);

        currentFrame.loadBlock();

        final SeekStatus result = currentFrame.scanToTerm(target, false);
        if (result == SeekStatus.END) {
          term.copyBytes(target);
          termExists = false;

          if (next() != null) {
            // if (DEBUG) {
            // System.out.println("  return NOT_FOUND term=" + brToString(term) + " " + term);
            // }
            return SeekStatus.NOT_FOUND;
          } else {
            // if (DEBUG) {
            // System.out.println("  return END");
            // }
            return SeekStatus.END;
          }
        } else {
          // if (DEBUG) {
          // System.out.println("  return " + result + " term=" + brToString(term) + " " + term);
          // }
          return result;
        }
      } else {
        // Follow this arc
        term.setByteAt(targetUpto, (byte) targetLabel);
        arc = nextArc;
        // Aggregate output as we go:
        assert arc.output != null;
        if (arc.output != VersionBlockTreeTermsWriter.NO_OUTPUT) {
          output = VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
        }

        // if (DEBUG) {
        // System.out.println("    index: follow label=" + toHex(target.bytes[target.offset +
        // targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
        // }
        targetUpto++;

        if (arc.isFinal()) {
          // if (DEBUG) System.out.println("    arc is final!");
          currentFrame =
              pushFrame(
                  arc,
                  VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput),
                  targetUpto);
          // if (DEBUG) System.out.println("    curFrame.ord=" + currentFrame.ord + " hasTerms=" +
          // currentFrame.hasTerms);
        }
      }
    }

    // validIndexPrefix = targetUpto;
    validIndexPrefix = currentFrame.prefix;

    currentFrame.scanToFloorFrame(target);

    currentFrame.loadBlock();

    final SeekStatus result = currentFrame.scanToTerm(target, false);

    if (result == SeekStatus.END) {
      term.copyBytes(target);
      termExists = false;
      if (next() != null) {
        // if (DEBUG) {
        // System.out.println("  return NOT_FOUND term=" + term.utf8ToString() + " " + term);
        // }
        return SeekStatus.NOT_FOUND;
      } else {
        // if (DEBUG) {
        // System.out.println("  return END");
        // }
        return SeekStatus.END;
      }
    } else {
      return result;
    }
  }
예제 #5
0
      // TODO: we may want an alternate mode here which is
      // "if you are about to return NOT_FOUND I won't use
      // the terms data from that"; eg FuzzyTermsEnum will
      // (usually) just immediately call seek again if we
      // return NOT_FOUND so it's a waste for us to fill in
      // the term that was actually NOT_FOUND
      @Override
      public SeekStatus seekCeil(final BytesRef target) throws IOException {

        if (indexEnum == null) {
          throw new IllegalStateException("terms index was not loaded");
        }

        // System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" +
        // target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term()
        // + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending="
        // + seekPending + " divisor=" + indexReader.getDivisor() + " this="  + this);
        if (didIndexNext) {
          if (nextIndexTerm == null) {
            // System.out.println("  nextIndexTerm=null");
          } else {
            // System.out.println("  nextIndexTerm=" + nextIndexTerm.utf8ToString());
          }
        }

        boolean doSeek = true;

        // See if we can avoid seeking, because target term
        // is after current term but before next index term:
        if (indexIsCurrent) {

          final int cmp = BytesRef.getUTF8SortedAsUnicodeComparator().compare(term.get(), target);

          if (cmp == 0) {
            // Already at the requested term
            return SeekStatus.FOUND;
          } else if (cmp < 0) {

            // Target term is after current term
            if (!didIndexNext) {
              if (indexEnum.next() == -1) {
                nextIndexTerm = null;
              } else {
                nextIndexTerm = indexEnum.term();
              }
              // System.out.println("  now do index next() nextIndexTerm=" + (nextIndexTerm == null
              // ? "null" : nextIndexTerm.utf8ToString()));
              didIndexNext = true;
            }

            if (nextIndexTerm == null
                || BytesRef.getUTF8SortedAsUnicodeComparator().compare(target, nextIndexTerm) < 0) {
              // Optimization: requested term is within the
              // same term block we are now in; skip seeking
              // (but do scanning):
              doSeek = false;
              // System.out.println("  skip seek: nextIndexTerm=" + (nextIndexTerm == null ? "null"
              // : nextIndexTerm.utf8ToString()));
            }
          }
        }

        if (doSeek) {
          // System.out.println("  seek");

          // Ask terms index to find biggest indexed term (=
          // first term in a block) that's <= our text:
          in.seek(indexEnum.seek(target));
          boolean result = nextBlock();

          // Block must exist since, at least, the indexed term
          // is in the block:
          assert result;

          indexIsCurrent = true;
          didIndexNext = false;

          if (doOrd) {
            state.ord = indexEnum.ord() - 1;
          }

          term.copyBytes(indexEnum.term());
          // System.out.println("  seek: term=" + term.utf8ToString());
        } else {
          // System.out.println("  skip seek");
          if (state.termBlockOrd == blockTermCount && !nextBlock()) {
            indexIsCurrent = false;
            return SeekStatus.END;
          }
        }

        seekPending = false;

        int common = 0;

        // Scan within block.  We could do this by calling
        // _next() and testing the resulting term, but this
        // is wasteful.  Instead, we first confirm the
        // target matches the common prefix of this block,
        // and then we scan the term bytes directly from the
        // termSuffixesreader's byte[], saving a copy into
        // the BytesRef term per term.  Only when we return
        // do we then copy the bytes into the term.

        while (true) {

          // First, see if target term matches common prefix
          // in this block:
          if (common < termBlockPrefix) {
            final int cmp =
                (term.byteAt(common) & 0xFF) - (target.bytes[target.offset + common] & 0xFF);
            if (cmp < 0) {

              // TODO: maybe we should store common prefix
              // in block header?  (instead of relying on
              // last term of previous block)

              // Target's prefix is after the common block
              // prefix, so term cannot be in this block
              // but it could be in next block.  We
              // must scan to end-of-block to set common
              // prefix for next block:
              if (state.termBlockOrd < blockTermCount) {
                while (state.termBlockOrd < blockTermCount - 1) {
                  state.termBlockOrd++;
                  state.ord++;
                  termSuffixesReader.skipBytes(termSuffixesReader.readVInt());
                }
                final int suffix = termSuffixesReader.readVInt();
                term.setLength(termBlockPrefix + suffix);
                term.grow(term.length());
                termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
              }
              state.ord++;

              if (!nextBlock()) {
                indexIsCurrent = false;
                return SeekStatus.END;
              }
              common = 0;

            } else if (cmp > 0) {
              // Target's prefix is before the common prefix
              // of this block, so we position to start of
              // block and return NOT_FOUND:
              assert state.termBlockOrd == 0;

              final int suffix = termSuffixesReader.readVInt();
              term.setLength(termBlockPrefix + suffix);
              term.grow(term.length());
              termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
              return SeekStatus.NOT_FOUND;
            } else {
              common++;
            }

            continue;
          }

          // Test every term in this block
          while (true) {
            state.termBlockOrd++;
            state.ord++;

            final int suffix = termSuffixesReader.readVInt();

            // We know the prefix matches, so just compare the new suffix:
            final int termLen = termBlockPrefix + suffix;
            int bytePos = termSuffixesReader.getPosition();

            boolean next = false;
            final int limit = target.offset + (termLen < target.length ? termLen : target.length);
            int targetPos = target.offset + termBlockPrefix;
            while (targetPos < limit) {
              final int cmp = (termSuffixes[bytePos++] & 0xFF) - (target.bytes[targetPos++] & 0xFF);
              if (cmp < 0) {
                // Current term is still before the target;
                // keep scanning
                next = true;
                break;
              } else if (cmp > 0) {
                // Done!  Current term is after target. Stop
                // here, fill in real term, return NOT_FOUND.
                term.setLength(termBlockPrefix + suffix);
                term.grow(term.length());
                termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
                // System.out.println("  NOT_FOUND");
                return SeekStatus.NOT_FOUND;
              }
            }

            if (!next && target.length <= termLen) {
              term.setLength(termBlockPrefix + suffix);
              term.grow(term.length());
              termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);

              if (target.length == termLen) {
                // Done!  Exact match.  Stop here, fill in
                // real term, return FOUND.
                // System.out.println("  FOUND");
                return SeekStatus.FOUND;
              } else {
                // System.out.println("  NOT_FOUND");
                return SeekStatus.NOT_FOUND;
              }
            }

            if (state.termBlockOrd == blockTermCount) {
              // Must pre-fill term for next block's common prefix
              term.setLength(termBlockPrefix + suffix);
              term.grow(term.length());
              termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
              break;
            } else {
              termSuffixesReader.skipBytes(suffix);
            }
          }

          // The purpose of the terms dict index is to seek
          // the enum to the closest index term before the
          // term we are looking for.  So, we should never
          // cross another index term (besides the first
          // one) while we are scanning:

          assert indexIsCurrent;

          if (!nextBlock()) {
            // System.out.println("  END");
            indexIsCurrent = false;
            return SeekStatus.END;
          }
          common = 0;
        }
      }