/* Does initial decode of next block of terms; this
      doesn't actually decode the docFreq, totalTermFreq,
      postings details (frq/prx offset, etc.) metadata;
      it just loads them as byte[] blobs which are then
      decoded on-demand if the metadata is ever requested
      for any term in this block.  This enables terms-only
      intensive consumes (eg certain MTQs, respelling) to
      not pay the price of decoding metadata they won't
      use. */
      private boolean nextBlock() throws IOException {

        // TODO: we still lazy-decode the byte[] for each
        // term (the suffix), but, if we decoded
        // all N terms up front then seeking could do a fast
        // bsearch w/in the block...

        // System.out.println("BTR.nextBlock() fp=" + in.getFilePointer() + " this=" + this);
        state.blockFilePointer = in.getFilePointer();
        blockTermCount = in.readVInt();
        // System.out.println("  blockTermCount=" + blockTermCount);
        if (blockTermCount == 0) {
          return false;
        }
        termBlockPrefix = in.readVInt();

        // term suffixes:
        int len = in.readVInt();
        if (termSuffixes.length < len) {
          termSuffixes = new byte[ArrayUtil.oversize(len, 1)];
        }
        // System.out.println("  termSuffixes len=" + len);
        in.readBytes(termSuffixes, 0, len);
        termSuffixesReader.reset(termSuffixes, 0, len);

        // docFreq, totalTermFreq
        len = in.readVInt();
        if (docFreqBytes.length < len) {
          docFreqBytes = new byte[ArrayUtil.oversize(len, 1)];
        }
        // System.out.println("  freq bytes len=" + len);
        in.readBytes(docFreqBytes, 0, len);
        freqReader.reset(docFreqBytes, 0, len);

        // metadata
        len = in.readVInt();
        if (bytes == null) {
          bytes = new byte[ArrayUtil.oversize(len, 1)];
          bytesReader = new ByteArrayDataInput();
        } else if (bytes.length < len) {
          bytes = new byte[ArrayUtil.oversize(len, 1)];
        }
        in.readBytes(bytes, 0, len);
        bytesReader.reset(bytes, 0, len);

        metaDataUpto = 0;
        state.termBlockOrd = 0;

        indexIsCurrent = false;
        // System.out.println("  indexIsCurrent=" + indexIsCurrent);

        return true;
      }
      public SegmentTermsEnum() throws IOException {
        in = BlockTermsReader.this.in.clone();
        in.seek(termsStartPointer);
        indexEnum = indexReader.getFieldEnum(fieldInfo);
        doOrd = indexReader.supportsOrd();
        fieldTerm.field = fieldInfo.name;
        state = postingsReader.newTermState();
        state.totalTermFreq = -1;
        state.ord = -1;

        termSuffixes = new byte[128];
        docFreqBytes = new byte[64];
        // System.out.println("BTR.enum init this=" + this + " postingsReader=" + postingsReader);
        longs = new long[longsSize];
      }
 @Override
 public void copyFrom(TermState _other) {
   super.copyFrom(_other);
   SepTermState other = (SepTermState) _other;
   if (docIndex == null) {
     docIndex = other.docIndex.clone();
   } else {
     docIndex.copyFrom(other.docIndex);
   }
   if (other.freqIndex != null) {
     if (freqIndex == null) {
       freqIndex = other.freqIndex.clone();
     } else {
       freqIndex.copyFrom(other.freqIndex);
     }
   } else {
     freqIndex = null;
   }
   if (other.posIndex != null) {
     if (posIndex == null) {
       posIndex = other.posIndex.clone();
     } else {
       posIndex.copyFrom(other.posIndex);
     }
   } else {
     posIndex = null;
   }
   payloadFP = other.payloadFP;
   skipFP = other.skipFP;
 }
      @Override
      public void seekExact(long ord) throws IOException {
        // System.out.println("BTR.seek by ord ord=" + ord);
        if (indexEnum == null) {
          throw new IllegalStateException("terms index was not loaded");
        }

        assert ord < numTerms;

        // TODO: if ord is in same terms block and
        // after current ord, we should avoid this seek just
        // like we do in the seek(BytesRef) case
        in.seek(indexEnum.seek(ord));
        boolean result = nextBlock();

        // Block must exist since ord < numTerms:
        assert result;

        indexIsCurrent = true;
        didIndexNext = false;
        seekPending = false;

        state.ord = indexEnum.ord() - 1;
        assert state.ord >= -1 : "ord=" + state.ord;
        term.copyBytes(indexEnum.term());

        // Now, scan:
        int left = (int) (ord - state.ord);
        while (left > 0) {
          final BytesRef term = _next();
          assert term != null;
          left--;
          assert indexIsCurrent;
        }
      }
      @Override
      public BytesRef next() throws IOException {
        // System.out.println("BTR.next() seekPending=" + seekPending + " pendingSeekCount=" +
        // state.termBlockOrd);

        // If seek was previously called and the term was cached,
        // usually caller is just going to pull a D/&PEnum or get
        // docFreq, etc.  But, if they then call next(),
        // this method catches up all internal state so next()
        // works properly:
        if (seekPending) {
          assert !indexIsCurrent;
          in.seek(state.blockFilePointer);
          final int pendingSeekCount = state.termBlockOrd;
          boolean result = nextBlock();

          final long savOrd = state.ord;

          // Block must exist since seek(TermState) was called w/ a
          // TermState previously returned by this enum when positioned
          // on a real term:
          assert result;

          while (state.termBlockOrd < pendingSeekCount) {
            BytesRef nextResult = _next();
            assert nextResult != null;
          }
          seekPending = false;
          state.ord = savOrd;
        }
        return _next();
      }
 @Override
 public TermState termState() throws IOException {
   // System.out.println("BTR.termState this=" + this);
   decodeMetaData();
   TermState ts = state.clone();
   // System.out.println("  return ts=" + ts);
   return ts;
 }
      private void decodeMetaData() throws IOException {
        // System.out.println("BTR.decodeMetadata mdUpto=" + metaDataUpto + " vs termCount=" +
        // state.termBlockOrd + " state=" + state);
        if (!seekPending) {
          // TODO: cutover to random-access API
          // here.... really stupid that we have to decode N
          // wasted term metadata just to get to the N+1th
          // that we really need...

          // lazily catch up on metadata decode:
          final int limit = state.termBlockOrd;
          boolean absolute = metaDataUpto == 0;
          // TODO: better API would be "jump straight to term=N"???
          while (metaDataUpto < limit) {
            // System.out.println("  decode mdUpto=" + metaDataUpto);
            // TODO: we could make "tiers" of metadata, ie,
            // decode docFreq/totalTF but don't decode postings
            // metadata; this way caller could get
            // docFreq/totalTF w/o paying decode cost for
            // postings

            // TODO: if docFreq were bulk decoded we could
            // just skipN here:

            // docFreq, totalTermFreq
            state.docFreq = freqReader.readVInt();
            // System.out.println("    dF=" + state.docFreq);
            if (fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
              state.totalTermFreq = state.docFreq + freqReader.readVLong();
              // System.out.println("    totTF=" + state.totalTermFreq);
            }
            // metadata
            for (int i = 0; i < longs.length; i++) {
              longs[i] = bytesReader.readVLong();
            }
            postingsReader.decodeTerm(longs, bytesReader, fieldInfo, state, absolute);
            metaDataUpto++;
            absolute = false;
          }
        } else {
          // System.out.println("  skip! seekPending");
        }
      }
 @Override
 public void seekExact(BytesRef target, TermState otherState) {
   // System.out.println("BTR.seekExact termState target=" + target.utf8ToString() + " " +
   // target + " this=" + this);
   assert otherState != null && otherState instanceof BlockTermState;
   assert !doOrd || ((BlockTermState) otherState).ord < numTerms;
   state.copyFrom(otherState);
   seekPending = true;
   indexIsCurrent = false;
   term.copyBytes(target);
 }
 @Override
 public TermState termState() throws IOException {
   decodeMetaData();
   return state.clone();
 }
Exemple #10
0
      // TODO: we may want an alternate mode here which is
      // "if you are about to return NOT_FOUND I won't use
      // the terms data from that"; eg FuzzyTermsEnum will
      // (usually) just immediately call seek again if we
      // return NOT_FOUND so it's a waste for us to fill in
      // the term that was actually NOT_FOUND
      @Override
      public SeekStatus seekCeil(final BytesRef target) throws IOException {

        if (indexEnum == null) {
          throw new IllegalStateException("terms index was not loaded");
        }

        // System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" +
        // target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term()
        // + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending="
        // + seekPending + " divisor=" + indexReader.getDivisor() + " this="  + this);
        if (didIndexNext) {
          if (nextIndexTerm == null) {
            // System.out.println("  nextIndexTerm=null");
          } else {
            // System.out.println("  nextIndexTerm=" + nextIndexTerm.utf8ToString());
          }
        }

        boolean doSeek = true;

        // See if we can avoid seeking, because target term
        // is after current term but before next index term:
        if (indexIsCurrent) {

          final int cmp = BytesRef.getUTF8SortedAsUnicodeComparator().compare(term.get(), target);

          if (cmp == 0) {
            // Already at the requested term
            return SeekStatus.FOUND;
          } else if (cmp < 0) {

            // Target term is after current term
            if (!didIndexNext) {
              if (indexEnum.next() == -1) {
                nextIndexTerm = null;
              } else {
                nextIndexTerm = indexEnum.term();
              }
              // System.out.println("  now do index next() nextIndexTerm=" + (nextIndexTerm == null
              // ? "null" : nextIndexTerm.utf8ToString()));
              didIndexNext = true;
            }

            if (nextIndexTerm == null
                || BytesRef.getUTF8SortedAsUnicodeComparator().compare(target, nextIndexTerm) < 0) {
              // Optimization: requested term is within the
              // same term block we are now in; skip seeking
              // (but do scanning):
              doSeek = false;
              // System.out.println("  skip seek: nextIndexTerm=" + (nextIndexTerm == null ? "null"
              // : nextIndexTerm.utf8ToString()));
            }
          }
        }

        if (doSeek) {
          // System.out.println("  seek");

          // Ask terms index to find biggest indexed term (=
          // first term in a block) that's <= our text:
          in.seek(indexEnum.seek(target));
          boolean result = nextBlock();

          // Block must exist since, at least, the indexed term
          // is in the block:
          assert result;

          indexIsCurrent = true;
          didIndexNext = false;

          if (doOrd) {
            state.ord = indexEnum.ord() - 1;
          }

          term.copyBytes(indexEnum.term());
          // System.out.println("  seek: term=" + term.utf8ToString());
        } else {
          // System.out.println("  skip seek");
          if (state.termBlockOrd == blockTermCount && !nextBlock()) {
            indexIsCurrent = false;
            return SeekStatus.END;
          }
        }

        seekPending = false;

        int common = 0;

        // Scan within block.  We could do this by calling
        // _next() and testing the resulting term, but this
        // is wasteful.  Instead, we first confirm the
        // target matches the common prefix of this block,
        // and then we scan the term bytes directly from the
        // termSuffixesreader's byte[], saving a copy into
        // the BytesRef term per term.  Only when we return
        // do we then copy the bytes into the term.

        while (true) {

          // First, see if target term matches common prefix
          // in this block:
          if (common < termBlockPrefix) {
            final int cmp =
                (term.byteAt(common) & 0xFF) - (target.bytes[target.offset + common] & 0xFF);
            if (cmp < 0) {

              // TODO: maybe we should store common prefix
              // in block header?  (instead of relying on
              // last term of previous block)

              // Target's prefix is after the common block
              // prefix, so term cannot be in this block
              // but it could be in next block.  We
              // must scan to end-of-block to set common
              // prefix for next block:
              if (state.termBlockOrd < blockTermCount) {
                while (state.termBlockOrd < blockTermCount - 1) {
                  state.termBlockOrd++;
                  state.ord++;
                  termSuffixesReader.skipBytes(termSuffixesReader.readVInt());
                }
                final int suffix = termSuffixesReader.readVInt();
                term.setLength(termBlockPrefix + suffix);
                term.grow(term.length());
                termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
              }
              state.ord++;

              if (!nextBlock()) {
                indexIsCurrent = false;
                return SeekStatus.END;
              }
              common = 0;

            } else if (cmp > 0) {
              // Target's prefix is before the common prefix
              // of this block, so we position to start of
              // block and return NOT_FOUND:
              assert state.termBlockOrd == 0;

              final int suffix = termSuffixesReader.readVInt();
              term.setLength(termBlockPrefix + suffix);
              term.grow(term.length());
              termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
              return SeekStatus.NOT_FOUND;
            } else {
              common++;
            }

            continue;
          }

          // Test every term in this block
          while (true) {
            state.termBlockOrd++;
            state.ord++;

            final int suffix = termSuffixesReader.readVInt();

            // We know the prefix matches, so just compare the new suffix:
            final int termLen = termBlockPrefix + suffix;
            int bytePos = termSuffixesReader.getPosition();

            boolean next = false;
            final int limit = target.offset + (termLen < target.length ? termLen : target.length);
            int targetPos = target.offset + termBlockPrefix;
            while (targetPos < limit) {
              final int cmp = (termSuffixes[bytePos++] & 0xFF) - (target.bytes[targetPos++] & 0xFF);
              if (cmp < 0) {
                // Current term is still before the target;
                // keep scanning
                next = true;
                break;
              } else if (cmp > 0) {
                // Done!  Current term is after target. Stop
                // here, fill in real term, return NOT_FOUND.
                term.setLength(termBlockPrefix + suffix);
                term.grow(term.length());
                termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
                // System.out.println("  NOT_FOUND");
                return SeekStatus.NOT_FOUND;
              }
            }

            if (!next && target.length <= termLen) {
              term.setLength(termBlockPrefix + suffix);
              term.grow(term.length());
              termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);

              if (target.length == termLen) {
                // Done!  Exact match.  Stop here, fill in
                // real term, return FOUND.
                // System.out.println("  FOUND");
                return SeekStatus.FOUND;
              } else {
                // System.out.println("  NOT_FOUND");
                return SeekStatus.NOT_FOUND;
              }
            }

            if (state.termBlockOrd == blockTermCount) {
              // Must pre-fill term for next block's common prefix
              term.setLength(termBlockPrefix + suffix);
              term.grow(term.length());
              termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
              break;
            } else {
              termSuffixesReader.skipBytes(suffix);
            }
          }

          // The purpose of the terms dict index is to seek
          // the enum to the closest index term before the
          // term we are looking for.  So, we should never
          // cross another index term (besides the first
          // one) while we are scanning:

          assert indexIsCurrent;

          if (!nextBlock()) {
            // System.out.println("  END");
            indexIsCurrent = false;
            return SeekStatus.END;
          }
          common = 0;
        }
      }