@Override
 public long ramBytesUsed() {
   return BASE_RAM_BYTES_USED
       + RamUsageEstimator.sizeOf(offsets)
       + RamUsageEstimator.sizeOf(scratch.bytes())
       + RamUsageEstimator.sizeOf(scratchUTF16.chars());
 }
Пример #2
0
      /* Decodes only the term bytes of the next term.  If caller then asks for
      metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily)
      decode all metadata up to the current term. */
      private BytesRef _next() throws IOException {
        // System.out.println("BTR._next seg=" + segment + " this=" + this + " termCount=" +
        // state.termBlockOrd + " (vs " + blockTermCount + ")");
        if (state.termBlockOrd == blockTermCount && !nextBlock()) {
          // System.out.println("  eof");
          indexIsCurrent = false;
          return null;
        }

        // TODO: cutover to something better for these ints!  simple64?
        final int suffix = termSuffixesReader.readVInt();
        // System.out.println("  suffix=" + suffix);

        term.setLength(termBlockPrefix + suffix);
        term.grow(term.length());
        termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
        state.termBlockOrd++;

        // NOTE: meaningless in the non-ord case
        state.ord++;

        // System.out.println("  return term=" + fieldInfo.name + ":" + term.utf8ToString() + " " +
        // term + " tbOrd=" + state.termBlockOrd);
        return term.get();
      }
 private void readField(BytesRef type, FieldInfo fieldInfo, StoredFieldVisitor visitor)
     throws IOException {
   readLine();
   assert StringHelper.startsWith(scratch.get(), VALUE);
   if (type == TYPE_STRING) {
     byte[] bytes = new byte[scratch.length() - VALUE.length];
     System.arraycopy(scratch.bytes(), VALUE.length, bytes, 0, bytes.length);
     visitor.stringField(fieldInfo, bytes);
   } else if (type == TYPE_BINARY) {
     byte[] copy = new byte[scratch.length() - VALUE.length];
     System.arraycopy(scratch.bytes(), VALUE.length, copy, 0, copy.length);
     visitor.binaryField(fieldInfo, copy);
   } else if (type == TYPE_INT) {
     scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length);
     visitor.intField(fieldInfo, Integer.parseInt(scratchUTF16.toString()));
   } else if (type == TYPE_LONG) {
     scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length);
     visitor.longField(fieldInfo, Long.parseLong(scratchUTF16.toString()));
   } else if (type == TYPE_FLOAT) {
     scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length);
     visitor.floatField(fieldInfo, Float.parseFloat(scratchUTF16.toString()));
   } else if (type == TYPE_DOUBLE) {
     scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length);
     visitor.doubleField(fieldInfo, Double.parseDouble(scratchUTF16.toString()));
   }
 }
 @SuppressWarnings("unused")
 private void printSeekState(PrintStream out) throws IOException {
   if (currentFrame == staticFrame) {
     out.println("  no prior seek");
   } else {
     out.println("  prior seek state:");
     int ord = 0;
     boolean isSeekFrame = true;
     while (true) {
       IDVersionSegmentTermsEnumFrame f = getFrame(ord);
       assert f != null;
       final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefix);
       if (f.nextEnt == -1) {
         out.println(
             "    frame "
                 + (isSeekFrame ? "(seek)" : "(next)")
                 + " ord="
                 + ord
                 + " fp="
                 + f.fp
                 + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "")
                 + " prefixLen="
                 + f.prefix
                 + " prefix="
                 + brToString(prefix)
                 + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")"))
                 + " hasTerms="
                 + f.hasTerms
                 + " isFloor="
                 + f.isFloor
                 + " code="
                 + ((f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS)
                     + (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0)
                     + (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0))
                 + " isLastInFloor="
                 + f.isLastInFloor
                 + " mdUpto="
                 + f.metaDataUpto
                 + " tbOrd="
                 + f.getTermBlockOrd());
       } else {
         out.println(
             "    frame "
                 + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)")
                 + " ord="
                 + ord
                 + " fp="
                 + f.fp
                 + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "")
                 + " prefixLen="
                 + f.prefix
                 + " prefix="
                 + brToString(prefix)
                 + " nextEnt="
                 + f.nextEnt
                 + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")"))
                 + " hasTerms="
                 + f.hasTerms
                 + " isFloor="
                 + f.isFloor
                 + " code="
                 + ((f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS)
                     + (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0)
                     + (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0))
                 + " lastSubFP="
                 + f.lastSubFP
                 + " isLastInFloor="
                 + f.isLastInFloor
                 + " mdUpto="
                 + f.metaDataUpto
                 + " tbOrd="
                 + f.getTermBlockOrd());
       }
       if (fr.index != null) {
         assert !isSeekFrame || f.arc != null : "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc;
         if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix - 1) & 0xFF)) {
           out.println(
               "      broken seek state: arc.label="
                   + (char) f.arc.label
                   + " vs term byte="
                   + (char) (term.byteAt(f.prefix - 1) & 0xFF));
           throw new RuntimeException("seek state is broken");
         }
         Pair<BytesRef, Long> output = Util.get(fr.index, prefix);
         if (output == null) {
           out.println("      broken seek state: prefix is not final in index");
           throw new RuntimeException("seek state is broken");
         } else if (isSeekFrame && !f.isFloor) {
           final ByteArrayDataInput reader =
               new ByteArrayDataInput(
                   output.output1.bytes, output.output1.offset, output.output1.length);
           final long codeOrig = reader.readVLong();
           final long code =
               (f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS)
                   | (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0)
                   | (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0);
           if (codeOrig != code) {
             out.println(
                 "      broken seek state: output code="
                     + codeOrig
                     + " doesn't match frame code="
                     + code);
             throw new RuntimeException("seek state is broken");
           }
         }
       }
       if (f == currentFrame) {
         break;
       }
       if (f.prefix == validIndexPrefix) {
         isSeekFrame = false;
       }
       ord++;
     }
   }
 }
    /** Builds an {@link SynonymMap} and returns it. */
    public SynonymMap build() throws IOException {
      ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
      // TODO: are we using the best sharing options?
      org.apache.lucene.util.fst.Builder<BytesRef> builder =
          new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE4, outputs);

      BytesRefBuilder scratch = new BytesRefBuilder();
      ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

      final Set<Integer> dedupSet;

      if (dedup) {
        dedupSet = new HashSet<>();
      } else {
        dedupSet = null;
      }

      final byte[] spare = new byte[5];

      Set<CharsRef> keys = workingSet.keySet();
      CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
      Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());

      final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();

      // System.out.println("fmap.build");
      for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
        CharsRef input = sortedKeys[keyIdx];
        MapEntry output = workingSet.get(input);

        int numEntries = output.ords.size();
        // output size, assume the worst case
        int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry

        scratch.grow(estimatedSize);
        scratchOutput.reset(scratch.bytes());

        // now write our output data:
        int count = 0;
        for (int i = 0; i < numEntries; i++) {
          if (dedupSet != null) {
            // box once
            final Integer ent = output.ords.get(i);
            if (dedupSet.contains(ent)) {
              continue;
            }
            dedupSet.add(ent);
          }
          scratchOutput.writeVInt(output.ords.get(i));
          count++;
        }

        final int pos = scratchOutput.getPosition();
        scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
        final int pos2 = scratchOutput.getPosition();
        final int vIntLen = pos2 - pos;

        // Move the count + includeOrig to the front of the byte[]:
        System.arraycopy(scratch.bytes(), pos, spare, 0, vIntLen);
        System.arraycopy(scratch.bytes(), 0, scratch.bytes(), vIntLen, pos);
        System.arraycopy(spare, 0, scratch.bytes(), 0, vIntLen);

        if (dedupSet != null) {
          dedupSet.clear();
        }

        scratch.setLength(scratchOutput.getPosition());
        // System.out.println("  add input=" + input + " output=" + scratch + " offset=" +
        // scratch.offset + " length=" + scratch.length + " count=" + count);
        builder.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
      }

      FST<BytesRef> fst = builder.finish();
      return new SynonymMap(fst, words, maxHorizontalContext);
    }
 private int parseIntAt(int offset) {
   scratchUTF16.copyUTF8Bytes(scratch.bytes(), offset, scratch.length() - offset);
   return ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length());
 }
 // helper method to strip strip away 'prefix' from 'scratch' and return as String
 private String stripPrefix(BytesRefBuilder scratch, BytesRef prefix) throws IOException {
   return new String(
       scratch.bytes(), prefix.length, scratch.length() - prefix.length, StandardCharsets.UTF_8);
 }
Пример #8
0
      // TODO: we may want an alternate mode here which is
      // "if you are about to return NOT_FOUND I won't use
      // the terms data from that"; eg FuzzyTermsEnum will
      // (usually) just immediately call seek again if we
      // return NOT_FOUND so it's a waste for us to fill in
      // the term that was actually NOT_FOUND
      @Override
      public SeekStatus seekCeil(final BytesRef target) throws IOException {

        if (indexEnum == null) {
          throw new IllegalStateException("terms index was not loaded");
        }

        // System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" +
        // target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term()
        // + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending="
        // + seekPending + " divisor=" + indexReader.getDivisor() + " this="  + this);
        if (didIndexNext) {
          if (nextIndexTerm == null) {
            // System.out.println("  nextIndexTerm=null");
          } else {
            // System.out.println("  nextIndexTerm=" + nextIndexTerm.utf8ToString());
          }
        }

        boolean doSeek = true;

        // See if we can avoid seeking, because target term
        // is after current term but before next index term:
        if (indexIsCurrent) {

          final int cmp = BytesRef.getUTF8SortedAsUnicodeComparator().compare(term.get(), target);

          if (cmp == 0) {
            // Already at the requested term
            return SeekStatus.FOUND;
          } else if (cmp < 0) {

            // Target term is after current term
            if (!didIndexNext) {
              if (indexEnum.next() == -1) {
                nextIndexTerm = null;
              } else {
                nextIndexTerm = indexEnum.term();
              }
              // System.out.println("  now do index next() nextIndexTerm=" + (nextIndexTerm == null
              // ? "null" : nextIndexTerm.utf8ToString()));
              didIndexNext = true;
            }

            if (nextIndexTerm == null
                || BytesRef.getUTF8SortedAsUnicodeComparator().compare(target, nextIndexTerm) < 0) {
              // Optimization: requested term is within the
              // same term block we are now in; skip seeking
              // (but do scanning):
              doSeek = false;
              // System.out.println("  skip seek: nextIndexTerm=" + (nextIndexTerm == null ? "null"
              // : nextIndexTerm.utf8ToString()));
            }
          }
        }

        if (doSeek) {
          // System.out.println("  seek");

          // Ask terms index to find biggest indexed term (=
          // first term in a block) that's <= our text:
          in.seek(indexEnum.seek(target));
          boolean result = nextBlock();

          // Block must exist since, at least, the indexed term
          // is in the block:
          assert result;

          indexIsCurrent = true;
          didIndexNext = false;

          if (doOrd) {
            state.ord = indexEnum.ord() - 1;
          }

          term.copyBytes(indexEnum.term());
          // System.out.println("  seek: term=" + term.utf8ToString());
        } else {
          // System.out.println("  skip seek");
          if (state.termBlockOrd == blockTermCount && !nextBlock()) {
            indexIsCurrent = false;
            return SeekStatus.END;
          }
        }

        seekPending = false;

        int common = 0;

        // Scan within block.  We could do this by calling
        // _next() and testing the resulting term, but this
        // is wasteful.  Instead, we first confirm the
        // target matches the common prefix of this block,
        // and then we scan the term bytes directly from the
        // termSuffixesreader's byte[], saving a copy into
        // the BytesRef term per term.  Only when we return
        // do we then copy the bytes into the term.

        while (true) {

          // First, see if target term matches common prefix
          // in this block:
          if (common < termBlockPrefix) {
            final int cmp =
                (term.byteAt(common) & 0xFF) - (target.bytes[target.offset + common] & 0xFF);
            if (cmp < 0) {

              // TODO: maybe we should store common prefix
              // in block header?  (instead of relying on
              // last term of previous block)

              // Target's prefix is after the common block
              // prefix, so term cannot be in this block
              // but it could be in next block.  We
              // must scan to end-of-block to set common
              // prefix for next block:
              if (state.termBlockOrd < blockTermCount) {
                while (state.termBlockOrd < blockTermCount - 1) {
                  state.termBlockOrd++;
                  state.ord++;
                  termSuffixesReader.skipBytes(termSuffixesReader.readVInt());
                }
                final int suffix = termSuffixesReader.readVInt();
                term.setLength(termBlockPrefix + suffix);
                term.grow(term.length());
                termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
              }
              state.ord++;

              if (!nextBlock()) {
                indexIsCurrent = false;
                return SeekStatus.END;
              }
              common = 0;

            } else if (cmp > 0) {
              // Target's prefix is before the common prefix
              // of this block, so we position to start of
              // block and return NOT_FOUND:
              assert state.termBlockOrd == 0;

              final int suffix = termSuffixesReader.readVInt();
              term.setLength(termBlockPrefix + suffix);
              term.grow(term.length());
              termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
              return SeekStatus.NOT_FOUND;
            } else {
              common++;
            }

            continue;
          }

          // Test every term in this block
          while (true) {
            state.termBlockOrd++;
            state.ord++;

            final int suffix = termSuffixesReader.readVInt();

            // We know the prefix matches, so just compare the new suffix:
            final int termLen = termBlockPrefix + suffix;
            int bytePos = termSuffixesReader.getPosition();

            boolean next = false;
            final int limit = target.offset + (termLen < target.length ? termLen : target.length);
            int targetPos = target.offset + termBlockPrefix;
            while (targetPos < limit) {
              final int cmp = (termSuffixes[bytePos++] & 0xFF) - (target.bytes[targetPos++] & 0xFF);
              if (cmp < 0) {
                // Current term is still before the target;
                // keep scanning
                next = true;
                break;
              } else if (cmp > 0) {
                // Done!  Current term is after target. Stop
                // here, fill in real term, return NOT_FOUND.
                term.setLength(termBlockPrefix + suffix);
                term.grow(term.length());
                termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
                // System.out.println("  NOT_FOUND");
                return SeekStatus.NOT_FOUND;
              }
            }

            if (!next && target.length <= termLen) {
              term.setLength(termBlockPrefix + suffix);
              term.grow(term.length());
              termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);

              if (target.length == termLen) {
                // Done!  Exact match.  Stop here, fill in
                // real term, return FOUND.
                // System.out.println("  FOUND");
                return SeekStatus.FOUND;
              } else {
                // System.out.println("  NOT_FOUND");
                return SeekStatus.NOT_FOUND;
              }
            }

            if (state.termBlockOrd == blockTermCount) {
              // Must pre-fill term for next block's common prefix
              term.setLength(termBlockPrefix + suffix);
              term.grow(term.length());
              termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
              break;
            } else {
              termSuffixesReader.skipBytes(suffix);
            }
          }

          // The purpose of the terms dict index is to seek
          // the enum to the closest index term before the
          // term we are looking for.  So, we should never
          // cross another index term (besides the first
          // one) while we are scanning:

          assert indexIsCurrent;

          if (!nextBlock()) {
            // System.out.println("  END");
            indexIsCurrent = false;
            return SeekStatus.END;
          }
          common = 0;
        }
      }
 private String readString(int offset, BytesRefBuilder scratch) {
   scratchUTF16.copyUTF8Bytes(scratch.bytes(), offset, scratch.length() - offset);
   return scratchUTF16.toString();
 }
  @Override
  public Fields get(int doc) throws IOException {
    SortedMap<String, SimpleTVTerms> fields = new TreeMap<>();
    in.seek(offsets[doc]);
    readLine();
    assert StringHelper.startsWith(scratch.get(), NUMFIELDS);
    int numFields = parseIntAt(NUMFIELDS.length);
    if (numFields == 0) {
      return null; // no vectors for this doc
    }
    for (int i = 0; i < numFields; i++) {
      readLine();
      assert StringHelper.startsWith(scratch.get(), FIELD);
      // skip fieldNumber:
      parseIntAt(FIELD.length);

      readLine();
      assert StringHelper.startsWith(scratch.get(), FIELDNAME);
      String fieldName = readString(FIELDNAME.length, scratch);

      readLine();
      assert StringHelper.startsWith(scratch.get(), FIELDPOSITIONS);
      boolean positions = Boolean.parseBoolean(readString(FIELDPOSITIONS.length, scratch));

      readLine();
      assert StringHelper.startsWith(scratch.get(), FIELDOFFSETS);
      boolean offsets = Boolean.parseBoolean(readString(FIELDOFFSETS.length, scratch));

      readLine();
      assert StringHelper.startsWith(scratch.get(), FIELDPAYLOADS);
      boolean payloads = Boolean.parseBoolean(readString(FIELDPAYLOADS.length, scratch));

      readLine();
      assert StringHelper.startsWith(scratch.get(), FIELDTERMCOUNT);
      int termCount = parseIntAt(FIELDTERMCOUNT.length);

      SimpleTVTerms terms = new SimpleTVTerms(offsets, positions, payloads);
      fields.put(fieldName, terms);

      BytesRefBuilder term = new BytesRefBuilder();
      for (int j = 0; j < termCount; j++) {
        readLine();
        assert StringHelper.startsWith(scratch.get(), TERMTEXT);
        int termLength = scratch.length() - TERMTEXT.length;
        term.grow(termLength);
        term.setLength(termLength);
        System.arraycopy(scratch.bytes(), TERMTEXT.length, term.bytes(), 0, termLength);

        SimpleTVPostings postings = new SimpleTVPostings();
        terms.terms.put(term.toBytesRef(), postings);

        readLine();
        assert StringHelper.startsWith(scratch.get(), TERMFREQ);
        postings.freq = parseIntAt(TERMFREQ.length);

        if (positions || offsets) {
          if (positions) {
            postings.positions = new int[postings.freq];
            if (payloads) {
              postings.payloads = new BytesRef[postings.freq];
            }
          }

          if (offsets) {
            postings.startOffsets = new int[postings.freq];
            postings.endOffsets = new int[postings.freq];
          }

          for (int k = 0; k < postings.freq; k++) {
            if (positions) {
              readLine();
              assert StringHelper.startsWith(scratch.get(), POSITION);
              postings.positions[k] = parseIntAt(POSITION.length);
              if (payloads) {
                readLine();
                assert StringHelper.startsWith(scratch.get(), PAYLOAD);
                if (scratch.length() - PAYLOAD.length == 0) {
                  postings.payloads[k] = null;
                } else {
                  byte payloadBytes[] = new byte[scratch.length() - PAYLOAD.length];
                  System.arraycopy(
                      scratch.bytes(), PAYLOAD.length, payloadBytes, 0, payloadBytes.length);
                  postings.payloads[k] = new BytesRef(payloadBytes);
                }
              }
            }

            if (offsets) {
              readLine();
              assert StringHelper.startsWith(scratch.get(), STARTOFFSET);
              postings.startOffsets[k] = parseIntAt(STARTOFFSET.length);

              readLine();
              assert StringHelper.startsWith(scratch.get(), ENDOFFSET);
              postings.endOffsets[k] = parseIntAt(ENDOFFSET.length);
            }
          }
        }
      }
    }
    return new SimpleTVFields(fields);
  }