Пример #1
0
      /* Does initial decode of next block of terms; this
      doesn't actually decode the docFreq, totalTermFreq,
      postings details (frq/prx offset, etc.) metadata;
      it just loads them as byte[] blobs which are then
      decoded on-demand if the metadata is ever requested
      for any term in this block.  This enables terms-only
      intensive consumes (eg certain MTQs, respelling) to
      not pay the price of decoding metadata they won't
      use. */
      private boolean nextBlock() throws IOException {

        // TODO: we still lazy-decode the byte[] for each
        // term (the suffix), but, if we decoded
        // all N terms up front then seeking could do a fast
        // bsearch w/in the block...

        // System.out.println("BTR.nextBlock() fp=" + in.getFilePointer() + " this=" + this);
        state.blockFilePointer = in.getFilePointer();
        blockTermCount = in.readVInt();
        // System.out.println("  blockTermCount=" + blockTermCount);
        if (blockTermCount == 0) {
          return false;
        }
        termBlockPrefix = in.readVInt();

        // term suffixes:
        int len = in.readVInt();
        if (termSuffixes.length < len) {
          termSuffixes = new byte[ArrayUtil.oversize(len, 1)];
        }
        // System.out.println("  termSuffixes len=" + len);
        in.readBytes(termSuffixes, 0, len);
        termSuffixesReader.reset(termSuffixes, 0, len);

        // docFreq, totalTermFreq
        len = in.readVInt();
        if (docFreqBytes.length < len) {
          docFreqBytes = new byte[ArrayUtil.oversize(len, 1)];
        }
        // System.out.println("  freq bytes len=" + len);
        in.readBytes(docFreqBytes, 0, len);
        freqReader.reset(docFreqBytes, 0, len);

        // metadata
        len = in.readVInt();
        if (bytes == null) {
          bytes = new byte[ArrayUtil.oversize(len, 1)];
          bytesReader = new ByteArrayDataInput();
        } else if (bytes.length < len) {
          bytes = new byte[ArrayUtil.oversize(len, 1)];
        }
        in.readBytes(bytes, 0, len);
        bytesReader.reset(bytes, 0, len);

        metaDataUpto = 0;
        state.termBlockOrd = 0;

        indexIsCurrent = false;
        // System.out.println("  indexIsCurrent=" + indexIsCurrent);

        return true;
      }
 @Override
 protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) {
   tmpInput.reset(scratch.bytes);
   tmpInput.skipBytes(scratch.length - 4); // suggestion + separator
   scratch.length -= 5; // sep + long
   return tmpInput.readInt();
 }
Пример #3
0
  // Interleaves all output tokens onto the futureOutputs:
  private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) {
    bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);

    final int code = bytesReader.readVInt();
    final boolean keepOrig = (code & 0x1) == 0;
    final int count = code >>> 1;
    // System.out.println("  addOutput count=" + count + " keepOrig=" + keepOrig);
    for (int outputIDX = 0; outputIDX < count; outputIDX++) {
      synonyms.words.get(bytesReader.readVInt(), scratchBytes);
      // System.out.println("    outIDX=" + outputIDX + " bytes=" + scratchBytes.length);
      scratchChars.copyUTF8Bytes(scratchBytes);
      int lastStart = 0;
      final int chEnd = lastStart + scratchChars.length();
      int outputUpto = nextRead;
      for (int chIDX = lastStart; chIDX <= chEnd; chIDX++) {
        if (chIDX == chEnd || scratchChars.charAt(chIDX) == SynonymMap.WORD_SEPARATOR) {
          final int outputLen = chIDX - lastStart;
          // Caller is not allowed to have empty string in
          // the output:
          assert outputLen > 0 : "output contains empty string: " + scratchChars;
          final int endOffset;
          final int posLen;
          if (chIDX == chEnd && lastStart == 0) {
            // This rule had a single output token, so, we set
            // this output's endOffset to the current
            // endOffset (ie, endOffset of the last input
            // token it matched):
            endOffset = matchEndOffset;
            posLen = keepOrig ? matchInputLength : 1;
          } else {
            // This rule has more than one output token; we
            // can't pick any particular endOffset for this
            // case, so, we inherit the endOffset for the
            // input token which this output overlaps:
            endOffset = -1;
            posLen = 1;
          }
          futureOutputs[outputUpto].add(
              scratchChars.chars(), lastStart, outputLen, endOffset, posLen);
          // System.out.println("      " + new String(scratchChars.chars, lastStart, outputLen) + "
          // outputUpto=" + outputUpto);
          lastStart = 1 + chIDX;
          // System.out.println("  slot=" + outputUpto + " keepOrig=" + keepOrig);
          outputUpto = rollIncr(outputUpto);
          assert futureOutputs[outputUpto].posIncr == 1
              : "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite;
        }
      }
    }

    int upto = nextRead;
    for (int idx = 0; idx < matchInputLength; idx++) {
      futureInputs[upto].keepOrig |= keepOrig;
      futureInputs[upto].matched = true;
      upto = rollIncr(upto);
    }
  }
Пример #4
0
 /** decodes the payload at the current position */
 protected BytesRef decodePayload(BytesRef scratch, ByteArrayDataInput tmpInput) {
   tmpInput.reset(scratch.bytes);
   tmpInput.skipBytes(scratch.length - 2); // skip to payload size
   short payloadLength = tmpInput.readShort(); // read payload size
   tmpInput.setPosition(scratch.length - 2 - payloadLength); // setPosition to start of payload
   BytesRef payloadScratch = new BytesRef(payloadLength);
   tmpInput.readBytes(payloadScratch.bytes, 0, payloadLength); // read payload
   payloadScratch.length = payloadLength;
   scratch.length -= 2; // payload length info (short)
   scratch.length -= payloadLength; // payload
   return payloadScratch;
 }
 @Override
 public void setDocument(int docId) {
   bytes = values.get(docId);
   in.reset(bytes.bytes, bytes.offset, bytes.length);
   if (!in.eof()) {
     // first value uses vLong on top of zig-zag encoding, then deltas are encoded using vLong
     long previousValue = longs[0] = ByteUtils.zigZagDecode(ByteUtils.readVLong(in));
     count = 1;
     while (!in.eof()) {
       longs = ArrayUtil.grow(longs, count + 1);
       previousValue = longs[count++] = previousValue + ByteUtils.readVLong(in);
     }
   } else {
     count = 0;
   }
 }
  // Pushes a frame we seek'd to
  IDVersionSegmentTermsEnumFrame pushFrame(
      FST.Arc<Pair<BytesRef, Long>> arc, Pair<BytesRef, Long> frameData, int length)
      throws IOException {
    scratchReader.reset(
        frameData.output1.bytes, frameData.output1.offset, frameData.output1.length);
    final long code = scratchReader.readVLong();
    final long fpSeek = code >>> VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS;
    final IDVersionSegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord);
    f.maxIDVersion = Long.MAX_VALUE - frameData.output2;
    f.hasTerms = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0;
    f.hasTermsOrig = f.hasTerms;
    f.isFloor = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0;
    if (f.isFloor) {
      f.setFloorData(scratchReader, frameData.output1);
    }
    pushFrame(arc, fpSeek, length);

    return f;
  }
Пример #7
0
 /** decodes the contexts at the current position */
 protected Set<BytesRef> decodeContexts(BytesRef scratch, ByteArrayDataInput tmpInput) {
   tmpInput.reset(scratch.bytes);
   tmpInput.skipBytes(scratch.length - 2); // skip to context set size
   short ctxSetSize = tmpInput.readShort();
   scratch.length -= 2;
   final Set<BytesRef> contextSet = new HashSet<>();
   for (short i = 0; i < ctxSetSize; i++) {
     tmpInput.setPosition(scratch.length - 2);
     short curContextLength = tmpInput.readShort();
     scratch.length -= 2;
     tmpInput.setPosition(scratch.length - curContextLength);
     BytesRef contextSpare = new BytesRef(curContextLength);
     tmpInput.readBytes(contextSpare.bytes, 0, curContextLength);
     contextSpare.length = curContextLength;
     contextSet.add(contextSpare);
     scratch.length -= curContextLength;
   }
   return contextSet;
 }
Пример #8
0
 /** decodes the weight at the current position */
 protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) {
   tmpInput.reset(scratch.bytes);
   tmpInput.skipBytes(scratch.length - 8); // suggestion
   scratch.length -= 8; // long
   return tmpInput.readLong();
 }
  @Override
  public void build(TermFreqIterator tfit) throws IOException {
    if (tfit instanceof TermFreqPayloadIterator) {
      throw new IllegalArgumentException("this suggester doesn't support payloads");
    }
    File tempInput =
        File.createTempFile(
            FSTCompletionLookup.class.getSimpleName(), ".input", Sort.defaultTempDir());
    File tempSorted =
        File.createTempFile(
            FSTCompletionLookup.class.getSimpleName(), ".sorted", Sort.defaultTempDir());

    Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
    Sort.ByteSequencesReader reader = null;
    ExternalRefSorter sorter = null;

    // Push floats up front before sequences to sort them. For now, assume they are non-negative.
    // If negative floats are allowed some trickery needs to be done to find their byte order.
    boolean success = false;
    try {
      byte[] buffer = new byte[0];
      ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
      BytesRef spare;
      while ((spare = tfit.next()) != null) {
        if (spare.length + 4 >= buffer.length) {
          buffer = ArrayUtil.grow(buffer, spare.length + 4);
        }

        output.reset(buffer);
        output.writeInt(encodeWeight(tfit.weight()));
        output.writeBytes(spare.bytes, spare.offset, spare.length);
        writer.write(buffer, 0, output.getPosition());
      }
      writer.close();

      // We don't know the distribution of scores and we need to bucket them, so we'll sort
      // and divide into equal buckets.
      SortInfo info = new Sort().sort(tempInput, tempSorted);
      tempInput.delete();
      FSTCompletionBuilder builder =
          new FSTCompletionBuilder(
              buckets, sorter = new ExternalRefSorter(new Sort()), sharedTailLength);

      final int inputLines = info.lines;
      reader = new Sort.ByteSequencesReader(tempSorted);
      long line = 0;
      int previousBucket = 0;
      int previousScore = 0;
      ByteArrayDataInput input = new ByteArrayDataInput();
      BytesRef tmp1 = new BytesRef();
      BytesRef tmp2 = new BytesRef();
      while (reader.read(tmp1)) {
        input.reset(tmp1.bytes);
        int currentScore = input.readInt();

        int bucket;
        if (line > 0 && currentScore == previousScore) {
          bucket = previousBucket;
        } else {
          bucket = (int) (line * buckets / inputLines);
        }
        previousScore = currentScore;
        previousBucket = bucket;

        // Only append the input, discard the weight.
        tmp2.bytes = tmp1.bytes;
        tmp2.offset = input.getPosition();
        tmp2.length = tmp1.length - input.getPosition();
        builder.add(tmp2, bucket);

        line++;
      }

      // The two FSTCompletions share the same automaton.
      this.higherWeightsCompletion = builder.build();
      this.normalCompletion =
          new FSTCompletion(higherWeightsCompletion.getFST(), false, exactMatchFirst);

      success = true;
    } finally {
      if (success) IOUtils.close(reader, writer, sorter);
      else IOUtils.closeWhileHandlingException(reader, writer, sorter);

      tempInput.delete();
      tempSorted.delete();
    }
  }