@Override
 public int compare(BytesRef left, BytesRef right) {
   // Make shallow copy in case decode changes the BytesRef:
   leftScratch.bytes = left.bytes;
   leftScratch.offset = left.offset;
   leftScratch.length = left.length;
   rightScratch.bytes = right.bytes;
   rightScratch.offset = right.offset;
   rightScratch.length = right.length;
   long leftCost = decode(leftScratch, input);
   long rightCost = decode(rightScratch, input);
   if (hasPayloads) {
     decodePayload(leftScratch, input);
     decodePayload(rightScratch, input);
   }
   if (hasContexts) {
     decodeContexts(leftScratch, input);
     decodeContexts(rightScratch, input);
   }
   int cmp = comparator.compare(leftScratch, rightScratch);
   if (cmp != 0) {
     return cmp;
   }
   return Long.compare(leftCost, rightCost);
 }
  @Override
  public BytesRef writeToBytes() {
    long start = System.nanoTime();
    int size = set.size();

    BytesRef bytes = new BytesRef(new byte[HEADER_SIZE + (int) bytesUsed.get()]);

    // Encode encoding type
    Bytes.writeInt(bytes, this.getEncoding().ordinal());

    // Encode flag
    bytes.bytes[bytes.offset++] = (byte) (this.isPruned() ? 1 : 0);

    // Encode size of the set
    Bytes.writeInt(bytes, size);

    // Encode longs
    BytesRef reusable = new BytesRef();
    for (int i = 0; i < this.set.size(); i++) {
      this.set.get(i, reusable);
      Bytes.writeBytesRef(reusable, bytes);
    }

    logger.debug(
        "Serialized {} terms - took {} ms", this.size(), (System.nanoTime() - start) / 1000000);

    bytes.length = bytes.offset;
    bytes.offset = 0;
    return bytes;
  }
 /**
  * Ensure we own term.bytes so that it's safe to modify. We detect via a kluge in which
  * cellsByLevel[0].termBuf is non-null, which is a pre-allocated for use to replace term.bytes.
  */
 void ensureOwnTermBytes() {
   NRCell cell0 = cellsByLevel[0];
   if (cell0.termBuf == null) return; // we already own the bytes
   System.arraycopy(term.bytes, term.offset, cell0.termBuf, 0, term.length);
   term.bytes = cell0.termBuf;
   term.offset = 0;
   cell0.termBuf = null;
 }
 protected static void copy(BytesRef from, BytesRef to) {
   if (to.bytes.length < from.length) {
     to.bytes = new byte[ArrayUtil.oversize(from.length, RamUsageEstimator.NUM_BYTES_BYTE)];
   }
   to.offset = 0;
   to.length = from.length;
   System.arraycopy(from.bytes, from.offset, to.bytes, 0, from.length);
 }
 @Override
 public BytesRef getTokenBytesNoLeaf(BytesRef result) {
   if (result == null) result = new BytesRef();
   result.bytes = term.bytes;
   result.offset = term.offset;
   result.length = termLenByLevel[cellLevel];
   assert result.length <= term.length;
   return result;
 }
    private void decodeTermFreqs() throws IOException {
      // logger.debug("Decode Term Freq in Node: {}", this.hashCode());
      // logger.debug("Decode Term Freq in Node at {}", in.getFilePointer());
      in.readBytes(termFreqCompressedBuffer.bytes, 0, termFreqCompressedBufferLength);
      termFreqCompressedBuffer.offset = 0;
      termFreqCompressedBuffer.length = termFreqCompressedBufferLength;
      nodDecompressor.decompress(termFreqCompressedBuffer, termFreqBuffer);
      // set length limit based on block size, as certain decompressor with
      // large window size can set it larger than the blockSize, e.g., AFor
      termFreqBuffer.length = termFreqBlockSize;

      termFreqReadPending = false;
    }
    private void decodeNodeLengths() throws IOException {
      // logger.debug("Decode Nodes Length: {}", this.hashCode());
      // logger.debug("Decode Nodes Length at {}", in.getFilePointer());
      in.readBytes(nodLenCompressedBuffer.bytes, 0, nodLenCompressedBufferLength);
      nodLenCompressedBuffer.offset = 0;
      nodLenCompressedBuffer.length = nodLenCompressedBufferLength;
      nodDecompressor.decompress(nodLenCompressedBuffer, nodLenBuffer);
      // set length limit based on block size, as certain decompressor with
      // large window size can set it larger than the blockSize, e.g., AFor
      nodLenBuffer.length = nodLenBlockSize;

      nodLenReadPending = false;
    }
Exemple #8
0
 @Test
 public void testCreateAndSplitId() {
   BytesRef createUid = Uid.createUidAsBytes("foo", "bar");
   BytesRef[] splitUidIntoTypeAndId = Uid.splitUidIntoTypeAndId(createUid);
   assertThat("foo", equalTo(splitUidIntoTypeAndId[0].utf8ToString()));
   assertThat("bar", equalTo(splitUidIntoTypeAndId[1].utf8ToString()));
   // split also with an offset
   BytesRef ref = new BytesRef(createUid.length + 10);
   ref.offset = 9;
   ref.length = createUid.length;
   System.arraycopy(createUid.bytes, createUid.offset, ref.bytes, ref.offset, ref.length);
   splitUidIntoTypeAndId = Uid.splitUidIntoTypeAndId(ref);
   assertThat("foo", equalTo(splitUidIntoTypeAndId[0].utf8ToString()));
   assertThat("bar", equalTo(splitUidIntoTypeAndId[1].utf8ToString()));
 }
  @Override
  public void compress(final IntsRef input, final BytesRef output) {
    assert input.ints.length % 32 == 0;
    final int[] uncompressedData = input.ints;
    final byte[] compressedData = output.bytes;

    // prepare the input buffer before starting the compression
    this.prepareInputBuffer(input);

    while (input.offset < input.length) {
      for (final long compressorCode :
          this.frameCompressorCodes(uncompressedData, input.offset, input.length)) {
        compressedData[output.offset] = (byte) compressorCode;
        this.compressors[(int) compressorCode].compress(input, output);
      }
    }

    // flip buffer
    input.offset = 0;
    output.length = output.offset;
    output.offset = 0;
  }
 @Override
 public BytesRef next() throws IOException {
   if (scratch == null) {
     return null;
   }
   boolean success = false;
   try {
     byte[] next = reader.read();
     if (next != null) {
       scratch.bytes = next;
       scratch.length = next.length;
       scratch.offset = 0;
     } else {
       IOUtils.close(reader);
       scratch = null;
     }
     success = true;
     return scratch;
   } finally {
     if (!success) {
       IOUtils.closeWhileHandlingException(reader);
     }
   }
 }
  @Override
  public Cell readCell(BytesRef term, Cell scratch) {
    if (scratch == null) scratch = getWorldCell();

    // We decode level #, leaf boolean, and populate bytes by reference. We don't decode the stack.

    // reverse lookup term length to the level and hence the cell
    NRCell[] cellsByLevel = ((NRCell) scratch).cellsByLevel;
    boolean isLeaf = term.bytes[term.offset + term.length - 1] == 0;
    int lenNoLeaf = isLeaf ? term.length - 1 : term.length;

    NRCell result = cellsByLevel[levelByTermLen[lenNoLeaf]];
    if (cellsByLevel[0].termBuf == null)
      cellsByLevel[0].termBuf = result.term.bytes; // a kluge; see cell.ensureOwnTermBytes()
    result.term.bytes = term.bytes;
    result.term.offset = term.offset;
    result.term.length = lenNoLeaf; // technically this isn't used but may help debugging
    result.reset();
    if (isLeaf) result.setLeaf();

    result.cellNumber = -1; // lazy decode flag

    return result;
  }
  @Override
  public void build(TermFreqIterator tfit) throws IOException {
    if (tfit instanceof TermFreqPayloadIterator) {
      throw new IllegalArgumentException("this suggester doesn't support payloads");
    }
    File tempInput =
        File.createTempFile(
            FSTCompletionLookup.class.getSimpleName(), ".input", Sort.defaultTempDir());
    File tempSorted =
        File.createTempFile(
            FSTCompletionLookup.class.getSimpleName(), ".sorted", Sort.defaultTempDir());

    Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
    Sort.ByteSequencesReader reader = null;
    ExternalRefSorter sorter = null;

    // Push floats up front before sequences to sort them. For now, assume they are non-negative.
    // If negative floats are allowed some trickery needs to be done to find their byte order.
    boolean success = false;
    try {
      byte[] buffer = new byte[0];
      ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
      BytesRef spare;
      while ((spare = tfit.next()) != null) {
        if (spare.length + 4 >= buffer.length) {
          buffer = ArrayUtil.grow(buffer, spare.length + 4);
        }

        output.reset(buffer);
        output.writeInt(encodeWeight(tfit.weight()));
        output.writeBytes(spare.bytes, spare.offset, spare.length);
        writer.write(buffer, 0, output.getPosition());
      }
      writer.close();

      // We don't know the distribution of scores and we need to bucket them, so we'll sort
      // and divide into equal buckets.
      SortInfo info = new Sort().sort(tempInput, tempSorted);
      tempInput.delete();
      FSTCompletionBuilder builder =
          new FSTCompletionBuilder(
              buckets, sorter = new ExternalRefSorter(new Sort()), sharedTailLength);

      final int inputLines = info.lines;
      reader = new Sort.ByteSequencesReader(tempSorted);
      long line = 0;
      int previousBucket = 0;
      int previousScore = 0;
      ByteArrayDataInput input = new ByteArrayDataInput();
      BytesRef tmp1 = new BytesRef();
      BytesRef tmp2 = new BytesRef();
      while (reader.read(tmp1)) {
        input.reset(tmp1.bytes);
        int currentScore = input.readInt();

        int bucket;
        if (line > 0 && currentScore == previousScore) {
          bucket = previousBucket;
        } else {
          bucket = (int) (line * buckets / inputLines);
        }
        previousScore = currentScore;
        previousBucket = bucket;

        // Only append the input, discard the weight.
        tmp2.bytes = tmp1.bytes;
        tmp2.offset = input.getPosition();
        tmp2.length = tmp1.length - input.getPosition();
        builder.add(tmp2, bucket);

        line++;
      }

      // The two FSTCompletions share the same automaton.
      this.higherWeightsCompletion = builder.build();
      this.normalCompletion =
          new FSTCompletion(higherWeightsCompletion.getFST(), false, exactMatchFirst);

      success = true;
    } finally {
      if (success) IOUtils.close(reader, writer, sorter);
      else IOUtils.closeWhileHandlingException(reader, writer, sorter);

      tempInput.delete();
      tempSorted.delete();
    }
  }