@Override public int compare(BytesRef left, BytesRef right) { // Make shallow copy in case decode changes the BytesRef: leftScratch.bytes = left.bytes; leftScratch.offset = left.offset; leftScratch.length = left.length; rightScratch.bytes = right.bytes; rightScratch.offset = right.offset; rightScratch.length = right.length; long leftCost = decode(leftScratch, input); long rightCost = decode(rightScratch, input); if (hasPayloads) { decodePayload(leftScratch, input); decodePayload(rightScratch, input); } if (hasContexts) { decodeContexts(leftScratch, input); decodeContexts(rightScratch, input); } int cmp = comparator.compare(leftScratch, rightScratch); if (cmp != 0) { return cmp; } return Long.compare(leftCost, rightCost); }
@Override public BytesRef writeToBytes() { long start = System.nanoTime(); int size = set.size(); BytesRef bytes = new BytesRef(new byte[HEADER_SIZE + (int) bytesUsed.get()]); // Encode encoding type Bytes.writeInt(bytes, this.getEncoding().ordinal()); // Encode flag bytes.bytes[bytes.offset++] = (byte) (this.isPruned() ? 1 : 0); // Encode size of the set Bytes.writeInt(bytes, size); // Encode longs BytesRef reusable = new BytesRef(); for (int i = 0; i < this.set.size(); i++) { this.set.get(i, reusable); Bytes.writeBytesRef(reusable, bytes); } logger.debug( "Serialized {} terms - took {} ms", this.size(), (System.nanoTime() - start) / 1000000); bytes.length = bytes.offset; bytes.offset = 0; return bytes; }
/** * Ensure we own term.bytes so that it's safe to modify. We detect via a kluge in which * cellsByLevel[0].termBuf is non-null, which is a pre-allocated for use to replace term.bytes. */ void ensureOwnTermBytes() { NRCell cell0 = cellsByLevel[0]; if (cell0.termBuf == null) return; // we already own the bytes System.arraycopy(term.bytes, term.offset, cell0.termBuf, 0, term.length); term.bytes = cell0.termBuf; term.offset = 0; cell0.termBuf = null; }
protected static void copy(BytesRef from, BytesRef to) { if (to.bytes.length < from.length) { to.bytes = new byte[ArrayUtil.oversize(from.length, RamUsageEstimator.NUM_BYTES_BYTE)]; } to.offset = 0; to.length = from.length; System.arraycopy(from.bytes, from.offset, to.bytes, 0, from.length); }
@Override public BytesRef getTokenBytesNoLeaf(BytesRef result) { if (result == null) result = new BytesRef(); result.bytes = term.bytes; result.offset = term.offset; result.length = termLenByLevel[cellLevel]; assert result.length <= term.length; return result; }
private void decodeTermFreqs() throws IOException { // logger.debug("Decode Term Freq in Node: {}", this.hashCode()); // logger.debug("Decode Term Freq in Node at {}", in.getFilePointer()); in.readBytes(termFreqCompressedBuffer.bytes, 0, termFreqCompressedBufferLength); termFreqCompressedBuffer.offset = 0; termFreqCompressedBuffer.length = termFreqCompressedBufferLength; nodDecompressor.decompress(termFreqCompressedBuffer, termFreqBuffer); // set length limit based on block size, as certain decompressor with // large window size can set it larger than the blockSize, e.g., AFor termFreqBuffer.length = termFreqBlockSize; termFreqReadPending = false; }
private void decodeNodeLengths() throws IOException { // logger.debug("Decode Nodes Length: {}", this.hashCode()); // logger.debug("Decode Nodes Length at {}", in.getFilePointer()); in.readBytes(nodLenCompressedBuffer.bytes, 0, nodLenCompressedBufferLength); nodLenCompressedBuffer.offset = 0; nodLenCompressedBuffer.length = nodLenCompressedBufferLength; nodDecompressor.decompress(nodLenCompressedBuffer, nodLenBuffer); // set length limit based on block size, as certain decompressor with // large window size can set it larger than the blockSize, e.g., AFor nodLenBuffer.length = nodLenBlockSize; nodLenReadPending = false; }
@Test public void testCreateAndSplitId() { BytesRef createUid = Uid.createUidAsBytes("foo", "bar"); BytesRef[] splitUidIntoTypeAndId = Uid.splitUidIntoTypeAndId(createUid); assertThat("foo", equalTo(splitUidIntoTypeAndId[0].utf8ToString())); assertThat("bar", equalTo(splitUidIntoTypeAndId[1].utf8ToString())); // split also with an offset BytesRef ref = new BytesRef(createUid.length + 10); ref.offset = 9; ref.length = createUid.length; System.arraycopy(createUid.bytes, createUid.offset, ref.bytes, ref.offset, ref.length); splitUidIntoTypeAndId = Uid.splitUidIntoTypeAndId(ref); assertThat("foo", equalTo(splitUidIntoTypeAndId[0].utf8ToString())); assertThat("bar", equalTo(splitUidIntoTypeAndId[1].utf8ToString())); }
@Override public void compress(final IntsRef input, final BytesRef output) { assert input.ints.length % 32 == 0; final int[] uncompressedData = input.ints; final byte[] compressedData = output.bytes; // prepare the input buffer before starting the compression this.prepareInputBuffer(input); while (input.offset < input.length) { for (final long compressorCode : this.frameCompressorCodes(uncompressedData, input.offset, input.length)) { compressedData[output.offset] = (byte) compressorCode; this.compressors[(int) compressorCode].compress(input, output); } } // flip buffer input.offset = 0; output.length = output.offset; output.offset = 0; }
@Override public BytesRef next() throws IOException { if (scratch == null) { return null; } boolean success = false; try { byte[] next = reader.read(); if (next != null) { scratch.bytes = next; scratch.length = next.length; scratch.offset = 0; } else { IOUtils.close(reader); scratch = null; } success = true; return scratch; } finally { if (!success) { IOUtils.closeWhileHandlingException(reader); } } }
@Override public Cell readCell(BytesRef term, Cell scratch) { if (scratch == null) scratch = getWorldCell(); // We decode level #, leaf boolean, and populate bytes by reference. We don't decode the stack. // reverse lookup term length to the level and hence the cell NRCell[] cellsByLevel = ((NRCell) scratch).cellsByLevel; boolean isLeaf = term.bytes[term.offset + term.length - 1] == 0; int lenNoLeaf = isLeaf ? term.length - 1 : term.length; NRCell result = cellsByLevel[levelByTermLen[lenNoLeaf]]; if (cellsByLevel[0].termBuf == null) cellsByLevel[0].termBuf = result.term.bytes; // a kluge; see cell.ensureOwnTermBytes() result.term.bytes = term.bytes; result.term.offset = term.offset; result.term.length = lenNoLeaf; // technically this isn't used but may help debugging result.reset(); if (isLeaf) result.setLeaf(); result.cellNumber = -1; // lazy decode flag return result; }
@Override public void build(TermFreqIterator tfit) throws IOException { if (tfit instanceof TermFreqPayloadIterator) { throw new IllegalArgumentException("this suggester doesn't support payloads"); } File tempInput = File.createTempFile( FSTCompletionLookup.class.getSimpleName(), ".input", Sort.defaultTempDir()); File tempSorted = File.createTempFile( FSTCompletionLookup.class.getSimpleName(), ".sorted", Sort.defaultTempDir()); Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput); Sort.ByteSequencesReader reader = null; ExternalRefSorter sorter = null; // Push floats up front before sequences to sort them. For now, assume they are non-negative. // If negative floats are allowed some trickery needs to be done to find their byte order. boolean success = false; try { byte[] buffer = new byte[0]; ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef spare; while ((spare = tfit.next()) != null) { if (spare.length + 4 >= buffer.length) { buffer = ArrayUtil.grow(buffer, spare.length + 4); } output.reset(buffer); output.writeInt(encodeWeight(tfit.weight())); output.writeBytes(spare.bytes, spare.offset, spare.length); writer.write(buffer, 0, output.getPosition()); } writer.close(); // We don't know the distribution of scores and we need to bucket them, so we'll sort // and divide into equal buckets. SortInfo info = new Sort().sort(tempInput, tempSorted); tempInput.delete(); FSTCompletionBuilder builder = new FSTCompletionBuilder( buckets, sorter = new ExternalRefSorter(new Sort()), sharedTailLength); final int inputLines = info.lines; reader = new Sort.ByteSequencesReader(tempSorted); long line = 0; int previousBucket = 0; int previousScore = 0; ByteArrayDataInput input = new ByteArrayDataInput(); BytesRef tmp1 = new BytesRef(); BytesRef tmp2 = new BytesRef(); while (reader.read(tmp1)) { input.reset(tmp1.bytes); int currentScore = input.readInt(); int bucket; if (line > 0 && currentScore == previousScore) { bucket = previousBucket; } else { bucket = (int) (line * buckets / inputLines); } previousScore = currentScore; previousBucket = bucket; // Only append the input, discard the weight. tmp2.bytes = tmp1.bytes; tmp2.offset = input.getPosition(); tmp2.length = tmp1.length - input.getPosition(); builder.add(tmp2, bucket); line++; } // The two FSTCompletions share the same automaton. this.higherWeightsCompletion = builder.build(); this.normalCompletion = new FSTCompletion(higherWeightsCompletion.getFST(), false, exactMatchFirst); success = true; } finally { if (success) IOUtils.close(reader, writer, sorter); else IOUtils.closeWhileHandlingException(reader, writer, sorter); tempInput.delete(); tempSorted.delete(); } }