/** * Copies bytes from the pool starting at the given offset with the given length into the given * {@link BytesRef} at offset <tt>0</tt> and returns it. * * <p>Note: this method allows to copy across block boundaries. */ public final BytesRef copyFrom(final BytesRef bytes, final int offset, final int length) { bytes.offset = 0; bytes.grow(length); bytes.length = length; int bufferIndex = offset >> BYTE_BLOCK_SHIFT; byte[] buffer = buffers[bufferIndex]; int pos = offset & BYTE_BLOCK_MASK; int overflow = (pos + length) - BYTE_BLOCK_SIZE; do { if (overflow <= 0) { System.arraycopy(buffer, pos, bytes.bytes, bytes.offset, bytes.length); bytes.length = length; bytes.offset = 0; break; } else { final int bytesToCopy = length - overflow; System.arraycopy(buffer, pos, bytes.bytes, bytes.offset, bytesToCopy); pos = 0; bytes.length -= bytesToCopy; bytes.offset += bytesToCopy; buffer = buffers[++bufferIndex]; overflow = overflow - BYTE_BLOCK_SIZE; } } while (true); return bytes; }
// Fill in a BytesRef from term's length & bytes encoded in // byte block public final BytesRef setBytesRef(BytesRef term, int textStart) { final byte[] bytes = term.bytes = buffers[textStart >> BYTE_BLOCK_SHIFT]; int pos = textStart & BYTE_BLOCK_MASK; if ((bytes[pos] & 0x80) == 0) { // length is 1 byte term.length = bytes[pos]; term.offset = pos + 1; } else { // length is 2 bytes term.length = (bytes[pos] & 0x7f) + ((bytes[pos + 1] & 0xff) << 7); term.offset = pos + 2; } assert term.length >= 0; return term; }
/** * Dereferences the byte block according to {@link BytesRef} offset. The offset is interpreted as * the absolute offset into the {@link ByteBlockPool}. */ public final BytesRef deref(BytesRef bytes) { final int offset = bytes.offset; byte[] buffer = buffers[offset >> BYTE_BLOCK_SHIFT]; int pos = offset & BYTE_BLOCK_MASK; bytes.bytes = buffer; bytes.offset = pos; return bytes; }
@Override public void build(TermFreqIterator tfit) throws IOException { if (tfit instanceof TermFreqPayloadIterator) { throw new IllegalArgumentException("this suggester doesn't support payloads"); } File tempInput = File.createTempFile( FSTCompletionLookup.class.getSimpleName(), ".input", Sort.defaultTempDir()); File tempSorted = File.createTempFile( FSTCompletionLookup.class.getSimpleName(), ".sorted", Sort.defaultTempDir()); Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput); Sort.ByteSequencesReader reader = null; ExternalRefSorter sorter = null; // Push floats up front before sequences to sort them. For now, assume they are non-negative. // If negative floats are allowed some trickery needs to be done to find their byte order. boolean success = false; try { byte[] buffer = new byte[0]; ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef spare; while ((spare = tfit.next()) != null) { if (spare.length + 4 >= buffer.length) { buffer = ArrayUtil.grow(buffer, spare.length + 4); } output.reset(buffer); output.writeInt(encodeWeight(tfit.weight())); output.writeBytes(spare.bytes, spare.offset, spare.length); writer.write(buffer, 0, output.getPosition()); } writer.close(); // We don't know the distribution of scores and we need to bucket them, so we'll sort // and divide into equal buckets. SortInfo info = new Sort().sort(tempInput, tempSorted); tempInput.delete(); FSTCompletionBuilder builder = new FSTCompletionBuilder( buckets, sorter = new ExternalRefSorter(new Sort()), sharedTailLength); final int inputLines = info.lines; reader = new Sort.ByteSequencesReader(tempSorted); long line = 0; int previousBucket = 0; int previousScore = 0; ByteArrayDataInput input = new ByteArrayDataInput(); BytesRef tmp1 = new BytesRef(); BytesRef tmp2 = new BytesRef(); while (reader.read(tmp1)) { input.reset(tmp1.bytes); int currentScore = input.readInt(); int bucket; if (line > 0 && currentScore == previousScore) { bucket = previousBucket; } else { bucket = (int) (line * buckets / inputLines); } previousScore = currentScore; previousBucket = bucket; // Only append the input, discard the weight. tmp2.bytes = tmp1.bytes; tmp2.offset = input.getPosition(); tmp2.length = tmp1.length - input.getPosition(); builder.add(tmp2, bucket); line++; } // The two FSTCompletions share the same automaton. this.higherWeightsCompletion = builder.build(); this.normalCompletion = new FSTCompletion(higherWeightsCompletion.getFST(), false, exactMatchFirst); success = true; } finally { if (success) IOUtils.close(reader, writer, sorter); else IOUtils.closeWhileHandlingException(reader, writer, sorter); tempInput.delete(); tempSorted.delete(); } }