/* Does initial decode of next block of terms; this doesn't actually decode the docFreq, totalTermFreq, postings details (frq/prx offset, etc.) metadata; it just loads them as byte[] blobs which are then decoded on-demand if the metadata is ever requested for any term in this block. This enables terms-only intensive consumes (eg certain MTQs, respelling) to not pay the price of decoding metadata they won't use. */ private boolean nextBlock() throws IOException { // TODO: we still lazy-decode the byte[] for each // term (the suffix), but, if we decoded // all N terms up front then seeking could do a fast // bsearch w/in the block... // System.out.println("BTR.nextBlock() fp=" + in.getFilePointer() + " this=" + this); state.blockFilePointer = in.getFilePointer(); blockTermCount = in.readVInt(); // System.out.println(" blockTermCount=" + blockTermCount); if (blockTermCount == 0) { return false; } termBlockPrefix = in.readVInt(); // term suffixes: int len = in.readVInt(); if (termSuffixes.length < len) { termSuffixes = new byte[ArrayUtil.oversize(len, 1)]; } // System.out.println(" termSuffixes len=" + len); in.readBytes(termSuffixes, 0, len); termSuffixesReader.reset(termSuffixes, 0, len); // docFreq, totalTermFreq len = in.readVInt(); if (docFreqBytes.length < len) { docFreqBytes = new byte[ArrayUtil.oversize(len, 1)]; } // System.out.println(" freq bytes len=" + len); in.readBytes(docFreqBytes, 0, len); freqReader.reset(docFreqBytes, 0, len); // metadata len = in.readVInt(); if (bytes == null) { bytes = new byte[ArrayUtil.oversize(len, 1)]; bytesReader = new ByteArrayDataInput(); } else if (bytes.length < len) { bytes = new byte[ArrayUtil.oversize(len, 1)]; } in.readBytes(bytes, 0, len); bytesReader.reset(bytes, 0, len); metaDataUpto = 0; state.termBlockOrd = 0; indexIsCurrent = false; // System.out.println(" indexIsCurrent=" + indexIsCurrent); return true; }
@Override protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) { tmpInput.reset(scratch.bytes); tmpInput.skipBytes(scratch.length - 4); // suggestion + separator scratch.length -= 5; // sep + long return tmpInput.readInt(); }
// Interleaves all output tokens onto the futureOutputs: private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) { bytesReader.reset(bytes.bytes, bytes.offset, bytes.length); final int code = bytesReader.readVInt(); final boolean keepOrig = (code & 0x1) == 0; final int count = code >>> 1; // System.out.println(" addOutput count=" + count + " keepOrig=" + keepOrig); for (int outputIDX = 0; outputIDX < count; outputIDX++) { synonyms.words.get(bytesReader.readVInt(), scratchBytes); // System.out.println(" outIDX=" + outputIDX + " bytes=" + scratchBytes.length); scratchChars.copyUTF8Bytes(scratchBytes); int lastStart = 0; final int chEnd = lastStart + scratchChars.length(); int outputUpto = nextRead; for (int chIDX = lastStart; chIDX <= chEnd; chIDX++) { if (chIDX == chEnd || scratchChars.charAt(chIDX) == SynonymMap.WORD_SEPARATOR) { final int outputLen = chIDX - lastStart; // Caller is not allowed to have empty string in // the output: assert outputLen > 0 : "output contains empty string: " + scratchChars; final int endOffset; final int posLen; if (chIDX == chEnd && lastStart == 0) { // This rule had a single output token, so, we set // this output's endOffset to the current // endOffset (ie, endOffset of the last input // token it matched): endOffset = matchEndOffset; posLen = keepOrig ? matchInputLength : 1; } else { // This rule has more than one output token; we // can't pick any particular endOffset for this // case, so, we inherit the endOffset for the // input token which this output overlaps: endOffset = -1; posLen = 1; } futureOutputs[outputUpto].add( scratchChars.chars(), lastStart, outputLen, endOffset, posLen); // System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " // outputUpto=" + outputUpto); lastStart = 1 + chIDX; // System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig); outputUpto = rollIncr(outputUpto); assert futureOutputs[outputUpto].posIncr == 1 : "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite; } } } int upto = nextRead; for (int idx = 0; idx < matchInputLength; idx++) { futureInputs[upto].keepOrig |= keepOrig; futureInputs[upto].matched = true; upto = rollIncr(upto); } }
/** decodes the payload at the current position */ protected BytesRef decodePayload(BytesRef scratch, ByteArrayDataInput tmpInput) { tmpInput.reset(scratch.bytes); tmpInput.skipBytes(scratch.length - 2); // skip to payload size short payloadLength = tmpInput.readShort(); // read payload size tmpInput.setPosition(scratch.length - 2 - payloadLength); // setPosition to start of payload BytesRef payloadScratch = new BytesRef(payloadLength); tmpInput.readBytes(payloadScratch.bytes, 0, payloadLength); // read payload payloadScratch.length = payloadLength; scratch.length -= 2; // payload length info (short) scratch.length -= payloadLength; // payload return payloadScratch; }
@Override public void setDocument(int docId) { bytes = values.get(docId); in.reset(bytes.bytes, bytes.offset, bytes.length); if (!in.eof()) { // first value uses vLong on top of zig-zag encoding, then deltas are encoded using vLong long previousValue = longs[0] = ByteUtils.zigZagDecode(ByteUtils.readVLong(in)); count = 1; while (!in.eof()) { longs = ArrayUtil.grow(longs, count + 1); previousValue = longs[count++] = previousValue + ByteUtils.readVLong(in); } } else { count = 0; } }
// Pushes a frame we seek'd to IDVersionSegmentTermsEnumFrame pushFrame( FST.Arc<Pair<BytesRef, Long>> arc, Pair<BytesRef, Long> frameData, int length) throws IOException { scratchReader.reset( frameData.output1.bytes, frameData.output1.offset, frameData.output1.length); final long code = scratchReader.readVLong(); final long fpSeek = code >>> VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS; final IDVersionSegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord); f.maxIDVersion = Long.MAX_VALUE - frameData.output2; f.hasTerms = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0; f.hasTermsOrig = f.hasTerms; f.isFloor = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0; if (f.isFloor) { f.setFloorData(scratchReader, frameData.output1); } pushFrame(arc, fpSeek, length); return f; }
/** decodes the contexts at the current position */ protected Set<BytesRef> decodeContexts(BytesRef scratch, ByteArrayDataInput tmpInput) { tmpInput.reset(scratch.bytes); tmpInput.skipBytes(scratch.length - 2); // skip to context set size short ctxSetSize = tmpInput.readShort(); scratch.length -= 2; final Set<BytesRef> contextSet = new HashSet<>(); for (short i = 0; i < ctxSetSize; i++) { tmpInput.setPosition(scratch.length - 2); short curContextLength = tmpInput.readShort(); scratch.length -= 2; tmpInput.setPosition(scratch.length - curContextLength); BytesRef contextSpare = new BytesRef(curContextLength); tmpInput.readBytes(contextSpare.bytes, 0, curContextLength); contextSpare.length = curContextLength; contextSet.add(contextSpare); scratch.length -= curContextLength; } return contextSet; }
/** decodes the weight at the current position */ protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) { tmpInput.reset(scratch.bytes); tmpInput.skipBytes(scratch.length - 8); // suggestion scratch.length -= 8; // long return tmpInput.readLong(); }
@Override public void build(TermFreqIterator tfit) throws IOException { if (tfit instanceof TermFreqPayloadIterator) { throw new IllegalArgumentException("this suggester doesn't support payloads"); } File tempInput = File.createTempFile( FSTCompletionLookup.class.getSimpleName(), ".input", Sort.defaultTempDir()); File tempSorted = File.createTempFile( FSTCompletionLookup.class.getSimpleName(), ".sorted", Sort.defaultTempDir()); Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput); Sort.ByteSequencesReader reader = null; ExternalRefSorter sorter = null; // Push floats up front before sequences to sort them. For now, assume they are non-negative. // If negative floats are allowed some trickery needs to be done to find their byte order. boolean success = false; try { byte[] buffer = new byte[0]; ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef spare; while ((spare = tfit.next()) != null) { if (spare.length + 4 >= buffer.length) { buffer = ArrayUtil.grow(buffer, spare.length + 4); } output.reset(buffer); output.writeInt(encodeWeight(tfit.weight())); output.writeBytes(spare.bytes, spare.offset, spare.length); writer.write(buffer, 0, output.getPosition()); } writer.close(); // We don't know the distribution of scores and we need to bucket them, so we'll sort // and divide into equal buckets. SortInfo info = new Sort().sort(tempInput, tempSorted); tempInput.delete(); FSTCompletionBuilder builder = new FSTCompletionBuilder( buckets, sorter = new ExternalRefSorter(new Sort()), sharedTailLength); final int inputLines = info.lines; reader = new Sort.ByteSequencesReader(tempSorted); long line = 0; int previousBucket = 0; int previousScore = 0; ByteArrayDataInput input = new ByteArrayDataInput(); BytesRef tmp1 = new BytesRef(); BytesRef tmp2 = new BytesRef(); while (reader.read(tmp1)) { input.reset(tmp1.bytes); int currentScore = input.readInt(); int bucket; if (line > 0 && currentScore == previousScore) { bucket = previousBucket; } else { bucket = (int) (line * buckets / inputLines); } previousScore = currentScore; previousBucket = bucket; // Only append the input, discard the weight. tmp2.bytes = tmp1.bytes; tmp2.offset = input.getPosition(); tmp2.length = tmp1.length - input.getPosition(); builder.add(tmp2, bucket); line++; } // The two FSTCompletions share the same automaton. this.higherWeightsCompletion = builder.build(); this.normalCompletion = new FSTCompletion(higherWeightsCompletion.getFST(), false, exactMatchFirst); success = true; } finally { if (success) IOUtils.close(reader, writer, sorter); else IOUtils.closeWhileHandlingException(reader, writer, sorter); tempInput.delete(); tempSorted.delete(); } }