@Override public List<LookupResult> lookup(CharSequence key, boolean onlyMorePopular, int num) { assert num > 0; BytesRef scratch = new BytesRef(key); int prefixLength = scratch.length; Arc<Long> arc = new Arc<Long>(); // match the prefix portion exactly Long prefixOutput = null; try { prefixOutput = lookupPrefix(scratch, arc); } catch (IOException bogus) { throw new RuntimeException(bogus); } if (prefixOutput == null) { return Collections.<LookupResult>emptyList(); } List<LookupResult> results = new ArrayList<LookupResult>(num); CharsRef spare = new CharsRef(); if (exactFirst && arc.isFinal()) { spare.grow(scratch.length); UnicodeUtil.UTF8toUTF16(scratch, spare); results.add( new LookupResult(spare.toString(), decodeWeight(prefixOutput + arc.nextFinalOutput))); if (--num == 0) { return results; // that was quick } } // complete top-N MinResult<Long> completions[] = null; try { completions = Util.shortestPaths(fst, arc, weightComparator, num); } catch (IOException bogus) { throw new RuntimeException(bogus); } BytesRef suffix = new BytesRef(8); for (MinResult<Long> completion : completions) { scratch.length = prefixLength; // append suffix Util.toBytesRef(completion.input, suffix); scratch.append(suffix); spare.grow(scratch.length); UnicodeUtil.UTF8toUTF16(scratch, spare); results.add( new LookupResult(spare.toString(), decodeWeight(prefixOutput + completion.output))); } return results; }
@Override public void seekExact(long ord) throws IOException { // TODO: would be better to make this simpler and faster. // but we dont want to introduce a bug that corrupts our enum state! bytesReader.setPosition(0); fst.getFirstArc(firstArc); IntsRef output = Util.getByOutput(fst, ord, bytesReader, firstArc, scratchArc, scratchInts); BytesRefBuilder scratchBytes = new BytesRefBuilder(); scratchBytes.clear(); Util.toBytesRef(output, scratchBytes); // TODO: we could do this lazily, better to try to push into FSTEnum though? in.seekExact(scratchBytes.get()); }
// TODO: maybe we could add bulk-add method to // Builder? Takes FST and unions it w/ current // FST. private void append(Builder<BytesRef> builder, FST<BytesRef> subIndex) throws IOException { final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<BytesRef>(subIndex); BytesRefFSTEnum.InputOutput<BytesRef> indexEnt; while ((indexEnt = subIndexEnum.next()) != null) { // if (DEBUG) { // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" // + indexEnt.output); // } builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); } }
@Override public void finishTerm(BytesRef text, TermStats stats) throws IOException { assert stats.docFreq > 0; // if (DEBUG) System.out.println("BTTW.finishTerm term=" + fieldInfo.name + ":" + // toString(text) + " seg=" + segment + " df=" + stats.docFreq); blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput()); pending.add(new PendingTerm(BytesRef.deepCopyOf(text), stats)); postingsWriter.finishTerm(stats); numTerms++; }
/** * Load frame for target arc(node) on fst, so that arc.label >= label and * !fsa.reject(arc.label) */ Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException { FST.Arc<FSTTermOutputs.TermData> arc = frame.fstArc; arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader); if (arc == null) { return null; } frame.fsaState = fsa.step(top.fsaState, arc.label); // if (TEST) System.out.println(" loadCeil frame="+frame); if (frame.fsaState == -1) { return loadNextFrame(top, frame); } return frame; }
@Override public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException { if (text.length == 0) { // We already added empty string in ctor assert termsFilePointer == startTermsFilePointer; return; } final int lengthSave = text.length; text.length = indexedTermPrefixLength(lastTerm, text); try { fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer); } finally { text.length = lengthSave; } lastTerm.copyBytes(text); }
@Override public void build(TermFreqIterator iterator) throws IOException { BytesRef scratch = new BytesRef(); TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator, BytesRef.getUTF8SortedAsUnicodeComparator()); IntsRef scratchInts = new IntsRef(); BytesRef previous = null; PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs); while ((scratch = iter.next()) != null) { long cost = iter.weight(); if (previous == null) { previous = new BytesRef(); } else if (scratch.equals(previous)) { continue; // for duplicate suggestions, the best weight is actually // added } Util.toIntsRef(scratch, scratchInts); builder.add(scratchInts, cost); previous.copyBytes(scratch); } fst = builder.finish(); }
private void loadTerms() throws IOException { PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false); final Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>> b; final PairOutputs<Long, Long> outputsInner = new PairOutputs<Long, Long>(posIntOutputs, posIntOutputs); final PairOutputs<Long, PairOutputs.Pair<Long, Long>> outputs = new PairOutputs<Long, PairOutputs.Pair<Long, Long>>(posIntOutputs, outputsInner); b = new Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>>( FST.INPUT_TYPE.BYTE1, outputs); IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); in.seek(termsStart); final BytesRef lastTerm = new BytesRef(10); long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; OpenBitSet visitedDocs = new OpenBitSet(); final IntsRef scratchIntsRef = new IntsRef(); while (true) { SimpleTextUtil.readLine(in, scratch); if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) { if (lastDocsStart != -1) { b.add( Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair( lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq))); sumTotalTermFreq += totalTermFreq; } break; } else if (StringHelper.startsWith(scratch, DOC)) { docFreq++; sumDocFreq++; UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + DOC.length, scratch.length - DOC.length, scratchUTF16); int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); visitedDocs.set(docID); } else if (StringHelper.startsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + FREQ.length, scratch.length - FREQ.length, scratchUTF16); totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); } else if (StringHelper.startsWith(scratch, TERM)) { if (lastDocsStart != -1) { b.add( Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair( lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq))); } lastDocsStart = in.getFilePointer(); final int len = scratch.length - TERM.length; if (len > lastTerm.length) { lastTerm.grow(len); } System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len); lastTerm.length = len; docFreq = 0; sumTotalTermFreq += totalTermFreq; totalTermFreq = 0; termCount++; } } docCount = (int) visitedDocs.cardinality(); fst = b.finish(); /* PrintStream ps = new PrintStream("out.dot"); fst.toDot(ps); ps.close(); System.out.println("SAVED out.dot"); */ // System.out.println("FST " + fst.sizeInBytes()); }
@SuppressWarnings("unused") private void printSeekState(PrintStream out) throws IOException { if (currentFrame == staticFrame) { out.println(" no prior seek"); } else { out.println(" prior seek state:"); int ord = 0; boolean isSeekFrame = true; while (true) { IDVersionSegmentTermsEnumFrame f = getFrame(ord); assert f != null; final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefix); if (f.nextEnt == -1) { out.println( " frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + brToString(prefix) + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0) + (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd()); } else { out.println( " frame " + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + brToString(prefix) + " nextEnt=" + f.nextEnt + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0) + (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0)) + " lastSubFP=" + f.lastSubFP + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd()); } if (fr.index != null) { assert !isSeekFrame || f.arc != null : "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc; if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix - 1) & 0xFF)) { out.println( " broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.byteAt(f.prefix - 1) & 0xFF)); throw new RuntimeException("seek state is broken"); } Pair<BytesRef, Long> output = Util.get(fr.index, prefix); if (output == null) { out.println(" broken seek state: prefix is not final in index"); throw new RuntimeException("seek state is broken"); } else if (isSeekFrame && !f.isFloor) { final ByteArrayDataInput reader = new ByteArrayDataInput( output.output1.bytes, output.output1.offset, output.output1.length); final long codeOrig = reader.readVLong(); final long code = (f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0) | (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0); if (codeOrig != code) { out.println( " broken seek state: output code=" + codeOrig + " doesn't match frame code=" + code); throw new RuntimeException("seek state is broken"); } } } if (f == currentFrame) { break; } if (f.prefix == validIndexPrefix) { isSeekFrame = false; } ord++; } } }
/** Builds an {@link SynonymMap} and returns it. */ public SynonymMap build() throws IOException { ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); // TODO: are we using the best sharing options? org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE4, outputs); BytesRefBuilder scratch = new BytesRefBuilder(); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); final Set<Integer> dedupSet; if (dedup) { dedupSet = new HashSet<>(); } else { dedupSet = null; } final byte[] spare = new byte[5]; Set<CharsRef> keys = workingSet.keySet(); CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]); Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator()); final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); // System.out.println("fmap.build"); for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) { CharsRef input = sortedKeys[keyIdx]; MapEntry output = workingSet.get(input); int numEntries = output.ords.size(); // output size, assume the worst case int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry scratch.grow(estimatedSize); scratchOutput.reset(scratch.bytes()); // now write our output data: int count = 0; for (int i = 0; i < numEntries; i++) { if (dedupSet != null) { // box once final Integer ent = output.ords.get(i); if (dedupSet.contains(ent)) { continue; } dedupSet.add(ent); } scratchOutput.writeVInt(output.ords.get(i)); count++; } final int pos = scratchOutput.getPosition(); scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1)); final int pos2 = scratchOutput.getPosition(); final int vIntLen = pos2 - pos; // Move the count + includeOrig to the front of the byte[]: System.arraycopy(scratch.bytes(), pos, spare, 0, vIntLen); System.arraycopy(scratch.bytes(), 0, scratch.bytes(), vIntLen, pos); System.arraycopy(spare, 0, scratch.bytes(), 0, vIntLen); if (dedupSet != null) { dedupSet.clear(); } scratch.setLength(scratchOutput.getPosition()); // System.out.println(" add input=" + input + " output=" + scratch + " offset=" + // scratch.offset + " length=" + scratch.length + " count=" + count); builder.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef()); } FST<BytesRef> fst = builder.finish(); return new SynonymMap(fst, words, maxHorizontalContext); }
public void compileIndex(List<PendingBlock> floorBlocks, RAMOutputStream scratchBytes) throws IOException { assert (isFloor && floorBlocks != null && floorBlocks.size() != 0) || (!isFloor && floorBlocks == null) : "isFloor=" + isFloor + " floorBlocks=" + floorBlocks; assert scratchBytes.getFilePointer() == 0; // TODO: try writing the leading vLong in MSB order // (opposite of what Lucene does today), for better // outputs sharing in the FST scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor)); if (isFloor) { scratchBytes.writeVInt(floorBlocks.size()); for (PendingBlock sub : floorBlocks) { assert sub.floorLeadByte != -1; // if (DEBUG) { // System.out.println(" write floorLeadByte=" + // Integer.toHexString(sub.floorLeadByte&0xff)); // } scratchBytes.writeByte((byte) sub.floorLeadByte); assert sub.fp > fp; scratchBytes.writeVLong((sub.fp - fp) << 1 | (sub.hasTerms ? 1 : 0)); } } final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); final Builder<BytesRef> indexBuilder = new Builder<BytesRef>( FST.INPUT_TYPE.BYTE1, 0, 0, true, false, Integer.MAX_VALUE, outputs, null, false, PackedInts.COMPACT, true, 15); // if (DEBUG) { // System.out.println(" compile index for prefix=" + prefix); // } // indexBuilder.DEBUG = false; final byte[] bytes = new byte[(int) scratchBytes.getFilePointer()]; assert bytes.length > 0; scratchBytes.writeTo(bytes, 0); indexBuilder.add( Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length)); scratchBytes.reset(); // Copy over index for all sub-blocks if (subIndices != null) { for (FST<BytesRef> subIndex : subIndices) { append(indexBuilder, subIndex); } } if (floorBlocks != null) { for (PendingBlock sub : floorBlocks) { if (sub.subIndices != null) { for (FST<BytesRef> subIndex : sub.subIndices) { append(indexBuilder, subIndex); } } sub.subIndices = null; } } index = indexBuilder.finish(); subIndices = null; /* Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); Util.toDot(index, w, false, false); System.out.println("SAVED to out.dot"); w.close(); */ }