// TODO: maybe we could add bulk-add method to // Builder? Takes FST and unions it w/ current // FST. private void append(Builder<BytesRef> builder, FST<BytesRef> subIndex) throws IOException { final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<BytesRef>(subIndex); BytesRefFSTEnum.InputOutput<BytesRef> indexEnt; while ((indexEnt = subIndexEnum.next()) != null) { // if (DEBUG) { // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" // + indexEnt.output); // } builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); } }
@Override public void finishTerm(BytesRef text, TermStats stats) throws IOException { assert stats.docFreq > 0; // if (DEBUG) System.out.println("BTTW.finishTerm term=" + fieldInfo.name + ":" + // toString(text) + " seg=" + segment + " df=" + stats.docFreq); blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput()); pending.add(new PendingTerm(BytesRef.deepCopyOf(text), stats)); postingsWriter.finishTerm(stats); numTerms++; }
@Override public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException { if (text.length == 0) { // We already added empty string in ctor assert termsFilePointer == startTermsFilePointer; return; } final int lengthSave = text.length; text.length = indexedTermPrefixLength(lastTerm, text); try { fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer); } finally { text.length = lengthSave; } lastTerm.copyBytes(text); }
@Override public void build(TermFreqIterator iterator) throws IOException { BytesRef scratch = new BytesRef(); TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator, BytesRef.getUTF8SortedAsUnicodeComparator()); IntsRef scratchInts = new IntsRef(); BytesRef previous = null; PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs); while ((scratch = iter.next()) != null) { long cost = iter.weight(); if (previous == null) { previous = new BytesRef(); } else if (scratch.equals(previous)) { continue; // for duplicate suggestions, the best weight is actually // added } Util.toIntsRef(scratch, scratchInts); builder.add(scratchInts, cost); previous.copyBytes(scratch); } fst = builder.finish(); }
private void loadTerms() throws IOException { PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false); final Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>> b; final PairOutputs<Long, Long> outputsInner = new PairOutputs<Long, Long>(posIntOutputs, posIntOutputs); final PairOutputs<Long, PairOutputs.Pair<Long, Long>> outputs = new PairOutputs<Long, PairOutputs.Pair<Long, Long>>(posIntOutputs, outputsInner); b = new Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>>( FST.INPUT_TYPE.BYTE1, outputs); IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); in.seek(termsStart); final BytesRef lastTerm = new BytesRef(10); long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; OpenBitSet visitedDocs = new OpenBitSet(); final IntsRef scratchIntsRef = new IntsRef(); while (true) { SimpleTextUtil.readLine(in, scratch); if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) { if (lastDocsStart != -1) { b.add( Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair( lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq))); sumTotalTermFreq += totalTermFreq; } break; } else if (StringHelper.startsWith(scratch, DOC)) { docFreq++; sumDocFreq++; UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + DOC.length, scratch.length - DOC.length, scratchUTF16); int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); visitedDocs.set(docID); } else if (StringHelper.startsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + FREQ.length, scratch.length - FREQ.length, scratchUTF16); totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); } else if (StringHelper.startsWith(scratch, TERM)) { if (lastDocsStart != -1) { b.add( Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair( lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq))); } lastDocsStart = in.getFilePointer(); final int len = scratch.length - TERM.length; if (len > lastTerm.length) { lastTerm.grow(len); } System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len); lastTerm.length = len; docFreq = 0; sumTotalTermFreq += totalTermFreq; totalTermFreq = 0; termCount++; } } docCount = (int) visitedDocs.cardinality(); fst = b.finish(); /* PrintStream ps = new PrintStream("out.dot"); fst.toDot(ps); ps.close(); System.out.println("SAVED out.dot"); */ // System.out.println("FST " + fst.sizeInBytes()); }
public void compileIndex(List<PendingBlock> floorBlocks, RAMOutputStream scratchBytes) throws IOException { assert (isFloor && floorBlocks != null && floorBlocks.size() != 0) || (!isFloor && floorBlocks == null) : "isFloor=" + isFloor + " floorBlocks=" + floorBlocks; assert scratchBytes.getFilePointer() == 0; // TODO: try writing the leading vLong in MSB order // (opposite of what Lucene does today), for better // outputs sharing in the FST scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor)); if (isFloor) { scratchBytes.writeVInt(floorBlocks.size()); for (PendingBlock sub : floorBlocks) { assert sub.floorLeadByte != -1; // if (DEBUG) { // System.out.println(" write floorLeadByte=" + // Integer.toHexString(sub.floorLeadByte&0xff)); // } scratchBytes.writeByte((byte) sub.floorLeadByte); assert sub.fp > fp; scratchBytes.writeVLong((sub.fp - fp) << 1 | (sub.hasTerms ? 1 : 0)); } } final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); final Builder<BytesRef> indexBuilder = new Builder<BytesRef>( FST.INPUT_TYPE.BYTE1, 0, 0, true, false, Integer.MAX_VALUE, outputs, null, false, PackedInts.COMPACT, true, 15); // if (DEBUG) { // System.out.println(" compile index for prefix=" + prefix); // } // indexBuilder.DEBUG = false; final byte[] bytes = new byte[(int) scratchBytes.getFilePointer()]; assert bytes.length > 0; scratchBytes.writeTo(bytes, 0); indexBuilder.add( Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length)); scratchBytes.reset(); // Copy over index for all sub-blocks if (subIndices != null) { for (FST<BytesRef> subIndex : subIndices) { append(indexBuilder, subIndex); } } if (floorBlocks != null) { for (PendingBlock sub : floorBlocks) { if (sub.subIndices != null) { for (FST<BytesRef> subIndex : sub.subIndices) { append(indexBuilder, subIndex); } } sub.subIndices = null; } } index = indexBuilder.finish(); subIndices = null; /* Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); Util.toDot(index, w, false, false); System.out.println("SAVED to out.dot"); w.close(); */ }