void writeBlocks(IntsRef prevTerm, int prefixLength, int count) throws IOException { if (prefixLength == 0 || count <= maxItemsInBlock) { // Easy case: not floor block. Eg, prefix is "foo", // and we found 30 terms/sub-blocks starting w/ that // prefix, and minItemsInBlock <= 30 <= // maxItemsInBlock. final PendingBlock nonFloorBlock = writeBlock(prevTerm, prefixLength, prefixLength, count, count, 0, false, -1, true); nonFloorBlock.compileIndex(null, scratchBytes); pending.add(nonFloorBlock); } else { // Floor block case. Eg, prefix is "foo" but we // have 100 terms/sub-blocks starting w/ that // prefix. We segment the entries into a primary // block and following floor blocks using the first // label in the suffix to assign to floor blocks. // TODO: we could store min & max suffix start byte // in each block, to make floor blocks authoritative // if (DEBUG) { // final BytesRef prefix = new BytesRef(prefixLength); // for(int m=0;m<prefixLength;m++) { // prefix.bytes[m] = (byte) prevTerm.ints[m]; // } // prefix.length = prefixLength; // //System.out.println("\nWBS count=" + count + " prefix=" + prefix.utf8ToString() + " " + // prefix); // System.out.println("writeBlocks: prefix=" + prefix + " " + prefix + " count=" + count + // " pending.size()=" + pending.size()); // } // System.out.println("\nwbs count=" + count); final int savLabel = prevTerm.ints[prevTerm.offset + prefixLength]; // Count up how many items fall under // each unique label after the prefix. // TODO: this is wasteful since the builder had // already done this (partitioned these sub-terms // according to their leading prefix byte) final List<PendingEntry> slice = pending.subList(pending.size() - count, pending.size()); int lastSuffixLeadLabel = -1; int termCount = 0; int subCount = 0; int numSubs = 0; for (PendingEntry ent : slice) { // First byte in the suffix of this term final int suffixLeadLabel; if (ent.isTerm) { PendingTerm term = (PendingTerm) ent; if (term.term.length == prefixLength) { // Suffix is 0, ie prefix 'foo' and term is // 'foo' so the term has empty string suffix // in this block assert lastSuffixLeadLabel == -1; assert numSubs == 0; suffixLeadLabel = -1; } else { suffixLeadLabel = term.term.bytes[term.term.offset + prefixLength] & 0xff; } } else { PendingBlock block = (PendingBlock) ent; assert block.prefix.length > prefixLength; suffixLeadLabel = block.prefix.bytes[block.prefix.offset + prefixLength] & 0xff; } if (suffixLeadLabel != lastSuffixLeadLabel && (termCount + subCount) != 0) { if (subBytes.length == numSubs) { subBytes = ArrayUtil.grow(subBytes); subTermCounts = ArrayUtil.grow(subTermCounts); subSubCounts = ArrayUtil.grow(subSubCounts); } subBytes[numSubs] = lastSuffixLeadLabel; lastSuffixLeadLabel = suffixLeadLabel; subTermCounts[numSubs] = termCount; subSubCounts[numSubs] = subCount; /* if (suffixLeadLabel == -1) { System.out.println(" sub " + -1 + " termCount=" + termCount + " subCount=" + subCount); } else { System.out.println(" sub " + Integer.toHexString(suffixLeadLabel) + " termCount=" + termCount + " subCount=" + subCount); } */ termCount = subCount = 0; numSubs++; } if (ent.isTerm) { termCount++; } else { subCount++; } } if (subBytes.length == numSubs) { subBytes = ArrayUtil.grow(subBytes); subTermCounts = ArrayUtil.grow(subTermCounts); subSubCounts = ArrayUtil.grow(subSubCounts); } subBytes[numSubs] = lastSuffixLeadLabel; subTermCounts[numSubs] = termCount; subSubCounts[numSubs] = subCount; numSubs++; /* if (lastSuffixLeadLabel == -1) { System.out.println(" sub " + -1 + " termCount=" + termCount + " subCount=" + subCount); } else { System.out.println(" sub " + Integer.toHexString(lastSuffixLeadLabel) + " termCount=" + termCount + " subCount=" + subCount); } */ if (subTermCountSums.length < numSubs) { subTermCountSums = ArrayUtil.grow(subTermCountSums, numSubs); } // Roll up (backwards) the termCounts; postings impl // needs this to know where to pull the term slice // from its pending terms stack: int sum = 0; for (int idx = numSubs - 1; idx >= 0; idx--) { sum += subTermCounts[idx]; subTermCountSums[idx] = sum; } // TODO: make a better segmenter? It'd have to // absorb the too-small end blocks backwards into // the previous blocks // Naive greedy segmentation; this is not always // best (it can produce a too-small block as the // last block): int pendingCount = 0; int startLabel = subBytes[0]; int curStart = count; subCount = 0; final List<PendingBlock> floorBlocks = new ArrayList<PendingBlock>(); PendingBlock firstBlock = null; for (int sub = 0; sub < numSubs; sub++) { pendingCount += subTermCounts[sub] + subSubCounts[sub]; // System.out.println(" " + (subTermCounts[sub] + subSubCounts[sub])); subCount++; // Greedily make a floor block as soon as we've // crossed the min count if (pendingCount >= minItemsInBlock) { final int curPrefixLength; if (startLabel == -1) { curPrefixLength = prefixLength; } else { curPrefixLength = 1 + prefixLength; // floor term: prevTerm.ints[prevTerm.offset + prefixLength] = startLabel; } // System.out.println(" " + subCount + " subs"); final PendingBlock floorBlock = writeBlock( prevTerm, prefixLength, curPrefixLength, curStart, pendingCount, subTermCountSums[1 + sub], true, startLabel, curStart == pendingCount); if (firstBlock == null) { firstBlock = floorBlock; } else { floorBlocks.add(floorBlock); } curStart -= pendingCount; // System.out.println(" = " + pendingCount); pendingCount = 0; assert minItemsInBlock == 1 || subCount > 1 : "minItemsInBlock=" + minItemsInBlock + " subCount=" + subCount + " sub=" + sub + " of " + numSubs + " subTermCount=" + subTermCountSums[sub] + " subSubCount=" + subSubCounts[sub] + " depth=" + prefixLength; subCount = 0; startLabel = subBytes[sub + 1]; if (curStart == 0) { break; } if (curStart <= maxItemsInBlock) { // remainder is small enough to fit into a // block. NOTE that this may be too small (< // minItemsInBlock); need a true segmenter // here assert startLabel != -1; assert firstBlock != null; prevTerm.ints[prevTerm.offset + prefixLength] = startLabel; // System.out.println(" final " + (numSubs-sub-1) + " subs"); /* for(sub++;sub < numSubs;sub++) { System.out.println(" " + (subTermCounts[sub] + subSubCounts[sub])); } System.out.println(" = " + curStart); if (curStart < minItemsInBlock) { System.out.println(" **"); } */ floorBlocks.add( writeBlock( prevTerm, prefixLength, prefixLength + 1, curStart, curStart, 0, true, startLabel, true)); break; } } } prevTerm.ints[prevTerm.offset + prefixLength] = savLabel; assert firstBlock != null; firstBlock.compileIndex(floorBlocks, scratchBytes); pending.add(firstBlock); // if (DEBUG) System.out.println(" done pending.size()=" + pending.size()); } lastBlockIndex = pending.size() - 1; }
public void compileIndex(List<PendingBlock> floorBlocks, RAMOutputStream scratchBytes) throws IOException { assert (isFloor && floorBlocks != null && floorBlocks.size() != 0) || (!isFloor && floorBlocks == null) : "isFloor=" + isFloor + " floorBlocks=" + floorBlocks; assert scratchBytes.getFilePointer() == 0; // TODO: try writing the leading vLong in MSB order // (opposite of what Lucene does today), for better // outputs sharing in the FST scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor)); if (isFloor) { scratchBytes.writeVInt(floorBlocks.size()); for (PendingBlock sub : floorBlocks) { assert sub.floorLeadByte != -1; // if (DEBUG) { // System.out.println(" write floorLeadByte=" + // Integer.toHexString(sub.floorLeadByte&0xff)); // } scratchBytes.writeByte((byte) sub.floorLeadByte); assert sub.fp > fp; scratchBytes.writeVLong((sub.fp - fp) << 1 | (sub.hasTerms ? 1 : 0)); } } final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); final Builder<BytesRef> indexBuilder = new Builder<BytesRef>( FST.INPUT_TYPE.BYTE1, 0, 0, true, false, Integer.MAX_VALUE, outputs, null, false, PackedInts.COMPACT, true, 15); // if (DEBUG) { // System.out.println(" compile index for prefix=" + prefix); // } // indexBuilder.DEBUG = false; final byte[] bytes = new byte[(int) scratchBytes.getFilePointer()]; assert bytes.length > 0; scratchBytes.writeTo(bytes, 0); indexBuilder.add( Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length)); scratchBytes.reset(); // Copy over index for all sub-blocks if (subIndices != null) { for (FST<BytesRef> subIndex : subIndices) { append(indexBuilder, subIndex); } } if (floorBlocks != null) { for (PendingBlock sub : floorBlocks) { if (sub.subIndices != null) { for (FST<BytesRef> subIndex : sub.subIndices) { append(indexBuilder, subIndex); } } sub.subIndices = null; } } index = indexBuilder.finish(); subIndices = null; /* Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); Util.toDot(index, w, false, false); System.out.println("SAVED to out.dot"); w.close(); */ }