@Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { PostingsWriterBase postingsWriter = new SepPostingsWriter(state, new MockIntFactory(blockSize)); boolean success = false; TermsIndexWriterBase indexWriter; try { indexWriter = new FixedGapTermsIndexWriter(state); success = true; } finally { if (!success) { postingsWriter.close(); } } success = false; try { FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter); success = true; return ret; } finally { if (!success) { try { postingsWriter.close(); } finally { indexWriter.close(); } } } }
@Override public void finishDoc() throws IOException { if (DEBUG) System.out.println("PW finishDoc"); if (pendingCount == -1) { wrappedPostingsWriter.finishDoc(); } }
@Override public void start(IndexOutput termsOut) throws IOException { this.termsOut = termsOut; CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT); termsOut.writeVInt(pending.length); // encode maxPositions in header wrappedPostingsWriter.start(termsOut); }
// Currently, this instance is re-used across fields, so // our parent calls setField whenever the field changes @Override public void setField(FieldInfo fieldInfo) { this.indexOptions = fieldInfo.getIndexOptions(); if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions); storePayloads = fieldInfo.hasPayloads(); wrappedPostingsWriter.setField(fieldInfo); // DEBUG = BlockTreeTermsWriter.DEBUG; }
@Override public void flushTermsBlock(int start, int count) throws IOException { if (DEBUG) System.out.println( "PW: flushTermsBlock start=" + start + " count=" + count + " pendingTerms.size()=" + pendingTerms.size()); int wrappedCount = 0; assert buffer.getFilePointer() == 0; assert start >= count; final int limit = pendingTerms.size() - start + count; for (int idx = pendingTerms.size() - start; idx < limit; idx++) { final PendingTerm term = pendingTerms.get(idx); if (term == null) { wrappedCount++; } else { buffer.writeVInt(term.bytes.length); buffer.writeBytes(term.bytes, 0, term.bytes.length); } } termsOut.writeVInt((int) buffer.getFilePointer()); buffer.writeTo(termsOut); buffer.reset(); // TDOO: this could be somewhat costly since // pendingTerms.size() could be biggish? int futureWrappedCount = 0; final int limit2 = pendingTerms.size(); for (int idx = limit; idx < limit2; idx++) { if (pendingTerms.get(idx) == null) { futureWrappedCount++; } } // Remove the terms we just wrote: pendingTerms.subList(pendingTerms.size() - start, limit).clear(); if (DEBUG) System.out.println( "PW: len=" + buffer.getFilePointer() + " fp=" + termsOut.getFilePointer() + " futureWrappedCount=" + futureWrappedCount + " wrappedCount=" + wrappedCount); // TODO: can we avoid calling this if all terms // were inlined...? Eg for a "primary key" field, the // wrapped codec is never invoked... wrappedPostingsWriter.flushTermsBlock(futureWrappedCount + wrappedCount, wrappedCount); }
@Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { PostingsWriterBase docsWriter = new Lucene40PostingsWriter(state); boolean success = false; try { FieldsConsumer ret = new AppendingTermsWriter( state, docsWriter, BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); success = true; return ret; } finally { if (!success) { docsWriter.close(); } } }
@Override public void startDoc(int docID, int termDocFreq) throws IOException { assert docID >= 0 : "got docID=" + docID; /* if (termID != -1) { if (docID == 0) { baseDocID = termID; } else if (baseDocID + docID != termID) { throw new RuntimeException("WRITE: baseDocID=" + baseDocID + " docID=" + docID + " termID=" + termID); } } */ if (DEBUG) System.out.println("PW doc=" + docID); if (pendingCount == pending.length) { push(); if (DEBUG) System.out.println("PW: wrapped.finishDoc"); wrappedPostingsWriter.finishDoc(); } if (pendingCount != -1) { assert pendingCount < pending.length; currentDoc = pending[pendingCount]; currentDoc.docID = docID; if (indexOptions == IndexOptions.DOCS_ONLY) { pendingCount++; } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) { pendingCount++; currentDoc.termFreq = termDocFreq; } else { currentDoc.termFreq = termDocFreq; } } else { // We've already seen too many docs for this term -- // just forward to our fallback writer wrappedPostingsWriter.startDoc(docID, termDocFreq); } }
@Override public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { if (DEBUG) System.out.println( "PW pos=" + position + " payload=" + (payload == null ? "null" : payload.length + " bytes")); if (pendingCount == pending.length) { push(); } if (pendingCount == -1) { // We've already seen too many docs for this term -- // just forward to our fallback writer wrappedPostingsWriter.addPosition(position, payload, startOffset, endOffset); } else { // buffer up final Position pos = pending[pendingCount++]; pos.pos = position; pos.startOffset = startOffset; pos.endOffset = endOffset; pos.docID = currentDoc.docID; if (payload != null && payload.length > 0) { if (pos.payload == null) { pos.payload = BytesRef.deepCopyOf(payload); } else { pos.payload.copyBytes(payload); } } else if (pos.payload != null) { pos.payload.length = 0; } } }
// Pushes pending positions to the wrapped codec private void push() throws IOException { if (DEBUG) System.out.println("PW now push @ " + pendingCount + " wrapped=" + wrappedPostingsWriter); assert pendingCount == pending.length; wrappedPostingsWriter.startTerm(); // Flush all buffered docs if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { Position doc = null; for (Position pos : pending) { if (doc == null) { doc = pos; if (DEBUG) System.out.println("PW: wrapped.startDoc docID=" + doc.docID + " tf=" + doc.termFreq); wrappedPostingsWriter.startDoc(doc.docID, doc.termFreq); } else if (doc.docID != pos.docID) { assert pos.docID > doc.docID; if (DEBUG) System.out.println("PW: wrapped.finishDoc"); wrappedPostingsWriter.finishDoc(); doc = pos; if (DEBUG) System.out.println("PW: wrapped.startDoc docID=" + doc.docID + " tf=" + doc.termFreq); wrappedPostingsWriter.startDoc(doc.docID, doc.termFreq); } if (DEBUG) System.out.println("PW: wrapped.addPos pos=" + pos.pos); wrappedPostingsWriter.addPosition(pos.pos, pos.payload, pos.startOffset, pos.endOffset); } // wrappedPostingsWriter.finishDoc(); } else { for (Position doc : pending) { wrappedPostingsWriter.startDoc( doc.docID, indexOptions == IndexOptions.DOCS_ONLY ? 0 : doc.termFreq); } } pendingCount = -1; }
@Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { // we pull this before the seed intentionally: because its not consumed at runtime // (the skipInterval is written into postings header) int skipInterval = _TestUtil.nextInt(seedRandom, 2, 10); if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: skipInterval=" + skipInterval); } final long seed = seedRandom.nextLong(); if (LuceneTestCase.VERBOSE) { System.out.println( "MockRandomCodec: writing to seg=" + state.segmentName + " formatID=" + state.segmentSuffix + " seed=" + seed); } final String seedFileName = IndexFileNames.segmentFileName(state.segmentName, state.segmentSuffix, SEED_EXT); final IndexOutput out = state.directory.createOutput(seedFileName, state.context); try { out.writeLong(seed); } finally { out.close(); } final Random random = new Random(seed); random.nextInt(); // consume a random for buffersize PostingsWriterBase postingsWriter; if (random.nextBoolean()) { postingsWriter = new SepPostingsWriter(state, new MockIntStreamFactory(random), skipInterval); } else { if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: writing Standard postings"); } postingsWriter = new Lucene40PostingsWriter(state, skipInterval); } if (random.nextBoolean()) { final int totTFCutoff = _TestUtil.nextInt(random, 1, 20); if (LuceneTestCase.VERBOSE) { System.out.println( "MockRandomCodec: writing pulsing postings with totTFCutoff=" + totTFCutoff); } postingsWriter = new PulsingPostingsWriter(totTFCutoff, postingsWriter); } final FieldsConsumer fields; if (random.nextBoolean()) { // Use BlockTree terms dict if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: writing BlockTree terms dict"); } // TODO: would be nice to allow 1 but this is very // slow to write final int minTermsInBlock = _TestUtil.nextInt(random, 2, 100); final int maxTermsInBlock = Math.max(2, (minTermsInBlock - 1) * 2 + random.nextInt(100)); boolean success = false; try { fields = new BlockTreeTermsWriter(state, postingsWriter, minTermsInBlock, maxTermsInBlock); success = true; } finally { if (!success) { postingsWriter.close(); } } } else { if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: writing Block terms dict"); } boolean success = false; final TermsIndexWriterBase indexWriter; try { if (random.nextBoolean()) { state.termIndexInterval = _TestUtil.nextInt(random, 1, 100); if (LuceneTestCase.VERBOSE) { System.out.println( "MockRandomCodec: fixed-gap terms index (tii=" + state.termIndexInterval + ")"); } indexWriter = new FixedGapTermsIndexWriter(state); } else { final VariableGapTermsIndexWriter.IndexTermSelector selector; final int n2 = random.nextInt(3); if (n2 == 0) { final int tii = _TestUtil.nextInt(random, 1, 100); selector = new VariableGapTermsIndexWriter.EveryNTermSelector(tii); if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: variable-gap terms index (tii=" + tii + ")"); } } else if (n2 == 1) { final int docFreqThresh = _TestUtil.nextInt(random, 2, 100); final int tii = _TestUtil.nextInt(random, 1, 100); selector = new VariableGapTermsIndexWriter.EveryNOrDocFreqTermSelector(docFreqThresh, tii); } else { final long seed2 = random.nextLong(); final int gap = _TestUtil.nextInt(random, 2, 40); if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: random-gap terms index (max gap=" + gap + ")"); } selector = new VariableGapTermsIndexWriter.IndexTermSelector() { final Random rand = new Random(seed2); @Override public boolean isIndexTerm(BytesRef term, TermStats stats) { return rand.nextInt(gap) == gap / 2; } @Override public void newField(FieldInfo fieldInfo) {} }; } indexWriter = new VariableGapTermsIndexWriter(state, selector); } success = true; } finally { if (!success) { postingsWriter.close(); } } success = false; try { fields = new BlockTermsWriter(indexWriter, state, postingsWriter); success = true; } finally { if (!success) { try { postingsWriter.close(); } finally { indexWriter.close(); } } } } return fields; }
@Override public void close() throws IOException { wrappedPostingsWriter.close(); }
/** Called when we are done adding docs to this term */ @Override public void finishTerm(TermStats stats) throws IOException { if (DEBUG) System.out.println( "PW finishTerm docCount=" + stats.docFreq + " pendingCount=" + pendingCount + " pendingTerms.size()=" + pendingTerms.size()); assert pendingCount > 0 || pendingCount == -1; if (pendingCount == -1) { wrappedPostingsWriter.finishTerm(stats); // Must add null entry to record terms that our // wrapped postings impl added pendingTerms.add(null); } else { // There were few enough total occurrences for this // term, so we fully inline our postings data into // terms dict, now: // TODO: it'd be better to share this encoding logic // in some inner codec that knows how to write a // single doc / single position, etc. This way if a // given codec wants to store other interesting // stuff, it could use this pulsing codec to do so if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { int lastDocID = 0; int pendingIDX = 0; int lastPayloadLength = -1; int lastOffsetLength = -1; while (pendingIDX < pendingCount) { final Position doc = pending[pendingIDX]; final int delta = doc.docID - lastDocID; lastDocID = doc.docID; if (DEBUG) System.out.println(" write doc=" + doc.docID + " freq=" + doc.termFreq); if (doc.termFreq == 1) { buffer.writeVInt((delta << 1) | 1); } else { buffer.writeVInt(delta << 1); buffer.writeVInt(doc.termFreq); } int lastPos = 0; int lastOffset = 0; for (int posIDX = 0; posIDX < doc.termFreq; posIDX++) { final Position pos = pending[pendingIDX++]; assert pos.docID == doc.docID; final int posDelta = pos.pos - lastPos; lastPos = pos.pos; if (DEBUG) System.out.println(" write pos=" + pos.pos); final int payloadLength = pos.payload == null ? 0 : pos.payload.length; if (storePayloads) { if (payloadLength != lastPayloadLength) { buffer.writeVInt((posDelta << 1) | 1); buffer.writeVInt(payloadLength); lastPayloadLength = payloadLength; } else { buffer.writeVInt(posDelta << 1); } } else { buffer.writeVInt(posDelta); } if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) { // System.out.println("write=" + pos.startOffset + "," + pos.endOffset); int offsetDelta = pos.startOffset - lastOffset; int offsetLength = pos.endOffset - pos.startOffset; if (offsetLength != lastOffsetLength) { buffer.writeVInt(offsetDelta << 1 | 1); buffer.writeVInt(offsetLength); } else { buffer.writeVInt(offsetDelta << 1); } lastOffset = pos.startOffset; lastOffsetLength = offsetLength; } if (payloadLength > 0) { assert storePayloads; buffer.writeBytes(pos.payload.bytes, 0, pos.payload.length); } } } } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) { int lastDocID = 0; for (int posIDX = 0; posIDX < pendingCount; posIDX++) { final Position doc = pending[posIDX]; final int delta = doc.docID - lastDocID; assert doc.termFreq != 0; if (doc.termFreq == 1) { buffer.writeVInt((delta << 1) | 1); } else { buffer.writeVInt(delta << 1); buffer.writeVInt(doc.termFreq); } lastDocID = doc.docID; } } else if (indexOptions == IndexOptions.DOCS_ONLY) { int lastDocID = 0; for (int posIDX = 0; posIDX < pendingCount; posIDX++) { final Position doc = pending[posIDX]; buffer.writeVInt(doc.docID - lastDocID); lastDocID = doc.docID; } } final byte[] bytes = new byte[(int) buffer.getFilePointer()]; buffer.writeTo(bytes, 0); pendingTerms.add(new PendingTerm(bytes)); buffer.reset(); } pendingCount = 0; }