@Override void newTerm(final int termID) { assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start"); TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray; postings.freqs[termID] = 1; if (doVectorOffsets) { int startOffset = fieldState.offset + offsetAttribute.startOffset(); int endOffset = fieldState.offset + offsetAttribute.endOffset(); termsHashPerField.writeVInt(1, startOffset); termsHashPerField.writeVInt(1, endOffset - startOffset); postings.lastOffsets[termID] = endOffset; } if (doVectorPositions) { termsHashPerField.writeVInt(0, fieldState.position); postings.lastPositions[termID] = fieldState.position; } }
/** * Called once per field per document if term vectors are enabled, to write the vectors to * RAMOutputStream, which is then quickly flushed to the real term vectors files in the Directory. */ @Override void finish() throws IOException { assert docState.testPoint("TermVectorsTermsWriterPerField.finish start"); final int numPostings = termsHashPerField.bytesHash.size(); final BytesRef flushTerm = perThread.flushTerm; assert numPostings >= 0; if (!doVectors || numPostings == 0) return; if (numPostings > maxNumPostings) maxNumPostings = numPostings; final IndexOutput tvf = perThread.doc.perDocTvf; // This is called once, after inverting all occurrences // of a given field in the doc. At this point we flush // our hash into the DocWriter. assert fieldInfo.storeTermVector; assert perThread.vectorFieldsInOrder(fieldInfo); perThread.doc.addField(termsHashPerField.fieldInfo.number); TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray; // TODO: we may want to make this sort in same order // as Codec's terms dict? final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUnicodeComparator()); tvf.writeVInt(numPostings); byte bits = 0x0; if (doVectorPositions) bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR; if (doVectorOffsets) bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; tvf.writeByte(bits); int lastLen = 0; byte[] lastBytes = null; int lastStart = 0; final ByteSliceReader reader = perThread.vectorSliceReader; final ByteBlockPool termBytePool = perThread.termsHashPerThread.termBytePool; for (int j = 0; j < numPostings; j++) { final int termID = termIDs[j]; final int freq = postings.freqs[termID]; // Get BytesRef termBytePool.setBytesRef(flushTerm, postings.textStarts[termID]); // Compute common byte prefix between last term and // this term int prefix = 0; if (j > 0) { while (prefix < lastLen && prefix < flushTerm.length) { if (lastBytes[lastStart + prefix] != flushTerm.bytes[flushTerm.offset + prefix]) { break; } prefix++; } } lastLen = flushTerm.length; lastBytes = flushTerm.bytes; lastStart = flushTerm.offset; final int suffix = flushTerm.length - prefix; tvf.writeVInt(prefix); tvf.writeVInt(suffix); tvf.writeBytes(flushTerm.bytes, lastStart + prefix, suffix); tvf.writeVInt(freq); if (doVectorPositions) { termsHashPerField.initReader(reader, termID, 0); reader.writeTo(tvf); } if (doVectorOffsets) { termsHashPerField.initReader(reader, termID, 1); reader.writeTo(tvf); } } termsHashPerField.reset(); // NOTE: we clear, per-field, at the thread level, // because term vectors fully write themselves on each // field; this saves RAM (eg if large doc has two large // fields w/ term vectors on) because we recycle/reuse // all RAM after each field: perThread.termsHashPerThread.reset(false); }