@Override void newTerm(final int termID) { assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start"); TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray; postings.freqs[termID] = 1; if (doVectorOffsets) { int startOffset = fieldState.offset + offsetAttribute.startOffset(); int endOffset = fieldState.offset + offsetAttribute.endOffset(); termsHashPerField.writeVInt(1, startOffset); termsHashPerField.writeVInt(1, endOffset - startOffset); postings.lastOffsets[termID] = endOffset; } if (doVectorPositions) { termsHashPerField.writeVInt(0, fieldState.position); postings.lastPositions[termID] = fieldState.position; } }
@Override boolean start(Fieldable[] fields, int count) { doVectors = false; doVectorPositions = false; doVectorOffsets = false; for (int i = 0; i < count; i++) { Fieldable field = fields[i]; if (field.isIndexed() && field.isTermVectorStored()) { doVectors = true; doVectorPositions |= field.isStorePositionWithTermVector(); doVectorOffsets |= field.isStoreOffsetWithTermVector(); } } if (doVectors) { if (perThread.doc == null) { perThread.doc = termsWriter.getPerDoc(); perThread.doc.docID = docState.docID; assert perThread.doc.numVectorFields == 0; assert 0 == perThread.doc.perDocTvf.length(); assert 0 == perThread.doc.perDocTvf.getFilePointer(); } assert perThread.doc.docID == docState.docID; if (termsHashPerField.bytesHash.size() != 0) { // Only necessary if previous doc hit a // non-aborting exception while writing vectors in // this field: termsHashPerField.reset(); perThread.termsHashPerThread.reset(false); } } // TODO: only if needed for performance // perThread.postingsCount = 0; return doVectors; }
void shrinkHash() { termsHashPerField.shrinkHash(maxNumPostings); maxNumPostings = 0; }
/** * Called once per field per document if term vectors are enabled, to write the vectors to * RAMOutputStream, which is then quickly flushed to the real term vectors files in the Directory. */ @Override void finish() throws IOException { assert docState.testPoint("TermVectorsTermsWriterPerField.finish start"); final int numPostings = termsHashPerField.bytesHash.size(); final BytesRef flushTerm = perThread.flushTerm; assert numPostings >= 0; if (!doVectors || numPostings == 0) return; if (numPostings > maxNumPostings) maxNumPostings = numPostings; final IndexOutput tvf = perThread.doc.perDocTvf; // This is called once, after inverting all occurrences // of a given field in the doc. At this point we flush // our hash into the DocWriter. assert fieldInfo.storeTermVector; assert perThread.vectorFieldsInOrder(fieldInfo); perThread.doc.addField(termsHashPerField.fieldInfo.number); TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray; // TODO: we may want to make this sort in same order // as Codec's terms dict? final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUnicodeComparator()); tvf.writeVInt(numPostings); byte bits = 0x0; if (doVectorPositions) bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR; if (doVectorOffsets) bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; tvf.writeByte(bits); int lastLen = 0; byte[] lastBytes = null; int lastStart = 0; final ByteSliceReader reader = perThread.vectorSliceReader; final ByteBlockPool termBytePool = perThread.termsHashPerThread.termBytePool; for (int j = 0; j < numPostings; j++) { final int termID = termIDs[j]; final int freq = postings.freqs[termID]; // Get BytesRef termBytePool.setBytesRef(flushTerm, postings.textStarts[termID]); // Compute common byte prefix between last term and // this term int prefix = 0; if (j > 0) { while (prefix < lastLen && prefix < flushTerm.length) { if (lastBytes[lastStart + prefix] != flushTerm.bytes[flushTerm.offset + prefix]) { break; } prefix++; } } lastLen = flushTerm.length; lastBytes = flushTerm.bytes; lastStart = flushTerm.offset; final int suffix = flushTerm.length - prefix; tvf.writeVInt(prefix); tvf.writeVInt(suffix); tvf.writeBytes(flushTerm.bytes, lastStart + prefix, suffix); tvf.writeVInt(freq); if (doVectorPositions) { termsHashPerField.initReader(reader, termID, 0); reader.writeTo(tvf); } if (doVectorOffsets) { termsHashPerField.initReader(reader, termID, 1); reader.writeTo(tvf); } } termsHashPerField.reset(); // NOTE: we clear, per-field, at the thread level, // because term vectors fully write themselves on each // field; this saves RAM (eg if large doc has two large // fields w/ term vectors on) because we recycle/reuse // all RAM after each field: perThread.termsHashPerThread.reset(false); }