SepDocsEnum init(FieldInfo fieldInfo, SepTermState termState, Bits liveDocs) throws IOException { this.liveDocs = liveDocs; this.indexOptions = fieldInfo.getIndexOptions(); omitTF = indexOptions == IndexOptions.DOCS_ONLY; storePayloads = fieldInfo.hasPayloads(); // TODO: can't we only do this if consumer // skipped consuming the previous docs? docIndex.copyFrom(termState.docIndex); docIndex.seek(docReader); if (!omitTF) { freqIndex.copyFrom(termState.freqIndex); freqIndex.seek(freqReader); } docFreq = termState.docFreq; // NOTE: unused if docFreq < skipMinimum: skipFP = termState.skipFP; count = 0; doc = -1; accum = 0; freq = 1; skipped = false; return this; }
private void readFields(IndexInput meta, FieldInfos infos) throws IOException { for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) { FieldInfo info = infos.fieldInfo(fieldNumber); if (info == null) { throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); } else if (!info.hasNorms()) { throw new CorruptIndexException("Invalid field: " + info.name, meta); } NormsEntry entry = new NormsEntry(); entry.docsWithFieldOffset = meta.readLong(); entry.numDocsWithField = meta.readInt(); entry.bytesPerNorm = meta.readByte(); switch (entry.bytesPerNorm) { case 0: case 1: case 2: case 4: case 8: break; default: throw new CorruptIndexException( "Invalid bytesPerValue: " + entry.bytesPerNorm + ", field: " + info.name, meta); } entry.normsOffset = meta.readLong(); norms.put(info.number, entry); } }
public void testDocValuesUnstored() throws IOException { Directory dir = newDirectory(); IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); iwconfig.setMergePolicy(newLogMergePolicy()); IndexWriter writer = new IndexWriter(dir, iwconfig); for (int i = 0; i < 50; i++) { Document doc = new Document(); doc.add(new NumericDocValuesField("dv", i)); doc.add(new TextField("docId", "" + i, Field.Store.YES)); writer.addDocument(doc); } DirectoryReader r = writer.getReader(); SlowCompositeReaderWrapper slow = new SlowCompositeReaderWrapper(r); FieldInfos fi = slow.getFieldInfos(); FieldInfo dvInfo = fi.fieldInfo("dv"); assertTrue(dvInfo.hasDocValues()); NumericDocValues dv = slow.getNumericDocValues("dv"); for (int i = 0; i < 50; i++) { assertEquals(i, dv.get(i)); StoredDocument d = slow.document(i); // cannot use d.get("dv") due to another bug! assertNull(d.getField("dv")); assertEquals(Integer.toString(i), d.get("docId")); } slow.close(); writer.close(); dir.close(); }
// Currently, this instance is re-used across fields, so // our parent calls setField whenever the field changes @Override public void setField(FieldInfo fieldInfo) { this.indexOptions = fieldInfo.getIndexOptions(); if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions); storePayloads = fieldInfo.hasPayloads(); wrappedPostingsWriter.setField(fieldInfo); // DEBUG = BlockTreeTermsWriter.DEBUG; }
public FieldInfo add(FieldInfo fi) { // IMPORTANT - reuse the field number if possible for consistent field numbers across segments return addOrUpdateInternal( fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), fi.getDocValuesType()); }
private FieldInfo addOrUpdateInternal( String name, int preferredFieldNumber, boolean storeTermVector, boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, DocValuesType docValues) { if (docValues == null) { throw new NullPointerException("DocValuesType cannot be null"); } FieldInfo fi = fieldInfo(name); if (fi == null) { // This field wasn't yet added to this in-RAM // segment's FieldInfo, so now we get a global // number for this field. If the field was seen // before then we'll get the same name and number, // else we'll allocate a new one: final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, docValues); fi = new FieldInfo( name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, -1, null); assert !byName.containsKey(fi.name); globalFieldNumbers.verifyConsistent( Integer.valueOf(fi.number), fi.name, fi.getDocValuesType()); byName.put(fi.name, fi); } else { fi.update(storeTermVector, omitNorms, storePayloads, indexOptions); if (docValues != DocValuesType.NONE) { // Only pay the synchronization cost if fi does not already have a DVType boolean updateGlobal = fi.getDocValuesType() == DocValuesType.NONE; if (updateGlobal) { // Must also update docValuesType map so it's // aware of this field's DocValuesType. This will throw IllegalArgumentException if // an illegal type change was attempted. globalFieldNumbers.setDocValuesType(fi.number, name, docValues); } fi.setDocValuesType(docValues); // this will also perform the consistency check. } } return fi; }
@Override public NumericDocValues getNormValues(String field) throws IOException { NumericDocValues dv = super.getNormValues(field); FieldInfo fi = getFieldInfos().fieldInfo(field); if (dv != null) { assert fi != null; assert fi.hasNorms(); return new AssertingNumericDocValues(dv, maxDoc()); } else { assert fi == null || fi.hasNorms() == false; return null; } }
@Override public SortedSetDocValues getSortedSetDocValues(String field) throws IOException { SortedSetDocValues dv = super.getSortedSetDocValues(field); FieldInfo fi = getFieldInfos().fieldInfo(field); if (dv != null) { assert fi != null; assert fi.getDocValuesType() == DocValuesType.SORTED_SET; return new AssertingSortedSetDocValues(dv, maxDoc()); } else { assert fi == null || fi.getDocValuesType() != DocValuesType.SORTED_SET; return null; } }
@Override public BinaryDocValues getBinaryDocValues(String field) throws IOException { BinaryDocValues dv = super.getBinaryDocValues(field); FieldInfo fi = getFieldInfos().fieldInfo(field); if (dv != null) { assert fi != null; assert fi.getDocValuesType() == DocValuesType.BINARY; return new AssertingBinaryDocValues(dv, maxDoc()); } else { assert fi == null || fi.getDocValuesType() != DocValuesType.BINARY; return null; } }
@Override public NumericDocValues getNumericDocValues(String field) throws IOException { NumericDocValues dv = super.getNumericDocValues(field); FieldInfo fi = getFieldInfos().fieldInfo(field); if (dv != null) { assert fi != null; assert fi.getDocValuesType() == DocValuesType.NUMERIC; return new AssertingNumericDocValues(dv, maxDoc()); } else { assert fi == null || fi.getDocValuesType() != DocValuesType.NUMERIC; return null; } }
@Override public void writeField(FieldInfo fieldInfo, PointsReader reader) throws IOException { PointValues values = reader.getValues(fieldInfo.name); boolean singleValuePerDoc = values.size() == values.getDocCount(); try (BKDWriter writer = new BKDWriter( writeState.segmentInfo.maxDoc(), writeState.directory, writeState.segmentInfo.name, fieldInfo.getPointDimensionCount(), fieldInfo.getPointNumBytes(), maxPointsInLeafNode, maxMBSortInHeap, values.size(), singleValuePerDoc)) { if (values instanceof MutablePointValues) { final long fp = writer.writeField(dataOut, fieldInfo.name, (MutablePointValues) values); if (fp != -1) { indexFPs.put(fieldInfo.name, fp); } return; } values.intersect( new IntersectVisitor() { @Override public void visit(int docID) { throw new IllegalStateException(); } public void visit(int docID, byte[] packedValue) throws IOException { writer.add(packedValue, docID); } @Override public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { return Relation.CELL_CROSSES_QUERY; } }); // We could have 0 points on merge since all docs with dimensional fields may be deleted: if (writer.getPointCount() > 0) { indexFPs.put(fieldInfo.name, writer.finish(dataOut)); } } }
SepDocsAndPositionsEnum init(FieldInfo fieldInfo, SepTermState termState, Bits liveDocs) throws IOException { this.liveDocs = liveDocs; storePayloads = fieldInfo.hasPayloads(); // System.out.println("Sep D&P init"); // TODO: can't we only do this if consumer // skipped consuming the previous docs? docIndex.copyFrom(termState.docIndex); docIndex.seek(docReader); // System.out.println(" docIndex=" + docIndex); freqIndex.copyFrom(termState.freqIndex); freqIndex.seek(freqReader); // System.out.println(" freqIndex=" + freqIndex); posIndex.copyFrom(termState.posIndex); // System.out.println(" posIndex=" + posIndex); posSeekPending = true; payloadPending = false; payloadFP = termState.payloadFP; skipFP = termState.skipFP; // System.out.println(" skipFP=" + skipFP); docFreq = termState.docFreq; count = 0; doc = -1; accum = 0; pendingPosCount = 0; pendingPayloadBytes = 0; skipped = false; return this; }
@Override public DocsAndPositionsEnum docsAndPositions( FieldInfo fieldInfo, BlockTermState _termState, Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { assert fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; final SepTermState termState = (SepTermState) _termState; SepDocsAndPositionsEnum postingsEnum; if (reuse == null || !(reuse instanceof SepDocsAndPositionsEnum)) { postingsEnum = new SepDocsAndPositionsEnum(); } else { postingsEnum = (SepDocsAndPositionsEnum) reuse; if (postingsEnum.startDocIn != docIn) { // If you are using ParellelReader, and pass in a // reused DocsAndPositionsEnum, it could have come // from another reader also using sep codec postingsEnum = new SepDocsAndPositionsEnum(); } } return postingsEnum.init(fieldInfo, termState, liveDocs); }
@Override public boolean hasOffsets() { return fieldInfo .getIndexOptions() .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; }
@Override public Bits getDocsWithField(FieldInfo field) throws IOException { if (field.getDocValuesType() == DocValuesType.SORTED_SET) { return DocValues.docsWithValue(getSortedSet(field), maxDoc); } else { return new Bits.MatchAllBits(maxDoc); } }
InsaneReader(LeafReader in, String insaneField) { super(in); this.insaneField = insaneField; ArrayList<FieldInfo> filteredInfos = new ArrayList<>(); for (FieldInfo fi : in.getFieldInfos()) { if (fi.name.equals(insaneField)) { filteredInfos.add( new FieldInfo( fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), DocValuesType.NONE, -1, Collections.emptyMap(), fi.getPointDimensionCount(), fi.getPointNumBytes())); } else { filteredInfos.add(fi); } } fieldInfos = new FieldInfos(filteredInfos.toArray(new FieldInfo[filteredInfos.size()])); }
public FSTTermsReader(SegmentReadState state, PostingsReaderBase postingsReader) throws IOException { final String termsFileName = IndexFileNames.segmentFileName( state.segmentInfo.name, state.segmentSuffix, FSTTermsWriter.TERMS_EXTENSION); this.postingsReader = postingsReader; final IndexInput in = state.directory.openInput(termsFileName, state.context); boolean success = false; try { version = readHeader(in); if (version >= FSTTermsWriter.TERMS_VERSION_CHECKSUM) { CodecUtil.checksumEntireFile(in); } this.postingsReader.init(in); seekDir(in); final FieldInfos fieldInfos = state.fieldInfos; final int numFields = in.readVInt(); for (int i = 0; i < numFields; i++) { int fieldNumber = in.readVInt(); FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); long numTerms = in.readVLong(); long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong(); long sumDocFreq = in.readVLong(); int docCount = in.readVInt(); int longsSize = in.readVInt(); TermsReader current = new TermsReader( fieldInfo, in, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize); TermsReader previous = fields.put(fieldInfo.name, current); checkFieldSummary(state.segmentInfo, in, current, previous); } success = true; } finally { if (success) { IOUtils.close(in); } else { IOUtils.closeWhileHandlingException(in); } } }
void createDocumentNode(final DocumentDescriptor inDescriptor) throws IOException { try { _document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument(); _rootNode = _document.createElement("document"); } catch (ParserConfigurationException e) { e.printStackTrace(); System.exit(1); } AtomicReader segmentReader = _indexReader.leaves().get(inDescriptor.segmentNumber).reader(); _rootNode.setAttribute("id", DocumentIdOperations.documentDescriptorToId(inDescriptor)); // TODO: implement the proper way of building a title from the production report _rootNode.setAttribute("title", buildDocumentTitle(segmentReader, inDescriptor)); _rootNode.setAttribute("path", "ruscorpora.ru"); _rootNode.setAttribute( "tagging", Attributes.getDocAttribute(segmentReader, inDescriptor.localId, "tagging")); _rootNode.setAttribute("snippets", "0"); Element attributesNode = _document.createElement("attributes"); _rootNode.appendChild(attributesNode); FieldInfos fields = segmentReader.getFieldInfos(); for (int fieldIndex = 0; fieldIndex != fields.size(); ++fieldIndex) { FieldInfo field = fields.fieldInfo(fieldIndex); // TODO: understand why field may turn into null if (field == null) { continue; } String name = field.name; if (Attributes.ATTRIBUTES.contains(name) || Attributes.ATTRIBUTES_FOR_REPORT.contains(name) || Attributes.ATTRIBUTES_FOR_WORD_INFO.contains(name) || !field.hasDocValues()) { // it's a word attribute continue; } Element attrNode = _document.createElement("attr"); attrNode.setAttribute("name", name); attrNode.setAttribute( "value", Attributes.getDocAttribute(segmentReader, inDescriptor.localId, name)); attributesNode.appendChild(attrNode); } }
@Override public void finishDoc() throws IOException { assert state == PostingsConsumerState.START; state = PostingsConsumerState.INITIAL; if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { assert positionCount == 0; // we should not have fed any positions! } else { assert positionCount == freq; } in.finishDoc(); }
/** Constructs a new FieldInfos from an array of FieldInfo objects */ public FieldInfos(FieldInfo[] infos) { boolean hasVectors = false; boolean hasProx = false; boolean hasPayloads = false; boolean hasOffsets = false; boolean hasFreq = false; boolean hasNorms = false; boolean hasDocValues = false; for (FieldInfo info : infos) { if (info.number < 0) { throw new IllegalArgumentException( "illegal field number: " + info.number + " for field " + info.name); } FieldInfo previous = byNumber.put(info.number, info); if (previous != null) { throw new IllegalArgumentException( "duplicate field numbers: " + previous.name + " and " + info.name + " have: " + info.number); } previous = byName.put(info.name, info); if (previous != null) { throw new IllegalArgumentException( "duplicate field names: " + previous.number + " and " + info.number + " have: " + info.name); } hasVectors |= info.hasVectors(); hasProx |= info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; hasFreq |= info.getIndexOptions() != IndexOptions.DOCS; hasOffsets |= info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; hasNorms |= info.hasNorms(); hasDocValues |= info.getDocValuesType() != DocValuesType.NONE; hasPayloads |= info.hasPayloads(); } this.hasVectors = hasVectors; this.hasProx = hasProx; this.hasPayloads = hasPayloads; this.hasOffsets = hasOffsets; this.hasFreq = hasFreq; this.hasNorms = hasNorms; this.hasDocValues = hasDocValues; this.values = Collections.unmodifiableCollection(byNumber.values()); }
@BeforeClass public static void beforeClass() throws Exception { testDoc = new Document(); fieldInfos = new FieldInfos.Builder(); DocHelper.setupDoc(testDoc); for (IndexableField field : testDoc.getFields()) { FieldInfo fieldInfo = fieldInfos.getOrAdd(field.name()); IndexableFieldType ift = field.fieldType(); fieldInfo.setIndexOptions(ift.indexOptions()); if (ift.omitNorms()) { fieldInfo.setOmitsNorms(); } fieldInfo.setDocValuesType(ift.docValuesType()); } dir = newDirectory(); IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy()); conf.getMergePolicy().setNoCFSRatio(0.0); IndexWriter writer = new IndexWriter(dir, conf); writer.addDocument(testDoc); writer.close(); }
private FieldInfo addOrUpdateInternal( String name, int preferredFieldNumber, boolean isIndexed, boolean storeTermVector, boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, DocValuesType docValues, DocValuesType normType) { FieldInfo fi = fieldInfo(name); if (fi == null) { // This field wasn't yet added to this in-RAM // segment's FieldInfo, so now we get a global // number for this field. If the field was seen // before then we'll get the same name and number, // else we'll allocate a new one: final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, docValues); fi = new FieldInfo( name, isIndexed, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, normType, null); assert !byName.containsKey(fi.name); assert globalFieldNumbers.containsConsistent( Integer.valueOf(fi.number), fi.name, fi.getDocValuesType()); byName.put(fi.name, fi); } else { fi.update(isIndexed, storeTermVector, omitNorms, storePayloads, indexOptions); if (docValues != null) { // only pay the synchronization cost if fi does not already have a DVType boolean updateGlobal = !fi.hasDocValues(); fi.setDocValuesType(docValues); // this will also perform the consistency check. if (updateGlobal) { // must also update docValuesType map so it's // aware of this field's DocValueType globalFieldNumbers.setDocValuesType(fi.number, name, docValues); } } if (!fi.omitsNorms() && normType != null) { fi.setNormValueType(normType); } } return fi; }
@Override public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState) throws IOException { final SepTermState termState = (SepTermState) _termState; final boolean isFirstTerm = termState.termBlockOrd == 0; // System.out.println("SEPR.nextTerm termCount=" + termState.termBlockOrd + " isFirstTerm=" + // isFirstTerm + " bytesReader.pos=" + termState.bytesReader.getPosition()); // System.out.println(" docFreq=" + termState.docFreq); termState.docIndex.read(termState.bytesReader, isFirstTerm); // System.out.println(" docIndex=" + termState.docIndex); if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { termState.freqIndex.read(termState.bytesReader, isFirstTerm); if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { // System.out.println(" freqIndex=" + termState.freqIndex); termState.posIndex.read(termState.bytesReader, isFirstTerm); // System.out.println(" posIndex=" + termState.posIndex); if (fieldInfo.hasPayloads()) { if (isFirstTerm) { termState.payloadFP = termState.bytesReader.readVLong(); } else { termState.payloadFP += termState.bytesReader.readVLong(); } // System.out.println(" payloadFP=" + termState.payloadFP); } } } if (termState.docFreq >= skipMinimum) { // System.out.println(" readSkip @ " + termState.bytesReader.getPosition()); if (isFirstTerm) { termState.skipFP = termState.bytesReader.readVLong(); } else { termState.skipFP += termState.bytesReader.readVLong(); } // System.out.println(" skipFP=" + termState.skipFP); } else if (isFirstTerm) { termState.skipFP = 0; } }
@Override public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { assert state == PostingsConsumerState.START; assert positionCount < freq; positionCount++; assert position >= lastPosition || position == -1; /* we still allow -1 from old 3.x indexes */ lastPosition = position; if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) { assert startOffset >= 0; assert startOffset >= lastStartOffset; lastStartOffset = startOffset; assert endOffset >= startOffset; } else { assert startOffset == -1; assert endOffset == -1; } if (payload != null) { assert fieldInfo.hasPayloads(); } in.addPosition(position, payload, startOffset, endOffset); }
@Override public void finishTerm(BytesRef text, TermStats stats) throws IOException { assert state == TermsConsumerState.START; state = TermsConsumerState.INITIAL; assert text.equals(lastTerm); assert stats.docFreq > 0; // otherwise, this method should not be called. assert stats.docFreq == lastPostingsConsumer.docFreq; sumDocFreq += stats.docFreq; if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) { assert stats.totalTermFreq == -1; } else { assert stats.totalTermFreq == lastPostingsConsumer.totalTermFreq; sumTotalTermFreq += stats.totalTermFreq; } in.finishTerm(text, stats); }
@Override public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START && lastPostingsConsumer.docFreq == 0; state = TermsConsumerState.FINISHED; assert docCount >= 0; assert docCount == visitedDocs.cardinality(); assert sumDocFreq >= docCount; assert sumDocFreq == this.sumDocFreq; if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) { assert sumTotalTermFreq == -1; } else { assert sumTotalTermFreq >= sumDocFreq; assert sumTotalTermFreq == this.sumTotalTermFreq; } in.finish(sumTotalTermFreq, sumDocFreq, docCount); }
// Finishes all terms in this field @Override public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { if (numTerms > 0) { blockBuilder.finish(); // We better have one final "root" block: assert pending.size() == 1 && !pending.get(0).isTerm : "pending.size()=" + pending.size() + " pending=" + pending; final PendingBlock root = (PendingBlock) pending.get(0); assert root.prefix.length == 0; assert root.index.getEmptyOutput() != null; this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; this.docCount = docCount; // Write FST to index indexStartFP = indexOut.getFilePointer(); root.index.save(indexOut); // System.out.println(" write FST " + indexStartFP + " field=" + fieldInfo.name); // if (SAVE_DOT_FILES || DEBUG) { // final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; // Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); // Util.toDot(root.index, w, false, false); // System.out.println("SAVED to " + dotFileName); // w.close(); // } fields.add( new FieldMetaData( fieldInfo, ((PendingBlock) pending.get(0)).index.getEmptyOutput(), numTerms, indexStartFP, sumTotalTermFreq, sumDocFreq, docCount)); } else { assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY && sumTotalTermFreq == -1; assert sumDocFreq == 0; assert docCount == 0; } }
@Override public void startDoc(int docID, int freq) throws IOException { assert state == PostingsConsumerState.INITIAL; state = PostingsConsumerState.START; assert docID >= 0; if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) { assert freq == -1; this.freq = 0; // we don't expect any positions here } else { assert freq > 0; this.freq = freq; totalTermFreq += freq; } this.positionCount = 0; this.lastPosition = 0; this.lastStartOffset = 0; docFreq++; visitedDocs.set(docID); in.startDoc(docID, freq); }
/** Produce _X.nrm if any document had a field with norms not disabled */ @Override public void flush( Map<String, InvertedDocEndConsumerPerField> fieldsToFlush, SegmentWriteState state) throws IOException { boolean success = false; boolean anythingFlushed = false; try { if (state.fieldInfos.hasNorms()) { for (FieldInfo fi : state.fieldInfos) { final NormsConsumerPerField toWrite = (NormsConsumerPerField) fieldsToFlush.get(fi.name); // we must check the final value of omitNorms for the fieldinfo, it could have // changed for this field since the first time we added it. if (!fi.omitsNorms()) { if (toWrite != null && toWrite.initialized()) { anythingFlushed = true; final Type type = toWrite.flush(state.segmentInfo.getDocCount()); assert fi.getNormType() == type; } else if (fi.isIndexed()) { anythingFlushed = true; assert fi.getNormType() == null : "got " + fi.getNormType() + "; field=" + fi.name; } } } } success = true; if (!anythingFlushed && consumer != null) { consumer.abort(); } } finally { if (success) { IOUtils.close(consumer); } else { IOUtils.closeWhileHandlingException(consumer); } } }
/** Call this only once (if you subclass!) */ protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix) throws IOException { final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) { throw new IllegalStateException( "Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); } // System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); final long startTime = System.nanoTime(); prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix); final int maxDoc = reader.maxDoc(); final int[] index = new int [maxDoc]; // immediate term numbers, or the index into the byte[] representing the last // number final int[] lastTerm = new int[maxDoc]; // last term we saw for this document final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) final Terms terms = reader.terms(field); if (terms == null) { // No terms return; } final TermsEnum te = terms.iterator(); final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); // System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // For our "term index wrapper" final List<BytesRef> indexedTerms = new ArrayList<>(); final PagedBytes indexedTermsBytes = new PagedBytes(15); // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. byte[] tempArr = new byte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in its byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. This requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; postingsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (; ; ) { final BytesRef t = te.term(); if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) { break; } // System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); visitTerm(te, termNum); if ((termNum & indexIntervalMask) == 0) { // Index this term sizeOfIndexedStrings += t.length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.add(indexedTerm); } final int df = te.docFreq(); if (df <= maxTermDocFreq) { postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE); // dF, but takes deletions into account int actualDF = 0; for (; ; ) { int doc = postingsEnum.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } // System.out.println(" chunk=" + chunk + " docs"); actualDF++; termInstances++; // System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = val >>> 8; int ilen = vIntSize(delta); byte[] arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit // boundary. // TODO: figure out what array lengths we can round up to w/o actually using more // memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment byte[] newarr = new byte[newLen]; System.arraycopy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = writeInt(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } // System.out.println(" ipos=" + ipos); int endPos = writeInt(delta, tempArr, ipos); // System.out.println(" endpos=" + endPos); if (endPos <= 4) { // System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (byte) val; val >>>= 8; } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new byte[12]; } } } setActualDocFreq(termNum, actualDF); } termNum++; if (te.next() == null) { break; } } numTermsInField = termNum; long midPoint = System.nanoTime(); if (termInstances == 0) { // we didn't invert anything // lower memory consumption. tnums = null; } else { this.index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { byte[] target = tnums[pass]; int pos = 0; // end in target; if (target != null) { pos = target.length; } else { target = new byte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { // System.out.println(" pass="******" process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = val >>> 8; // System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw new IllegalStateException( "Too many values for UnInvertedField faceting on field " + field); } byte[] arr = bytes[doc]; /* for(byte b : arr) { //System.out.println(" b=" + Integer.toHexString((int) b)); } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.length <= pos + len) { int newlen = target.length; /** * * we don't have to worry about the array getting too large since the "pos" param * will overflow first (only 24 bits available) if ((newlen<<1) <= 0) { // * overflow... newlen = Integer.MAX_VALUE; if (newlen <= pos + len) { throw new * SolrException(400,"Too many terms to uninvert field!"); } } else { while (newlen * <= pos + len) newlen<<=1; // doubling strategy } ** */ while (newlen <= pos + len) newlen <<= 1; // doubling strategy byte[] newtarget = new byte[newlen]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } System.arraycopy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.length) { byte[] newtarget = new byte[pos]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } tnums[pass] = target; if ((pass << 16) > maxDoc) break; } } indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]); long endTime = System.nanoTime(); total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS); phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS); }