/** make sure we downgrade positions and payloads correctly */ public void testMixing() throws Exception { // no positions FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); Directory dir = newDirectory(); RandomIndexWriter iw = new RandomIndexWriter(random(), dir); for (int i = 0; i < 20; i++) { Document doc = new Document(); if (i < 19 && random().nextBoolean()) { for (int j = 0; j < 50; j++) { doc.add(new TextField("foo", "i have positions", Field.Store.NO)); } } else { for (int j = 0; j < 50; j++) { doc.add(new Field("foo", "i have no positions", ft)); } } iw.addDocument(doc); iw.commit(); } if (random().nextBoolean()) { iw.forceMerge(1); } DirectoryReader ir = iw.getReader(); FieldInfos fis = MultiFields.getMergedFieldInfos(ir); assertEquals(IndexOptions.DOCS_AND_FREQS, fis.fieldInfo("foo").getIndexOptions()); assertFalse(fis.fieldInfo("foo").hasPayloads()); iw.close(); ir.close(); dir.close(); // checkindex }
public void testDocValues() throws IOException { assertU(adoc("id", "1")); assertU(commit()); try (SolrCore core = h.getCoreInc()) { final RefCounted<SolrIndexSearcher> searcherRef = core.openNewSearcher(true, true); final SolrIndexSearcher searcher = searcherRef.get(); try { final LeafReader reader = searcher.getLeafReader(); assertEquals(1, reader.numDocs()); final FieldInfos infos = reader.getFieldInfos(); assertEquals(DocValuesType.NUMERIC, infos.fieldInfo("floatdv").getDocValuesType()); assertEquals(DocValuesType.NUMERIC, infos.fieldInfo("intdv").getDocValuesType()); assertEquals(DocValuesType.NUMERIC, infos.fieldInfo("doubledv").getDocValuesType()); assertEquals(DocValuesType.NUMERIC, infos.fieldInfo("longdv").getDocValuesType()); assertEquals(DocValuesType.SORTED, infos.fieldInfo("stringdv").getDocValuesType()); assertEquals((long) Float.floatToIntBits(1), reader.getNumericDocValues("floatdv").get(0)); assertEquals(2L, reader.getNumericDocValues("intdv").get(0)); assertEquals(Double.doubleToLongBits(3), reader.getNumericDocValues("doubledv").get(0)); assertEquals(4L, reader.getNumericDocValues("longdv").get(0)); final IndexSchema schema = core.getLatestSchema(); final SchemaField floatDv = schema.getField("floatdv"); final SchemaField intDv = schema.getField("intdv"); final SchemaField doubleDv = schema.getField("doubledv"); final SchemaField longDv = schema.getField("longdv"); FunctionValues values = floatDv .getType() .getValueSource(floatDv, null) .getValues(null, searcher.getLeafReader().leaves().get(0)); assertEquals(1f, values.floatVal(0), 0f); assertEquals(1f, values.objectVal(0)); values = intDv .getType() .getValueSource(intDv, null) .getValues(null, searcher.getLeafReader().leaves().get(0)); assertEquals(2, values.intVal(0)); assertEquals(2, values.objectVal(0)); values = doubleDv .getType() .getValueSource(doubleDv, null) .getValues(null, searcher.getLeafReader().leaves().get(0)); assertEquals(3d, values.doubleVal(0), 0d); assertEquals(3d, values.objectVal(0)); values = longDv .getType() .getValueSource(longDv, null) .getValues(null, searcher.getLeafReader().leaves().get(0)); assertEquals(4L, values.longVal(0)); assertEquals(4L, values.objectVal(0)); } finally { searcherRef.decref(); } } }
// Tests whether merging of docs that have different // omitTermFreqAndPositions for the same field works public void testMixedMerge() throws Exception { Directory ram = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriter writer = new IndexWriter( ram, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer) .setMaxBufferedDocs(3) .setMergePolicy(newLogMergePolicy(2))); Document d = new Document(); // this field will have Tf Field f1 = newField("f1", "This field has term freqs", normalType); d.add(f1); // this field will NOT have Tf Field f2 = newField("f2", "This field has NO Tf in all docs", omitType); d.add(f2); for (int i = 0; i < 30; i++) writer.addDocument(d); // now we add another document which has term freq for field f2 and not for f1 and verify if the // SegmentMerger // keep things constant d = new Document(); // Reverese f1 = newField("f1", "This field has term freqs", omitType); d.add(f1); f2 = newField("f2", "This field has NO Tf in all docs", normalType); d.add(f2); for (int i = 0; i < 30; i++) writer.addDocument(d); // force merge writer.forceMerge(1); // flush writer.close(); SegmentReader reader = getOnlySegmentReader(DirectoryReader.open(ram)); FieldInfos fi = reader.getFieldInfos(); assertEquals( "OmitTermFreqAndPositions field bit should be set.", IndexOptions.DOCS_ONLY, fi.fieldInfo("f1").getIndexOptions()); assertEquals( "OmitTermFreqAndPositions field bit should be set.", IndexOptions.DOCS_ONLY, fi.fieldInfo("f2").getIndexOptions()); reader.close(); ram.close(); }
public void testDocValuesUnstored() throws IOException { Directory dir = newDirectory(); IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); iwconfig.setMergePolicy(newLogMergePolicy()); IndexWriter writer = new IndexWriter(dir, iwconfig); for (int i = 0; i < 50; i++) { Document doc = new Document(); doc.add(new NumericDocValuesField("dv", i)); doc.add(new TextField("docId", "" + i, Field.Store.YES)); writer.addDocument(doc); } DirectoryReader r = writer.getReader(); SlowCompositeReaderWrapper slow = new SlowCompositeReaderWrapper(r); FieldInfos fi = slow.getFieldInfos(); FieldInfo dvInfo = fi.fieldInfo("dv"); assertTrue(dvInfo.hasDocValues()); NumericDocValues dv = slow.getNumericDocValues("dv"); for (int i = 0; i < 50; i++) { assertEquals(i, dv.get(i)); StoredDocument d = slow.document(i); // cannot use d.get("dv") due to another bug! assertNull(d.getField("dv")); assertEquals(Integer.toString(i), d.get("docId")); } slow.close(); writer.close(); dir.close(); }
private void readFields(IndexInput meta, FieldInfos infos) throws IOException { for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) { FieldInfo info = infos.fieldInfo(fieldNumber); if (info == null) { throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); } else if (!info.hasNorms()) { throw new CorruptIndexException("Invalid field: " + info.name, meta); } NormsEntry entry = new NormsEntry(); entry.docsWithFieldOffset = meta.readLong(); entry.numDocsWithField = meta.readInt(); entry.bytesPerNorm = meta.readByte(); switch (entry.bytesPerNorm) { case 0: case 1: case 2: case 4: case 8: break; default: throw new CorruptIndexException( "Invalid bytesPerValue: " + entry.bytesPerNorm + ", field: " + info.name, meta); } entry.normsOffset = meta.readLong(); norms.put(info.number, entry); } }
public void testFieldNames() throws Exception { Directory dir1 = getDir1(random()); Directory dir2 = getDir2(random()); ParallelLeafReader pr = new ParallelLeafReader( SlowCompositeReaderWrapper.wrap(DirectoryReader.open(dir1)), SlowCompositeReaderWrapper.wrap(DirectoryReader.open(dir2))); FieldInfos fieldInfos = pr.getFieldInfos(); assertEquals(4, fieldInfos.size()); assertNotNull(fieldInfos.fieldInfo("f1")); assertNotNull(fieldInfos.fieldInfo("f2")); assertNotNull(fieldInfos.fieldInfo("f3")); assertNotNull(fieldInfos.fieldInfo("f4")); pr.close(); dir1.close(); dir2.close(); }
// Make sure first adding docs that do not omitNorms for // field X, then adding docs that do omitNorms for that same // field, public void testMixedRAM() throws Exception { Directory ram = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriter writer = new IndexWriter( ram, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer) .setMaxBufferedDocs(10) .setMergePolicy(newLogMergePolicy(2))); Document d = new Document(); // this field will have norms Field f1 = newTextField("f1", "This field has norms", Field.Store.NO); d.add(f1); // this field will NOT have norms FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.setOmitNorms(true); Field f2 = newField("f2", "This field has NO norms in all docs", customType); d.add(f2); for (int i = 0; i < 5; i++) { writer.addDocument(d); } for (int i = 0; i < 20; i++) { writer.addDocument(d); } // force merge writer.forceMerge(1); // flush writer.close(); SegmentReader reader = getOnlySegmentReader(DirectoryReader.open(ram)); FieldInfos fi = reader.getFieldInfos(); assertTrue("OmitNorms field bit should not be set.", !fi.fieldInfo("f1").omitsNorms()); assertTrue("OmitNorms field bit should be set.", fi.fieldInfo("f2").omitsNorms()); reader.close(); ram.close(); }
private int readFields(IndexInput meta, FieldInfos infos) throws IOException { int numEntries = 0; int fieldNumber = meta.readVInt(); while (fieldNumber != -1) { numEntries++; FieldInfo info = infos.fieldInfo(fieldNumber); if (info == null) { // trickier to validate more: because we re-use for norms, because we use multiple entries // for "composite" types like sortedset, etc. throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); } int fieldType = meta.readByte(); if (fieldType == NUMBER) { NumericEntry entry = new NumericEntry(); entry.offset = meta.readLong(); entry.format = meta.readByte(); switch (entry.format) { case DELTA_COMPRESSED: case TABLE_COMPRESSED: case GCD_COMPRESSED: case UNCOMPRESSED: break; default: throw new CorruptIndexException("Unknown format: " + entry.format, meta); } if (entry.format != UNCOMPRESSED) { entry.packedIntsVersion = meta.readVInt(); } numerics.put(info.name, entry); } else if (fieldType == BYTES) { BinaryEntry entry = new BinaryEntry(); entry.offset = meta.readLong(); entry.numBytes = meta.readLong(); entry.minLength = meta.readVInt(); entry.maxLength = meta.readVInt(); if (entry.minLength != entry.maxLength) { entry.packedIntsVersion = meta.readVInt(); entry.blockSize = meta.readVInt(); } binaries.put(info.name, entry); } else if (fieldType == FST) { FSTEntry entry = new FSTEntry(); entry.offset = meta.readLong(); entry.numOrds = meta.readVLong(); fsts.put(info.name, entry); } else { throw new CorruptIndexException("invalid entry type: " + fieldType, meta); } fieldNumber = meta.readVInt(); } return numEntries; }
@Override public void visitDocument(int n, StoredFieldVisitor visitor) throws IOException { in.seek(offsets[n]); while (true) { readLine(); if (StringHelper.startsWith(scratch.get(), FIELD) == false) { break; } int fieldNumber = parseIntAt(FIELD.length); FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); readLine(); assert StringHelper.startsWith(scratch.get(), NAME); readLine(); assert StringHelper.startsWith(scratch.get(), TYPE); final BytesRef type; if (equalsAt(TYPE_STRING, scratch.get(), TYPE.length)) { type = TYPE_STRING; } else if (equalsAt(TYPE_BINARY, scratch.get(), TYPE.length)) { type = TYPE_BINARY; } else if (equalsAt(TYPE_INT, scratch.get(), TYPE.length)) { type = TYPE_INT; } else if (equalsAt(TYPE_LONG, scratch.get(), TYPE.length)) { type = TYPE_LONG; } else if (equalsAt(TYPE_FLOAT, scratch.get(), TYPE.length)) { type = TYPE_FLOAT; } else if (equalsAt(TYPE_DOUBLE, scratch.get(), TYPE.length)) { type = TYPE_DOUBLE; } else { throw new RuntimeException("unknown field type"); } switch (visitor.needsField(fieldInfo)) { case YES: readField(type, fieldInfo, visitor); break; case NO: readLine(); assert StringHelper.startsWith(scratch.get(), VALUE); break; case STOP: return; } } }
public FSTTermsReader(SegmentReadState state, PostingsReaderBase postingsReader) throws IOException { final String termsFileName = IndexFileNames.segmentFileName( state.segmentInfo.name, state.segmentSuffix, FSTTermsWriter.TERMS_EXTENSION); this.postingsReader = postingsReader; final IndexInput in = state.directory.openInput(termsFileName, state.context); boolean success = false; try { version = readHeader(in); if (version >= FSTTermsWriter.TERMS_VERSION_CHECKSUM) { CodecUtil.checksumEntireFile(in); } this.postingsReader.init(in); seekDir(in); final FieldInfos fieldInfos = state.fieldInfos; final int numFields = in.readVInt(); for (int i = 0; i < numFields; i++) { int fieldNumber = in.readVInt(); FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); long numTerms = in.readVLong(); long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong(); long sumDocFreq = in.readVLong(); int docCount = in.readVInt(); int longsSize = in.readVInt(); TermsReader current = new TermsReader( fieldInfo, in, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize); TermsReader previous = fields.put(fieldInfo.name, current); checkFieldSummary(state.segmentInfo, in, current, previous); } success = true; } finally { if (success) { IOUtils.close(in); } else { IOUtils.closeWhileHandlingException(in); } } }
// Tests whether the DocumentWriter correctly enable the // omitNorms bit in the FieldInfo public void testOmitNorms() throws Exception { Directory ram = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); Document d = new Document(); // this field will have norms Field f1 = newTextField("f1", "This field has norms", Field.Store.NO); d.add(f1); // this field will NOT have norms FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.setOmitNorms(true); Field f2 = newField("f2", "This field has NO norms in all docs", customType); d.add(f2); writer.addDocument(d); writer.forceMerge(1); // now we add another document which has term freq for field f2 and not for f1 and verify if the // SegmentMerger // keep things constant d = new Document(); // Reverse d.add(newField("f1", "This field has norms", customType)); d.add(newTextField("f2", "This field has NO norms in all docs", Field.Store.NO)); writer.addDocument(d); // force merge writer.forceMerge(1); // flush writer.close(); SegmentReader reader = getOnlySegmentReader(DirectoryReader.open(ram)); FieldInfos fi = reader.getFieldInfos(); assertTrue("OmitNorms field bit should be set.", fi.fieldInfo("f1").omitsNorms()); assertTrue("OmitNorms field bit should be set.", fi.fieldInfo("f2").omitsNorms()); reader.close(); ram.close(); }
void createDocumentNode(final DocumentDescriptor inDescriptor) throws IOException { try { _document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument(); _rootNode = _document.createElement("document"); } catch (ParserConfigurationException e) { e.printStackTrace(); System.exit(1); } AtomicReader segmentReader = _indexReader.leaves().get(inDescriptor.segmentNumber).reader(); _rootNode.setAttribute("id", DocumentIdOperations.documentDescriptorToId(inDescriptor)); // TODO: implement the proper way of building a title from the production report _rootNode.setAttribute("title", buildDocumentTitle(segmentReader, inDescriptor)); _rootNode.setAttribute("path", "ruscorpora.ru"); _rootNode.setAttribute( "tagging", Attributes.getDocAttribute(segmentReader, inDescriptor.localId, "tagging")); _rootNode.setAttribute("snippets", "0"); Element attributesNode = _document.createElement("attributes"); _rootNode.appendChild(attributesNode); FieldInfos fields = segmentReader.getFieldInfos(); for (int fieldIndex = 0; fieldIndex != fields.size(); ++fieldIndex) { FieldInfo field = fields.fieldInfo(fieldIndex); // TODO: understand why field may turn into null if (field == null) { continue; } String name = field.name; if (Attributes.ATTRIBUTES.contains(name) || Attributes.ATTRIBUTES_FOR_REPORT.contains(name) || Attributes.ATTRIBUTES_FOR_WORD_INFO.contains(name) || !field.hasDocValues()) { // it's a word attribute continue; } Element attrNode = _document.createElement("attr"); attrNode.setAttribute("name", name); attrNode.setAttribute( "value", Attributes.getDocAttribute(segmentReader, inDescriptor.localId, name)); attributesNode.appendChild(attrNode); } }
final Document doc(int n, FieldSelector fieldSelector) throws IOException { indexStream.seek(n * 8L); long position = indexStream.readLong(); fieldsStream.seek(position); Document doc = new Document(); int numFields = fieldsStream.readVInt(); for (int i = 0; i < numFields; i++) { int fieldNumber = fieldsStream.readVInt(); FieldInfo fi = fieldInfos.fieldInfo(fieldNumber); FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name); byte bits = fieldsStream.readByte(); boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0; boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0; boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0; // TODO: Find an alternative approach here if this list continues to grow beyond the // list of 5 or 6 currently here. See Lucene 762 for discussion if (acceptField.equals(FieldSelectorResult.LOAD)) { addField(doc, fi, binary, compressed, tokenize); } else if (acceptField.equals(FieldSelectorResult.LOAD_FOR_MERGE)) { addFieldForMerge(doc, fi, binary, compressed, tokenize); } else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK)) { addField(doc, fi, binary, compressed, tokenize); break; // Get out of this loop } else if (acceptField.equals(FieldSelectorResult.LAZY_LOAD)) { addFieldLazy(doc, fi, binary, compressed, tokenize); } else if (acceptField.equals(FieldSelectorResult.SIZE)) { skipField(binary, compressed, addFieldSize(doc, fi, binary, compressed)); } else if (acceptField.equals(FieldSelectorResult.SIZE_AND_BREAK)) { addFieldSize(doc, fi, binary, compressed); break; } else { skipField(binary, compressed); } } return doc; }
/** * Safe (but, slowish) default method to write every vector field in the document. This default * implementation requires that the vectors implement both Fields.size and Terms.size. */ protected final void addAllDocVectors(Fields vectors, FieldInfos fieldInfos) throws IOException { if (vectors == null) { startDocument(0); return; } final int numFields = vectors.size(); if (numFields == -1) { throw new IllegalStateException("vectors.size() must be implemented (it returned -1)"); } startDocument(numFields); final FieldsEnum fieldsEnum = vectors.iterator(); String fieldName; String lastFieldName = null; while ((fieldName = fieldsEnum.next()) != null) { final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldName); assert lastFieldName == null || fieldName.compareTo(lastFieldName) > 0 : "lastFieldName=" + lastFieldName + " fieldName=" + fieldName; lastFieldName = fieldName; final Terms terms = fieldsEnum.terms(); if (terms == null) { // FieldsEnum shouldn't lie... continue; } final int numTerms = (int) terms.size(); if (numTerms == -1) { throw new IllegalStateException("terms.size() must be implemented (it returned -1)"); } final TermsEnum termsEnum = terms.iterator(null); DocsAndPositionsEnum docsAndPositionsEnum = null; boolean startedField = false; // NOTE: this is tricky, because TermVectors allow // indexing offsets but NOT positions. So we must // lazily init the field by checking whether first // position we see is -1 or not. int termCount = 0; while (termsEnum.next() != null) { termCount++; final int freq = (int) termsEnum.totalTermFreq(); if (startedField) { startTerm(termsEnum.term(), freq); } // TODO: we need a "query" API where we can ask (via // flex API) what this term was indexed with... // Both positions & offsets: docsAndPositionsEnum = termsEnum.docsAndPositions(null, null, true); final boolean hasOffsets; boolean hasPositions = false; if (docsAndPositionsEnum == null) { // Fallback: no offsets docsAndPositionsEnum = termsEnum.docsAndPositions(null, null, false); hasOffsets = false; } else { hasOffsets = true; } if (docsAndPositionsEnum != null) { final int docID = docsAndPositionsEnum.nextDoc(); assert docID != DocIdSetIterator.NO_MORE_DOCS; assert docsAndPositionsEnum.freq() == freq; for (int posUpto = 0; posUpto < freq; posUpto++) { final int pos = docsAndPositionsEnum.nextPosition(); if (!startedField) { assert numTerms > 0; hasPositions = pos != -1; startField(fieldInfo, numTerms, hasPositions, hasOffsets); startTerm(termsEnum.term(), freq); startedField = true; } final int startOffset; final int endOffset; if (hasOffsets) { startOffset = docsAndPositionsEnum.startOffset(); endOffset = docsAndPositionsEnum.endOffset(); assert startOffset != -1; assert endOffset != -1; } else { startOffset = -1; endOffset = -1; } assert !hasPositions || pos >= 0; addPosition(pos, startOffset, endOffset); } } else { if (!startedField) { assert numTerms > 0; startField(fieldInfo, numTerms, hasPositions, hasOffsets); startTerm(termsEnum.term(), freq); startedField = true; } } } assert termCount == numTerms; } }
// Tests whether the DocumentWriter correctly enable the // omitTermFreqAndPositions bit in the FieldInfo public void testPositions() throws Exception { Directory ram = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig(analyzer)); Document d = new Document(); // f1,f2,f3: docs only FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS); Field f1 = newField("f1", "This field has docs only", ft); d.add(f1); Field f2 = newField("f2", "This field has docs only", ft); d.add(f2); Field f3 = newField("f3", "This field has docs only", ft); d.add(f3); FieldType ft2 = new FieldType(TextField.TYPE_NOT_STORED); ft2.setIndexOptions(IndexOptions.DOCS_AND_FREQS); // f4,f5,f6 docs and freqs Field f4 = newField("f4", "This field has docs and freqs", ft2); d.add(f4); Field f5 = newField("f5", "This field has docs and freqs", ft2); d.add(f5); Field f6 = newField("f6", "This field has docs and freqs", ft2); d.add(f6); FieldType ft3 = new FieldType(TextField.TYPE_NOT_STORED); ft3.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); // f7,f8,f9 docs/freqs/positions Field f7 = newField("f7", "This field has docs and freqs and positions", ft3); d.add(f7); Field f8 = newField("f8", "This field has docs and freqs and positions", ft3); d.add(f8); Field f9 = newField("f9", "This field has docs and freqs and positions", ft3); d.add(f9); writer.addDocument(d); writer.forceMerge(1); // now we add another document which has docs-only for f1, f4, f7, docs/freqs for f2, f5, f8, // and docs/freqs/positions for f3, f6, f9 d = new Document(); // f1,f4,f7: docs only f1 = newField("f1", "This field has docs only", ft); d.add(f1); f4 = newField("f4", "This field has docs only", ft); d.add(f4); f7 = newField("f7", "This field has docs only", ft); d.add(f7); // f2, f5, f8: docs and freqs f2 = newField("f2", "This field has docs and freqs", ft2); d.add(f2); f5 = newField("f5", "This field has docs and freqs", ft2); d.add(f5); f8 = newField("f8", "This field has docs and freqs", ft2); d.add(f8); // f3, f6, f9: docs and freqs and positions f3 = newField("f3", "This field has docs and freqs and positions", ft3); d.add(f3); f6 = newField("f6", "This field has docs and freqs and positions", ft3); d.add(f6); f9 = newField("f9", "This field has docs and freqs and positions", ft3); d.add(f9); writer.addDocument(d); // force merge writer.forceMerge(1); // flush writer.close(); SegmentReader reader = getOnlySegmentReader(DirectoryReader.open(ram)); FieldInfos fi = reader.getFieldInfos(); // docs + docs = docs assertEquals(IndexOptions.DOCS, fi.fieldInfo("f1").getIndexOptions()); // docs + docs/freqs = docs assertEquals(IndexOptions.DOCS, fi.fieldInfo("f2").getIndexOptions()); // docs + docs/freqs/pos = docs assertEquals(IndexOptions.DOCS, fi.fieldInfo("f3").getIndexOptions()); // docs/freqs + docs = docs assertEquals(IndexOptions.DOCS, fi.fieldInfo("f4").getIndexOptions()); // docs/freqs + docs/freqs = docs/freqs assertEquals(IndexOptions.DOCS_AND_FREQS, fi.fieldInfo("f5").getIndexOptions()); // docs/freqs + docs/freqs/pos = docs/freqs assertEquals(IndexOptions.DOCS_AND_FREQS, fi.fieldInfo("f6").getIndexOptions()); // docs/freqs/pos + docs = docs assertEquals(IndexOptions.DOCS, fi.fieldInfo("f7").getIndexOptions()); // docs/freqs/pos + docs/freqs = docs/freqs assertEquals(IndexOptions.DOCS_AND_FREQS, fi.fieldInfo("f8").getIndexOptions()); // docs/freqs/pos + docs/freqs/pos = docs/freqs/pos assertEquals(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, fi.fieldInfo("f9").getIndexOptions()); reader.close(); ram.close(); }
@Override public void merge(MergeState mergeState) throws IOException { if (mergeState.segmentInfo.getIndexSort() != null) { // TODO: can we gain back some optos even if index is sorted? E.g. if sort results in large // chunks of contiguous docs from one sub // being copied over...? super.merge(mergeState); return; } for (PointsReader reader : mergeState.pointsReaders) { if (reader instanceof Lucene60PointsReader == false) { // We can only bulk merge when all to-be-merged segments use our format: super.merge(mergeState); return; } } for (PointsReader reader : mergeState.pointsReaders) { if (reader != null) { reader.checkIntegrity(); } } for (FieldInfo fieldInfo : mergeState.mergeFieldInfos) { if (fieldInfo.getPointDimensionCount() != 0) { if (fieldInfo.getPointDimensionCount() == 1) { boolean singleValuePerDoc = true; // Worst case total maximum size (if none of the points are deleted): long totMaxSize = 0; for (int i = 0; i < mergeState.pointsReaders.length; i++) { PointsReader reader = mergeState.pointsReaders[i]; if (reader != null) { FieldInfos readerFieldInfos = mergeState.fieldInfos[i]; FieldInfo readerFieldInfo = readerFieldInfos.fieldInfo(fieldInfo.name); if (readerFieldInfo != null && readerFieldInfo.getPointDimensionCount() > 0) { PointValues values = reader.getValues(fieldInfo.name); if (values != null) { totMaxSize += values.size(); singleValuePerDoc &= values.size() == values.getDocCount(); } } } } // System.out.println("MERGE: field=" + fieldInfo.name); // Optimize the 1D case to use BKDWriter.merge, which does a single merge sort of the // already sorted incoming segments, instead of trying to sort all points again as if // we were simply reindexing them: try (BKDWriter writer = new BKDWriter( writeState.segmentInfo.maxDoc(), writeState.directory, writeState.segmentInfo.name, fieldInfo.getPointDimensionCount(), fieldInfo.getPointNumBytes(), maxPointsInLeafNode, maxMBSortInHeap, totMaxSize, singleValuePerDoc)) { List<BKDReader> bkdReaders = new ArrayList<>(); List<MergeState.DocMap> docMaps = new ArrayList<>(); for (int i = 0; i < mergeState.pointsReaders.length; i++) { PointsReader reader = mergeState.pointsReaders[i]; if (reader != null) { // we confirmed this up above assert reader instanceof Lucene60PointsReader; Lucene60PointsReader reader60 = (Lucene60PointsReader) reader; // NOTE: we cannot just use the merged fieldInfo.number (instead of resolving to // this // reader's FieldInfo as we do below) because field numbers can easily be different // when addIndexes(Directory...) copies over segments from another index: FieldInfos readerFieldInfos = mergeState.fieldInfos[i]; FieldInfo readerFieldInfo = readerFieldInfos.fieldInfo(fieldInfo.name); if (readerFieldInfo != null && readerFieldInfo.getPointDimensionCount() > 0) { BKDReader bkdReader = reader60.readers.get(readerFieldInfo.number); if (bkdReader != null) { bkdReaders.add(bkdReader); docMaps.add(mergeState.docMaps[i]); } } } } long fp = writer.merge(dataOut, docMaps, bkdReaders); if (fp != -1) { indexFPs.put(fieldInfo.name, fp); } } } else { mergeOneField(mergeState, fieldInfo); } } } finish(); }
public static List<Element> handlePropertyName( String[] propertyNames, ServiceContext context, boolean freq, int maxRecords, String cswServiceSpecificConstraint, LuceneConfig luceneConfig) throws Exception { List<Element> domainValuesList = null; if (Log.isDebugEnabled(Geonet.CSW)) Log.debug( Geonet.CSW, "Handling property names '" + Arrays.toString(propertyNames) + "' with max records of " + maxRecords); for (int i = 0; i < propertyNames.length; i++) { if (i == 0) domainValuesList = new ArrayList<Element>(); // Initialize list of values element. Element listOfValues = null; // Generate DomainValues element Element domainValues = new Element("DomainValues", Csw.NAMESPACE_CSW); // FIXME what should be the type ??? domainValues.setAttribute("type", "csw:Record"); String property = propertyNames[i].trim(); // Set propertyName in any case. Element pn = new Element("PropertyName", Csw.NAMESPACE_CSW); domainValues.addContent(pn.setText(property)); GeonetContext gc = (GeonetContext) context.getHandlerContext(Geonet.CONTEXT_NAME); SearchManager sm = gc.getSearchmanager(); IndexAndTaxonomy indexAndTaxonomy = sm.getNewIndexReader(null); try { GeonetworkMultiReader reader = indexAndTaxonomy.indexReader; BooleanQuery groupsQuery = (BooleanQuery) CatalogSearcher.getGroupsQuery(context); BooleanQuery query = null; // Apply CSW service specific constraint if (StringUtils.isNotEmpty(cswServiceSpecificConstraint)) { Query constraintQuery = CatalogSearcher.getCswServiceSpecificConstraintQuery( cswServiceSpecificConstraint, luceneConfig); query = new BooleanQuery(); BooleanClause.Occur occur = LuceneUtils.convertRequiredAndProhibitedToOccur(true, false); query.add(groupsQuery, occur); query.add(constraintQuery, occur); } else { query = groupsQuery; } List<Pair<String, Boolean>> sortFields = Collections.singletonList(Pair.read(Geonet.SearchResult.SortBy.RELEVANCE, true)); Sort sort = LuceneSearcher.makeSort(sortFields, context.getLanguage(), false); CachingWrapperFilter filter = null; Pair<TopDocs, Element> searchResults = LuceneSearcher.doSearchAndMakeSummary( maxRecords, 0, maxRecords, context.getLanguage(), null, reader, query, filter, sort, null, false, false, false, false // Scoring is useless for GetDomain operation ); TopDocs hits = searchResults.one(); try { // Get mapped lucene field in CSW configuration String indexField = CatalogConfiguration.getFieldMapping().get(property.toLowerCase()); if (indexField != null) property = indexField; // check if params asked is in the index using getFieldNames ? FieldInfos fi = new SlowCompositeReaderWrapper(reader).getFieldInfos(); if (fi.fieldInfo(property) == null) continue; boolean isRange = false; if (CatalogConfiguration.getGetRecordsRangeFields().contains(property)) isRange = true; if (isRange) listOfValues = new Element("RangeOfValues", Csw.NAMESPACE_CSW); else listOfValues = new Element("ListOfValues", Csw.NAMESPACE_CSW); Set<String> fields = new HashSet<String>(); fields.add(property); fields.add("_isTemplate"); // parse each document in the index String[] fieldValues; SortedSet<String> sortedValues = new TreeSet<String>(); HashMap<String, Integer> duplicateValues = new HashMap<String, Integer>(); for (int j = 0; j < hits.scoreDocs.length; j++) { DocumentStoredFieldVisitor selector = new DocumentStoredFieldVisitor(fields); reader.document(hits.scoreDocs[j].doc, selector); Document doc = selector.getDocument(); // Skip templates and subTemplates String[] isTemplate = doc.getValues("_isTemplate"); if (isTemplate[0] != null && !isTemplate[0].equals("n")) continue; // Get doc values for specified property fieldValues = doc.getValues(property); if (fieldValues == null) continue; addtoSortedSet(sortedValues, fieldValues, duplicateValues); } SummaryComparator valuesComparator = new SummaryComparator(SortOption.FREQUENCY, Type.STRING, context.getLanguage(), null); TreeSet<Map.Entry<String, Integer>> sortedValuesFrequency = new TreeSet<Map.Entry<String, Integer>>(valuesComparator); sortedValuesFrequency.addAll(duplicateValues.entrySet()); if (freq) return createValuesByFrequency(sortedValuesFrequency); else listOfValues.addContent(createValuesElement(sortedValues, isRange)); } finally { // any children means that the catalog was unable to determine // anything about the specified parameter if (listOfValues != null && listOfValues.getChildren().size() != 0) domainValues.addContent(listOfValues); // Add current DomainValues to the list domainValuesList.add(domainValues); } } finally { sm.releaseIndexReader(indexAndTaxonomy); } } return domainValuesList; }