static { // Id IdFielType = new FieldType(); IdFielType.setStored(true); IdFielType.setTokenized(false); IdFielType.setOmitNorms(true); IdFielType.setIndexOptions(IndexOptions.DOCS); IdFielType.freeze(); // content ContentFielType = new FieldType(); ContentFielType.setStored(false); ContentFielType.setTokenized(true); ContentFielType.setOmitNorms(false); ContentFielType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); ContentFielType.freeze(); // title TitleFielType = new FieldType(); TitleFielType.setStored(true); TitleFielType.setTokenized(true); TitleFielType.setOmitNorms(false); TitleFielType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); TitleFielType.freeze(); // onlyForStore OnLyStoreFieldType = new FieldType(); OnLyStoreFieldType.setStored(true); OnLyStoreFieldType.setTokenized(false); OnLyStoreFieldType.setOmitNorms(false); OnLyStoreFieldType.setIndexOptions(IndexOptions.NONE); OnLyStoreFieldType.freeze(); }
static { LABEL_FIELD_TYPE.setStored(true); LABEL_FIELD_TYPE.setTokenized(true); LABEL_FIELD_TYPE.freeze(); URI_FIELD_TYPE.setStored(true); URI_FIELD_TYPE.setTokenized(false); URI_FIELD_TYPE.freeze(); FIELD_TYPE.setStored(true); FIELD_TYPE.freeze(); }
// LUCENE-1727: make sure doc fields are stored in order public void testStoredFieldsOrder() throws Throwable { Directory d = newDirectory(); IndexWriter w = new IndexWriter(d, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); Document doc = new Document(); FieldType customType = new FieldType(); customType.setStored(true); doc.add(newField("zzz", "a b c", customType)); doc.add(newField("aaa", "a b c", customType)); doc.add(newField("zzz", "1 2 3", customType)); w.addDocument(doc); IndexReader r = w.getReader(); Document doc2 = r.document(0); Iterator<IndexableField> it = doc2.getFields().iterator(); assertTrue(it.hasNext()); Field f = (Field) it.next(); assertEquals(f.name(), "zzz"); assertEquals(f.stringValue(), "a b c"); assertTrue(it.hasNext()); f = (Field) it.next(); assertEquals(f.name(), "aaa"); assertEquals(f.stringValue(), "a b c"); assertTrue(it.hasNext()); f = (Field) it.next(); assertEquals(f.name(), "zzz"); assertEquals(f.stringValue(), "1 2 3"); assertFalse(it.hasNext()); r.close(); w.close(); d.close(); }
/** * Used for adding a document when a field needs to be created from a type and a string. * * <p>By default, the indexed value is the same as the stored value (taken from toInternal()). * Having a different representation for external, internal, and indexed would present quite a few * problems given the current Lucene architecture. An analyzer for adding docs would need to * translate internal->indexed while an analyzer for querying would need to translate * external->indexed. * * <p>The only other alternative to having internal==indexed would be to have internal==external. * In this case, toInternal should convert to the indexed representation, toExternal() should do * nothing, and createField() should *not* call toInternal, but use the external value and set * tokenized=true to get Lucene to convert to the internal(indexed) form. :TODO: clean up and * clarify this explanation. * * @see #toInternal */ public StorableField createField(SchemaField field, Object value, float boost) { if (!field.indexed() && !field.stored()) { if (log.isTraceEnabled()) log.trace("Ignoring unindexed/unstored field: " + field); return null; } String val; try { val = toInternal(value.toString()); } catch (RuntimeException e) { throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "Error while creating field '" + field + "' from value '" + value + "'", e); } if (val == null) return null; org.apache.lucene.document.FieldType newType = new org.apache.lucene.document.FieldType(); newType.setTokenized(field.isTokenized()); newType.setStored(field.stored()); newType.setOmitNorms(field.omitNorms()); newType.setIndexOptions(field.indexed() ? getIndexOptions(field, val) : IndexOptions.NONE); newType.setStoreTermVectors(field.storeTermVector()); newType.setStoreTermVectorOffsets(field.storeTermOffsets()); newType.setStoreTermVectorPositions(field.storeTermPositions()); newType.setStoreTermVectorPayloads(field.storeTermPayloads()); return createField(field.getName(), val, newType, boost); }
// LUCENE-325: test forceMergeDeletes without waiting, when // many adjacent merges are required public void testForceMergeDeletes3() throws IOException { Directory dir = newDirectory(); IndexWriter writer = new IndexWriter( dir, newIndexWriterConfig(new MockAnalyzer(random())) .setMaxBufferedDocs(2) .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH) .setMergePolicy(newLogMergePolicy(50))); FieldType customType = new FieldType(); customType.setStored(true); FieldType customType1 = new FieldType(TextField.TYPE_NOT_STORED); customType1.setTokenized(false); customType1.setStoreTermVectors(true); customType1.setStoreTermVectorPositions(true); customType1.setStoreTermVectorOffsets(true); Document document = new Document(); Field storedField = newField("stored", "stored", customType); document.add(storedField); Field termVectorField = newField("termVector", "termVector", customType1); document.add(termVectorField); Field idField = newStringField("id", "", Field.Store.NO); document.add(idField); for (int i = 0; i < 98; i++) { idField.setStringValue("" + i); writer.addDocument(document); } writer.close(); IndexReader ir = DirectoryReader.open(dir); assertEquals(98, ir.maxDoc()); assertEquals(98, ir.numDocs()); ir.close(); IndexWriterConfig dontMergeConfig = new IndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(NoMergePolicy.INSTANCE); writer = new IndexWriter(dir, dontMergeConfig); for (int i = 0; i < 98; i += 2) { writer.deleteDocuments(new Term("id", "" + i)); } writer.close(); ir = DirectoryReader.open(dir); assertEquals(49, ir.numDocs()); ir.close(); writer = new IndexWriter( dir, newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy(3))); writer.forceMergeDeletes(false); writer.close(); ir = DirectoryReader.open(dir); assertEquals(49, ir.maxDoc()); assertEquals(49, ir.numDocs()); ir.close(); dir.close(); }
static { FIELD_TYPE.setIndexed(true); FIELD_TYPE.setTokenized(false); FIELD_TYPE.setStored(true); FIELD_TYPE.setOmitNorms(true); FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_ONLY); FIELD_TYPE.freeze(); }
@Test public void testName() throws Exception { FieldType TYPE_STORED = new FieldType(); TYPE_STORED.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); TYPE_STORED.setTokenized(false); TYPE_STORED.setStored(true); TYPE_STORED.freeze(); System.out.println(TYPE_STORED); }
private void addDoc(IndexWriter writer, String text, String count) throws IOException { Document doc = new Document(); doc.add(new Field("ngram", text, StringField.TYPE_NOT_STORED)); FieldType fieldType = new FieldType(); fieldType.setStored(true); Field countField = new Field("count", count, fieldType); doc.add(countField); writer.addDocument(doc); }
static { // Default: pointValues + docValues FieldType type = new FieldType(); type.setDimensions(1, Double.BYTES); // pointValues (assume Double) type.setDocValuesType(DocValuesType.NUMERIC); // docValues type.setStored(false); type.freeze(); DEFAULT_FIELDTYPE = type; // Legacy default: legacyNumerics type = new FieldType(); type.setIndexOptions(IndexOptions.DOCS); type.setNumericType(FieldType.LegacyNumericType.DOUBLE); type.setNumericPrecisionStep(8); // same as solr default type.setDocValuesType(DocValuesType.NONE); // no docValues! type.setStored(false); type.freeze(); LEGACY_FIELDTYPE = type; }
static { FIELD_TYPE.setIndexed(true); FIELD_TYPE.setTokenized(false); FIELD_TYPE.setStored(true); FIELD_TYPE.setOmitNorms(true); FIELD_TYPE.setIndexOptions( FieldInfo.IndexOptions .DOCS_AND_FREQS_AND_POSITIONS); // we store payload (otherwise, we really need just // docs) FIELD_TYPE.freeze(); NESTED_FIELD_TYPE.setIndexed(true); NESTED_FIELD_TYPE.setTokenized(false); NESTED_FIELD_TYPE.setStored(false); NESTED_FIELD_TYPE.setOmitNorms(true); // we can set this to another index option when we move away from storing payload.. // NESTED_FIELD_TYPE.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY); NESTED_FIELD_TYPE.freeze(); }
static { UID_FIELD_TYPE.setIndexed(true); UID_FIELD_TYPE.setTokenized(false); UID_FIELD_TYPE.setStored(true); UID_FIELD_TYPE.setOmitNorms(true); UID_FIELD_TYPE.setIndexOptions( FieldInfo.IndexOptions .DOCS_AND_FREQS_AND_POSITIONS); // we store payload (otherwise, we really need just // docs) UID_FIELD_TYPE.freeze(); }
public void testReadSkip() throws IOException { Directory dir = newDirectory(); IndexWriterConfig iwConf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); iwConf.setMaxBufferedDocs(RandomInts.randomIntBetween(random(), 2, 30)); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConf); FieldType ft = new FieldType(); ft.setStored(true); ft.freeze(); final String string = _TestUtil.randomSimpleString(random(), 50); final byte[] bytes = string.getBytes("UTF-8"); final long l = random().nextBoolean() ? random().nextInt(42) : random().nextLong(); final int i = random().nextBoolean() ? random().nextInt(42) : random().nextInt(); final float f = random().nextFloat(); final double d = random().nextDouble(); List<Field> fields = Arrays.asList( new Field("bytes", bytes, ft), new Field("string", string, ft), new LongField("long", l, Store.YES), new IntField("int", i, Store.YES), new FloatField("float", f, Store.YES), new DoubleField("double", d, Store.YES)); for (int k = 0; k < 100; ++k) { Document doc = new Document(); for (Field fld : fields) { doc.add(fld); } iw.w.addDocument(doc); } iw.commit(); final DirectoryReader reader = DirectoryReader.open(dir); final int docID = random().nextInt(100); for (Field fld : fields) { String fldName = fld.name(); final Document sDoc = reader.document(docID, Collections.singleton(fldName)); final IndexableField sField = sDoc.getField(fldName); if (Field.class.equals(fld.getClass())) { assertEquals(fld.binaryValue(), sField.binaryValue()); assertEquals(fld.stringValue(), sField.stringValue()); } else { assertEquals(fld.numericValue(), sField.numericValue()); } } reader.close(); iw.close(); dir.close(); }
static { TYPE_NOT_STORED.setOmitNorms(true); TYPE_NOT_STORED.setIndexOptions(IndexOptions.DOCS); TYPE_NOT_STORED.setTokenized(false); TYPE_NOT_STORED.freeze(); TYPE_STORED.setOmitNorms(true); TYPE_STORED.setIndexOptions(IndexOptions.DOCS); TYPE_STORED.setStored(true); TYPE_STORED.setTokenized(false); TYPE_STORED.freeze(); }
/** * Translates the pre-4.0 enums for specifying how a field should be indexed into the 4.0 {@link * FieldType} approach. * * @deprecated This is here only to ease transition from the pre-4.0 APIs. */ @Deprecated public static final FieldType translateFieldType( Store store, Index index, TermVector termVector) { final FieldType ft = new FieldType(); ft.setStored(store == Store.YES); switch (index) { case ANALYZED: ft.setIndexed(true); ft.setTokenized(true); break; case ANALYZED_NO_NORMS: ft.setIndexed(true); ft.setTokenized(true); ft.setOmitNorms(true); break; case NOT_ANALYZED: ft.setIndexed(true); ft.setTokenized(false); break; case NOT_ANALYZED_NO_NORMS: ft.setIndexed(true); ft.setTokenized(false); ft.setOmitNorms(true); break; case NO: break; } switch (termVector) { case NO: break; case YES: ft.setStoreTermVectors(true); break; case WITH_POSITIONS: ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); break; case WITH_OFFSETS: ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); break; case WITH_POSITIONS_OFFSETS: ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.setStoreTermVectorOffsets(true); break; } ft.freeze(); return ft; }
/** * Persists all snapshots information. If the given id and segment are not null, it persists their * information as well. */ private void persistSnapshotInfos(String id, String segment) throws IOException { writer.deleteAll(); Document d = new Document(); FieldType ft = new FieldType(); ft.setStored(true); d.add(new Field(SNAPSHOTS_ID, "", ft)); for (Entry<String, String> e : super.getSnapshots().entrySet()) { d.add(new Field(e.getKey(), e.getValue(), ft)); } if (id != null) { d.add(new Field(id, segment, ft)); } writer.addDocument(d); writer.commit(); }
/** * Tests various combinations of omitNorms=true/false, the field not existing at all, ensuring * that only omitNorms is 'viral'. Internally checks that MultiNorms.norms() is consistent * (returns the same bytes) as the fully merged equivalent. */ public void testOmitNormsCombos() throws IOException { // indexed with norms FieldType customType = new FieldType(TextField.TYPE_STORED); Field norms = new Field("foo", "a", customType); // indexed without norms FieldType customType1 = new FieldType(TextField.TYPE_STORED); customType1.setOmitNorms(true); Field noNorms = new Field("foo", "a", customType1); // not indexed, but stored FieldType customType2 = new FieldType(); customType2.setStored(true); Field noIndex = new Field("foo", "a", customType2); // not indexed but stored, omitNorms is set FieldType customType3 = new FieldType(); customType3.setStored(true); customType3.setOmitNorms(true); Field noNormsNoIndex = new Field("foo", "a", customType3); // not indexed nor stored (doesnt exist at all, we index a different field instead) Field emptyNorms = new Field("bar", "a", customType); assertNotNull(getNorms("foo", norms, norms)); assertNull(getNorms("foo", norms, noNorms)); assertNotNull(getNorms("foo", norms, noIndex)); assertNotNull(getNorms("foo", norms, noNormsNoIndex)); assertNotNull(getNorms("foo", norms, emptyNorms)); assertNull(getNorms("foo", noNorms, noNorms)); assertNull(getNorms("foo", noNorms, noIndex)); assertNull(getNorms("foo", noNorms, noNormsNoIndex)); assertNull(getNorms("foo", noNorms, emptyNorms)); assertNull(getNorms("foo", noIndex, noIndex)); assertNull(getNorms("foo", noIndex, noNormsNoIndex)); assertNull(getNorms("foo", noIndex, emptyNorms)); assertNull(getNorms("foo", noNormsNoIndex, noNormsNoIndex)); assertNull(getNorms("foo", noNormsNoIndex, emptyNorms)); assertNull(getNorms("foo", emptyNorms, emptyNorms)); }
public void testIndexedBit() throws Exception { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); Document doc = new Document(); FieldType onlyStored = new FieldType(); onlyStored.setStored(true); doc.add(new Field("field", "value", onlyStored)); doc.add(new StringField("field2", "value", Field.Store.YES)); w.addDocument(doc); IndexReader r = w.getReader(); w.close(); assertFalse(r.document(0).getField("field").fieldType().indexed()); assertTrue(r.document(0).getField("field2").fieldType().indexed()); r.close(); dir.close(); }
// LUCENE-1270 public void testHangOnClose() throws IOException { Directory dir = newDirectory(); LogByteSizeMergePolicy lmp = new LogByteSizeMergePolicy(); lmp.setNoCFSRatio(0.0); lmp.setMergeFactor(100); IndexWriter writer = new IndexWriter( dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())) .setMaxBufferedDocs(5) .setMergePolicy(lmp)); Document doc = new Document(); FieldType customType = new FieldType(TextField.TYPE_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorOffsets(true); doc.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType)); for (int i = 0; i < 60; i++) writer.addDocument(doc); Document doc2 = new Document(); FieldType customType2 = new FieldType(); customType2.setStored(true); doc2.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType2)); doc2.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType2)); doc2.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType2)); doc2.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType2)); for (int i = 0; i < 10; i++) writer.addDocument(doc2); writer.close(); Directory dir2 = newDirectory(); lmp = new LogByteSizeMergePolicy(); lmp.setMinMergeMB(0.0001); lmp.setNoCFSRatio(0.0); lmp.setMergeFactor(4); writer = new IndexWriter( dir2, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())) .setMergeScheduler(new SerialMergeScheduler()) .setMergePolicy(lmp)); writer.addIndexes(dir); writer.close(); dir.close(); dir2.close(); }
private IndexReader createIndex(int docCount, int facetFields, boolean ram) throws CorruptIndexException, LockObtainFailedException, IOException { Directory directory; if (ram) { directory = new RAMDirectory(); } else { File dir = new File("./target/tmp/facet_tmp"); if (dir.exists()) { directory = FSDirectory.open(dir); if (DirectoryReader.indexExists(directory)) { DirectoryReader reader = DirectoryReader.open(directory); if (reader.numDocs() == docCount) { return reader; } reader.close(); directory.close(); } } rmr(dir); directory = FSDirectory.open(dir); } IndexWriterConfig conf = new IndexWriterConfig(LUCENE_VERSION, new KeywordAnalyzer()); IndexWriter writer = new IndexWriter(directory, conf); FieldType fieldType = new FieldType(); fieldType.setStored(true); fieldType.setIndexed(true); fieldType.setOmitNorms(true); long start = System.nanoTime(); for (int i = 0; i < docCount; i++) { long now = System.nanoTime(); if (start + TimeUnit.SECONDS.toNanos(5) < now) { System.out.println("Indexing doc " + i + " of " + docCount); start = System.nanoTime(); } Document document = new Document(); document.add(new Field("f1", "value", fieldType)); document.add(new Field("f2", "v" + i, fieldType)); for (int f = 0; f < facetFields; f++) { document.add(new Field("facet" + f, "value", fieldType)); } writer.addDocument(document); } writer.close(); return DirectoryReader.open(directory); }
/** * Método para la indexación individual de cada fichero PDF * * @param f el fichero PDF * @param writer el IndexWriter * @throws IOException */ public static void indexFile(File f, IndexWriter writer) throws IOException { // Cargamos el fichero mediante PDFBox PDDocument pddDocument = PDDocument.load(f.getAbsolutePath()); PDFTextStripper textStripper = new PDFTextStripper(); int numPages = pddDocument.getNumberOfPages(); String pageContent; // Declaramos un Field propio FieldType fieldText = new FieldType(); fieldText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); fieldText.setStored(false); fieldText.setStoreTermVectorOffsets(true); fieldText.setStoreTermVectorPositions(true); fieldText.setStoreTermVectors(true); // Recorremos e indexamos cada una de las páginas del fichero, almacenando el número de página y // el título del fichero, e indexando el contenido for (int i = 0; i < numPages; i++) { if (i == 0) { i++; } textStripper.setStartPage(i); textStripper.setEndPage(i); // coger una página pageContent = textStripper.getText(pddDocument); if (pageContent != null && !pageContent.isEmpty()) { pageContent = pageContent.toLowerCase(); } if (pageContent != null) { // Declaramos el documento a indexar para esa página // Número de página // Contenido de la página // Título del fichero // Añadimos el documento } } // Cerramos el fichero PDF }
public void index(Item item) throws IOException { String id = item.getId(); String text = item.getText(); long publicationTIme = item.getPublicationTime(); Document document = new Document(); Field idField = new StringField("id", id, Store.YES); document.add(idField); FieldType fieldType = new FieldType(); fieldType.setStored(true); fieldType.setIndexed(true); fieldType.setStoreTermVectors(true); document.add(new Field("text", text, fieldType)); document.add(new LongField("publicationTIme", publicationTIme, LongField.TYPE_STORED)); if (iwriter != null) { iwriter.addDocument(document); } }
/** * Construct Indexer * * @param directory the main BlackLab index directory * @param create if true, creates a new index; otherwise, appends to existing index * @param docIndexerClass how to index the files, or null to autodetect * @param indexTemplateFile JSON file to use as template for index structure / metadata (if * creating new index) * @throws DocumentFormatException if no DocIndexer was specified and autodetection failed * @throws IOException */ public Indexer( File directory, boolean create, Class<? extends DocIndexer> docIndexerClass, File indexTemplateFile) throws DocumentFormatException, IOException { this.docIndexerClass = docIndexerClass; searcher = Searcher.openForWriting(directory, create, indexTemplateFile); if (!create) searcher.getIndexStructure().setModified(); if (this.docIndexerClass == null) { // No DocIndexer supplied; try to detect it from the index // metadata. String formatId = searcher.getIndexStructure().getDocumentFormat(); if (formatId != null && formatId.length() > 0) setDocIndexer(DocumentFormats.getIndexerClass(formatId)); else { throw new DocumentFormatException("Cannot detect document format for index!"); } } metadataFieldTypeTokenized = new FieldType(); metadataFieldTypeTokenized.setStored(true); metadataFieldTypeTokenized.setIndexed(true); metadataFieldTypeTokenized.setTokenized(true); metadataFieldTypeTokenized.setOmitNorms(true); // @@@ <-- depending on setting? metadataFieldTypeTokenized.setStoreTermVectors(true); metadataFieldTypeTokenized.setStoreTermVectorPositions(true); metadataFieldTypeTokenized.setStoreTermVectorOffsets(true); metadataFieldTypeTokenized.freeze(); metadataFieldTypeUntokenized = new FieldType(metadataFieldTypeTokenized); metadataFieldTypeUntokenized.setTokenized(false); metadataFieldTypeUntokenized.freeze(); }
private boolean isAHit(Query q, String content, Analyzer analyzer) throws IOException { Directory ramDir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), ramDir, analyzer); Document doc = new Document(); FieldType fieldType = new FieldType(); fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); fieldType.setTokenized(true); fieldType.setStored(true); Field field = new Field(FIELD, content, fieldType); doc.add(field); writer.addDocument(doc); writer.close(); DirectoryReader ir = DirectoryReader.open(ramDir); IndexSearcher is = new IndexSearcher(ir); int hits = is.search(q, 10).totalHits; ir.close(); ramDir.close(); if (hits == 1) { return true; } else { return false; } }
// Tests some very basic usages... public void testBasic() throws Exception { final String groupField = "author"; FieldType customType = new FieldType(); customType.setStored(true); Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter( random(), dir, newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy())); boolean canUseIDV = true; List<Document> documents = new ArrayList<>(); // 0 Document doc = new Document(); addGroupField(doc, groupField, "author1", canUseIDV); doc.add(new TextField("content", "random text", Field.Store.YES)); doc.add(new Field("id", "1", customType)); documents.add(doc); // 1 doc = new Document(); addGroupField(doc, groupField, "author1", canUseIDV); doc.add(new TextField("content", "some more random text", Field.Store.YES)); doc.add(new Field("id", "2", customType)); documents.add(doc); // 2 doc = new Document(); addGroupField(doc, groupField, "author1", canUseIDV); doc.add(new TextField("content", "some more random textual data", Field.Store.YES)); doc.add(new Field("id", "3", customType)); doc.add(new StringField("groupend", "x", Field.Store.NO)); documents.add(doc); w.addDocuments(documents); documents.clear(); // 3 doc = new Document(); addGroupField(doc, groupField, "author2", canUseIDV); doc.add(new TextField("content", "some random text", Field.Store.YES)); doc.add(new Field("id", "4", customType)); doc.add(new StringField("groupend", "x", Field.Store.NO)); w.addDocument(doc); // 4 doc = new Document(); addGroupField(doc, groupField, "author3", canUseIDV); doc.add(new TextField("content", "some more random text", Field.Store.YES)); doc.add(new Field("id", "5", customType)); documents.add(doc); // 5 doc = new Document(); addGroupField(doc, groupField, "author3", canUseIDV); doc.add(new TextField("content", "random", Field.Store.YES)); doc.add(new Field("id", "6", customType)); doc.add(new StringField("groupend", "x", Field.Store.NO)); documents.add(doc); w.addDocuments(documents); documents.clear(); // 6 -- no author field doc = new Document(); doc.add(new TextField("content", "random word stuck in alot of other text", Field.Store.YES)); doc.add(new Field("id", "6", customType)); doc.add(new StringField("groupend", "x", Field.Store.NO)); w.addDocument(doc); IndexSearcher indexSearcher = newSearcher(w.getReader()); w.close(); Sort groupSort = Sort.RELEVANCE; GroupingSearch groupingSearch = createRandomGroupingSearch(groupField, groupSort, 5, canUseIDV); TopGroups<?> groups = groupingSearch.search(indexSearcher, new TermQuery(new Term("content", "random")), 0, 10); assertEquals(7, groups.totalHitCount); assertEquals(7, groups.totalGroupedHitCount); assertEquals(4, groups.groups.length); // relevance order: 5, 0, 3, 4, 1, 2, 6 // the later a document is added the higher this docId // value GroupDocs<?> group = groups.groups[0]; compareGroupValue("author3", group); assertEquals(2, group.scoreDocs.length); assertEquals(5, group.scoreDocs[0].doc); assertEquals(4, group.scoreDocs[1].doc); assertTrue(group.scoreDocs[0].score > group.scoreDocs[1].score); group = groups.groups[1]; compareGroupValue("author1", group); assertEquals(3, group.scoreDocs.length); assertEquals(0, group.scoreDocs[0].doc); assertEquals(1, group.scoreDocs[1].doc); assertEquals(2, group.scoreDocs[2].doc); assertTrue(group.scoreDocs[0].score > group.scoreDocs[1].score); assertTrue(group.scoreDocs[1].score > group.scoreDocs[2].score); group = groups.groups[2]; compareGroupValue("author2", group); assertEquals(1, group.scoreDocs.length); assertEquals(3, group.scoreDocs[0].doc); group = groups.groups[3]; compareGroupValue(null, group); assertEquals(1, group.scoreDocs.length); assertEquals(6, group.scoreDocs[0].doc); Filter lastDocInBlock = new QueryWrapperFilter(new TermQuery(new Term("groupend", "x"))); groupingSearch = new GroupingSearch(lastDocInBlock); groups = groupingSearch.search(indexSearcher, new TermQuery(new Term("content", "random")), 0, 10); assertEquals(7, groups.totalHitCount); assertEquals(7, groups.totalGroupedHitCount); assertEquals(4, groups.totalGroupCount.longValue()); assertEquals(4, groups.groups.length); indexSearcher.getIndexReader().close(); dir.close(); }
static { FIELD_TYPE.setIndexOptions(IndexOptions.NONE); FIELD_TYPE.setStored(false); FIELD_TYPE.setOmitNorms(true); FIELD_TYPE.freeze(); }
public void testTotalGroupCount() throws Exception { final String groupField = "author"; FieldType customType = new FieldType(); customType.setStored(true); Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter( random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())) .setMergePolicy(newLogMergePolicy())); boolean canUseIDV = !"Lucene3x".equals(w.w.getConfig().getCodec().getName()); // 0 Document doc = new Document(); addGroupField(doc, groupField, "author1", canUseIDV); doc.add(new Field("content", "random text", TextField.TYPE_STORED)); doc.add(new Field("id", "1", customType)); w.addDocument(doc); // 1 doc = new Document(); addGroupField(doc, groupField, "author1", canUseIDV); doc.add(new Field("content", "some more random text blob", TextField.TYPE_STORED)); doc.add(new Field("id", "2", customType)); w.addDocument(doc); // 2 doc = new Document(); addGroupField(doc, groupField, "author1", canUseIDV); doc.add(new Field("content", "some more random textual data", TextField.TYPE_STORED)); doc.add(new Field("id", "3", customType)); w.addDocument(doc); w.commit(); // To ensure a second segment // 3 doc = new Document(); addGroupField(doc, groupField, "author2", canUseIDV); doc.add(new Field("content", "some random text", TextField.TYPE_STORED)); doc.add(new Field("id", "4", customType)); w.addDocument(doc); // 4 doc = new Document(); addGroupField(doc, groupField, "author3", canUseIDV); doc.add(new Field("content", "some more random text", TextField.TYPE_STORED)); doc.add(new Field("id", "5", customType)); w.addDocument(doc); // 5 doc = new Document(); addGroupField(doc, groupField, "author3", canUseIDV); doc.add(new Field("content", "random blob", TextField.TYPE_STORED)); doc.add(new Field("id", "6", customType)); w.addDocument(doc); // 6 -- no author field doc = new Document(); doc.add(new Field("content", "random word stuck in alot of other text", TextField.TYPE_STORED)); doc.add(new Field("id", "6", customType)); w.addDocument(doc); IndexSearcher indexSearcher = new IndexSearcher(w.getReader()); w.close(); AbstractAllGroupsCollector<?> allGroupsCollector = createRandomCollector(groupField, canUseIDV); indexSearcher.search(new TermQuery(new Term("content", "random")), allGroupsCollector); assertEquals(4, allGroupsCollector.getGroupCount()); allGroupsCollector = createRandomCollector(groupField, canUseIDV); indexSearcher.search(new TermQuery(new Term("content", "some")), allGroupsCollector); assertEquals(3, allGroupsCollector.getGroupCount()); allGroupsCollector = createRandomCollector(groupField, canUseIDV); indexSearcher.search(new TermQuery(new Term("content", "blob")), allGroupsCollector); assertEquals(2, allGroupsCollector.getGroupCount()); indexSearcher.getIndexReader().close(); dir.close(); }
/** * Method to create Lucene Index Keep in mind that always index text value to Lucene for * calculating Cosine Similarity. You have to generate tokens, terms and their frequencies and * store them in the Lucene Index. * * @throws CorruptIndexException * @throws LockObtainFailedException * @throws IOException */ public void index() throws CorruptIndexException, LockObtainFailedException, IOException { System.out.println(">>> Process source directory : " + sourceDirectory.getAbsolutePath()); System.out.println(">>> Create index in directory : " + indexDirectory.getAbsolutePath()); Directory dir = FSDirectory.open(indexDirectory); Analyzer analyzer = new StandardAnalyzer(StandardAnalyzer.STOP_WORDS_SET); // using stop words // Analyzer analyzer = new StandardAnalyzer(CharArraySet.EMPTY_SET); // using stop words // System.out.println(">>> Overwrite the Analyser: DO NOT USE STOPWORDS FILTER !!!"); /** * Source: http://toolongdidntread.com/lucene/using-lucene-to-generate-n-grams/ * * <p>1) Run a document through an Analyzer which filters out the stuff we don’t care about. * SimpleAnalyzer, in this case, applies a lower case filter and a letter tokenizer, which makes * all text lowercase and divides text at non-letters, respectively. * * <p>2) Wrap this analyzer with ShingleAnalyzerWrapper which constructs shingles (token * n-grams) from a stream. This is the main thing we want to accomplish. * * <p>3) We generate a TokenStream which enumerates (a fancy word for establishes) “fields” from * a “document” (what I mentioned earlier). * * <p>4) Given a token stream, we want to extract certain things from it, like just the * characters and not all the other stuff that comes along with the stream. We’ll use * CharTermAttribute which extract just the words from the stream. * * <p>5) Finally, we iterate over the stream by incrementing the tokens, extracting each * CharTermAttribute from the tokens. */ if (useNgramsAsTerms) { analyzer = getNGramAnalyser(analyzer, N); } IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_10_2, analyzer); if (indexDirectory.exists()) { iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); } else { // Add new documents to an existing index: iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); } IndexWriter writer = new IndexWriter(dir, iwc); for (File f : sourceDirectory.listFiles()) { System.out.println("> DOC : " + f.getAbsolutePath()); if (f.isDirectory()) { System.out.println(">>> Indexer processes FILE : " + f.getAbsolutePath()); for (File fileTXT : f.listFiles()) { String at = getAllText(fileTXT); System.out.println("> file : " + fileTXT.getAbsolutePath()); Document doc = new Document(); FieldType fieldType = new FieldType(); fieldType.setIndexed(true); fieldType.setIndexOptions( FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); fieldType.setStored(true); fieldType.setStoreTermVectors(true); fieldType.setTokenized(true); Field contentField = new Field(fieldName, at, fieldType); doc.add(contentField); FieldType fieldType2 = new FieldType(); fieldType2.setIndexed(true); fieldType2.setStored(true); fieldType2.setStoreTermVectors(false); fieldType2.setTokenized(false); Field idField = new Field("id", fileTXT.getAbsolutePath(), fieldType2); doc.add(idField); FieldType fieldType3 = new FieldType(); fieldType3.setIndexed(false); fieldType3.setStored(true); fieldType3.setStoreTermVectors(false); fieldType3.setTokenized(false); Field rawField = new Field("raw", at, fieldType3); doc.add(rawField); writer.addDocument(doc); } } else { if (!f.getName().startsWith(".DS_Store")) { String at = getAllText(f); Document doc = new Document(); FieldType fieldType = new FieldType(); fieldType.setIndexed(true); fieldType.setIndexOptions( FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); fieldType.setStored(true); fieldType.setStoreTermVectors(true); fieldType.setTokenized(true); Field contentField = new Field(fieldName, at, fieldType); doc.add(contentField); FieldType fieldType2 = new FieldType(); fieldType2.setIndexed(true); fieldType2.setStored(true); fieldType2.setStoreTermVectors(false); fieldType2.setTokenized(false); Field idField = new Field("id", f.getAbsolutePath(), fieldType2); doc.add(idField); FieldType fieldType3 = new FieldType(); fieldType3.setIndexed(false); fieldType3.setStored(true); fieldType3.setStoreTermVectors(false); fieldType3.setTokenized(false); Field rawField = new Field("raw", at, fieldType3); doc.add(rawField); writer.addDocument(doc); } } } writer.close(); }
@Nightly public void test() throws Exception { MockDirectoryWrapper dir = new MockDirectoryWrapper(random(), new MMapDirectory(createTempDir("4GBStoredFields"))); dir.setThrottling(MockDirectoryWrapper.Throttling.NEVER); IndexWriter w = new IndexWriter( dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())) .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .setRAMBufferSizeMB(256.0) .setMergeScheduler(new ConcurrentMergeScheduler()) .setMergePolicy(newLogMergePolicy(false, 10)) .setOpenMode(IndexWriterConfig.OpenMode.CREATE)); MergePolicy mp = w.getConfig().getMergePolicy(); if (mp instanceof LogByteSizeMergePolicy) { // 1 petabyte: ((LogByteSizeMergePolicy) mp).setMaxMergeMB(1024 * 1024 * 1024); } final Document doc = new Document(); final FieldType ft = new FieldType(); ft.setIndexed(false); ft.setStored(true); ft.freeze(); final int valueLength = RandomInts.randomIntBetween(random(), 1 << 13, 1 << 20); final byte[] value = new byte[valueLength]; for (int i = 0; i < valueLength; ++i) { // random so that even compressing codecs can't compress it value[i] = (byte) random().nextInt(256); } final Field f = new Field("fld", value, ft); doc.add(f); final int numDocs = (int) ((1L << 32) / valueLength + 100); for (int i = 0; i < numDocs; ++i) { w.addDocument(doc); if (VERBOSE && i % (numDocs / 10) == 0) { System.out.println(i + " of " + numDocs + "..."); } } w.forceMerge(1); w.close(); if (VERBOSE) { boolean found = false; for (String file : dir.listAll()) { if (file.endsWith(".fdt")) { final long fileLength = dir.fileLength(file); if (fileLength >= 1L << 32) { found = true; } System.out.println("File length of " + file + " : " + fileLength); } } if (!found) { System.out.println("No .fdt file larger than 4GB, test bug?"); } } DirectoryReader rd = DirectoryReader.open(dir); Document sd = rd.document(numDocs - 1); assertNotNull(sd); assertEquals(1, sd.getFields().size()); BytesRef valueRef = sd.getBinaryValue("fld"); assertNotNull(valueRef); assertEquals(new BytesRef(value), valueRef); rd.close(); dir.close(); }
@BeforeClass public static void beforeClass() throws Exception { noDocs = atLeast(4096); distance = (1L << 60) / noDocs; directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter( random(), directory, newIndexWriterConfig(new MockAnalyzer(random())) .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) .setMergePolicy(newLogMergePolicy())); final FieldType storedLong = new FieldType(LegacyLongField.TYPE_NOT_STORED); storedLong.setStored(true); storedLong.freeze(); final FieldType storedLong8 = new FieldType(storedLong); storedLong8.setNumericPrecisionStep(8); final FieldType storedLong4 = new FieldType(storedLong); storedLong4.setNumericPrecisionStep(4); final FieldType storedLong6 = new FieldType(storedLong); storedLong6.setNumericPrecisionStep(6); final FieldType storedLong2 = new FieldType(storedLong); storedLong2.setNumericPrecisionStep(2); LegacyLongField field8 = new LegacyLongField("field8", 0L, storedLong8), field6 = new LegacyLongField("field6", 0L, storedLong6), field4 = new LegacyLongField("field4", 0L, storedLong4), field2 = new LegacyLongField("field2", 0L, storedLong2); Document doc = new Document(); // add fields, that have a distance to test general functionality doc.add(field8); doc.add(field6); doc.add(field4); doc.add(field2); // Add a series of noDocs docs with increasing long values, by updating the fields for (int l = 0; l < noDocs; l++) { long val = distance * l + startOffset; field8.setLongValue(val); field6.setLongValue(val); field4.setLongValue(val); field2.setLongValue(val); val = l - (noDocs / 2); writer.addDocument(doc); } Map<String, Type> map = new HashMap<>(); map.put("field2", Type.LEGACY_LONG); map.put("field4", Type.LEGACY_LONG); map.put("field6", Type.LEGACY_LONG); map.put("field8", Type.LEGACY_LONG); reader = UninvertingReader.wrap(writer.getReader(), map); searcher = newSearcher(reader); writer.close(); }
@BeforeClass public static void beforeClass() throws Exception { noDocs = atLeast(4096); distance = (1L << 60) / noDocs; directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter( random(), directory, newIndexWriterConfig(new MockAnalyzer(random())) .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) .setMergePolicy(newLogMergePolicy())); final FieldType storedLong = new FieldType(LegacyLongField.TYPE_NOT_STORED); storedLong.setStored(true); storedLong.freeze(); final FieldType storedLong8 = new FieldType(storedLong); storedLong8.setNumericPrecisionStep(8); final FieldType storedLong4 = new FieldType(storedLong); storedLong4.setNumericPrecisionStep(4); final FieldType storedLong6 = new FieldType(storedLong); storedLong6.setNumericPrecisionStep(6); final FieldType storedLong2 = new FieldType(storedLong); storedLong2.setNumericPrecisionStep(2); final FieldType storedLongNone = new FieldType(storedLong); storedLongNone.setNumericPrecisionStep(Integer.MAX_VALUE); final FieldType unstoredLong = LegacyLongField.TYPE_NOT_STORED; final FieldType unstoredLong8 = new FieldType(unstoredLong); unstoredLong8.setNumericPrecisionStep(8); final FieldType unstoredLong6 = new FieldType(unstoredLong); unstoredLong6.setNumericPrecisionStep(6); final FieldType unstoredLong4 = new FieldType(unstoredLong); unstoredLong4.setNumericPrecisionStep(4); final FieldType unstoredLong2 = new FieldType(unstoredLong); unstoredLong2.setNumericPrecisionStep(2); LegacyLongField field8 = new LegacyLongField("field8", 0L, storedLong8), field6 = new LegacyLongField("field6", 0L, storedLong6), field4 = new LegacyLongField("field4", 0L, storedLong4), field2 = new LegacyLongField("field2", 0L, storedLong2), fieldNoTrie = new LegacyLongField("field" + Integer.MAX_VALUE, 0L, storedLongNone), ascfield8 = new LegacyLongField("ascfield8", 0L, unstoredLong8), ascfield6 = new LegacyLongField("ascfield6", 0L, unstoredLong6), ascfield4 = new LegacyLongField("ascfield4", 0L, unstoredLong4), ascfield2 = new LegacyLongField("ascfield2", 0L, unstoredLong2); Document doc = new Document(); // add fields, that have a distance to test general functionality doc.add(field8); doc.add(field6); doc.add(field4); doc.add(field2); doc.add(fieldNoTrie); // add ascending fields with a distance of 1, beginning at -noDocs/2 to test the correct // splitting of range and inclusive/exclusive doc.add(ascfield8); doc.add(ascfield6); doc.add(ascfield4); doc.add(ascfield2); // Add a series of noDocs docs with increasing long values, by updating the fields for (int l = 0; l < noDocs; l++) { long val = distance * l + startOffset; field8.setLongValue(val); field6.setLongValue(val); field4.setLongValue(val); field2.setLongValue(val); fieldNoTrie.setLongValue(val); val = l - (noDocs / 2); ascfield8.setLongValue(val); ascfield6.setLongValue(val); ascfield4.setLongValue(val); ascfield2.setLongValue(val); writer.addDocument(doc); } reader = writer.getReader(); searcher = newSearcher(reader); writer.close(); }