public static Document createDocument(int n, String indexName, int numFields) { StringBuilder sb = new StringBuilder(); FieldType customType = new FieldType(TextField.TYPE_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorOffsets(true); FieldType customType1 = new FieldType(StringField.TYPE_STORED); customType1.setStoreTermVectors(true); customType1.setStoreTermVectorPositions(true); customType1.setStoreTermVectorOffsets(true); final Document doc = new Document(); doc.add(new Field("id", Integer.toString(n), customType1)); doc.add(new Field("indexname", indexName, customType1)); sb.append("a"); sb.append(n); doc.add(new Field("field1", sb.toString(), customType)); sb.append(" b"); sb.append(n); for (int i = 1; i < numFields; i++) { doc.add(new Field("field" + (i + 1), sb.toString(), customType)); } return doc; }
/** * Translates the pre-4.0 enums for specifying how a field should be indexed into the 4.0 {@link * FieldType} approach. * * @deprecated This is here only to ease transition from the pre-4.0 APIs. */ @Deprecated public static final FieldType translateFieldType( Store store, Index index, TermVector termVector) { final FieldType ft = new FieldType(); ft.setStored(store == Store.YES); switch (index) { case ANALYZED: ft.setIndexed(true); ft.setTokenized(true); break; case ANALYZED_NO_NORMS: ft.setIndexed(true); ft.setTokenized(true); ft.setOmitNorms(true); break; case NOT_ANALYZED: ft.setIndexed(true); ft.setTokenized(false); break; case NOT_ANALYZED_NO_NORMS: ft.setIndexed(true); ft.setTokenized(false); ft.setOmitNorms(true); break; case NO: break; } switch (termVector) { case NO: break; case YES: ft.setStoreTermVectors(true); break; case WITH_POSITIONS: ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); break; case WITH_OFFSETS: ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); break; case WITH_POSITIONS_OFFSETS: ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.setStoreTermVectorOffsets(true); break; } ft.freeze(); return ft; }
private void indexDocument(IndexWriter iw, ProcessedDocument parsedDoc) throws IOException { org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); FieldType customType = new FieldType(TextField.TYPE_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorOffsets(false); doc.add(new Field(INDEX_FIELD_CONTENT, parsedDoc.getText(), customType)); doc.add(new StringField(INDEX_FIELD_URL, parsedDoc.getDocumentURL(), Field.Store.YES)); doc.add(new StringField(INDEX_FIELD_DOC_ID, parsedDoc.getDocumentId(), Field.Store.YES)); doc.add(new TextField(INDEX_FIELD_TITLE, parsedDoc.getDocumentTitle(), Field.Store.YES)); doc.add(new StringField(INDEX_FIELD_DOC_TYPE, parsedDoc.getDocumentType(), Field.Store.YES)); /** * TODO: 2.2 -- The effect of boosting (Book Section 2.1.2) * * <p>Uncomment the lines below to demonstrate the effect of boosting */ // if ( parsedDoc.getDocumentId().equals("g1-d13")) { // doc.setBoost(2); // } iw.addDocument(doc); }
// LUCENE-325: test forceMergeDeletes without waiting, when // many adjacent merges are required public void testForceMergeDeletes3() throws IOException { Directory dir = newDirectory(); IndexWriter writer = new IndexWriter( dir, newIndexWriterConfig(new MockAnalyzer(random())) .setMaxBufferedDocs(2) .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH) .setMergePolicy(newLogMergePolicy(50))); FieldType customType = new FieldType(); customType.setStored(true); FieldType customType1 = new FieldType(TextField.TYPE_NOT_STORED); customType1.setTokenized(false); customType1.setStoreTermVectors(true); customType1.setStoreTermVectorPositions(true); customType1.setStoreTermVectorOffsets(true); Document document = new Document(); Field storedField = newField("stored", "stored", customType); document.add(storedField); Field termVectorField = newField("termVector", "termVector", customType1); document.add(termVectorField); Field idField = newStringField("id", "", Field.Store.NO); document.add(idField); for (int i = 0; i < 98; i++) { idField.setStringValue("" + i); writer.addDocument(document); } writer.close(); IndexReader ir = DirectoryReader.open(dir); assertEquals(98, ir.maxDoc()); assertEquals(98, ir.numDocs()); ir.close(); IndexWriterConfig dontMergeConfig = new IndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(NoMergePolicy.INSTANCE); writer = new IndexWriter(dir, dontMergeConfig); for (int i = 0; i < 98; i += 2) { writer.deleteDocuments(new Term("id", "" + i)); } writer.close(); ir = DirectoryReader.open(dir); assertEquals(49, ir.numDocs()); ir.close(); writer = new IndexWriter( dir, newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy(3))); writer.forceMergeDeletes(false); writer.close(); ir = DirectoryReader.open(dir); assertEquals(49, ir.maxDoc()); assertEquals(49, ir.numDocs()); ir.close(); dir.close(); }
/** * Used for adding a document when a field needs to be created from a type and a string. * * <p>By default, the indexed value is the same as the stored value (taken from toInternal()). * Having a different representation for external, internal, and indexed would present quite a few * problems given the current Lucene architecture. An analyzer for adding docs would need to * translate internal->indexed while an analyzer for querying would need to translate * external->indexed. * * <p>The only other alternative to having internal==indexed would be to have internal==external. * In this case, toInternal should convert to the indexed representation, toExternal() should do * nothing, and createField() should *not* call toInternal, but use the external value and set * tokenized=true to get Lucene to convert to the internal(indexed) form. :TODO: clean up and * clarify this explanation. * * @see #toInternal */ public StorableField createField(SchemaField field, Object value, float boost) { if (!field.indexed() && !field.stored()) { if (log.isTraceEnabled()) log.trace("Ignoring unindexed/unstored field: " + field); return null; } String val; try { val = toInternal(value.toString()); } catch (RuntimeException e) { throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "Error while creating field '" + field + "' from value '" + value + "'", e); } if (val == null) return null; org.apache.lucene.document.FieldType newType = new org.apache.lucene.document.FieldType(); newType.setTokenized(field.isTokenized()); newType.setStored(field.stored()); newType.setOmitNorms(field.omitNorms()); newType.setIndexOptions(field.indexed() ? getIndexOptions(field, val) : IndexOptions.NONE); newType.setStoreTermVectors(field.storeTermVector()); newType.setStoreTermVectorOffsets(field.storeTermOffsets()); newType.setStoreTermVectorPositions(field.storeTermPositions()); newType.setStoreTermVectorPayloads(field.storeTermPayloads()); return createField(field.getName(), val, newType, boost); }
static { customType2 = new FieldType(TextField.TYPE_STORED); customType2.setStoreTermVectors(true); customType2.setStoreTermVectorPositions(true); customType2.setStoreTermVectorOffsets(true); textField2 = new Field(TEXT_FIELD_2_KEY, FIELD_2_TEXT, customType2); }
public void testChangeGaps() throws Exception { // LUCENE-5324: check that it is possible to change the wrapper's gaps final int positionGap = random().nextInt(1000); final int offsetGap = random().nextInt(1000); final Analyzer delegate = new MockAnalyzer(random()); final Analyzer a = new DelegatingAnalyzerWrapper(delegate.getReuseStrategy()) { @Override protected Analyzer getWrappedAnalyzer(String fieldName) { return delegate; } @Override public int getPositionIncrementGap(String fieldName) { return positionGap; } @Override public int getOffsetGap(String fieldName) { return offsetGap; } }; final RandomIndexWriter writer = new RandomIndexWriter(random(), newDirectory(), a); final Document doc = new Document(); final FieldType ft = new FieldType(); ft.setIndexOptions(IndexOptions.DOCS); ft.setTokenized(true); ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.setStoreTermVectorOffsets(true); doc.add(new Field("f", "a", ft)); doc.add(new Field("f", "a", ft)); writer.addDocument(doc); final LeafReader reader = getOnlySegmentReader(writer.getReader()); final Fields fields = reader.getTermVectors(0); final Terms terms = fields.terms("f"); final TermsEnum te = terms.iterator(); assertEquals(new BytesRef("a"), te.next()); final PostingsEnum dpe = te.postings(null, PostingsEnum.ALL); assertEquals(0, dpe.nextDoc()); assertEquals(2, dpe.freq()); assertEquals(0, dpe.nextPosition()); assertEquals(0, dpe.startOffset()); final int endOffset = dpe.endOffset(); assertEquals(1 + positionGap, dpe.nextPosition()); assertEquals(1 + endOffset + offsetGap, dpe.endOffset()); assertEquals(null, te.next()); reader.close(); writer.close(); writer.w.getDirectory().close(); }
protected DirectoryReader indexDocsWithLucene(TestDoc[] testDocs) throws IOException { Map<String, Analyzer> mapping = new HashMap<String, Analyzer>(); for (TestFieldSetting field : testDocs[0].fieldSettings) { if (field.storedPayloads) { mapping.put( field.name, new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new StandardTokenizer(Version.CURRENT.luceneVersion, reader); TokenFilter filter = new LowerCaseFilter(Version.CURRENT.luceneVersion, tokenizer); filter = new TypeAsPayloadTokenFilter(filter); return new TokenStreamComponents(tokenizer, filter); } }); } } PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper( new StandardAnalyzer(Version.CURRENT.luceneVersion, CharArraySet.EMPTY_SET), mapping); Directory dir = new RAMDirectory(); IndexWriterConfig conf = new IndexWriterConfig(Version.CURRENT.luceneVersion, wrapper); conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, conf); for (TestDoc doc : testDocs) { Document d = new Document(); d.add(new Field("id", doc.id, StringField.TYPE_STORED)); for (int i = 0; i < doc.fieldContent.length; i++) { FieldType type = new FieldType(TextField.TYPE_STORED); TestFieldSetting fieldSetting = doc.fieldSettings[i]; type.setStoreTermVectorOffsets(fieldSetting.storedOffset); type.setStoreTermVectorPayloads(fieldSetting.storedPayloads); type.setStoreTermVectorPositions( fieldSetting.storedPositions || fieldSetting.storedPayloads || fieldSetting.storedOffset); type.setStoreTermVectors(true); type.freeze(); d.add(new Field(fieldSetting.name, doc.fieldContent[i], type)); } writer.updateDocument(new Term("id", doc.id), d); writer.commit(); } writer.close(); return DirectoryReader.open(dir); }
@Test public void testLotsOfPhrases() throws IOException { Directory dir = newDirectory(); IndexWriter writer = new IndexWriter( dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer( random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET))); FieldType type = new FieldType(TextField.TYPE_STORED); type.setStoreTermVectorOffsets(true); type.setStoreTermVectorPositions(true); type.setStoreTermVectors(true); type.freeze(); String[] terms = {"org", "apache", "lucene"}; int iters = atLeast(1000); StringBuilder builder = new StringBuilder(); for (int i = 0; i < iters; i++) { builder.append(terms[random().nextInt(terms.length)]).append(" "); if (random().nextInt(6) == 3) { builder.append("elasticsearch").append(" "); } } Document doc = new Document(); Field field = new Field("field", builder.toString(), type); doc.add(field); writer.addDocument(doc); PhraseQuery query = new PhraseQuery(); query.add(new Term("field", "org")); query.add(new Term("field", "apache")); query.add(new Term("field", "lucene")); XFastVectorHighlighter highlighter = new XFastVectorHighlighter(); IndexReader reader = DirectoryReader.open(writer, true); IndexSearcher searcher = newSearcher(reader); TopDocs hits = searcher.search(query, 10); assertEquals(1, hits.totalHits); XFieldQuery fieldQuery = highlighter.getFieldQuery(query, reader); String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, hits.scoreDocs[0].doc, "field", 1000, 1); for (int i = 0; i < bestFragments.length; i++) { String result = bestFragments[i].replaceAll("<b>org apache lucene</b>", "FOOBAR"); assertFalse(result.contains("org apache lucene")); } reader.close(); writer.close(); dir.close(); }
// LUCENE-1270 public void testHangOnClose() throws IOException { Directory dir = newDirectory(); LogByteSizeMergePolicy lmp = new LogByteSizeMergePolicy(); lmp.setNoCFSRatio(0.0); lmp.setMergeFactor(100); IndexWriter writer = new IndexWriter( dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())) .setMaxBufferedDocs(5) .setMergePolicy(lmp)); Document doc = new Document(); FieldType customType = new FieldType(TextField.TYPE_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorOffsets(true); doc.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType)); for (int i = 0; i < 60; i++) writer.addDocument(doc); Document doc2 = new Document(); FieldType customType2 = new FieldType(); customType2.setStored(true); doc2.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType2)); doc2.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType2)); doc2.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType2)); doc2.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType2)); for (int i = 0; i < 10; i++) writer.addDocument(doc2); writer.close(); Directory dir2 = newDirectory(); lmp = new LogByteSizeMergePolicy(); lmp.setMinMergeMB(0.0001); lmp.setNoCFSRatio(0.0); lmp.setMergeFactor(4); writer = new IndexWriter( dir2, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())) .setMergeScheduler(new SerialMergeScheduler()) .setMergePolicy(lmp)); writer.addIndexes(dir); writer.close(); dir.close(); dir2.close(); }
// make 1 doc with multi valued field protected void make1dmfIndex(Analyzer analyzer, String... values) throws Exception { IndexWriter writer = new IndexWriter( dir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer).setOpenMode(OpenMode.CREATE)); Document doc = new Document(); FieldType customType = new FieldType(TextField.TYPE_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorOffsets(true); customType.setStoreTermVectorPositions(true); for (String value : values) { doc.add(new Field(F, value, customType)); } writer.addDocument(doc); writer.close(); if (reader != null) reader.close(); reader = DirectoryReader.open(dir); }
protected void makeUnstoredIndex() throws Exception { IndexWriter writer = new IndexWriter( dir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzerW).setOpenMode(OpenMode.CREATE)); Document doc = new Document(); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorOffsets(true); customType.setStoreTermVectorPositions(true); doc.add(new Field(F, "aaa", customType)); // doc.add( new Field( F, "aaa", Store.NO, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) // ); writer.addDocument(doc); writer.close(); if (reader != null) reader.close(); reader = DirectoryReader.open(dir); }
/** * Método para la indexación individual de cada fichero PDF * * @param f el fichero PDF * @param writer el IndexWriter * @throws IOException */ public static void indexFile(File f, IndexWriter writer) throws IOException { // Cargamos el fichero mediante PDFBox PDDocument pddDocument = PDDocument.load(f.getAbsolutePath()); PDFTextStripper textStripper = new PDFTextStripper(); int numPages = pddDocument.getNumberOfPages(); String pageContent; // Declaramos un Field propio FieldType fieldText = new FieldType(); fieldText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); fieldText.setStored(false); fieldText.setStoreTermVectorOffsets(true); fieldText.setStoreTermVectorPositions(true); fieldText.setStoreTermVectors(true); // Recorremos e indexamos cada una de las páginas del fichero, almacenando el número de página y // el título del fichero, e indexando el contenido for (int i = 0; i < numPages; i++) { if (i == 0) { i++; } textStripper.setStartPage(i); textStripper.setEndPage(i); // coger una página pageContent = textStripper.getText(pddDocument); if (pageContent != null && !pageContent.isEmpty()) { pageContent = pageContent.toLowerCase(); } if (pageContent != null) { // Declaramos el documento a indexar para esa página // Número de página // Contenido de la página // Título del fichero // Añadimos el documento } } // Cerramos el fichero PDF }
private void indexDocument(IndexWriter iw, NewsStory newsStory) throws IOException { org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorOffsets(false); doc.add(new Field(INDEX_FIELD_CONTENT, newsStory.getContent().getText(), customType)); doc.add(new StringField(INDEX_FIELD_URL, newsStory.getUrl(), Field.Store.YES)); doc.add(new StringField(INDEX_FIELD_DOC_ID, newsStory.getId(), Field.Store.YES)); doc.add(new TextField(INDEX_FIELD_TITLE, newsStory.getTitle(), Field.Store.YES)); iw.addDocument(doc); }
public void testDuelMemoryIndexCoreDirectoryWithArrayField() throws Exception { final String field_name = "text"; MockAnalyzer mockAnalyzer = new MockAnalyzer(random()); if (random().nextBoolean()) { mockAnalyzer.setOffsetGap(random().nextInt(100)); } // index into a random directory FieldType type = new FieldType(TextField.TYPE_STORED); type.setStoreTermVectorOffsets(true); type.setStoreTermVectorPayloads(false); type.setStoreTermVectorPositions(true); type.setStoreTermVectors(true); type.freeze(); Document doc = new Document(); doc.add(new Field(field_name, "la la", type)); doc.add(new Field(field_name, "foo bar foo bar foo", type)); Directory dir = newDirectory(); IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(random(), mockAnalyzer)); writer.updateDocument(new Term("id", "1"), doc); writer.commit(); writer.close(); DirectoryReader reader = DirectoryReader.open(dir); // Index document in Memory index MemoryIndex memIndex = new MemoryIndex(true); memIndex.addField(field_name, "la la", mockAnalyzer); memIndex.addField(field_name, "foo bar foo bar foo", mockAnalyzer); // compare term vectors Terms ramTv = reader.getTermVector(0, field_name); IndexReader memIndexReader = memIndex.createSearcher().getIndexReader(); TestUtil.checkReader(memIndexReader); Terms memTv = memIndexReader.getTermVector(0, field_name); compareTermVectors(ramTv, memTv, field_name); memIndexReader.close(); reader.close(); dir.close(); }
private void doTestMixedPostings(Codec codec) throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); iwc.setCodec(codec); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); // turn on vectors for the checkindex cross-check ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectorPositions(true); Field idField = new Field("id", "", ft); Field dateField = new Field("date", "", ft); doc.add(idField); doc.add(dateField); for (int i = 0; i < 100; i++) { idField.setStringValue(Integer.toString(random().nextInt(50))); dateField.setStringValue(Integer.toString(random().nextInt(100))); iw.addDocument(doc); } iw.close(); dir.close(); // checkindex }
/** * Construct Indexer * * @param directory the main BlackLab index directory * @param create if true, creates a new index; otherwise, appends to existing index * @param docIndexerClass how to index the files, or null to autodetect * @param indexTemplateFile JSON file to use as template for index structure / metadata (if * creating new index) * @throws DocumentFormatException if no DocIndexer was specified and autodetection failed * @throws IOException */ public Indexer( File directory, boolean create, Class<? extends DocIndexer> docIndexerClass, File indexTemplateFile) throws DocumentFormatException, IOException { this.docIndexerClass = docIndexerClass; searcher = Searcher.openForWriting(directory, create, indexTemplateFile); if (!create) searcher.getIndexStructure().setModified(); if (this.docIndexerClass == null) { // No DocIndexer supplied; try to detect it from the index // metadata. String formatId = searcher.getIndexStructure().getDocumentFormat(); if (formatId != null && formatId.length() > 0) setDocIndexer(DocumentFormats.getIndexerClass(formatId)); else { throw new DocumentFormatException("Cannot detect document format for index!"); } } metadataFieldTypeTokenized = new FieldType(); metadataFieldTypeTokenized.setStored(true); metadataFieldTypeTokenized.setIndexed(true); metadataFieldTypeTokenized.setTokenized(true); metadataFieldTypeTokenized.setOmitNorms(true); // @@@ <-- depending on setting? metadataFieldTypeTokenized.setStoreTermVectors(true); metadataFieldTypeTokenized.setStoreTermVectorPositions(true); metadataFieldTypeTokenized.setStoreTermVectorOffsets(true); metadataFieldTypeTokenized.freeze(); metadataFieldTypeUntokenized = new FieldType(metadataFieldTypeTokenized); metadataFieldTypeUntokenized.setTokenized(false); metadataFieldTypeUntokenized.freeze(); }
protected void _add(Topic topic) { if (topic == null) return; // 虽然不太可能,还是预防一下吧 // 暂时不索引评论 dao.fetchLinks(topic, "replies"); Document document; document = new Document(); Field field; FieldType fieldType; // 先加入id fieldType = new FieldType(); fieldType.setIndexed(true); // 索引 fieldType.setStored(true); // 存储 fieldType.setStoreTermVectors(true); fieldType.setTokenized(true); fieldType.setStoreTermVectorPositions(true); // 存储位置 fieldType.setStoreTermVectorOffsets(true); // 存储偏移量 field = new Field("id", topic.getId(), fieldType); document.add(field); // 加入标题 fieldType = new FieldType(); fieldType.setIndexed(true); // 索引 fieldType.setStored(true); // 存储 fieldType.setStoreTermVectors(true); fieldType.setTokenized(true); fieldType.setStoreTermVectorPositions(true); // 存储位置 fieldType.setStoreTermVectorOffsets(true); // 存储偏移量 field = new Field("title", topic.getTitle(), fieldType); document.add(field); // 加入文章内容 fieldType = new FieldType(); fieldType.setIndexed(true); // 索引 fieldType.setStored(false); // 存储 fieldType.setStoreTermVectors(true); fieldType.setTokenized(true); fieldType.setStoreTermVectorPositions(true); // 存储位置 fieldType.setStoreTermVectorOffsets(true); // 存储偏移量 field = new Field("content", topic.getContent(), fieldType); document.add(field); StringBuilder sb = new StringBuilder(); if (topic.getReplies() != null) { for (TopicReply reply : topic.getReplies()) { if (reply == null) continue; bigContentService.fill(reply); if (reply.getContent() != null) { if (sb.length() + reply.getContent().length() > (IndexWriter.MAX_TERM_LENGTH / 3)) { break; } sb.append(reply.getContent()); } } } fieldType = new FieldType(); fieldType.setIndexed(true); // 索引 fieldType.setStored(false); // 存储 fieldType.setStoreTermVectors(true); fieldType.setTokenized(true); fieldType.setStoreTermVectorPositions(true); // 存储位置 fieldType.setStoreTermVectorOffsets(true); // 存储偏移量 field = new Field("reply", sb.toString(), fieldType); document.add(field); try { luceneIndex.writer.addDocument(document); } catch (IOException e) { log.debug("add to index fail : id=" + topic.getId()); } catch (Error e) { log.debug("add to index fail : id=" + topic.getId()); } }
public void indexDoc() throws IOException { Document d = new Document(); FieldType customType1 = new FieldType(TextField.TYPE_STORED); customType1.setTokenized(false); customType1.setOmitNorms(true); ArrayList<Field> fields = new ArrayList<>(); String idString = getIdString(); Field idField = newField("id", idString, customType1); fields.add(idField); int nFields = nextInt(maxFields); for (int i = 0; i < nFields; i++) { FieldType customType = new FieldType(); switch (nextInt(4)) { case 0: break; case 1: customType.setStoreTermVectors(true); break; case 2: customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); break; case 3: customType.setStoreTermVectors(true); customType.setStoreTermVectorOffsets(true); break; } switch (nextInt(4)) { case 0: customType.setStored(true); customType.setOmitNorms(true); customType.setIndexed(true); fields.add(newField("f" + nextInt(100), getString(1), customType)); break; case 1: customType.setIndexed(true); customType.setTokenized(true); fields.add(newField("f" + nextInt(100), getString(0), customType)); break; case 2: customType.setStored(true); customType.setStoreTermVectors(false); customType.setStoreTermVectorOffsets(false); customType.setStoreTermVectorPositions(false); fields.add(newField("f" + nextInt(100), getString(0), customType)); break; case 3: customType.setStored(true); customType.setIndexed(true); customType.setTokenized(true); fields.add(newField("f" + nextInt(100), getString(bigFieldSize), customType)); break; } } if (sameFieldOrder) { Collections.sort(fields, fieldNameComparator); } else { // random placement of id field also Collections.swap(fields, nextInt(fields.size()), 0); } for (int i = 0; i < fields.size(); i++) { d.add(fields.get(i)); } if (VERBOSE) { System.out.println(Thread.currentThread().getName() + ": indexing id:" + idString); } w.updateDocument(new Term("id", idString), d); // System.out.println(Thread.currentThread().getName() + ": indexing "+d); docs.put(idString, d); }
public void testRandomDiscreteMultiValueHighlighting() throws Exception { String[] randomValues = new String[3 + random().nextInt(10 * RANDOM_MULTIPLIER)]; for (int i = 0; i < randomValues.length; i++) { String randomValue; do { randomValue = _TestUtil.randomSimpleString(random()); } while ("".equals(randomValue)); randomValues[i] = randomValue; } Directory dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter( random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())) .setMergePolicy(newLogMergePolicy())); FieldType customType = new FieldType(TextField.TYPE_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorOffsets(true); customType.setStoreTermVectorPositions(true); int numDocs = randomValues.length * 5; int numFields = 2 + random().nextInt(5); int numTerms = 2 + random().nextInt(3); List<Doc> docs = new ArrayList<Doc>(numDocs); List<Document> documents = new ArrayList<Document>(numDocs); Map<String, Set<Integer>> valueToDocId = new HashMap<String, Set<Integer>>(); for (int i = 0; i < numDocs; i++) { Document document = new Document(); String[][] fields = new String[numFields][numTerms]; for (int j = 0; j < numFields; j++) { String[] fieldValues = new String[numTerms]; fieldValues[0] = getRandomValue(randomValues, valueToDocId, i); StringBuilder builder = new StringBuilder(fieldValues[0]); for (int k = 1; k < numTerms; k++) { fieldValues[k] = getRandomValue(randomValues, valueToDocId, i); builder.append(' ').append(fieldValues[k]); } document.add(new Field(F, builder.toString(), customType)); fields[j] = fieldValues; } docs.add(new Doc(fields)); documents.add(document); } writer.addDocuments(documents); writer.close(); IndexReader reader = DirectoryReader.open(dir); try { int highlightIters = 1 + random().nextInt(120 * RANDOM_MULTIPLIER); for (int highlightIter = 0; highlightIter < highlightIters; highlightIter++) { String queryTerm = randomValues[random().nextInt(randomValues.length)]; int randomHit = valueToDocId.get(queryTerm).iterator().next(); List<StringBuilder> builders = new ArrayList<StringBuilder>(); for (String[] fieldValues : docs.get(randomHit).fieldValues) { StringBuilder builder = new StringBuilder(); boolean hit = false; for (int i = 0; i < fieldValues.length; i++) { if (queryTerm.equals(fieldValues[i])) { builder.append("<b>").append(queryTerm).append("</b>"); hit = true; } else { builder.append(fieldValues[i]); } if (i != fieldValues.length - 1) { builder.append(' '); } } if (hit) { builders.add(builder); } } FieldQuery fq = new FieldQuery(tq(queryTerm), true, true); FieldTermStack stack = new FieldTermStack(reader, randomHit, F, fq); FieldPhraseList fpl = new FieldPhraseList(stack, fq); SimpleFragListBuilder sflb = new SimpleFragListBuilder(100); FieldFragList ffl = sflb.createFieldFragList(fpl, 300); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); sfb.setDiscreteMultiValueHighlighting(true); String[] actualFragments = sfb.createFragments(reader, randomHit, F, ffl, numFields); assertEquals(builders.size(), actualFragments.length); for (int i = 0; i < actualFragments.length; i++) { assertEquals(builders.get(i).toString(), actualFragments[i]); } } } finally { reader.close(); dir.close(); } }
// creates 8 fields with different options and does "duels" of fields against each other public void test() throws Exception { Directory dir = newDirectory(); Analyzer analyzer = new Analyzer(Analyzer.PER_FIELD_REUSE_STRATEGY) { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader); if (fieldName.contains("payloadsFixed")) { TokenFilter filter = new MockFixedLengthPayloadFilter(new Random(0), tokenizer, 1); return new TokenStreamComponents(tokenizer, filter); } else if (fieldName.contains("payloadsVariable")) { TokenFilter filter = new MockVariableLengthPayloadFilter(new Random(0), tokenizer); return new TokenStreamComponents(tokenizer, filter); } else { return new TokenStreamComponents(tokenizer); } } }; IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene41PostingsFormat())); // TODO we could actually add more fields implemented with different PFs // or, just put this test into the usual rotation? RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc.clone()); Document doc = new Document(); FieldType docsOnlyType = new FieldType(TextField.TYPE_NOT_STORED); // turn this on for a cross-check docsOnlyType.setStoreTermVectors(true); docsOnlyType.setIndexOptions(IndexOptions.DOCS_ONLY); FieldType docsAndFreqsType = new FieldType(TextField.TYPE_NOT_STORED); // turn this on for a cross-check docsAndFreqsType.setStoreTermVectors(true); docsAndFreqsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); FieldType positionsType = new FieldType(TextField.TYPE_NOT_STORED); // turn these on for a cross-check positionsType.setStoreTermVectors(true); positionsType.setStoreTermVectorPositions(true); positionsType.setStoreTermVectorOffsets(true); positionsType.setStoreTermVectorPayloads(true); FieldType offsetsType = new FieldType(positionsType); offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field field1 = new Field("field1docs", "", docsOnlyType); Field field2 = new Field("field2freqs", "", docsAndFreqsType); Field field3 = new Field("field3positions", "", positionsType); Field field4 = new Field("field4offsets", "", offsetsType); Field field5 = new Field("field5payloadsFixed", "", positionsType); Field field6 = new Field("field6payloadsVariable", "", positionsType); Field field7 = new Field("field7payloadsFixedOffsets", "", offsetsType); Field field8 = new Field("field8payloadsVariableOffsets", "", offsetsType); doc.add(field1); doc.add(field2); doc.add(field3); doc.add(field4); doc.add(field5); doc.add(field6); doc.add(field7); doc.add(field8); for (int i = 0; i < MAXDOC; i++) { String stringValue = Integer.toString(i) + " verycommon " + English.intToEnglish(i).replace('-', ' ') + " " + _TestUtil.randomSimpleString(random()); field1.setStringValue(stringValue); field2.setStringValue(stringValue); field3.setStringValue(stringValue); field4.setStringValue(stringValue); field5.setStringValue(stringValue); field6.setStringValue(stringValue); field7.setStringValue(stringValue); field8.setStringValue(stringValue); iw.addDocument(doc); } iw.close(); verify(dir); _TestUtil.checkIndex(dir); // for some extra coverage, checkIndex before we forceMerge iwc.setOpenMode(OpenMode.APPEND); IndexWriter iw2 = new IndexWriter(dir, iwc.clone()); iw2.forceMerge(1); iw2.close(); verify(dir); dir.close(); }