/** * @Title: createIndex @Description: 建立索引 * * @param @param documentList * @param @throws IOException * @return void * @throws */ public static void createIndex(List<Document> documentList, String path) throws IOException { // 在当前路径下创建一个叫indexDir的目录 File file = new File(path); String pathAll = file.getParentFile().getParentFile().toString() + "\\index"; File indexDir = new File(pathAll); // 创建索引目录 Directory directory = FSDirectory.open(indexDir); // 创建一个分词器 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); // 创建索引配置器 IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_36, analyzer); LogMergePolicy mergePolicy = new LogByteSizeMergePolicy(); // 设置segment添加文档(Document)时的合并频率 // 值较小,建立索引的速度就较慢 // 值较大,建立索引的速度就较快,>10适合批量建立索引 mergePolicy.setMergeFactor(50); // 设置segment最大合并文档(Document)数 // 值较小有利于追加索引的速度 // 值较大,适合批量建立索引和更快的搜索 mergePolicy.setMaxMergeDocs(5000); // 启用复合式索引文件格式,合并多个segment mergePolicy.setUseCompoundFile(true); indexWriterConfig.setMergePolicy(mergePolicy); // 设置索引的打开模式 indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); // 创建索引器 IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig); for (Document document : documentList) { // 把文档添加到索引库 indexWriter.addDocument(document); } // 提交索引到磁盘上的索引库,关闭索引器 indexWriter.close(); }
public static IndexWriterConfig getIndexWriterConfig(Analyzer analyzer, boolean create) { IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(create ? OpenMode.CREATE : OpenMode.CREATE_OR_APPEND); config.setRAMBufferSizeMB(150); // faster indexing // Set merge factor (if using LogMergePolicy, which is the default up to version LUCENE_32, // so yes) MergePolicy mp = config.getMergePolicy(); if (mp instanceof LogMergePolicy) { ((LogMergePolicy) mp).setMergeFactor(40); // faster indexing } return config; }
private MergePolicy _getMergePolicy() throws Exception { if (PropsValues.LUCENE_MERGE_POLICY.equals(NoMergePolicy.class.getName())) { return NoMergePolicy.NO_COMPOUND_FILES; } ClassLoader classLoader = ClassLoaderUtil.getPortalClassLoader(); MergePolicy mergePolicy = (MergePolicy) InstanceFactory.newInstance(classLoader, PropsValues.LUCENE_MERGE_POLICY); if (mergePolicy instanceof LogMergePolicy) { LogMergePolicy logMergePolicy = (LogMergePolicy) mergePolicy; logMergePolicy.setMergeFactor(PropsValues.LUCENE_MERGE_FACTOR); } return mergePolicy; }
/** * Index the fileset. * * @exception IOException if Lucene I/O exception TODO: refactor!!!!! */ private void indexDocs() throws IOException { Date start = new Date(); boolean create = overwrite; // If the index directory doesn't exist, // create it and force create mode if (indexDir.mkdirs() && !overwrite) { create = true; } FSDirectory dir = FSDirectory.open(indexDir); try { Searcher searcher = null; boolean checkLastModified = false; if (!create) { try { searcher = new IndexSearcher(dir, true); checkLastModified = true; } catch (IOException ioe) { log("IOException: " + ioe.getMessage()); // Empty - ignore, which indicates to index all // documents } } log("checkLastModified = " + checkLastModified, Project.MSG_VERBOSE); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) .setOpenMode(create ? OpenMode.CREATE : OpenMode.APPEND); LogMergePolicy lmp = (LogMergePolicy) conf.getMergePolicy(); lmp.setUseCompoundFile(useCompoundIndex); lmp.setMergeFactor(mergeFactor); IndexWriter writer = new IndexWriter(dir, conf); int totalFiles = 0; int totalIndexed = 0; int totalIgnored = 0; try { for (int i = 0; i < rcs.size(); i++) { ResourceCollection rc = rcs.elementAt(i); if (rc.isFilesystemOnly()) { Iterator resources = rc.iterator(); while (resources.hasNext()) { Resource r = (Resource) resources.next(); if (!r.isExists() || !(r instanceof FileResource)) { continue; } totalFiles++; File file = ((FileResource) r).getFile(); if (!file.exists() || !file.canRead()) { throw new BuildException( "File \"" + file.getAbsolutePath() + "\" does not exist or is not readable."); } boolean indexIt = true; if (checkLastModified) { Term pathTerm = new Term("path", file.getPath()); TermQuery query = new TermQuery(pathTerm); ScoreDoc[] hits = searcher.search(query, null, 1).scoreDocs; // if document is found, compare the // indexed last modified time with the // current file // - don't index if up to date if (hits.length > 0) { Document doc = searcher.doc(hits[0].doc); String indexModified = doc.get("modified").trim(); if (indexModified != null) { long lastModified = 0; try { lastModified = DateTools.stringToTime(indexModified); } catch (ParseException e) { // if modified time is not parsable, skip } if (lastModified == file.lastModified()) { // TODO: remove existing document indexIt = false; } } } } if (indexIt) { try { log("Indexing " + file.getPath(), Project.MSG_VERBOSE); Document doc = handler.getDocument(file); if (doc == null) { totalIgnored++; } else { // Add the path of the file as a field named "path". Use a Keyword field, so // that the index stores the path, and so that the path is searchable doc.add( new Field( "path", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED)); // Add the last modified date of the file a field named "modified". Use a // Keyword field, so that it's searchable, but so that no attempt is made // to tokenize the field into words. doc.add( new Field( "modified", DateTools.timeToString( file.lastModified(), DateTools.Resolution.MILLISECOND), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.addDocument(doc); totalIndexed++; } } catch (DocumentHandlerException e) { throw new BuildException(e); } } } // for j } // if (fs != null) } // for i writer.optimize(); } // try finally { // always make sure everything gets closed, // no matter how we exit. writer.close(); if (searcher != null) { searcher.close(); } } Date end = new Date(); log( totalIndexed + " out of " + totalFiles + " indexed (" + totalIgnored + " ignored) in " + (end.getTime() - start.getTime()) + " milliseconds"); } finally { dir.close(); } }