private void doTest(Random random, PrintWriter out, boolean useCompoundFiles, int MAX_DOCS) throws Exception { Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random); IndexWriterConfig conf = newIndexWriterConfig(analyzer); final MergePolicy mp = conf.getMergePolicy(); mp.setNoCFSRatio(useCompoundFiles ? 1.0 : 0.0); IndexWriter writer = new IndexWriter(directory, conf); if (VERBOSE) { System.out.println("TEST: now build index MAX_DOCS=" + MAX_DOCS); } for (int j = 0; j < MAX_DOCS; j++) { Document d = new Document(); d.add(newTextField(PRIORITY_FIELD, HIGH_PRIORITY, Field.Store.YES)); d.add(newTextField(ID_FIELD, Integer.toString(j), Field.Store.YES)); writer.addDocument(d); } writer.close(); // try a search without OR IndexReader reader = DirectoryReader.open(directory); IndexSearcher searcher = newSearcher(reader); Query query = new TermQuery(new Term(PRIORITY_FIELD, HIGH_PRIORITY)); out.println("Query: " + query.toString(PRIORITY_FIELD)); if (VERBOSE) { System.out.println("TEST: search query=" + query); } final Sort sort = new Sort(SortField.FIELD_SCORE, new SortField(ID_FIELD, SortField.Type.INT)); ScoreDoc[] hits = searcher.search(query, null, MAX_DOCS, sort).scoreDocs; printHits(out, hits, searcher); checkHits(hits, MAX_DOCS, searcher); // try a new search with OR searcher = newSearcher(reader); hits = null; BooleanQuery booleanQuery = new BooleanQuery(); booleanQuery.add( new TermQuery(new Term(PRIORITY_FIELD, HIGH_PRIORITY)), BooleanClause.Occur.SHOULD); booleanQuery.add( new TermQuery(new Term(PRIORITY_FIELD, MED_PRIORITY)), BooleanClause.Occur.SHOULD); out.println("Query: " + booleanQuery.toString(PRIORITY_FIELD)); hits = searcher.search(booleanQuery, null, MAX_DOCS, sort).scoreDocs; printHits(out, hits, searcher); checkHits(hits, MAX_DOCS, searcher); reader.close(); directory.close(); }
public static IndexWriterConfig getIndexWriterConfig(Analyzer analyzer, boolean create) { IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(create ? OpenMode.CREATE : OpenMode.CREATE_OR_APPEND); config.setRAMBufferSizeMB(150); // faster indexing // Set merge factor (if using LogMergePolicy, which is the default up to version LUCENE_32, // so yes) MergePolicy mp = config.getMergePolicy(); if (mp instanceof LogMergePolicy) { ((LogMergePolicy) mp).setMergeFactor(40); // faster indexing } return config; }
/** Override this to customize index settings, e.g. which codec to use. */ protected IndexWriterConfig getIndexWriterConfig( Version matchVersion, Analyzer indexAnalyzer, IndexWriterConfig.OpenMode openMode) { IndexWriterConfig iwc = new IndexWriterConfig(matchVersion, indexAnalyzer); iwc.setCodec(new Lucene46Codec()); iwc.setOpenMode(openMode); // This way all merged segments will be sorted at // merge time, allow for per-segment early termination // when those segments are searched: iwc.setMergePolicy(new SortingMergePolicy(iwc.getMergePolicy(), SORT)); return iwc; }
public void testCodec() throws Exception { Directory dir = new AppendingRAMDirectory(new RAMDirectory()); IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_40, new MockAnalyzer()); cfg.setCodecProvider(new AppendingCodecProvider()); ((LogMergePolicy) cfg.getMergePolicy()).setUseCompoundFile(false); ((LogMergePolicy) cfg.getMergePolicy()).setUseCompoundDocStore(false); IndexWriter writer = new IndexWriter(dir, cfg); Document doc = new Document(); doc.add(new Field("f", text, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS)); writer.addDocument(doc); writer.commit(); writer.addDocument(doc); writer.optimize(); writer.close(); IndexReader reader = IndexReader.open(dir, null, true, 1, new AppendingCodecProvider()); assertEquals(2, reader.numDocs()); doc = reader.document(0); assertEquals(text, doc.get("f")); Fields fields = MultiFields.getFields(reader); Terms terms = fields.terms("f"); assertNotNull(terms); TermsEnum te = terms.iterator(); assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("quick"))); assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("brown"))); assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("fox"))); assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("jumped"))); assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("over"))); assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("lazy"))); assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("dog"))); assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("the"))); DocsEnum de = te.docs(null, null); assertTrue(de.advance(0) != DocsEnum.NO_MORE_DOCS); assertEquals(2, de.freq()); assertTrue(de.advance(1) != DocsEnum.NO_MORE_DOCS); assertTrue(de.advance(2) == DocsEnum.NO_MORE_DOCS); reader.close(); }
@Override public void close(TaskAttemptContext context) throws IOException { LOG.debug( "Task " + context.getTaskAttemptID() + " merging into dstDir: " + workDir + ", srcDirs: " + shards); writeShardNumberFile(context); heartBeater.needHeartBeat(); try { Directory mergedIndex = new HdfsDirectory(workDir, NoLockFactory.INSTANCE, context.getConfiguration()); // TODO: shouldn't we pull the Version from the solrconfig.xml? IndexWriterConfig writerConfig = new IndexWriterConfig(null).setOpenMode(OpenMode.CREATE).setUseCompoundFile(false) // .setMergePolicy(mergePolicy) // TODO: grab tuned MergePolicy from solrconfig.xml? // .setMergeScheduler(...) // TODO: grab tuned MergeScheduler from solrconfig.xml? ; if (LOG.isDebugEnabled()) { writerConfig.setInfoStream(System.out); } // writerConfig.setRAMBufferSizeMB(100); // improve performance // writerConfig.setMaxThreadStates(1); // disable compound file to improve performance // also see // http://lucene.472066.n3.nabble.com/Questions-on-compound-file-format-td489105.html // also see defaults in SolrIndexConfig MergePolicy mergePolicy = writerConfig.getMergePolicy(); LOG.debug("mergePolicy was: {}", mergePolicy); if (mergePolicy instanceof TieredMergePolicy) { ((TieredMergePolicy) mergePolicy).setNoCFSRatio(0.0); // ((TieredMergePolicy) mergePolicy).setMaxMergeAtOnceExplicit(10000); // ((TieredMergePolicy) mergePolicy).setMaxMergeAtOnce(10000); // ((TieredMergePolicy) mergePolicy).setSegmentsPerTier(10000); } else if (mergePolicy instanceof LogMergePolicy) { ((LogMergePolicy) mergePolicy).setNoCFSRatio(0.0); } LOG.info("Using mergePolicy: {}", mergePolicy); IndexWriter writer = new IndexWriter(mergedIndex, writerConfig); Directory[] indexes = new Directory[shards.size()]; for (int i = 0; i < shards.size(); i++) { indexes[i] = new HdfsDirectory(shards.get(i), NoLockFactory.INSTANCE, context.getConfiguration()); } context.setStatus("Logically merging " + shards.size() + " shards into one shard"); LOG.info("Logically merging " + shards.size() + " shards into one shard: " + workDir); RTimer timer = new RTimer(); writer.addIndexes(indexes); // TODO: avoid intermediate copying of files into dst directory; rename the files into the // dir instead (cp -> rename) // This can improve performance and turns this phase into a true "logical" merge, completing // in constant time. // See https://issues.apache.org/jira/browse/LUCENE-4746 timer.stop(); if (LOG.isDebugEnabled()) { context .getCounter( SolrCounters.class.getName(), SolrCounters.LOGICAL_TREE_MERGE_TIME.toString()) .increment((long) timer.getTime()); } LOG.info("Logical merge took {}ms", timer.getTime()); int maxSegments = context .getConfiguration() .getInt(TreeMergeMapper.MAX_SEGMENTS_ON_TREE_MERGE, Integer.MAX_VALUE); context.setStatus( "Optimizing Solr: forcing mtree merge down to " + maxSegments + " segments"); LOG.info("Optimizing Solr: forcing tree merge down to {} segments", maxSegments); timer = new RTimer(); if (maxSegments < Integer.MAX_VALUE) { writer.forceMerge(maxSegments); // TODO: consider perf enhancement for no-deletes merges: bulk-copy the postings data // see // http://lucene.472066.n3.nabble.com/Experience-with-large-merge-factors-tp1637832p1647046.html } timer.stop(); if (LOG.isDebugEnabled()) { context .getCounter( SolrCounters.class.getName(), SolrCounters.PHYSICAL_TREE_MERGE_TIME.toString()) .increment((long) timer.getTime()); } LOG.info( "Optimizing Solr: done forcing tree merge down to {} segments in {}ms", maxSegments, timer.getTime()); timer = new RTimer(); LOG.info("Optimizing Solr: Closing index writer"); writer.close(); LOG.info("Optimizing Solr: Done closing index writer in {}ms", timer.getTime()); context.setStatus("Done"); } finally { heartBeater.cancelHeartBeat(); heartBeater.close(); } }
/** * Index the fileset. * * @exception IOException if Lucene I/O exception TODO: refactor!!!!! */ private void indexDocs() throws IOException { Date start = new Date(); boolean create = overwrite; // If the index directory doesn't exist, // create it and force create mode if (indexDir.mkdirs() && !overwrite) { create = true; } FSDirectory dir = FSDirectory.open(indexDir); try { Searcher searcher = null; boolean checkLastModified = false; if (!create) { try { searcher = new IndexSearcher(dir, true); checkLastModified = true; } catch (IOException ioe) { log("IOException: " + ioe.getMessage()); // Empty - ignore, which indicates to index all // documents } } log("checkLastModified = " + checkLastModified, Project.MSG_VERBOSE); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) .setOpenMode(create ? OpenMode.CREATE : OpenMode.APPEND); LogMergePolicy lmp = (LogMergePolicy) conf.getMergePolicy(); lmp.setUseCompoundFile(useCompoundIndex); lmp.setMergeFactor(mergeFactor); IndexWriter writer = new IndexWriter(dir, conf); int totalFiles = 0; int totalIndexed = 0; int totalIgnored = 0; try { for (int i = 0; i < rcs.size(); i++) { ResourceCollection rc = rcs.elementAt(i); if (rc.isFilesystemOnly()) { Iterator resources = rc.iterator(); while (resources.hasNext()) { Resource r = (Resource) resources.next(); if (!r.isExists() || !(r instanceof FileResource)) { continue; } totalFiles++; File file = ((FileResource) r).getFile(); if (!file.exists() || !file.canRead()) { throw new BuildException( "File \"" + file.getAbsolutePath() + "\" does not exist or is not readable."); } boolean indexIt = true; if (checkLastModified) { Term pathTerm = new Term("path", file.getPath()); TermQuery query = new TermQuery(pathTerm); ScoreDoc[] hits = searcher.search(query, null, 1).scoreDocs; // if document is found, compare the // indexed last modified time with the // current file // - don't index if up to date if (hits.length > 0) { Document doc = searcher.doc(hits[0].doc); String indexModified = doc.get("modified").trim(); if (indexModified != null) { long lastModified = 0; try { lastModified = DateTools.stringToTime(indexModified); } catch (ParseException e) { // if modified time is not parsable, skip } if (lastModified == file.lastModified()) { // TODO: remove existing document indexIt = false; } } } } if (indexIt) { try { log("Indexing " + file.getPath(), Project.MSG_VERBOSE); Document doc = handler.getDocument(file); if (doc == null) { totalIgnored++; } else { // Add the path of the file as a field named "path". Use a Keyword field, so // that the index stores the path, and so that the path is searchable doc.add( new Field( "path", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED)); // Add the last modified date of the file a field named "modified". Use a // Keyword field, so that it's searchable, but so that no attempt is made // to tokenize the field into words. doc.add( new Field( "modified", DateTools.timeToString( file.lastModified(), DateTools.Resolution.MILLISECOND), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.addDocument(doc); totalIndexed++; } } catch (DocumentHandlerException e) { throw new BuildException(e); } } } // for j } // if (fs != null) } // for i writer.optimize(); } // try finally { // always make sure everything gets closed, // no matter how we exit. writer.close(); if (searcher != null) { searcher.close(); } } Date end = new Date(); log( totalIndexed + " out of " + totalFiles + " indexed (" + totalIgnored + " ignored) in " + (end.getTime() - start.getTime()) + " milliseconds"); } finally { dir.close(); } }