private void doTest(Random random, PrintWriter out, boolean useCompoundFiles, int MAX_DOCS)
      throws Exception {
    Directory directory = newDirectory();
    Analyzer analyzer = new MockAnalyzer(random);
    IndexWriterConfig conf = newIndexWriterConfig(analyzer);
    final MergePolicy mp = conf.getMergePolicy();
    mp.setNoCFSRatio(useCompoundFiles ? 1.0 : 0.0);
    IndexWriter writer = new IndexWriter(directory, conf);
    if (VERBOSE) {
      System.out.println("TEST: now build index MAX_DOCS=" + MAX_DOCS);
    }

    for (int j = 0; j < MAX_DOCS; j++) {
      Document d = new Document();
      d.add(newTextField(PRIORITY_FIELD, HIGH_PRIORITY, Field.Store.YES));
      d.add(newTextField(ID_FIELD, Integer.toString(j), Field.Store.YES));
      writer.addDocument(d);
    }
    writer.close();

    // try a search without OR
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher searcher = newSearcher(reader);

    Query query = new TermQuery(new Term(PRIORITY_FIELD, HIGH_PRIORITY));
    out.println("Query: " + query.toString(PRIORITY_FIELD));
    if (VERBOSE) {
      System.out.println("TEST: search query=" + query);
    }

    final Sort sort = new Sort(SortField.FIELD_SCORE, new SortField(ID_FIELD, SortField.Type.INT));

    ScoreDoc[] hits = searcher.search(query, null, MAX_DOCS, sort).scoreDocs;
    printHits(out, hits, searcher);
    checkHits(hits, MAX_DOCS, searcher);

    // try a new search with OR
    searcher = newSearcher(reader);
    hits = null;

    BooleanQuery booleanQuery = new BooleanQuery();
    booleanQuery.add(
        new TermQuery(new Term(PRIORITY_FIELD, HIGH_PRIORITY)), BooleanClause.Occur.SHOULD);
    booleanQuery.add(
        new TermQuery(new Term(PRIORITY_FIELD, MED_PRIORITY)), BooleanClause.Occur.SHOULD);
    out.println("Query: " + booleanQuery.toString(PRIORITY_FIELD));

    hits = searcher.search(booleanQuery, null, MAX_DOCS, sort).scoreDocs;
    printHits(out, hits, searcher);
    checkHits(hits, MAX_DOCS, searcher);

    reader.close();
    directory.close();
  }
Beispiel #2
0
  public static IndexWriterConfig getIndexWriterConfig(Analyzer analyzer, boolean create) {
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setOpenMode(create ? OpenMode.CREATE : OpenMode.CREATE_OR_APPEND);
    config.setRAMBufferSizeMB(150); // faster indexing

    // Set merge factor (if using LogMergePolicy, which is the default up to version LUCENE_32,
    // so yes)
    MergePolicy mp = config.getMergePolicy();
    if (mp instanceof LogMergePolicy) {
      ((LogMergePolicy) mp).setMergeFactor(40); // faster indexing
    }
    return config;
  }
  /** Override this to customize index settings, e.g. which codec to use. */
  protected IndexWriterConfig getIndexWriterConfig(
      Version matchVersion, Analyzer indexAnalyzer, IndexWriterConfig.OpenMode openMode) {
    IndexWriterConfig iwc = new IndexWriterConfig(matchVersion, indexAnalyzer);
    iwc.setCodec(new Lucene46Codec());
    iwc.setOpenMode(openMode);

    // This way all merged segments will be sorted at
    // merge time, allow for per-segment early termination
    // when those segments are searched:
    iwc.setMergePolicy(new SortingMergePolicy(iwc.getMergePolicy(), SORT));

    return iwc;
  }
  public void testCodec() throws Exception {
    Directory dir = new AppendingRAMDirectory(new RAMDirectory());
    IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_40, new MockAnalyzer());

    cfg.setCodecProvider(new AppendingCodecProvider());
    ((LogMergePolicy) cfg.getMergePolicy()).setUseCompoundFile(false);
    ((LogMergePolicy) cfg.getMergePolicy()).setUseCompoundDocStore(false);
    IndexWriter writer = new IndexWriter(dir, cfg);
    Document doc = new Document();
    doc.add(new Field("f", text, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
    writer.addDocument(doc);
    writer.commit();
    writer.addDocument(doc);
    writer.optimize();
    writer.close();
    IndexReader reader = IndexReader.open(dir, null, true, 1, new AppendingCodecProvider());
    assertEquals(2, reader.numDocs());
    doc = reader.document(0);
    assertEquals(text, doc.get("f"));
    Fields fields = MultiFields.getFields(reader);
    Terms terms = fields.terms("f");
    assertNotNull(terms);
    TermsEnum te = terms.iterator();
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("quick")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("brown")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("fox")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("jumped")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("over")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("lazy")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("dog")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("the")));
    DocsEnum de = te.docs(null, null);
    assertTrue(de.advance(0) != DocsEnum.NO_MORE_DOCS);
    assertEquals(2, de.freq());
    assertTrue(de.advance(1) != DocsEnum.NO_MORE_DOCS);
    assertTrue(de.advance(2) == DocsEnum.NO_MORE_DOCS);
    reader.close();
  }
    @Override
    public void close(TaskAttemptContext context) throws IOException {
      LOG.debug(
          "Task "
              + context.getTaskAttemptID()
              + " merging into dstDir: "
              + workDir
              + ", srcDirs: "
              + shards);
      writeShardNumberFile(context);
      heartBeater.needHeartBeat();
      try {
        Directory mergedIndex =
            new HdfsDirectory(workDir, NoLockFactory.INSTANCE, context.getConfiguration());

        // TODO: shouldn't we pull the Version from the solrconfig.xml?
        IndexWriterConfig writerConfig =
            new IndexWriterConfig(null).setOpenMode(OpenMode.CREATE).setUseCompoundFile(false)
            // .setMergePolicy(mergePolicy) // TODO: grab tuned MergePolicy from solrconfig.xml?
            // .setMergeScheduler(...) // TODO: grab tuned MergeScheduler from solrconfig.xml?
            ;

        if (LOG.isDebugEnabled()) {
          writerConfig.setInfoStream(System.out);
        }
        //        writerConfig.setRAMBufferSizeMB(100); // improve performance
        //        writerConfig.setMaxThreadStates(1);

        // disable compound file to improve performance
        // also see
        // http://lucene.472066.n3.nabble.com/Questions-on-compound-file-format-td489105.html
        // also see defaults in SolrIndexConfig
        MergePolicy mergePolicy = writerConfig.getMergePolicy();
        LOG.debug("mergePolicy was: {}", mergePolicy);
        if (mergePolicy instanceof TieredMergePolicy) {
          ((TieredMergePolicy) mergePolicy).setNoCFSRatio(0.0);
          //          ((TieredMergePolicy) mergePolicy).setMaxMergeAtOnceExplicit(10000);
          //          ((TieredMergePolicy) mergePolicy).setMaxMergeAtOnce(10000);
          //          ((TieredMergePolicy) mergePolicy).setSegmentsPerTier(10000);
        } else if (mergePolicy instanceof LogMergePolicy) {
          ((LogMergePolicy) mergePolicy).setNoCFSRatio(0.0);
        }
        LOG.info("Using mergePolicy: {}", mergePolicy);

        IndexWriter writer = new IndexWriter(mergedIndex, writerConfig);

        Directory[] indexes = new Directory[shards.size()];
        for (int i = 0; i < shards.size(); i++) {
          indexes[i] =
              new HdfsDirectory(shards.get(i), NoLockFactory.INSTANCE, context.getConfiguration());
        }

        context.setStatus("Logically merging " + shards.size() + " shards into one shard");
        LOG.info("Logically merging " + shards.size() + " shards into one shard: " + workDir);
        RTimer timer = new RTimer();

        writer.addIndexes(indexes);
        // TODO: avoid intermediate copying of files into dst directory; rename the files into the
        // dir instead (cp -> rename)
        // This can improve performance and turns this phase into a true "logical" merge, completing
        // in constant time.
        // See https://issues.apache.org/jira/browse/LUCENE-4746

        timer.stop();
        if (LOG.isDebugEnabled()) {
          context
              .getCounter(
                  SolrCounters.class.getName(), SolrCounters.LOGICAL_TREE_MERGE_TIME.toString())
              .increment((long) timer.getTime());
        }
        LOG.info("Logical merge took {}ms", timer.getTime());
        int maxSegments =
            context
                .getConfiguration()
                .getInt(TreeMergeMapper.MAX_SEGMENTS_ON_TREE_MERGE, Integer.MAX_VALUE);
        context.setStatus(
            "Optimizing Solr: forcing mtree merge down to " + maxSegments + " segments");
        LOG.info("Optimizing Solr: forcing tree merge down to {} segments", maxSegments);
        timer = new RTimer();
        if (maxSegments < Integer.MAX_VALUE) {
          writer.forceMerge(maxSegments);
          // TODO: consider perf enhancement for no-deletes merges: bulk-copy the postings data
          // see
          // http://lucene.472066.n3.nabble.com/Experience-with-large-merge-factors-tp1637832p1647046.html
        }
        timer.stop();
        if (LOG.isDebugEnabled()) {
          context
              .getCounter(
                  SolrCounters.class.getName(), SolrCounters.PHYSICAL_TREE_MERGE_TIME.toString())
              .increment((long) timer.getTime());
        }
        LOG.info(
            "Optimizing Solr: done forcing tree merge down to {} segments in {}ms",
            maxSegments,
            timer.getTime());

        timer = new RTimer();
        LOG.info("Optimizing Solr: Closing index writer");
        writer.close();
        LOG.info("Optimizing Solr: Done closing index writer in {}ms", timer.getTime());
        context.setStatus("Done");
      } finally {
        heartBeater.cancelHeartBeat();
        heartBeater.close();
      }
    }
  /**
   * Index the fileset.
   *
   * @exception IOException if Lucene I/O exception TODO: refactor!!!!!
   */
  private void indexDocs() throws IOException {
    Date start = new Date();

    boolean create = overwrite;
    // If the index directory doesn't exist,
    // create it and force create mode
    if (indexDir.mkdirs() && !overwrite) {
      create = true;
    }

    FSDirectory dir = FSDirectory.open(indexDir);
    try {
      Searcher searcher = null;
      boolean checkLastModified = false;
      if (!create) {
        try {
          searcher = new IndexSearcher(dir, true);
          checkLastModified = true;
        } catch (IOException ioe) {
          log("IOException: " + ioe.getMessage());
          // Empty - ignore, which indicates to index all
          // documents
        }
      }

      log("checkLastModified = " + checkLastModified, Project.MSG_VERBOSE);

      IndexWriterConfig conf =
          new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
              .setOpenMode(create ? OpenMode.CREATE : OpenMode.APPEND);
      LogMergePolicy lmp = (LogMergePolicy) conf.getMergePolicy();
      lmp.setUseCompoundFile(useCompoundIndex);
      lmp.setMergeFactor(mergeFactor);
      IndexWriter writer = new IndexWriter(dir, conf);
      int totalFiles = 0;
      int totalIndexed = 0;
      int totalIgnored = 0;
      try {

        for (int i = 0; i < rcs.size(); i++) {
          ResourceCollection rc = rcs.elementAt(i);
          if (rc.isFilesystemOnly()) {
            Iterator resources = rc.iterator();
            while (resources.hasNext()) {
              Resource r = (Resource) resources.next();
              if (!r.isExists() || !(r instanceof FileResource)) {
                continue;
              }

              totalFiles++;

              File file = ((FileResource) r).getFile();

              if (!file.exists() || !file.canRead()) {
                throw new BuildException(
                    "File \"" + file.getAbsolutePath() + "\" does not exist or is not readable.");
              }

              boolean indexIt = true;

              if (checkLastModified) {
                Term pathTerm = new Term("path", file.getPath());
                TermQuery query = new TermQuery(pathTerm);
                ScoreDoc[] hits = searcher.search(query, null, 1).scoreDocs;

                // if document is found, compare the
                // indexed last modified time with the
                // current file
                // - don't index if up to date
                if (hits.length > 0) {
                  Document doc = searcher.doc(hits[0].doc);
                  String indexModified = doc.get("modified").trim();
                  if (indexModified != null) {
                    long lastModified = 0;
                    try {
                      lastModified = DateTools.stringToTime(indexModified);
                    } catch (ParseException e) {
                      // if modified time is not parsable, skip
                    }
                    if (lastModified == file.lastModified()) {
                      // TODO: remove existing document
                      indexIt = false;
                    }
                  }
                }
              }

              if (indexIt) {
                try {
                  log("Indexing " + file.getPath(), Project.MSG_VERBOSE);
                  Document doc = handler.getDocument(file);

                  if (doc == null) {
                    totalIgnored++;
                  } else {
                    // Add the path of the file as a field named "path".  Use a Keyword field, so
                    // that the index stores the path, and so that the path is searchable
                    doc.add(
                        new Field(
                            "path", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));

                    // Add the last modified date of the file a field named "modified".  Use a
                    // Keyword field, so that it's searchable, but so that no attempt is made
                    // to tokenize the field into words.
                    doc.add(
                        new Field(
                            "modified",
                            DateTools.timeToString(
                                file.lastModified(), DateTools.Resolution.MILLISECOND),
                            Field.Store.YES,
                            Field.Index.NOT_ANALYZED));

                    writer.addDocument(doc);
                    totalIndexed++;
                  }
                } catch (DocumentHandlerException e) {
                  throw new BuildException(e);
                }
              }
            }
            // for j
          }
          // if (fs != null)
        }
        // for i

        writer.optimize();
      }
      // try
      finally {
        // always make sure everything gets closed,
        // no matter how we exit.
        writer.close();
        if (searcher != null) {
          searcher.close();
        }
      }

      Date end = new Date();

      log(
          totalIndexed
              + " out of "
              + totalFiles
              + " indexed ("
              + totalIgnored
              + " ignored) in "
              + (end.getTime() - start.getTime())
              + " milliseconds");
    } finally {
      dir.close();
    }
  }