示例#1
0
 private IndexWriter createWriter(boolean create) throws IOException {
   try {
     final IndexWriterConfig iwc = new IndexWriterConfig(engineConfig.getAnalyzer());
     iwc.setCommitOnClose(false); // we by default don't commit on close
     iwc.setOpenMode(
         create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND);
     iwc.setIndexDeletionPolicy(deletionPolicy);
     // with tests.verbose, lucene sets this up: plumb to align with filesystem stream
     boolean verbose = false;
     try {
       verbose = Boolean.parseBoolean(System.getProperty("tests.verbose"));
     } catch (Throwable ignore) {
     }
     iwc.setInfoStream(verbose ? InfoStream.getDefault() : new LoggerInfoStream(logger));
     iwc.setMergeScheduler(mergeScheduler);
     MergePolicy mergePolicy = config().getMergePolicy();
     // Give us the opportunity to upgrade old segments while performing
     // background merges
     mergePolicy = new ElasticsearchMergePolicy(mergePolicy);
     iwc.setMergePolicy(mergePolicy);
     iwc.setSimilarity(engineConfig.getSimilarity());
     iwc.setRAMBufferSizeMB(engineConfig.getIndexingBufferSize().mbFrac());
     iwc.setCodec(engineConfig.getCodec());
     iwc.setUseCompoundFile(
         true); // always use compound on flush - reduces # of file-handles on refresh
     return new IndexWriter(store.directory(), iwc);
   } catch (LockObtainFailedException ex) {
     logger.warn("could not lock IndexWriter", ex);
     throw ex;
   }
 }
  /** Override this to customize index settings, e.g. which codec to use. */
  protected IndexWriterConfig getIndexWriterConfig(
      Version matchVersion, Analyzer indexAnalyzer, IndexWriterConfig.OpenMode openMode) {
    IndexWriterConfig iwc = new IndexWriterConfig(matchVersion, indexAnalyzer);
    iwc.setCodec(new Lucene46Codec());
    iwc.setOpenMode(openMode);

    // This way all merged segments will be sorted at
    // merge time, allow for per-segment early termination
    // when those segments are searched:
    iwc.setMergePolicy(new SortingMergePolicy(iwc.getMergePolicy(), SORT));

    return iwc;
  }
  public boolean openIndex() {
    try {
      Directory dir = FSDirectory.open(new File(indexPath).toPath());
      Analyzer analyzer = new StandardAnalyzer();
      IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

      // Always overwrite the directory now changed to append
      iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
      iwc.setCodec(new CompressingCodec2());
      indexWriter = new IndexWriter(dir, iwc);

      return true;
    } catch (Exception e) {
      System.err.println("Error opening the index. " + e.getMessage());
    }
    return false;
  }
  static IndexWriterConfig iwcWithSuggestField(Analyzer analyzer, final Set<String> suggestFields) {
    IndexWriterConfig iwc = newIndexWriterConfig(random(), analyzer);
    iwc.setMergePolicy(newLogMergePolicy());
    Codec filterCodec =
        new Lucene60Codec() {
          PostingsFormat postingsFormat = new Completion50PostingsFormat();

          @Override
          public PostingsFormat getPostingsFormatForField(String field) {
            if (suggestFields.contains(field)) {
              return postingsFormat;
            }
            return super.getPostingsFormatForField(field);
          }
        };
    iwc.setCodec(filterCodec);
    return iwc;
  }
  public Lookup buildAnalyzingLookup(
      final CompletionFieldMapper mapper, String[] terms, String[] surfaces, long[] weights)
      throws IOException {
    RAMDirectory dir = new RAMDirectory();
    FilterCodec filterCodec =
        new FilterCodec("filtered", Codec.getDefault()) {
          @Override
          public PostingsFormat postingsFormat() {
            final PostingsFormat in = super.postingsFormat();
            return mapper.postingsFormat(in);
          }
        };
    IndexWriterConfig indexWriterConfig = new IndexWriterConfig(mapper.indexAnalyzer());

    indexWriterConfig.setCodec(filterCodec);
    IndexWriter writer = new IndexWriter(dir, indexWriterConfig);
    for (int i = 0; i < weights.length; i++) {
      Document doc = new Document();
      BytesRef payload =
          mapper.buildPayload(
              new BytesRef(surfaces[i]), weights[i], new BytesRef(Long.toString(weights[i])));
      doc.add(mapper.getCompletionField(ContextMapping.EMPTY_CONTEXT, terms[i], payload));
      if (randomBoolean()) {
        writer.commit();
      }
      writer.addDocument(doc);
    }
    writer.commit();
    writer.forceMerge(1, true);
    writer.commit();
    DirectoryReader reader = DirectoryReader.open(writer, true);
    assertThat(reader.leaves().size(), equalTo(1));
    assertThat(reader.leaves().get(0).reader().numDocs(), equalTo(weights.length));
    LeafReaderContext atomicReaderContext = reader.leaves().get(0);
    Terms luceneTerms = atomicReaderContext.reader().terms(mapper.name());
    Lookup lookup =
        ((Completion090PostingsFormat.CompletionTerms) luceneTerms)
            .getLookup(mapper, new CompletionSuggestionContext(null));
    reader.close();
    writer.close();
    dir.close();
    return lookup;
  }
 private void doTestMixedPostings(Codec codec) throws Exception {
   Directory dir = newDirectory();
   IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
   iwc.setCodec(codec);
   RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
   Document doc = new Document();
   FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
   // turn on vectors for the checkindex cross-check
   ft.setStoreTermVectors(true);
   ft.setStoreTermVectorOffsets(true);
   ft.setStoreTermVectorPositions(true);
   Field idField = new Field("id", "", ft);
   Field dateField = new Field("date", "", ft);
   doc.add(idField);
   doc.add(dateField);
   for (int i = 0; i < 100; i++) {
     idField.setStringValue(Integer.toString(random().nextInt(50)));
     dateField.setStringValue(Integer.toString(random().nextInt(100)));
     iw.addDocument(doc);
   }
   iw.close();
   dir.close(); // checkindex
 }
示例#7
0
  @Test
  public void testGetThatFieldProbabilityRatioIsReflectedInBoost() throws Exception {

    ArgumentCaptor<Float> normalizeCaptor = ArgumentCaptor.forClass(Float.class);

    DocumentFrequencyCorrection dfc = new DocumentFrequencyCorrection();

    Directory directory = newDirectory();

    Analyzer analyzer =
        new Analyzer() {
          protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new WhitespaceTokenizer();
            TokenStream filter =
                new WordDelimiterFilter(
                    source,
                    WordDelimiterFilter.GENERATE_WORD_PARTS
                        | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE,
                    null);
            filter = new LowerCaseFilter(filter);
            return new TokenStreamComponents(source, filter);
          }
        };

    IndexWriterConfig conf = new IndexWriterConfig(analyzer);
    conf.setCodec(Codec.forName(TestUtil.LUCENE_CODEC));
    IndexWriter indexWriter = new IndexWriter(directory, conf);

    // Both fields f1 and f2 have 10 terms in total.
    // f1: the search terms (abc def) make 100% of all terms in f1
    // f2: the search terms (abc def) make 50% of all terms in f2
    // --> we expect that the sum of the boost factors for terms in bq(+f1:abc, +f1:def)
    // equals 2 * sum of the boost factors for terms in bq(+f2:abc, +f2:def)

    PRMSFieldBoostTest.addNumDocs("f1", "abc def", indexWriter, 2);
    PRMSFieldBoostTest.addNumDocs("f1", "abc", indexWriter, 4);
    PRMSFieldBoostTest.addNumDocs("f1", "def", indexWriter, 2);
    PRMSFieldBoostTest.addNumDocs("f2", "abc def", indexWriter, 1);
    PRMSFieldBoostTest.addNumDocs("f2", "abc", indexWriter, 2);
    PRMSFieldBoostTest.addNumDocs("f2", "def", indexWriter, 1);
    PRMSFieldBoostTest.addNumDocs("f2", "ghi", indexWriter, 5);

    indexWriter.close();

    IndexReader indexReader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(indexReader);
    indexSearcher.setSimilarity(similarity);

    Map<String, Float> fields = new HashMap<>();
    fields.put("f1", 1f);
    fields.put("f2", 1f);
    SearchFieldsAndBoosting searchFieldsAndBoosting =
        new SearchFieldsAndBoosting(FieldBoostModel.PRMS, fields, fields, 0.8f);

    LuceneQueryBuilder queryBuilder =
        new LuceneQueryBuilder(dfc, analyzer, searchFieldsAndBoosting, 0.01f, null);

    WhiteSpaceQuerqyParser parser = new WhiteSpaceQuerqyParser();

    Query query = queryBuilder.createQuery(parser.parse("AbcDef"));
    dfc.finishedUserQuery();

    assertTrue(query instanceof DisjunctionMaxQuery);

    DisjunctionMaxQuery dmq = (DisjunctionMaxQuery) query;
    List<Query> disjuncts = dmq.getDisjuncts();
    assertEquals(2, disjuncts.size());

    Query disjunct1 = disjuncts.get(0);
    if (disjunct1 instanceof BoostQuery) {
      disjunct1 = ((BoostQuery) disjunct1).getQuery();
    }
    assertTrue(disjunct1 instanceof BooleanQuery);

    BooleanQuery bq1 = (BooleanQuery) disjunct1;

    Query disjunct2 = disjuncts.get(1);
    if (disjunct2 instanceof BoostQuery) {
      disjunct2 = ((BoostQuery) disjunct2).getQuery();
    }
    assertTrue(disjunct2 instanceof BooleanQuery);

    BooleanQuery bq2 = (BooleanQuery) disjunct2;

    final Weight weight1 = bq1.createWeight(indexSearcher, true);
    weight1.normalize(0.1f, 4f);

    final Weight weight2 = bq2.createWeight(indexSearcher, true);
    weight2.normalize(0.1f, 4f);

    Mockito.verify(simWeight, times(4)).normalize(eq(0.1f), normalizeCaptor.capture());
    final List<Float> capturedBoosts = normalizeCaptor.getAllValues();

    // capturedBoosts = boosts of [bq1.term1, bq1.term2, bq2.term1, bq2.term2 ]
    assertEquals(capturedBoosts.get(0), capturedBoosts.get(1), 0.00001);
    assertEquals(capturedBoosts.get(2), capturedBoosts.get(3), 0.00001);
    assertEquals(2f, capturedBoosts.get(0) / capturedBoosts.get(3), 0.00001);

    indexReader.close();
    directory.close();
    analyzer.close();
  }
  // creates 8 fields with different options and does "duels" of fields against each other
  public void test() throws Exception {
    Directory dir = newDirectory();
    Analyzer analyzer =
        new Analyzer(Analyzer.PER_FIELD_REUSE_STRATEGY) {
          @Override
          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer tokenizer = new MockTokenizer(reader);
            if (fieldName.contains("payloadsFixed")) {
              TokenFilter filter = new MockFixedLengthPayloadFilter(new Random(0), tokenizer, 1);
              return new TokenStreamComponents(tokenizer, filter);
            } else if (fieldName.contains("payloadsVariable")) {
              TokenFilter filter = new MockVariableLengthPayloadFilter(new Random(0), tokenizer);
              return new TokenStreamComponents(tokenizer, filter);
            } else {
              return new TokenStreamComponents(tokenizer);
            }
          }
        };
    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
    iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene41PostingsFormat()));
    // TODO we could actually add more fields implemented with different PFs
    // or, just put this test into the usual rotation?
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc.clone());
    Document doc = new Document();
    FieldType docsOnlyType = new FieldType(TextField.TYPE_NOT_STORED);
    // turn this on for a cross-check
    docsOnlyType.setStoreTermVectors(true);
    docsOnlyType.setIndexOptions(IndexOptions.DOCS_ONLY);

    FieldType docsAndFreqsType = new FieldType(TextField.TYPE_NOT_STORED);
    // turn this on for a cross-check
    docsAndFreqsType.setStoreTermVectors(true);
    docsAndFreqsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);

    FieldType positionsType = new FieldType(TextField.TYPE_NOT_STORED);
    // turn these on for a cross-check
    positionsType.setStoreTermVectors(true);
    positionsType.setStoreTermVectorPositions(true);
    positionsType.setStoreTermVectorOffsets(true);
    positionsType.setStoreTermVectorPayloads(true);
    FieldType offsetsType = new FieldType(positionsType);
    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    Field field1 = new Field("field1docs", "", docsOnlyType);
    Field field2 = new Field("field2freqs", "", docsAndFreqsType);
    Field field3 = new Field("field3positions", "", positionsType);
    Field field4 = new Field("field4offsets", "", offsetsType);
    Field field5 = new Field("field5payloadsFixed", "", positionsType);
    Field field6 = new Field("field6payloadsVariable", "", positionsType);
    Field field7 = new Field("field7payloadsFixedOffsets", "", offsetsType);
    Field field8 = new Field("field8payloadsVariableOffsets", "", offsetsType);
    doc.add(field1);
    doc.add(field2);
    doc.add(field3);
    doc.add(field4);
    doc.add(field5);
    doc.add(field6);
    doc.add(field7);
    doc.add(field8);
    for (int i = 0; i < MAXDOC; i++) {
      String stringValue =
          Integer.toString(i)
              + " verycommon "
              + English.intToEnglish(i).replace('-', ' ')
              + " "
              + _TestUtil.randomSimpleString(random());
      field1.setStringValue(stringValue);
      field2.setStringValue(stringValue);
      field3.setStringValue(stringValue);
      field4.setStringValue(stringValue);
      field5.setStringValue(stringValue);
      field6.setStringValue(stringValue);
      field7.setStringValue(stringValue);
      field8.setStringValue(stringValue);
      iw.addDocument(doc);
    }
    iw.close();
    verify(dir);
    _TestUtil.checkIndex(dir); // for some extra coverage, checkIndex before we forceMerge
    iwc.setOpenMode(OpenMode.APPEND);
    IndexWriter iw2 = new IndexWriter(dir, iwc.clone());
    iw2.forceMerge(1);
    iw2.close();
    verify(dir);
    dir.close();
  }
  // TODO: not sure this test is that great, we should probably peek inside PerFieldPostingsFormat
  // or something?!
  @Test
  public void testChangeCodecAndMerge() throws IOException {
    Directory dir = newDirectory();
    if (VERBOSE) {
      System.out.println("TEST: make new index");
    }
    IndexWriterConfig iwconf =
        newIndexWriterConfig(new MockAnalyzer(random()))
            .setOpenMode(OpenMode.CREATE)
            .setCodec(new MockCodec());
    iwconf.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH);
    // ((LogMergePolicy) iwconf.getMergePolicy()).setMergeFactor(10);
    IndexWriter writer = newWriter(dir, iwconf);

    addDocs(writer, 10);
    writer.commit();
    assertQuery(new Term("content", "aaa"), dir, 10);
    if (VERBOSE) {
      System.out.println("TEST: addDocs3");
    }
    addDocs3(writer, 10);
    writer.commit();
    writer.close();

    assertQuery(new Term("content", "ccc"), dir, 10);
    assertQuery(new Term("content", "aaa"), dir, 10);
    Codec codec = iwconf.getCodec();

    iwconf =
        newIndexWriterConfig(new MockAnalyzer(random()))
            .setOpenMode(OpenMode.APPEND)
            .setCodec(codec);
    // ((LogMergePolicy) iwconf.getMergePolicy()).setNoCFSRatio(0.0);
    // ((LogMergePolicy) iwconf.getMergePolicy()).setMergeFactor(10);
    iwconf.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH);

    iwconf.setCodec(new MockCodec2()); // uses standard for field content
    writer = newWriter(dir, iwconf);
    // swap in new codec for currently written segments
    if (VERBOSE) {
      System.out.println("TEST: add docs w/ Standard codec for content field");
    }
    addDocs2(writer, 10);
    writer.commit();
    codec = iwconf.getCodec();
    assertEquals(30, writer.maxDoc());
    assertQuery(new Term("content", "bbb"), dir, 10);
    assertQuery(new Term("content", "ccc"), dir, 10); // //
    assertQuery(new Term("content", "aaa"), dir, 10);

    if (VERBOSE) {
      System.out.println("TEST: add more docs w/ new codec");
    }
    addDocs2(writer, 10);
    writer.commit();
    assertQuery(new Term("content", "ccc"), dir, 10);
    assertQuery(new Term("content", "bbb"), dir, 20);
    assertQuery(new Term("content", "aaa"), dir, 10);
    assertEquals(40, writer.maxDoc());

    if (VERBOSE) {
      System.out.println("TEST: now optimize");
    }
    writer.forceMerge(1);
    assertEquals(40, writer.maxDoc());
    writer.close();
    assertQuery(new Term("content", "ccc"), dir, 10);
    assertQuery(new Term("content", "bbb"), dir, 20);
    assertQuery(new Term("content", "aaa"), dir, 10);

    dir.close();
  }