Example #1
0
 static {
   // Id
   IdFielType = new FieldType();
   IdFielType.setStored(true);
   IdFielType.setTokenized(false);
   IdFielType.setOmitNorms(true);
   IdFielType.setIndexOptions(IndexOptions.DOCS);
   IdFielType.freeze();
   // content
   ContentFielType = new FieldType();
   ContentFielType.setStored(false);
   ContentFielType.setTokenized(true);
   ContentFielType.setOmitNorms(false);
   ContentFielType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
   ContentFielType.freeze();
   // title
   TitleFielType = new FieldType();
   TitleFielType.setStored(true);
   TitleFielType.setTokenized(true);
   TitleFielType.setOmitNorms(false);
   TitleFielType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
   TitleFielType.freeze();
   // onlyForStore
   OnLyStoreFieldType = new FieldType();
   OnLyStoreFieldType.setStored(true);
   OnLyStoreFieldType.setTokenized(false);
   OnLyStoreFieldType.setOmitNorms(false);
   OnLyStoreFieldType.setIndexOptions(IndexOptions.NONE);
   OnLyStoreFieldType.freeze();
 }
Example #2
0
    static {
      LABEL_FIELD_TYPE.setStored(true);
      LABEL_FIELD_TYPE.setTokenized(true);
      LABEL_FIELD_TYPE.freeze();

      URI_FIELD_TYPE.setStored(true);
      URI_FIELD_TYPE.setTokenized(false);
      URI_FIELD_TYPE.freeze();

      FIELD_TYPE.setStored(true);
      FIELD_TYPE.freeze();
    }
  /**
   * Translates the pre-4.0 enums for specifying how a field should be indexed into the 4.0 {@link
   * FieldType} approach.
   *
   * @deprecated This is here only to ease transition from the pre-4.0 APIs.
   */
  @Deprecated
  public static final FieldType translateFieldType(
      Store store, Index index, TermVector termVector) {
    final FieldType ft = new FieldType();

    ft.setStored(store == Store.YES);

    switch (index) {
      case ANALYZED:
        ft.setIndexed(true);
        ft.setTokenized(true);
        break;
      case ANALYZED_NO_NORMS:
        ft.setIndexed(true);
        ft.setTokenized(true);
        ft.setOmitNorms(true);
        break;
      case NOT_ANALYZED:
        ft.setIndexed(true);
        ft.setTokenized(false);
        break;
      case NOT_ANALYZED_NO_NORMS:
        ft.setIndexed(true);
        ft.setTokenized(false);
        ft.setOmitNorms(true);
        break;
      case NO:
        break;
    }

    switch (termVector) {
      case NO:
        break;
      case YES:
        ft.setStoreTermVectors(true);
        break;
      case WITH_POSITIONS:
        ft.setStoreTermVectors(true);
        ft.setStoreTermVectorPositions(true);
        break;
      case WITH_OFFSETS:
        ft.setStoreTermVectors(true);
        ft.setStoreTermVectorOffsets(true);
        break;
      case WITH_POSITIONS_OFFSETS:
        ft.setStoreTermVectors(true);
        ft.setStoreTermVectorPositions(true);
        ft.setStoreTermVectorOffsets(true);
        break;
    }
    ft.freeze();
    return ft;
  }
  static {
    TYPE_NOT_STORED.setOmitNorms(true);
    TYPE_NOT_STORED.setIndexOptions(IndexOptions.DOCS);
    TYPE_NOT_STORED.setTokenized(false);
    TYPE_NOT_STORED.freeze();

    TYPE_STORED.setOmitNorms(true);
    TYPE_STORED.setIndexOptions(IndexOptions.DOCS);
    TYPE_STORED.setStored(true);
    TYPE_STORED.setTokenized(false);
    TYPE_STORED.freeze();
  }
 /*
  * Test per field codec support - adding fields with random codecs
  */
 @Test
 public void testStressPerFieldCodec() throws IOException {
   Directory dir = newDirectory(random());
   final int docsPerRound = 97;
   int numRounds = atLeast(1);
   for (int i = 0; i < numRounds; i++) {
     int num = TestUtil.nextInt(random(), 30, 60);
     IndexWriterConfig config =
         newIndexWriterConfig(random(), TEST_VERSION_CURRENT, new MockAnalyzer(random()));
     config.setOpenMode(OpenMode.CREATE_OR_APPEND);
     IndexWriter writer = newWriter(dir, config);
     for (int j = 0; j < docsPerRound; j++) {
       final Document doc = new Document();
       for (int k = 0; k < num; k++) {
         FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
         customType.setTokenized(random().nextBoolean());
         customType.setOmitNorms(random().nextBoolean());
         Field field =
             newField("" + k, TestUtil.randomRealisticUnicodeString(random(), 128), customType);
         doc.add(field);
       }
       writer.addDocument(doc);
     }
     if (random().nextBoolean()) {
       writer.forceMerge(1);
     }
     writer.commit();
     assertEquals((i + 1) * docsPerRound, writer.maxDoc());
     writer.close();
   }
   dir.close();
 }
Example #6
0
 public Builder(
     String index, @Nullable Settings indexSettings, RootObjectMapper.Builder builder) {
   this.index = index;
   this.indexSettings = indexSettings;
   this.builderContext = new Mapper.BuilderContext(indexSettings, new ContentPath(1));
   this.rootObjectMapper = builder.build(builderContext);
   IdFieldMapper idFieldMapper = new IdFieldMapper();
   if (indexSettings != null) {
     String idIndexed = indexSettings.get("index.mapping._id.indexed");
     if (idIndexed != null && Booleans.parseBoolean(idIndexed, false)) {
       FieldType fieldType = new FieldType(IdFieldMapper.Defaults.FIELD_TYPE);
       fieldType.setTokenized(false);
       idFieldMapper = new IdFieldMapper(fieldType);
     }
   }
   this.rootMappers.put(IdFieldMapper.class, idFieldMapper);
   // add default mappers, order is important (for example analyzer should come before the rest
   // to set context.analyzer)
   this.rootMappers.put(SizeFieldMapper.class, new SizeFieldMapper());
   this.rootMappers.put(IndexFieldMapper.class, new IndexFieldMapper());
   this.rootMappers.put(SourceFieldMapper.class, new SourceFieldMapper());
   this.rootMappers.put(TypeFieldMapper.class, new TypeFieldMapper());
   this.rootMappers.put(AnalyzerMapper.class, new AnalyzerMapper());
   this.rootMappers.put(AllFieldMapper.class, new AllFieldMapper());
   this.rootMappers.put(BoostFieldMapper.class, new BoostFieldMapper());
   this.rootMappers.put(RoutingFieldMapper.class, new RoutingFieldMapper());
   this.rootMappers.put(TimestampFieldMapper.class, new TimestampFieldMapper());
   this.rootMappers.put(TTLFieldMapper.class, new TTLFieldMapper());
   this.rootMappers.put(UidFieldMapper.class, new UidFieldMapper());
   // don't add parent field, by default its "null"
 }
  /**
   * Used for adding a document when a field needs to be created from a type and a string.
   *
   * <p>By default, the indexed value is the same as the stored value (taken from toInternal()).
   * Having a different representation for external, internal, and indexed would present quite a few
   * problems given the current Lucene architecture. An analyzer for adding docs would need to
   * translate internal-&gt;indexed while an analyzer for querying would need to translate
   * external-&gt;indexed.
   *
   * <p>The only other alternative to having internal==indexed would be to have internal==external.
   * In this case, toInternal should convert to the indexed representation, toExternal() should do
   * nothing, and createField() should *not* call toInternal, but use the external value and set
   * tokenized=true to get Lucene to convert to the internal(indexed) form. :TODO: clean up and
   * clarify this explanation.
   *
   * @see #toInternal
   */
  public StorableField createField(SchemaField field, Object value, float boost) {
    if (!field.indexed() && !field.stored()) {
      if (log.isTraceEnabled()) log.trace("Ignoring unindexed/unstored field: " + field);
      return null;
    }

    String val;
    try {
      val = toInternal(value.toString());
    } catch (RuntimeException e) {
      throw new SolrException(
          SolrException.ErrorCode.SERVER_ERROR,
          "Error while creating field '" + field + "' from value '" + value + "'",
          e);
    }
    if (val == null) return null;

    org.apache.lucene.document.FieldType newType = new org.apache.lucene.document.FieldType();
    newType.setTokenized(field.isTokenized());
    newType.setStored(field.stored());
    newType.setOmitNorms(field.omitNorms());
    newType.setIndexOptions(field.indexed() ? getIndexOptions(field, val) : IndexOptions.NONE);
    newType.setStoreTermVectors(field.storeTermVector());
    newType.setStoreTermVectorOffsets(field.storeTermOffsets());
    newType.setStoreTermVectorPositions(field.storeTermPositions());
    newType.setStoreTermVectorPayloads(field.storeTermPayloads());

    return createField(field.getName(), val, newType, boost);
  }
  // LUCENE-325: test forceMergeDeletes without waiting, when
  // many adjacent merges are required
  public void testForceMergeDeletes3() throws IOException {
    Directory dir = newDirectory();
    IndexWriter writer =
        new IndexWriter(
            dir,
            newIndexWriterConfig(new MockAnalyzer(random()))
                .setMaxBufferedDocs(2)
                .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)
                .setMergePolicy(newLogMergePolicy(50)));

    FieldType customType = new FieldType();
    customType.setStored(true);

    FieldType customType1 = new FieldType(TextField.TYPE_NOT_STORED);
    customType1.setTokenized(false);
    customType1.setStoreTermVectors(true);
    customType1.setStoreTermVectorPositions(true);
    customType1.setStoreTermVectorOffsets(true);

    Document document = new Document();
    Field storedField = newField("stored", "stored", customType);
    document.add(storedField);
    Field termVectorField = newField("termVector", "termVector", customType1);
    document.add(termVectorField);
    Field idField = newStringField("id", "", Field.Store.NO);
    document.add(idField);
    for (int i = 0; i < 98; i++) {
      idField.setStringValue("" + i);
      writer.addDocument(document);
    }
    writer.close();

    IndexReader ir = DirectoryReader.open(dir);
    assertEquals(98, ir.maxDoc());
    assertEquals(98, ir.numDocs());
    ir.close();

    IndexWriterConfig dontMergeConfig =
        new IndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(NoMergePolicy.INSTANCE);
    writer = new IndexWriter(dir, dontMergeConfig);
    for (int i = 0; i < 98; i += 2) {
      writer.deleteDocuments(new Term("id", "" + i));
    }
    writer.close();
    ir = DirectoryReader.open(dir);
    assertEquals(49, ir.numDocs());
    ir.close();

    writer =
        new IndexWriter(
            dir,
            newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy(3)));
    writer.forceMergeDeletes(false);
    writer.close();
    ir = DirectoryReader.open(dir);
    assertEquals(49, ir.maxDoc());
    assertEquals(49, ir.numDocs());
    ir.close();
    dir.close();
  }
 static {
   TYPE.setIndexed(true);
   TYPE.setOmitNorms(true);
   TYPE.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS);
   TYPE.setTokenized(true);
   TYPE.setStoreTermVectors(true);
   TYPE.freeze();
 }
 static {
   FIELD_TYPE.setIndexed(true);
   FIELD_TYPE.setTokenized(false);
   FIELD_TYPE.setStored(true);
   FIELD_TYPE.setOmitNorms(true);
   FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_ONLY);
   FIELD_TYPE.freeze();
 }
 private static FieldType idFieldType(Settings indexSettings) {
   FieldType fieldType = new FieldType(Defaults.FIELD_TYPE);
   boolean pre2x = Version.indexCreated(indexSettings).before(Version.V_2_0_0);
   if (pre2x && indexSettings.getAsBoolean("index.mapping._id.indexed", true) == false) {
     fieldType.setTokenized(false);
   }
   return fieldType;
 }
Example #12
0
 @Test
 public void testName() throws Exception {
   FieldType TYPE_STORED = new FieldType();
   TYPE_STORED.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
   TYPE_STORED.setTokenized(false);
   TYPE_STORED.setStored(true);
   TYPE_STORED.freeze();
   System.out.println(TYPE_STORED);
 }
    static {
      FIELD_TYPE.setIndexed(true);
      FIELD_TYPE.setTokenized(false);
      FIELD_TYPE.setStored(true);
      FIELD_TYPE.setOmitNorms(true);
      FIELD_TYPE.setIndexOptions(
          FieldInfo.IndexOptions
              .DOCS_AND_FREQS_AND_POSITIONS); // we store payload (otherwise, we really need just
                                              // docs)
      FIELD_TYPE.freeze();

      NESTED_FIELD_TYPE.setIndexed(true);
      NESTED_FIELD_TYPE.setTokenized(false);
      NESTED_FIELD_TYPE.setStored(false);
      NESTED_FIELD_TYPE.setOmitNorms(true);
      // we can set this to another index option when we move away from storing payload..
      // NESTED_FIELD_TYPE.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY);
      NESTED_FIELD_TYPE.freeze();
    }
Example #14
0
 static {
   UID_FIELD_TYPE.setIndexed(true);
   UID_FIELD_TYPE.setTokenized(false);
   UID_FIELD_TYPE.setStored(true);
   UID_FIELD_TYPE.setOmitNorms(true);
   UID_FIELD_TYPE.setIndexOptions(
       FieldInfo.IndexOptions
           .DOCS_AND_FREQS_AND_POSITIONS); // we store payload (otherwise, we really need just
                                           // docs)
   UID_FIELD_TYPE.freeze();
 }
Example #15
0
  public void testChangeGaps() throws Exception {
    // LUCENE-5324: check that it is possible to change the wrapper's gaps
    final int positionGap = random().nextInt(1000);
    final int offsetGap = random().nextInt(1000);
    final Analyzer delegate = new MockAnalyzer(random());
    final Analyzer a =
        new DelegatingAnalyzerWrapper(delegate.getReuseStrategy()) {
          @Override
          protected Analyzer getWrappedAnalyzer(String fieldName) {
            return delegate;
          }

          @Override
          public int getPositionIncrementGap(String fieldName) {
            return positionGap;
          }

          @Override
          public int getOffsetGap(String fieldName) {
            return offsetGap;
          }
        };

    final RandomIndexWriter writer = new RandomIndexWriter(random(), newDirectory(), a);
    final Document doc = new Document();
    final FieldType ft = new FieldType();
    ft.setIndexOptions(IndexOptions.DOCS);
    ft.setTokenized(true);
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorPositions(true);
    ft.setStoreTermVectorOffsets(true);
    doc.add(new Field("f", "a", ft));
    doc.add(new Field("f", "a", ft));
    writer.addDocument(doc);
    final LeafReader reader = getOnlySegmentReader(writer.getReader());
    final Fields fields = reader.getTermVectors(0);
    final Terms terms = fields.terms("f");
    final TermsEnum te = terms.iterator();
    assertEquals(new BytesRef("a"), te.next());
    final PostingsEnum dpe = te.postings(null, PostingsEnum.ALL);
    assertEquals(0, dpe.nextDoc());
    assertEquals(2, dpe.freq());
    assertEquals(0, dpe.nextPosition());
    assertEquals(0, dpe.startOffset());
    final int endOffset = dpe.endOffset();
    assertEquals(1 + positionGap, dpe.nextPosition());
    assertEquals(1 + endOffset + offsetGap, dpe.endOffset());
    assertEquals(null, te.next());
    reader.close();
    writer.close();
    writer.w.getDirectory().close();
  }
Example #16
0
  /**
   * Construct Indexer
   *
   * @param directory the main BlackLab index directory
   * @param create if true, creates a new index; otherwise, appends to existing index
   * @param docIndexerClass how to index the files, or null to autodetect
   * @param indexTemplateFile JSON file to use as template for index structure / metadata (if
   *     creating new index)
   * @throws DocumentFormatException if no DocIndexer was specified and autodetection failed
   * @throws IOException
   */
  public Indexer(
      File directory,
      boolean create,
      Class<? extends DocIndexer> docIndexerClass,
      File indexTemplateFile)
      throws DocumentFormatException, IOException {
    this.docIndexerClass = docIndexerClass;

    searcher = Searcher.openForWriting(directory, create, indexTemplateFile);
    if (!create) searcher.getIndexStructure().setModified();

    if (this.docIndexerClass == null) {
      // No DocIndexer supplied; try to detect it from the index
      // metadata.
      String formatId = searcher.getIndexStructure().getDocumentFormat();
      if (formatId != null && formatId.length() > 0)
        setDocIndexer(DocumentFormats.getIndexerClass(formatId));
      else {
        throw new DocumentFormatException("Cannot detect document format for index!");
      }
    }

    metadataFieldTypeTokenized = new FieldType();
    metadataFieldTypeTokenized.setStored(true);
    metadataFieldTypeTokenized.setIndexed(true);
    metadataFieldTypeTokenized.setTokenized(true);
    metadataFieldTypeTokenized.setOmitNorms(true); // @@@ <-- depending on setting?
    metadataFieldTypeTokenized.setStoreTermVectors(true);
    metadataFieldTypeTokenized.setStoreTermVectorPositions(true);
    metadataFieldTypeTokenized.setStoreTermVectorOffsets(true);
    metadataFieldTypeTokenized.freeze();

    metadataFieldTypeUntokenized = new FieldType(metadataFieldTypeTokenized);
    metadataFieldTypeUntokenized.setTokenized(false);
    metadataFieldTypeUntokenized.freeze();
  }
Example #17
0
  private QueryEngine(File path)
      throws FileNotFoundException, UnsupportedEncodingException, IOException {
    dir = new RAMDirectory();
    StandardAnalyzer sa = new StandardAnalyzer(Version.LUCENE_40);
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, sa);
    IndexWriter iw = new IndexWriter(dir, iwc);

    File[] files;
    files = path.listFiles();
    iwc.setSimilarity(new DefaultSimilarity());
    FileInputStream fis;
    for (File fi : files) {
      String out = "";
      String curline;
      BufferedReader br = new BufferedReader(new FileReader(fi));
      while ((curline = br.readLine()) != null) {
        out += (curline + "\n");
      }
      fis = new FileInputStream(fi);
      Document doc = new Document();
      doc.add(new StringField("path", fi.getAbsolutePath(), Field.Store.YES));
      //           doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis,
      // "UTF-8"))));

      FieldType ft = new FieldType();
      ft.setIndexed(true);
      ft.setTokenized(true);
      ft.setStoreTermVectors(true);
      doc.add(
          new Field("contents", out, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
      iw.addDocument(doc);
      fis.close();
    }
    iw.close();
    IndexReader ir = DirectoryReader.open(dir);
    ir2 = ir;
    is = new IndexSearcher(ir);
    is.setSimilarity(new DefaultSimilarity());
    qp = new QueryParser(Version.LUCENE_40, "contents", sa);
  }
  private boolean isAHit(Query q, String content, Analyzer analyzer) throws IOException {
    Directory ramDir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), ramDir, analyzer);
    Document doc = new Document();
    FieldType fieldType = new FieldType();
    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    fieldType.setTokenized(true);
    fieldType.setStored(true);
    Field field = new Field(FIELD, content, fieldType);
    doc.add(field);
    writer.addDocument(doc);
    writer.close();
    DirectoryReader ir = DirectoryReader.open(ramDir);
    IndexSearcher is = new IndexSearcher(ir);

    int hits = is.search(q, 10).totalHits;
    ir.close();
    ramDir.close();
    if (hits == 1) {
      return true;
    } else {
      return false;
    }
  }
Example #19
0
 static {
   TYPE_STORED_NOT_INDEXED.setIndexed(false);
   TYPE_STORED_NOT_INDEXED.setStored(true);
   TYPE_STORED_NOT_INDEXED.setTokenized(true);
   TYPE_STORED_NOT_INDEXED.freeze();
 }
 static {
   FIELD_TYPE.setIndexed(true);
   FIELD_TYPE.setTokenized(true);
   FIELD_TYPE.freeze();
 }
    public void indexDoc() throws IOException {
      Document d = new Document();

      FieldType customType1 = new FieldType(TextField.TYPE_STORED);
      customType1.setTokenized(false);
      customType1.setOmitNorms(true);

      ArrayList<Field> fields = new ArrayList<>();
      String idString = getIdString();
      Field idField = newField("id", idString, customType1);
      fields.add(idField);

      int nFields = nextInt(maxFields);
      for (int i = 0; i < nFields; i++) {

        FieldType customType = new FieldType();
        switch (nextInt(4)) {
          case 0:
            break;
          case 1:
            customType.setStoreTermVectors(true);
            break;
          case 2:
            customType.setStoreTermVectors(true);
            customType.setStoreTermVectorPositions(true);
            break;
          case 3:
            customType.setStoreTermVectors(true);
            customType.setStoreTermVectorOffsets(true);
            break;
        }

        switch (nextInt(4)) {
          case 0:
            customType.setStored(true);
            customType.setOmitNorms(true);
            customType.setIndexed(true);
            fields.add(newField("f" + nextInt(100), getString(1), customType));
            break;
          case 1:
            customType.setIndexed(true);
            customType.setTokenized(true);
            fields.add(newField("f" + nextInt(100), getString(0), customType));
            break;
          case 2:
            customType.setStored(true);
            customType.setStoreTermVectors(false);
            customType.setStoreTermVectorOffsets(false);
            customType.setStoreTermVectorPositions(false);
            fields.add(newField("f" + nextInt(100), getString(0), customType));
            break;
          case 3:
            customType.setStored(true);
            customType.setIndexed(true);
            customType.setTokenized(true);
            fields.add(newField("f" + nextInt(100), getString(bigFieldSize), customType));
            break;
        }
      }

      if (sameFieldOrder) {
        Collections.sort(fields, fieldNameComparator);
      } else {
        // random placement of id field also
        Collections.swap(fields, nextInt(fields.size()), 0);
      }

      for (int i = 0; i < fields.size(); i++) {
        d.add(fields.get(i));
      }
      if (VERBOSE) {
        System.out.println(Thread.currentThread().getName() + ": indexing id:" + idString);
      }
      w.updateDocument(new Term("id", idString), d);
      // System.out.println(Thread.currentThread().getName() + ": indexing "+d);
      docs.put(idString, d);
    }
Example #22
0
  /**
   * Method to create Lucene Index Keep in mind that always index text value to Lucene for
   * calculating Cosine Similarity. You have to generate tokens, terms and their frequencies and
   * store them in the Lucene Index.
   *
   * @throws CorruptIndexException
   * @throws LockObtainFailedException
   * @throws IOException
   */
  public void index() throws CorruptIndexException, LockObtainFailedException, IOException {

    System.out.println(">>> Process source directory  : " + sourceDirectory.getAbsolutePath());

    System.out.println(">>> Create index in directory : " + indexDirectory.getAbsolutePath());

    Directory dir = FSDirectory.open(indexDirectory);

    Analyzer analyzer = new StandardAnalyzer(StandardAnalyzer.STOP_WORDS_SET); // using stop words
    // Analyzer analyzer = new StandardAnalyzer(CharArraySet.EMPTY_SET);  // using stop words
    // System.out.println(">>> Overwrite the Analyser: DO NOT USE STOPWORDS FILTER !!!");

    /**
     * Source: http://toolongdidntread.com/lucene/using-lucene-to-generate-n-grams/
     *
     * <p>1) Run a document through an Analyzer which filters out the stuff we don’t care about.
     * SimpleAnalyzer, in this case, applies a lower case filter and a letter tokenizer, which makes
     * all text lowercase and divides text at non-letters, respectively.
     *
     * <p>2) Wrap this analyzer with ShingleAnalyzerWrapper which constructs shingles (token
     * n-grams) from a stream. This is the main thing we want to accomplish.
     *
     * <p>3) We generate a TokenStream which enumerates (a fancy word for establishes) “fields” from
     * a “document” (what I mentioned earlier).
     *
     * <p>4) Given a token stream, we want to extract certain things from it, like just the
     * characters and not all the other stuff that comes along with the stream. We’ll use
     * CharTermAttribute which extract just the words from the stream.
     *
     * <p>5) Finally, we iterate over the stream by incrementing the tokens, extracting each
     * CharTermAttribute from the tokens.
     */
    if (useNgramsAsTerms) {
      analyzer = getNGramAnalyser(analyzer, N);
    }

    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_10_2, analyzer);

    if (indexDirectory.exists()) {
      iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    } else {
      // Add new documents to an existing index:
      iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    }

    IndexWriter writer = new IndexWriter(dir, iwc);

    for (File f : sourceDirectory.listFiles()) {

      System.out.println("> DOC  :  " + f.getAbsolutePath());

      if (f.isDirectory()) {

        System.out.println(">>> Indexer processes FILE : " + f.getAbsolutePath());

        for (File fileTXT : f.listFiles()) {

          String at = getAllText(fileTXT);

          System.out.println("> file  :  " + fileTXT.getAbsolutePath());
          Document doc = new Document();

          FieldType fieldType = new FieldType();
          fieldType.setIndexed(true);
          fieldType.setIndexOptions(
              FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
          fieldType.setStored(true);
          fieldType.setStoreTermVectors(true);
          fieldType.setTokenized(true);
          Field contentField = new Field(fieldName, at, fieldType);
          doc.add(contentField);

          FieldType fieldType2 = new FieldType();
          fieldType2.setIndexed(true);
          fieldType2.setStored(true);
          fieldType2.setStoreTermVectors(false);
          fieldType2.setTokenized(false);
          Field idField = new Field("id", fileTXT.getAbsolutePath(), fieldType2);
          doc.add(idField);

          FieldType fieldType3 = new FieldType();
          fieldType3.setIndexed(false);
          fieldType3.setStored(true);
          fieldType3.setStoreTermVectors(false);
          fieldType3.setTokenized(false);
          Field rawField = new Field("raw", at, fieldType3);
          doc.add(rawField);

          writer.addDocument(doc);
        }

      } else {

        if (!f.getName().startsWith(".DS_Store")) {

          String at = getAllText(f);

          Document doc = new Document();
          FieldType fieldType = new FieldType();
          fieldType.setIndexed(true);
          fieldType.setIndexOptions(
              FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
          fieldType.setStored(true);
          fieldType.setStoreTermVectors(true);
          fieldType.setTokenized(true);
          Field contentField = new Field(fieldName, at, fieldType);
          doc.add(contentField);

          FieldType fieldType2 = new FieldType();
          fieldType2.setIndexed(true);
          fieldType2.setStored(true);
          fieldType2.setStoreTermVectors(false);
          fieldType2.setTokenized(false);
          Field idField = new Field("id", f.getAbsolutePath(), fieldType2);
          doc.add(idField);

          FieldType fieldType3 = new FieldType();
          fieldType3.setIndexed(false);
          fieldType3.setStored(true);
          fieldType3.setStoreTermVectors(false);
          fieldType3.setTokenized(false);
          Field rawField = new Field("raw", at, fieldType3);
          doc.add(rawField);

          writer.addDocument(doc);
        }
      }
    }
    writer.close();
  }
  @Slow
  public void testNoWaitClose() throws Throwable {
    Directory directory = newDirectory();

    if (directory instanceof MockDirectoryWrapper) {
      ((MockDirectoryWrapper) directory).setPreventDoubleWrite(false);
    }

    final Document doc = new Document();
    FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
    customType.setTokenized(false);

    Field idField = newField("id", "", customType);
    doc.add(idField);

    for (int pass = 0; pass < 2; pass++) {
      if (VERBOSE) {
        System.out.println("TEST: pass="******"TEST: iter=" + iter);
        }
        for (int j = 0; j < 199; j++) {
          idField.setStringValue(Integer.toString(iter * 201 + j));
          writer.addDocument(doc);
        }

        int delID = iter * 199;
        for (int j = 0; j < 20; j++) {
          writer.deleteDocuments(new Term("id", Integer.toString(delID)));
          delID += 5;
        }

        writer.commit();

        // Force a bunch of merge threads to kick off so we
        // stress out aborting them on close:
        ((LogMergePolicy) writer.getConfig().getMergePolicy()).setMergeFactor(2);

        final IndexWriter finalWriter = writer;
        final AtomicReference<Throwable> failure = new AtomicReference<>();
        Thread t1 =
            new Thread() {
              @Override
              public void run() {
                boolean done = false;
                while (!done) {
                  for (int i = 0; i < 100; i++) {
                    try {
                      finalWriter.addDocument(doc);
                    } catch (AlreadyClosedException e) {
                      done = true;
                      break;
                    } catch (NullPointerException e) {
                      done = true;
                      break;
                    } catch (Throwable e) {
                      e.printStackTrace(System.out);
                      failure.set(e);
                      done = true;
                      break;
                    }
                  }
                  Thread.yield();
                }
              }
            };

        t1.start();

        writer.close();
        t1.join();

        if (failure.get() != null) {
          throw failure.get();
        }

        // Make sure reader can read
        IndexReader reader = DirectoryReader.open(directory);
        reader.close();

        // Reopen
        writer =
            new IndexWriter(
                directory,
                newIndexWriterConfig(new MockAnalyzer(random()))
                    .setOpenMode(OpenMode.APPEND)
                    .setMergePolicy(newLogMergePolicy())
                    .setCommitOnClose(false));
      }
      writer.close();
    }

    directory.close();
  }
Example #24
0
 static {
   customType5 = new FieldType(TextField.TYPE_STORED);
   customType5.setOmitNorms(true);
   customType5.setTokenized(false);
   noNormsField = new Field(NO_NORMS_KEY, NO_NORMS_TEXT, customType5);
 }
  protected void _add(Topic topic) {
    if (topic == null) return; // 虽然不太可能,还是预防一下吧
    // 暂时不索引评论
    dao.fetchLinks(topic, "replies");
    Document document;
    document = new Document();
    Field field;
    FieldType fieldType;

    // 先加入id
    fieldType = new FieldType();
    fieldType.setIndexed(true); // 索引
    fieldType.setStored(true); // 存储
    fieldType.setStoreTermVectors(true);
    fieldType.setTokenized(true);
    fieldType.setStoreTermVectorPositions(true); // 存储位置
    fieldType.setStoreTermVectorOffsets(true); // 存储偏移量
    field = new Field("id", topic.getId(), fieldType);
    document.add(field);

    // 加入标题
    fieldType = new FieldType();
    fieldType.setIndexed(true); // 索引
    fieldType.setStored(true); // 存储
    fieldType.setStoreTermVectors(true);
    fieldType.setTokenized(true);
    fieldType.setStoreTermVectorPositions(true); // 存储位置
    fieldType.setStoreTermVectorOffsets(true); // 存储偏移量
    field = new Field("title", topic.getTitle(), fieldType);
    document.add(field);

    // 加入文章内容
    fieldType = new FieldType();
    fieldType.setIndexed(true); // 索引
    fieldType.setStored(false); // 存储
    fieldType.setStoreTermVectors(true);
    fieldType.setTokenized(true);
    fieldType.setStoreTermVectorPositions(true); // 存储位置
    fieldType.setStoreTermVectorOffsets(true); // 存储偏移量
    field = new Field("content", topic.getContent(), fieldType);
    document.add(field);

    StringBuilder sb = new StringBuilder();
    if (topic.getReplies() != null) {
      for (TopicReply reply : topic.getReplies()) {
        if (reply == null) continue;
        bigContentService.fill(reply);
        if (reply.getContent() != null) {
          if (sb.length() + reply.getContent().length() > (IndexWriter.MAX_TERM_LENGTH / 3)) {
            break;
          }
          sb.append(reply.getContent());
        }
      }
    }
    fieldType = new FieldType();
    fieldType.setIndexed(true); // 索引
    fieldType.setStored(false); // 存储
    fieldType.setStoreTermVectors(true);
    fieldType.setTokenized(true);
    fieldType.setStoreTermVectorPositions(true); // 存储位置
    fieldType.setStoreTermVectorOffsets(true); // 存储偏移量

    field = new Field("reply", sb.toString(), fieldType);
    document.add(field);

    try {
      luceneIndex.writer.addDocument(document);
    } catch (IOException e) {
      log.debug("add to index fail : id=" + topic.getId());
    } catch (Error e) {
      log.debug("add to index fail : id=" + topic.getId());
    }
  }
  public void testRandomStoredFields() throws IOException {
    Directory dir = newDirectory();
    Random rand = random();
    RandomIndexWriter w =
        new RandomIndexWriter(
            rand,
            dir,
            newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
                .setMaxBufferedDocs(_TestUtil.nextInt(rand, 5, 20)));
    // w.w.setNoCFSRatio(0.0);
    final int docCount = atLeast(200);
    final int fieldCount = _TestUtil.nextInt(rand, 1, 5);

    final List<Integer> fieldIDs = new ArrayList<Integer>();

    FieldType customType = new FieldType(TextField.TYPE_STORED);
    customType.setTokenized(false);
    Field idField = newField("id", "", customType);

    for (int i = 0; i < fieldCount; i++) {
      fieldIDs.add(i);
    }

    final Map<String, Document> docs = new HashMap<String, Document>();

    if (VERBOSE) {
      System.out.println("TEST: build index docCount=" + docCount);
    }

    FieldType customType2 = new FieldType();
    customType2.setStored(true);
    for (int i = 0; i < docCount; i++) {
      Document doc = new Document();
      doc.add(idField);
      final String id = "" + i;
      idField.setStringValue(id);
      docs.put(id, doc);
      if (VERBOSE) {
        System.out.println("TEST: add doc id=" + id);
      }

      for (int field : fieldIDs) {
        final String s;
        if (rand.nextInt(4) != 3) {
          s = _TestUtil.randomUnicodeString(rand, 1000);
          doc.add(newField("f" + field, s, customType2));
        } else {
          s = null;
        }
      }
      w.addDocument(doc);
      if (rand.nextInt(50) == 17) {
        // mixup binding of field name -> Number every so often
        Collections.shuffle(fieldIDs);
      }
      if (rand.nextInt(5) == 3 && i > 0) {
        final String delID = "" + rand.nextInt(i);
        if (VERBOSE) {
          System.out.println("TEST: delete doc id=" + delID);
        }
        w.deleteDocuments(new Term("id", delID));
        docs.remove(delID);
      }
    }

    if (VERBOSE) {
      System.out.println("TEST: " + docs.size() + " docs in index; now load fields");
    }
    if (docs.size() > 0) {
      String[] idsList = docs.keySet().toArray(new String[docs.size()]);

      for (int x = 0; x < 2; x++) {
        IndexReader r = w.getReader();
        IndexSearcher s = newSearcher(r);

        if (VERBOSE) {
          System.out.println("TEST: cycle x=" + x + " r=" + r);
        }

        int num = atLeast(1000);
        for (int iter = 0; iter < num; iter++) {
          String testID = idsList[rand.nextInt(idsList.length)];
          if (VERBOSE) {
            System.out.println("TEST: test id=" + testID);
          }
          TopDocs hits = s.search(new TermQuery(new Term("id", testID)), 1);
          assertEquals(1, hits.totalHits);
          Document doc = r.document(hits.scoreDocs[0].doc);
          Document docExp = docs.get(testID);
          for (int i = 0; i < fieldCount; i++) {
            assertEquals(
                "doc " + testID + ", field f" + fieldCount + " is wrong",
                docExp.get("f" + i),
                doc.get("f" + i));
          }
        }
        r.close();
        w.forceMerge(1);
      }
    }
    w.close();
    dir.close();
  }