示例#1
0
 static {
   // Id
   IdFielType = new FieldType();
   IdFielType.setStored(true);
   IdFielType.setTokenized(false);
   IdFielType.setOmitNorms(true);
   IdFielType.setIndexOptions(IndexOptions.DOCS);
   IdFielType.freeze();
   // content
   ContentFielType = new FieldType();
   ContentFielType.setStored(false);
   ContentFielType.setTokenized(true);
   ContentFielType.setOmitNorms(false);
   ContentFielType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
   ContentFielType.freeze();
   // title
   TitleFielType = new FieldType();
   TitleFielType.setStored(true);
   TitleFielType.setTokenized(true);
   TitleFielType.setOmitNorms(false);
   TitleFielType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
   TitleFielType.freeze();
   // onlyForStore
   OnLyStoreFieldType = new FieldType();
   OnLyStoreFieldType.setStored(true);
   OnLyStoreFieldType.setTokenized(false);
   OnLyStoreFieldType.setOmitNorms(false);
   OnLyStoreFieldType.setIndexOptions(IndexOptions.NONE);
   OnLyStoreFieldType.freeze();
 }
示例#2
0
    static {
      LABEL_FIELD_TYPE.setStored(true);
      LABEL_FIELD_TYPE.setTokenized(true);
      LABEL_FIELD_TYPE.freeze();

      URI_FIELD_TYPE.setStored(true);
      URI_FIELD_TYPE.setTokenized(false);
      URI_FIELD_TYPE.freeze();

      FIELD_TYPE.setStored(true);
      FIELD_TYPE.freeze();
    }
  // LUCENE-1727: make sure doc fields are stored in order
  public void testStoredFieldsOrder() throws Throwable {
    Directory d = newDirectory();
    IndexWriter w =
        new IndexWriter(d, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
    Document doc = new Document();

    FieldType customType = new FieldType();
    customType.setStored(true);
    doc.add(newField("zzz", "a b c", customType));
    doc.add(newField("aaa", "a b c", customType));
    doc.add(newField("zzz", "1 2 3", customType));
    w.addDocument(doc);
    IndexReader r = w.getReader();
    Document doc2 = r.document(0);
    Iterator<IndexableField> it = doc2.getFields().iterator();
    assertTrue(it.hasNext());
    Field f = (Field) it.next();
    assertEquals(f.name(), "zzz");
    assertEquals(f.stringValue(), "a b c");

    assertTrue(it.hasNext());
    f = (Field) it.next();
    assertEquals(f.name(), "aaa");
    assertEquals(f.stringValue(), "a b c");

    assertTrue(it.hasNext());
    f = (Field) it.next();
    assertEquals(f.name(), "zzz");
    assertEquals(f.stringValue(), "1 2 3");
    assertFalse(it.hasNext());
    r.close();
    w.close();
    d.close();
  }
  /**
   * Used for adding a document when a field needs to be created from a type and a string.
   *
   * <p>By default, the indexed value is the same as the stored value (taken from toInternal()).
   * Having a different representation for external, internal, and indexed would present quite a few
   * problems given the current Lucene architecture. An analyzer for adding docs would need to
   * translate internal-&gt;indexed while an analyzer for querying would need to translate
   * external-&gt;indexed.
   *
   * <p>The only other alternative to having internal==indexed would be to have internal==external.
   * In this case, toInternal should convert to the indexed representation, toExternal() should do
   * nothing, and createField() should *not* call toInternal, but use the external value and set
   * tokenized=true to get Lucene to convert to the internal(indexed) form. :TODO: clean up and
   * clarify this explanation.
   *
   * @see #toInternal
   */
  public StorableField createField(SchemaField field, Object value, float boost) {
    if (!field.indexed() && !field.stored()) {
      if (log.isTraceEnabled()) log.trace("Ignoring unindexed/unstored field: " + field);
      return null;
    }

    String val;
    try {
      val = toInternal(value.toString());
    } catch (RuntimeException e) {
      throw new SolrException(
          SolrException.ErrorCode.SERVER_ERROR,
          "Error while creating field '" + field + "' from value '" + value + "'",
          e);
    }
    if (val == null) return null;

    org.apache.lucene.document.FieldType newType = new org.apache.lucene.document.FieldType();
    newType.setTokenized(field.isTokenized());
    newType.setStored(field.stored());
    newType.setOmitNorms(field.omitNorms());
    newType.setIndexOptions(field.indexed() ? getIndexOptions(field, val) : IndexOptions.NONE);
    newType.setStoreTermVectors(field.storeTermVector());
    newType.setStoreTermVectorOffsets(field.storeTermOffsets());
    newType.setStoreTermVectorPositions(field.storeTermPositions());
    newType.setStoreTermVectorPayloads(field.storeTermPayloads());

    return createField(field.getName(), val, newType, boost);
  }
  // LUCENE-325: test forceMergeDeletes without waiting, when
  // many adjacent merges are required
  public void testForceMergeDeletes3() throws IOException {
    Directory dir = newDirectory();
    IndexWriter writer =
        new IndexWriter(
            dir,
            newIndexWriterConfig(new MockAnalyzer(random()))
                .setMaxBufferedDocs(2)
                .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)
                .setMergePolicy(newLogMergePolicy(50)));

    FieldType customType = new FieldType();
    customType.setStored(true);

    FieldType customType1 = new FieldType(TextField.TYPE_NOT_STORED);
    customType1.setTokenized(false);
    customType1.setStoreTermVectors(true);
    customType1.setStoreTermVectorPositions(true);
    customType1.setStoreTermVectorOffsets(true);

    Document document = new Document();
    Field storedField = newField("stored", "stored", customType);
    document.add(storedField);
    Field termVectorField = newField("termVector", "termVector", customType1);
    document.add(termVectorField);
    Field idField = newStringField("id", "", Field.Store.NO);
    document.add(idField);
    for (int i = 0; i < 98; i++) {
      idField.setStringValue("" + i);
      writer.addDocument(document);
    }
    writer.close();

    IndexReader ir = DirectoryReader.open(dir);
    assertEquals(98, ir.maxDoc());
    assertEquals(98, ir.numDocs());
    ir.close();

    IndexWriterConfig dontMergeConfig =
        new IndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(NoMergePolicy.INSTANCE);
    writer = new IndexWriter(dir, dontMergeConfig);
    for (int i = 0; i < 98; i += 2) {
      writer.deleteDocuments(new Term("id", "" + i));
    }
    writer.close();
    ir = DirectoryReader.open(dir);
    assertEquals(49, ir.numDocs());
    ir.close();

    writer =
        new IndexWriter(
            dir,
            newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy(3)));
    writer.forceMergeDeletes(false);
    writer.close();
    ir = DirectoryReader.open(dir);
    assertEquals(49, ir.maxDoc());
    assertEquals(49, ir.numDocs());
    ir.close();
    dir.close();
  }
 static {
   FIELD_TYPE.setIndexed(true);
   FIELD_TYPE.setTokenized(false);
   FIELD_TYPE.setStored(true);
   FIELD_TYPE.setOmitNorms(true);
   FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_ONLY);
   FIELD_TYPE.freeze();
 }
示例#7
0
 @Test
 public void testName() throws Exception {
   FieldType TYPE_STORED = new FieldType();
   TYPE_STORED.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
   TYPE_STORED.setTokenized(false);
   TYPE_STORED.setStored(true);
   TYPE_STORED.freeze();
   System.out.println(TYPE_STORED);
 }
 private void addDoc(IndexWriter writer, String text, String count) throws IOException {
   Document doc = new Document();
   doc.add(new Field("ngram", text, StringField.TYPE_NOT_STORED));
   FieldType fieldType = new FieldType();
   fieldType.setStored(true);
   Field countField = new Field("count", count, fieldType);
   doc.add(countField);
   writer.addDocument(doc);
 }
 static {
   // Default: pointValues + docValues
   FieldType type = new FieldType();
   type.setDimensions(1, Double.BYTES); // pointValues (assume Double)
   type.setDocValuesType(DocValuesType.NUMERIC); // docValues
   type.setStored(false);
   type.freeze();
   DEFAULT_FIELDTYPE = type;
   // Legacy default: legacyNumerics
   type = new FieldType();
   type.setIndexOptions(IndexOptions.DOCS);
   type.setNumericType(FieldType.LegacyNumericType.DOUBLE);
   type.setNumericPrecisionStep(8); // same as solr default
   type.setDocValuesType(DocValuesType.NONE); // no docValues!
   type.setStored(false);
   type.freeze();
   LEGACY_FIELDTYPE = type;
 }
    static {
      FIELD_TYPE.setIndexed(true);
      FIELD_TYPE.setTokenized(false);
      FIELD_TYPE.setStored(true);
      FIELD_TYPE.setOmitNorms(true);
      FIELD_TYPE.setIndexOptions(
          FieldInfo.IndexOptions
              .DOCS_AND_FREQS_AND_POSITIONS); // we store payload (otherwise, we really need just
                                              // docs)
      FIELD_TYPE.freeze();

      NESTED_FIELD_TYPE.setIndexed(true);
      NESTED_FIELD_TYPE.setTokenized(false);
      NESTED_FIELD_TYPE.setStored(false);
      NESTED_FIELD_TYPE.setOmitNorms(true);
      // we can set this to another index option when we move away from storing payload..
      // NESTED_FIELD_TYPE.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY);
      NESTED_FIELD_TYPE.freeze();
    }
示例#11
0
 static {
   UID_FIELD_TYPE.setIndexed(true);
   UID_FIELD_TYPE.setTokenized(false);
   UID_FIELD_TYPE.setStored(true);
   UID_FIELD_TYPE.setOmitNorms(true);
   UID_FIELD_TYPE.setIndexOptions(
       FieldInfo.IndexOptions
           .DOCS_AND_FREQS_AND_POSITIONS); // we store payload (otherwise, we really need just
                                           // docs)
   UID_FIELD_TYPE.freeze();
 }
  public void testReadSkip() throws IOException {
    Directory dir = newDirectory();
    IndexWriterConfig iwConf =
        newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
    iwConf.setMaxBufferedDocs(RandomInts.randomIntBetween(random(), 2, 30));
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConf);

    FieldType ft = new FieldType();
    ft.setStored(true);
    ft.freeze();

    final String string = _TestUtil.randomSimpleString(random(), 50);
    final byte[] bytes = string.getBytes("UTF-8");
    final long l = random().nextBoolean() ? random().nextInt(42) : random().nextLong();
    final int i = random().nextBoolean() ? random().nextInt(42) : random().nextInt();
    final float f = random().nextFloat();
    final double d = random().nextDouble();

    List<Field> fields =
        Arrays.asList(
            new Field("bytes", bytes, ft),
            new Field("string", string, ft),
            new LongField("long", l, Store.YES),
            new IntField("int", i, Store.YES),
            new FloatField("float", f, Store.YES),
            new DoubleField("double", d, Store.YES));

    for (int k = 0; k < 100; ++k) {
      Document doc = new Document();
      for (Field fld : fields) {
        doc.add(fld);
      }
      iw.w.addDocument(doc);
    }
    iw.commit();

    final DirectoryReader reader = DirectoryReader.open(dir);
    final int docID = random().nextInt(100);
    for (Field fld : fields) {
      String fldName = fld.name();
      final Document sDoc = reader.document(docID, Collections.singleton(fldName));
      final IndexableField sField = sDoc.getField(fldName);
      if (Field.class.equals(fld.getClass())) {
        assertEquals(fld.binaryValue(), sField.binaryValue());
        assertEquals(fld.stringValue(), sField.stringValue());
      } else {
        assertEquals(fld.numericValue(), sField.numericValue());
      }
    }
    reader.close();
    iw.close();
    dir.close();
  }
  static {
    TYPE_NOT_STORED.setOmitNorms(true);
    TYPE_NOT_STORED.setIndexOptions(IndexOptions.DOCS);
    TYPE_NOT_STORED.setTokenized(false);
    TYPE_NOT_STORED.freeze();

    TYPE_STORED.setOmitNorms(true);
    TYPE_STORED.setIndexOptions(IndexOptions.DOCS);
    TYPE_STORED.setStored(true);
    TYPE_STORED.setTokenized(false);
    TYPE_STORED.freeze();
  }
  /**
   * Translates the pre-4.0 enums for specifying how a field should be indexed into the 4.0 {@link
   * FieldType} approach.
   *
   * @deprecated This is here only to ease transition from the pre-4.0 APIs.
   */
  @Deprecated
  public static final FieldType translateFieldType(
      Store store, Index index, TermVector termVector) {
    final FieldType ft = new FieldType();

    ft.setStored(store == Store.YES);

    switch (index) {
      case ANALYZED:
        ft.setIndexed(true);
        ft.setTokenized(true);
        break;
      case ANALYZED_NO_NORMS:
        ft.setIndexed(true);
        ft.setTokenized(true);
        ft.setOmitNorms(true);
        break;
      case NOT_ANALYZED:
        ft.setIndexed(true);
        ft.setTokenized(false);
        break;
      case NOT_ANALYZED_NO_NORMS:
        ft.setIndexed(true);
        ft.setTokenized(false);
        ft.setOmitNorms(true);
        break;
      case NO:
        break;
    }

    switch (termVector) {
      case NO:
        break;
      case YES:
        ft.setStoreTermVectors(true);
        break;
      case WITH_POSITIONS:
        ft.setStoreTermVectors(true);
        ft.setStoreTermVectorPositions(true);
        break;
      case WITH_OFFSETS:
        ft.setStoreTermVectors(true);
        ft.setStoreTermVectorOffsets(true);
        break;
      case WITH_POSITIONS_OFFSETS:
        ft.setStoreTermVectors(true);
        ft.setStoreTermVectorPositions(true);
        ft.setStoreTermVectorOffsets(true);
        break;
    }
    ft.freeze();
    return ft;
  }
 /**
  * Persists all snapshots information. If the given id and segment are not null, it persists their
  * information as well.
  */
 private void persistSnapshotInfos(String id, String segment) throws IOException {
   writer.deleteAll();
   Document d = new Document();
   FieldType ft = new FieldType();
   ft.setStored(true);
   d.add(new Field(SNAPSHOTS_ID, "", ft));
   for (Entry<String, String> e : super.getSnapshots().entrySet()) {
     d.add(new Field(e.getKey(), e.getValue(), ft));
   }
   if (id != null) {
     d.add(new Field(id, segment, ft));
   }
   writer.addDocument(d);
   writer.commit();
 }
示例#16
0
  /**
   * Tests various combinations of omitNorms=true/false, the field not existing at all, ensuring
   * that only omitNorms is 'viral'. Internally checks that MultiNorms.norms() is consistent
   * (returns the same bytes) as the fully merged equivalent.
   */
  public void testOmitNormsCombos() throws IOException {
    // indexed with norms
    FieldType customType = new FieldType(TextField.TYPE_STORED);
    Field norms = new Field("foo", "a", customType);
    // indexed without norms
    FieldType customType1 = new FieldType(TextField.TYPE_STORED);
    customType1.setOmitNorms(true);
    Field noNorms = new Field("foo", "a", customType1);
    // not indexed, but stored
    FieldType customType2 = new FieldType();
    customType2.setStored(true);
    Field noIndex = new Field("foo", "a", customType2);
    // not indexed but stored, omitNorms is set
    FieldType customType3 = new FieldType();
    customType3.setStored(true);
    customType3.setOmitNorms(true);
    Field noNormsNoIndex = new Field("foo", "a", customType3);
    // not indexed nor stored (doesnt exist at all, we index a different field instead)
    Field emptyNorms = new Field("bar", "a", customType);

    assertNotNull(getNorms("foo", norms, norms));
    assertNull(getNorms("foo", norms, noNorms));
    assertNotNull(getNorms("foo", norms, noIndex));
    assertNotNull(getNorms("foo", norms, noNormsNoIndex));
    assertNotNull(getNorms("foo", norms, emptyNorms));
    assertNull(getNorms("foo", noNorms, noNorms));
    assertNull(getNorms("foo", noNorms, noIndex));
    assertNull(getNorms("foo", noNorms, noNormsNoIndex));
    assertNull(getNorms("foo", noNorms, emptyNorms));
    assertNull(getNorms("foo", noIndex, noIndex));
    assertNull(getNorms("foo", noIndex, noNormsNoIndex));
    assertNull(getNorms("foo", noIndex, emptyNorms));
    assertNull(getNorms("foo", noNormsNoIndex, noNormsNoIndex));
    assertNull(getNorms("foo", noNormsNoIndex, emptyNorms));
    assertNull(getNorms("foo", emptyNorms, emptyNorms));
  }
 public void testIndexedBit() throws Exception {
   Directory dir = newDirectory();
   RandomIndexWriter w = new RandomIndexWriter(random(), dir);
   Document doc = new Document();
   FieldType onlyStored = new FieldType();
   onlyStored.setStored(true);
   doc.add(new Field("field", "value", onlyStored));
   doc.add(new StringField("field2", "value", Field.Store.YES));
   w.addDocument(doc);
   IndexReader r = w.getReader();
   w.close();
   assertFalse(r.document(0).getField("field").fieldType().indexed());
   assertTrue(r.document(0).getField("field2").fieldType().indexed());
   r.close();
   dir.close();
 }
示例#18
0
  // LUCENE-1270
  public void testHangOnClose() throws IOException {

    Directory dir = newDirectory();
    LogByteSizeMergePolicy lmp = new LogByteSizeMergePolicy();
    lmp.setNoCFSRatio(0.0);
    lmp.setMergeFactor(100);
    IndexWriter writer =
        new IndexWriter(
            dir,
            newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
                .setMaxBufferedDocs(5)
                .setMergePolicy(lmp));

    Document doc = new Document();
    FieldType customType = new FieldType(TextField.TYPE_STORED);
    customType.setStoreTermVectors(true);
    customType.setStoreTermVectorPositions(true);
    customType.setStoreTermVectorOffsets(true);
    doc.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType));
    for (int i = 0; i < 60; i++) writer.addDocument(doc);

    Document doc2 = new Document();
    FieldType customType2 = new FieldType();
    customType2.setStored(true);
    doc2.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType2));
    doc2.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType2));
    doc2.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType2));
    doc2.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType2));
    for (int i = 0; i < 10; i++) writer.addDocument(doc2);
    writer.close();

    Directory dir2 = newDirectory();
    lmp = new LogByteSizeMergePolicy();
    lmp.setMinMergeMB(0.0001);
    lmp.setNoCFSRatio(0.0);
    lmp.setMergeFactor(4);
    writer =
        new IndexWriter(
            dir2,
            newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
                .setMergeScheduler(new SerialMergeScheduler())
                .setMergePolicy(lmp));
    writer.addIndexes(dir);
    writer.close();
    dir.close();
    dir2.close();
  }
示例#19
0
 private IndexReader createIndex(int docCount, int facetFields, boolean ram)
     throws CorruptIndexException, LockObtainFailedException, IOException {
   Directory directory;
   if (ram) {
     directory = new RAMDirectory();
   } else {
     File dir = new File("./target/tmp/facet_tmp");
     if (dir.exists()) {
       directory = FSDirectory.open(dir);
       if (DirectoryReader.indexExists(directory)) {
         DirectoryReader reader = DirectoryReader.open(directory);
         if (reader.numDocs() == docCount) {
           return reader;
         }
         reader.close();
         directory.close();
       }
     }
     rmr(dir);
     directory = FSDirectory.open(dir);
   }
   IndexWriterConfig conf = new IndexWriterConfig(LUCENE_VERSION, new KeywordAnalyzer());
   IndexWriter writer = new IndexWriter(directory, conf);
   FieldType fieldType = new FieldType();
   fieldType.setStored(true);
   fieldType.setIndexed(true);
   fieldType.setOmitNorms(true);
   long start = System.nanoTime();
   for (int i = 0; i < docCount; i++) {
     long now = System.nanoTime();
     if (start + TimeUnit.SECONDS.toNanos(5) < now) {
       System.out.println("Indexing doc " + i + " of " + docCount);
       start = System.nanoTime();
     }
     Document document = new Document();
     document.add(new Field("f1", "value", fieldType));
     document.add(new Field("f2", "v" + i, fieldType));
     for (int f = 0; f < facetFields; f++) {
       document.add(new Field("facet" + f, "value", fieldType));
     }
     writer.addDocument(document);
   }
   writer.close();
   return DirectoryReader.open(directory);
 }
  /**
   * Método para la indexación individual de cada fichero PDF
   *
   * @param f el fichero PDF
   * @param writer el IndexWriter
   * @throws IOException
   */
  public static void indexFile(File f, IndexWriter writer) throws IOException {

    // Cargamos el fichero mediante PDFBox
    PDDocument pddDocument = PDDocument.load(f.getAbsolutePath());
    PDFTextStripper textStripper = new PDFTextStripper();
    int numPages = pddDocument.getNumberOfPages();
    String pageContent;

    // Declaramos un Field propio
    FieldType fieldText = new FieldType();
    fieldText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    fieldText.setStored(false);
    fieldText.setStoreTermVectorOffsets(true);
    fieldText.setStoreTermVectorPositions(true);
    fieldText.setStoreTermVectors(true);

    // Recorremos e indexamos cada una de las páginas del fichero, almacenando el número de página y
    // el título del fichero, e indexando el contenido
    for (int i = 0; i < numPages; i++) {
      if (i == 0) {
        i++;
      }
      textStripper.setStartPage(i);
      textStripper.setEndPage(i);
      // coger una página
      pageContent = textStripper.getText(pddDocument);
      if (pageContent != null && !pageContent.isEmpty()) {
        pageContent = pageContent.toLowerCase();
      }

      if (pageContent != null) {
        // Declaramos el documento a indexar para esa página

        // Número de página
        // Contenido de la página
        // Título del fichero

        // Añadimos el documento
      }
    }

    // Cerramos el fichero PDF

  }
  public void index(Item item) throws IOException {
    String id = item.getId();
    String text = item.getText();

    long publicationTIme = item.getPublicationTime();

    Document document = new Document();

    Field idField = new StringField("id", id, Store.YES);
    document.add(idField);

    FieldType fieldType = new FieldType();
    fieldType.setStored(true);
    fieldType.setIndexed(true);
    fieldType.setStoreTermVectors(true);
    document.add(new Field("text", text, fieldType));

    document.add(new LongField("publicationTIme", publicationTIme, LongField.TYPE_STORED));
    if (iwriter != null) {
      iwriter.addDocument(document);
    }
  }
示例#22
0
  /**
   * Construct Indexer
   *
   * @param directory the main BlackLab index directory
   * @param create if true, creates a new index; otherwise, appends to existing index
   * @param docIndexerClass how to index the files, or null to autodetect
   * @param indexTemplateFile JSON file to use as template for index structure / metadata (if
   *     creating new index)
   * @throws DocumentFormatException if no DocIndexer was specified and autodetection failed
   * @throws IOException
   */
  public Indexer(
      File directory,
      boolean create,
      Class<? extends DocIndexer> docIndexerClass,
      File indexTemplateFile)
      throws DocumentFormatException, IOException {
    this.docIndexerClass = docIndexerClass;

    searcher = Searcher.openForWriting(directory, create, indexTemplateFile);
    if (!create) searcher.getIndexStructure().setModified();

    if (this.docIndexerClass == null) {
      // No DocIndexer supplied; try to detect it from the index
      // metadata.
      String formatId = searcher.getIndexStructure().getDocumentFormat();
      if (formatId != null && formatId.length() > 0)
        setDocIndexer(DocumentFormats.getIndexerClass(formatId));
      else {
        throw new DocumentFormatException("Cannot detect document format for index!");
      }
    }

    metadataFieldTypeTokenized = new FieldType();
    metadataFieldTypeTokenized.setStored(true);
    metadataFieldTypeTokenized.setIndexed(true);
    metadataFieldTypeTokenized.setTokenized(true);
    metadataFieldTypeTokenized.setOmitNorms(true); // @@@ <-- depending on setting?
    metadataFieldTypeTokenized.setStoreTermVectors(true);
    metadataFieldTypeTokenized.setStoreTermVectorPositions(true);
    metadataFieldTypeTokenized.setStoreTermVectorOffsets(true);
    metadataFieldTypeTokenized.freeze();

    metadataFieldTypeUntokenized = new FieldType(metadataFieldTypeTokenized);
    metadataFieldTypeUntokenized.setTokenized(false);
    metadataFieldTypeUntokenized.freeze();
  }
  private boolean isAHit(Query q, String content, Analyzer analyzer) throws IOException {
    Directory ramDir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), ramDir, analyzer);
    Document doc = new Document();
    FieldType fieldType = new FieldType();
    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    fieldType.setTokenized(true);
    fieldType.setStored(true);
    Field field = new Field(FIELD, content, fieldType);
    doc.add(field);
    writer.addDocument(doc);
    writer.close();
    DirectoryReader ir = DirectoryReader.open(ramDir);
    IndexSearcher is = new IndexSearcher(ir);

    int hits = is.search(q, 10).totalHits;
    ir.close();
    ramDir.close();
    if (hits == 1) {
      return true;
    } else {
      return false;
    }
  }
  // Tests some very basic usages...
  public void testBasic() throws Exception {

    final String groupField = "author";

    FieldType customType = new FieldType();
    customType.setStored(true);

    Directory dir = newDirectory();
    RandomIndexWriter w =
        new RandomIndexWriter(
            random(),
            dir,
            newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy()));
    boolean canUseIDV = true;
    List<Document> documents = new ArrayList<>();
    // 0
    Document doc = new Document();
    addGroupField(doc, groupField, "author1", canUseIDV);
    doc.add(new TextField("content", "random text", Field.Store.YES));
    doc.add(new Field("id", "1", customType));
    documents.add(doc);

    // 1
    doc = new Document();
    addGroupField(doc, groupField, "author1", canUseIDV);
    doc.add(new TextField("content", "some more random text", Field.Store.YES));
    doc.add(new Field("id", "2", customType));
    documents.add(doc);

    // 2
    doc = new Document();
    addGroupField(doc, groupField, "author1", canUseIDV);
    doc.add(new TextField("content", "some more random textual data", Field.Store.YES));
    doc.add(new Field("id", "3", customType));
    doc.add(new StringField("groupend", "x", Field.Store.NO));
    documents.add(doc);
    w.addDocuments(documents);
    documents.clear();

    // 3
    doc = new Document();
    addGroupField(doc, groupField, "author2", canUseIDV);
    doc.add(new TextField("content", "some random text", Field.Store.YES));
    doc.add(new Field("id", "4", customType));
    doc.add(new StringField("groupend", "x", Field.Store.NO));
    w.addDocument(doc);

    // 4
    doc = new Document();
    addGroupField(doc, groupField, "author3", canUseIDV);
    doc.add(new TextField("content", "some more random text", Field.Store.YES));
    doc.add(new Field("id", "5", customType));
    documents.add(doc);

    // 5
    doc = new Document();
    addGroupField(doc, groupField, "author3", canUseIDV);
    doc.add(new TextField("content", "random", Field.Store.YES));
    doc.add(new Field("id", "6", customType));
    doc.add(new StringField("groupend", "x", Field.Store.NO));
    documents.add(doc);
    w.addDocuments(documents);
    documents.clear();

    // 6 -- no author field
    doc = new Document();
    doc.add(new TextField("content", "random word stuck in alot of other text", Field.Store.YES));
    doc.add(new Field("id", "6", customType));
    doc.add(new StringField("groupend", "x", Field.Store.NO));

    w.addDocument(doc);

    IndexSearcher indexSearcher = newSearcher(w.getReader());
    w.close();

    Sort groupSort = Sort.RELEVANCE;
    GroupingSearch groupingSearch = createRandomGroupingSearch(groupField, groupSort, 5, canUseIDV);

    TopGroups<?> groups =
        groupingSearch.search(indexSearcher, new TermQuery(new Term("content", "random")), 0, 10);

    assertEquals(7, groups.totalHitCount);
    assertEquals(7, groups.totalGroupedHitCount);
    assertEquals(4, groups.groups.length);

    // relevance order: 5, 0, 3, 4, 1, 2, 6

    // the later a document is added the higher this docId
    // value
    GroupDocs<?> group = groups.groups[0];
    compareGroupValue("author3", group);
    assertEquals(2, group.scoreDocs.length);
    assertEquals(5, group.scoreDocs[0].doc);
    assertEquals(4, group.scoreDocs[1].doc);
    assertTrue(group.scoreDocs[0].score > group.scoreDocs[1].score);

    group = groups.groups[1];
    compareGroupValue("author1", group);
    assertEquals(3, group.scoreDocs.length);
    assertEquals(0, group.scoreDocs[0].doc);
    assertEquals(1, group.scoreDocs[1].doc);
    assertEquals(2, group.scoreDocs[2].doc);
    assertTrue(group.scoreDocs[0].score > group.scoreDocs[1].score);
    assertTrue(group.scoreDocs[1].score > group.scoreDocs[2].score);

    group = groups.groups[2];
    compareGroupValue("author2", group);
    assertEquals(1, group.scoreDocs.length);
    assertEquals(3, group.scoreDocs[0].doc);

    group = groups.groups[3];
    compareGroupValue(null, group);
    assertEquals(1, group.scoreDocs.length);
    assertEquals(6, group.scoreDocs[0].doc);

    Filter lastDocInBlock = new QueryWrapperFilter(new TermQuery(new Term("groupend", "x")));
    groupingSearch = new GroupingSearch(lastDocInBlock);
    groups =
        groupingSearch.search(indexSearcher, new TermQuery(new Term("content", "random")), 0, 10);

    assertEquals(7, groups.totalHitCount);
    assertEquals(7, groups.totalGroupedHitCount);
    assertEquals(4, groups.totalGroupCount.longValue());
    assertEquals(4, groups.groups.length);

    indexSearcher.getIndexReader().close();
    dir.close();
  }
示例#25
0
 static {
   FIELD_TYPE.setIndexOptions(IndexOptions.NONE);
   FIELD_TYPE.setStored(false);
   FIELD_TYPE.setOmitNorms(true);
   FIELD_TYPE.freeze();
 }
  public void testTotalGroupCount() throws Exception {

    final String groupField = "author";
    FieldType customType = new FieldType();
    customType.setStored(true);

    Directory dir = newDirectory();
    RandomIndexWriter w =
        new RandomIndexWriter(
            random(),
            dir,
            newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
                .setMergePolicy(newLogMergePolicy()));
    boolean canUseIDV = !"Lucene3x".equals(w.w.getConfig().getCodec().getName());

    // 0
    Document doc = new Document();
    addGroupField(doc, groupField, "author1", canUseIDV);
    doc.add(new Field("content", "random text", TextField.TYPE_STORED));
    doc.add(new Field("id", "1", customType));
    w.addDocument(doc);

    // 1
    doc = new Document();
    addGroupField(doc, groupField, "author1", canUseIDV);
    doc.add(new Field("content", "some more random text blob", TextField.TYPE_STORED));
    doc.add(new Field("id", "2", customType));
    w.addDocument(doc);

    // 2
    doc = new Document();
    addGroupField(doc, groupField, "author1", canUseIDV);
    doc.add(new Field("content", "some more random textual data", TextField.TYPE_STORED));
    doc.add(new Field("id", "3", customType));
    w.addDocument(doc);
    w.commit(); // To ensure a second segment

    // 3
    doc = new Document();
    addGroupField(doc, groupField, "author2", canUseIDV);
    doc.add(new Field("content", "some random text", TextField.TYPE_STORED));
    doc.add(new Field("id", "4", customType));
    w.addDocument(doc);

    // 4
    doc = new Document();
    addGroupField(doc, groupField, "author3", canUseIDV);
    doc.add(new Field("content", "some more random text", TextField.TYPE_STORED));
    doc.add(new Field("id", "5", customType));
    w.addDocument(doc);

    // 5
    doc = new Document();
    addGroupField(doc, groupField, "author3", canUseIDV);
    doc.add(new Field("content", "random blob", TextField.TYPE_STORED));
    doc.add(new Field("id", "6", customType));
    w.addDocument(doc);

    // 6 -- no author field
    doc = new Document();
    doc.add(new Field("content", "random word stuck in alot of other text", TextField.TYPE_STORED));
    doc.add(new Field("id", "6", customType));
    w.addDocument(doc);

    IndexSearcher indexSearcher = new IndexSearcher(w.getReader());
    w.close();

    AbstractAllGroupsCollector<?> allGroupsCollector = createRandomCollector(groupField, canUseIDV);
    indexSearcher.search(new TermQuery(new Term("content", "random")), allGroupsCollector);
    assertEquals(4, allGroupsCollector.getGroupCount());

    allGroupsCollector = createRandomCollector(groupField, canUseIDV);
    indexSearcher.search(new TermQuery(new Term("content", "some")), allGroupsCollector);
    assertEquals(3, allGroupsCollector.getGroupCount());

    allGroupsCollector = createRandomCollector(groupField, canUseIDV);
    indexSearcher.search(new TermQuery(new Term("content", "blob")), allGroupsCollector);
    assertEquals(2, allGroupsCollector.getGroupCount());

    indexSearcher.getIndexReader().close();
    dir.close();
  }
示例#27
0
  /**
   * Method to create Lucene Index Keep in mind that always index text value to Lucene for
   * calculating Cosine Similarity. You have to generate tokens, terms and their frequencies and
   * store them in the Lucene Index.
   *
   * @throws CorruptIndexException
   * @throws LockObtainFailedException
   * @throws IOException
   */
  public void index() throws CorruptIndexException, LockObtainFailedException, IOException {

    System.out.println(">>> Process source directory  : " + sourceDirectory.getAbsolutePath());

    System.out.println(">>> Create index in directory : " + indexDirectory.getAbsolutePath());

    Directory dir = FSDirectory.open(indexDirectory);

    Analyzer analyzer = new StandardAnalyzer(StandardAnalyzer.STOP_WORDS_SET); // using stop words
    // Analyzer analyzer = new StandardAnalyzer(CharArraySet.EMPTY_SET);  // using stop words
    // System.out.println(">>> Overwrite the Analyser: DO NOT USE STOPWORDS FILTER !!!");

    /**
     * Source: http://toolongdidntread.com/lucene/using-lucene-to-generate-n-grams/
     *
     * <p>1) Run a document through an Analyzer which filters out the stuff we don’t care about.
     * SimpleAnalyzer, in this case, applies a lower case filter and a letter tokenizer, which makes
     * all text lowercase and divides text at non-letters, respectively.
     *
     * <p>2) Wrap this analyzer with ShingleAnalyzerWrapper which constructs shingles (token
     * n-grams) from a stream. This is the main thing we want to accomplish.
     *
     * <p>3) We generate a TokenStream which enumerates (a fancy word for establishes) “fields” from
     * a “document” (what I mentioned earlier).
     *
     * <p>4) Given a token stream, we want to extract certain things from it, like just the
     * characters and not all the other stuff that comes along with the stream. We’ll use
     * CharTermAttribute which extract just the words from the stream.
     *
     * <p>5) Finally, we iterate over the stream by incrementing the tokens, extracting each
     * CharTermAttribute from the tokens.
     */
    if (useNgramsAsTerms) {
      analyzer = getNGramAnalyser(analyzer, N);
    }

    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_10_2, analyzer);

    if (indexDirectory.exists()) {
      iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    } else {
      // Add new documents to an existing index:
      iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    }

    IndexWriter writer = new IndexWriter(dir, iwc);

    for (File f : sourceDirectory.listFiles()) {

      System.out.println("> DOC  :  " + f.getAbsolutePath());

      if (f.isDirectory()) {

        System.out.println(">>> Indexer processes FILE : " + f.getAbsolutePath());

        for (File fileTXT : f.listFiles()) {

          String at = getAllText(fileTXT);

          System.out.println("> file  :  " + fileTXT.getAbsolutePath());
          Document doc = new Document();

          FieldType fieldType = new FieldType();
          fieldType.setIndexed(true);
          fieldType.setIndexOptions(
              FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
          fieldType.setStored(true);
          fieldType.setStoreTermVectors(true);
          fieldType.setTokenized(true);
          Field contentField = new Field(fieldName, at, fieldType);
          doc.add(contentField);

          FieldType fieldType2 = new FieldType();
          fieldType2.setIndexed(true);
          fieldType2.setStored(true);
          fieldType2.setStoreTermVectors(false);
          fieldType2.setTokenized(false);
          Field idField = new Field("id", fileTXT.getAbsolutePath(), fieldType2);
          doc.add(idField);

          FieldType fieldType3 = new FieldType();
          fieldType3.setIndexed(false);
          fieldType3.setStored(true);
          fieldType3.setStoreTermVectors(false);
          fieldType3.setTokenized(false);
          Field rawField = new Field("raw", at, fieldType3);
          doc.add(rawField);

          writer.addDocument(doc);
        }

      } else {

        if (!f.getName().startsWith(".DS_Store")) {

          String at = getAllText(f);

          Document doc = new Document();
          FieldType fieldType = new FieldType();
          fieldType.setIndexed(true);
          fieldType.setIndexOptions(
              FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
          fieldType.setStored(true);
          fieldType.setStoreTermVectors(true);
          fieldType.setTokenized(true);
          Field contentField = new Field(fieldName, at, fieldType);
          doc.add(contentField);

          FieldType fieldType2 = new FieldType();
          fieldType2.setIndexed(true);
          fieldType2.setStored(true);
          fieldType2.setStoreTermVectors(false);
          fieldType2.setTokenized(false);
          Field idField = new Field("id", f.getAbsolutePath(), fieldType2);
          doc.add(idField);

          FieldType fieldType3 = new FieldType();
          fieldType3.setIndexed(false);
          fieldType3.setStored(true);
          fieldType3.setStoreTermVectors(false);
          fieldType3.setTokenized(false);
          Field rawField = new Field("raw", at, fieldType3);
          doc.add(rawField);

          writer.addDocument(doc);
        }
      }
    }
    writer.close();
  }
  @Nightly
  public void test() throws Exception {
    MockDirectoryWrapper dir =
        new MockDirectoryWrapper(random(), new MMapDirectory(createTempDir("4GBStoredFields")));
    dir.setThrottling(MockDirectoryWrapper.Throttling.NEVER);

    IndexWriter w =
        new IndexWriter(
            dir,
            new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
                .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
                .setRAMBufferSizeMB(256.0)
                .setMergeScheduler(new ConcurrentMergeScheduler())
                .setMergePolicy(newLogMergePolicy(false, 10))
                .setOpenMode(IndexWriterConfig.OpenMode.CREATE));

    MergePolicy mp = w.getConfig().getMergePolicy();
    if (mp instanceof LogByteSizeMergePolicy) {
      // 1 petabyte:
      ((LogByteSizeMergePolicy) mp).setMaxMergeMB(1024 * 1024 * 1024);
    }

    final Document doc = new Document();
    final FieldType ft = new FieldType();
    ft.setIndexed(false);
    ft.setStored(true);
    ft.freeze();
    final int valueLength = RandomInts.randomIntBetween(random(), 1 << 13, 1 << 20);
    final byte[] value = new byte[valueLength];
    for (int i = 0; i < valueLength; ++i) {
      // random so that even compressing codecs can't compress it
      value[i] = (byte) random().nextInt(256);
    }
    final Field f = new Field("fld", value, ft);
    doc.add(f);

    final int numDocs = (int) ((1L << 32) / valueLength + 100);
    for (int i = 0; i < numDocs; ++i) {
      w.addDocument(doc);
      if (VERBOSE && i % (numDocs / 10) == 0) {
        System.out.println(i + " of " + numDocs + "...");
      }
    }
    w.forceMerge(1);
    w.close();
    if (VERBOSE) {
      boolean found = false;
      for (String file : dir.listAll()) {
        if (file.endsWith(".fdt")) {
          final long fileLength = dir.fileLength(file);
          if (fileLength >= 1L << 32) {
            found = true;
          }
          System.out.println("File length of " + file + " : " + fileLength);
        }
      }
      if (!found) {
        System.out.println("No .fdt file larger than 4GB, test bug?");
      }
    }

    DirectoryReader rd = DirectoryReader.open(dir);
    Document sd = rd.document(numDocs - 1);
    assertNotNull(sd);
    assertEquals(1, sd.getFields().size());
    BytesRef valueRef = sd.getBinaryValue("fld");
    assertNotNull(valueRef);
    assertEquals(new BytesRef(value), valueRef);
    rd.close();

    dir.close();
  }
  @BeforeClass
  public static void beforeClass() throws Exception {
    noDocs = atLeast(4096);
    distance = (1L << 60) / noDocs;
    directory = newDirectory();
    RandomIndexWriter writer =
        new RandomIndexWriter(
            random(),
            directory,
            newIndexWriterConfig(new MockAnalyzer(random()))
                .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000))
                .setMergePolicy(newLogMergePolicy()));

    final FieldType storedLong = new FieldType(LegacyLongField.TYPE_NOT_STORED);
    storedLong.setStored(true);
    storedLong.freeze();

    final FieldType storedLong8 = new FieldType(storedLong);
    storedLong8.setNumericPrecisionStep(8);

    final FieldType storedLong4 = new FieldType(storedLong);
    storedLong4.setNumericPrecisionStep(4);

    final FieldType storedLong6 = new FieldType(storedLong);
    storedLong6.setNumericPrecisionStep(6);

    final FieldType storedLong2 = new FieldType(storedLong);
    storedLong2.setNumericPrecisionStep(2);

    LegacyLongField field8 = new LegacyLongField("field8", 0L, storedLong8),
        field6 = new LegacyLongField("field6", 0L, storedLong6),
        field4 = new LegacyLongField("field4", 0L, storedLong4),
        field2 = new LegacyLongField("field2", 0L, storedLong2);

    Document doc = new Document();
    // add fields, that have a distance to test general functionality
    doc.add(field8);
    doc.add(field6);
    doc.add(field4);
    doc.add(field2);

    // Add a series of noDocs docs with increasing long values, by updating the fields
    for (int l = 0; l < noDocs; l++) {
      long val = distance * l + startOffset;
      field8.setLongValue(val);
      field6.setLongValue(val);
      field4.setLongValue(val);
      field2.setLongValue(val);

      val = l - (noDocs / 2);
      writer.addDocument(doc);
    }
    Map<String, Type> map = new HashMap<>();
    map.put("field2", Type.LEGACY_LONG);
    map.put("field4", Type.LEGACY_LONG);
    map.put("field6", Type.LEGACY_LONG);
    map.put("field8", Type.LEGACY_LONG);
    reader = UninvertingReader.wrap(writer.getReader(), map);
    searcher = newSearcher(reader);
    writer.close();
  }
  @BeforeClass
  public static void beforeClass() throws Exception {
    noDocs = atLeast(4096);
    distance = (1L << 60) / noDocs;
    directory = newDirectory();
    RandomIndexWriter writer =
        new RandomIndexWriter(
            random(),
            directory,
            newIndexWriterConfig(new MockAnalyzer(random()))
                .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000))
                .setMergePolicy(newLogMergePolicy()));

    final FieldType storedLong = new FieldType(LegacyLongField.TYPE_NOT_STORED);
    storedLong.setStored(true);
    storedLong.freeze();

    final FieldType storedLong8 = new FieldType(storedLong);
    storedLong8.setNumericPrecisionStep(8);

    final FieldType storedLong4 = new FieldType(storedLong);
    storedLong4.setNumericPrecisionStep(4);

    final FieldType storedLong6 = new FieldType(storedLong);
    storedLong6.setNumericPrecisionStep(6);

    final FieldType storedLong2 = new FieldType(storedLong);
    storedLong2.setNumericPrecisionStep(2);

    final FieldType storedLongNone = new FieldType(storedLong);
    storedLongNone.setNumericPrecisionStep(Integer.MAX_VALUE);

    final FieldType unstoredLong = LegacyLongField.TYPE_NOT_STORED;

    final FieldType unstoredLong8 = new FieldType(unstoredLong);
    unstoredLong8.setNumericPrecisionStep(8);

    final FieldType unstoredLong6 = new FieldType(unstoredLong);
    unstoredLong6.setNumericPrecisionStep(6);

    final FieldType unstoredLong4 = new FieldType(unstoredLong);
    unstoredLong4.setNumericPrecisionStep(4);

    final FieldType unstoredLong2 = new FieldType(unstoredLong);
    unstoredLong2.setNumericPrecisionStep(2);

    LegacyLongField field8 = new LegacyLongField("field8", 0L, storedLong8),
        field6 = new LegacyLongField("field6", 0L, storedLong6),
        field4 = new LegacyLongField("field4", 0L, storedLong4),
        field2 = new LegacyLongField("field2", 0L, storedLong2),
        fieldNoTrie = new LegacyLongField("field" + Integer.MAX_VALUE, 0L, storedLongNone),
        ascfield8 = new LegacyLongField("ascfield8", 0L, unstoredLong8),
        ascfield6 = new LegacyLongField("ascfield6", 0L, unstoredLong6),
        ascfield4 = new LegacyLongField("ascfield4", 0L, unstoredLong4),
        ascfield2 = new LegacyLongField("ascfield2", 0L, unstoredLong2);

    Document doc = new Document();
    // add fields, that have a distance to test general functionality
    doc.add(field8);
    doc.add(field6);
    doc.add(field4);
    doc.add(field2);
    doc.add(fieldNoTrie);
    // add ascending fields with a distance of 1, beginning at -noDocs/2 to test the correct
    // splitting of range and inclusive/exclusive
    doc.add(ascfield8);
    doc.add(ascfield6);
    doc.add(ascfield4);
    doc.add(ascfield2);

    // Add a series of noDocs docs with increasing long values, by updating the fields
    for (int l = 0; l < noDocs; l++) {
      long val = distance * l + startOffset;
      field8.setLongValue(val);
      field6.setLongValue(val);
      field4.setLongValue(val);
      field2.setLongValue(val);
      fieldNoTrie.setLongValue(val);

      val = l - (noDocs / 2);
      ascfield8.setLongValue(val);
      ascfield6.setLongValue(val);
      ascfield4.setLongValue(val);
      ascfield2.setLongValue(val);
      writer.addDocument(doc);
    }
    reader = writer.getReader();
    searcher = newSearcher(reader);
    writer.close();
  }