/**
   * Note: if you use a counting {@link Facets} implementation, you can amortize the sampled counts
   * by calling this method. Uses the {@link FacetsConfig} and the {@link IndexSearcher} to
   * determine the upper bound for each facet value.
   */
  public FacetResult amortizeFacetCounts(
      FacetResult res, FacetsConfig config, IndexSearcher searcher) throws IOException {
    if (res == null || totalHits <= sampleSize) {
      return res;
    }

    LabelAndValue[] fixedLabelValues = new LabelAndValue[res.labelValues.length];
    IndexReader reader = searcher.getIndexReader();
    DimConfig dimConfig = config.getDimConfig(res.dim);

    // +2 to prepend dimension, append child label
    String[] childPath = new String[res.path.length + 2];
    childPath[0] = res.dim;

    System.arraycopy(res.path, 0, childPath, 1, res.path.length); // reuse

    for (int i = 0; i < res.labelValues.length; i++) {
      childPath[res.path.length + 1] = res.labelValues[i].label;
      String fullPath = FacetsConfig.pathToString(childPath, childPath.length);
      int max = reader.docFreq(new Term(dimConfig.indexFieldName, fullPath));
      int correctedCount = (int) (res.labelValues[i].value.doubleValue() / samplingRate);
      correctedCount = Math.min(max, correctedCount);
      fixedLabelValues[i] = new LabelAndValue(res.labelValues[i].label, correctedCount);
    }

    // cap the total count on the total number of non-deleted documents in the reader
    int correctedTotalCount = res.value.intValue();
    if (correctedTotalCount > 0) {
      correctedTotalCount =
          Math.min(reader.numDocs(), (int) (res.value.doubleValue() / samplingRate));
    }

    return new FacetResult(
        res.dim, res.path, correctedTotalCount, fixedLabelValues, res.childCount);
  }
  public void testMoreThan32ProhibitedClauses() throws Exception {
    final Directory d = newDirectory();
    final RandomIndexWriter w = new RandomIndexWriter(random(), d);
    Document doc = new Document();
    doc.add(
        new TextField(
            "field",
            "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33",
            Field.Store.NO));
    w.addDocument(doc);
    doc = new Document();
    doc.add(new TextField("field", "33", Field.Store.NO));
    w.addDocument(doc);
    final IndexReader r = w.getReader();
    w.close();
    final IndexSearcher s = newSearcher(r);

    final BooleanQuery q = new BooleanQuery();
    for (int term = 0; term < 33; term++) {
      q.add(
          new BooleanClause(
              new TermQuery(new Term("field", "" + term)), BooleanClause.Occur.MUST_NOT));
    }
    q.add(new BooleanClause(new TermQuery(new Term("field", "33")), BooleanClause.Occur.SHOULD));

    final int[] count = new int[1];
    s.search(
        q,
        new Collector() {
          private Scorer scorer;

          @Override
          public void setScorer(Scorer scorer) {
            // Make sure we got BooleanScorer:
            this.scorer = scorer;
            assertEquals(
                "Scorer is implemented by wrong class",
                BooleanScorer.class.getName() + "$BucketScorer",
                scorer.getClass().getName());
          }

          @Override
          public void collect(int doc) {
            count[0]++;
          }

          @Override
          public void setNextReader(AtomicReaderContext context) {}

          @Override
          public boolean acceptsDocsOutOfOrder() {
            return true;
          }
        });

    assertEquals(1, count[0]);

    r.close();
    d.close();
  }
示例#3
0
  private void dumpDocuments() throws IOException {
    outputBanner("Documents");

    int totalDocs = mIndexReader.numDocs();

    outputLn();
    outputLn("There are " + totalDocs + " documents in this index.");

    mConsole.debug("Total number of documents: " + totalDocs);
    for (int i = 0; i < totalDocs; i++) {
      Document doc = null;
      try {
        doc = mIndexReader.document(i, null);
      } catch (IllegalArgumentException e) {
        if ("attempt to access a deleted document".equals(e.getMessage())) {
          mConsole.warn(
              "encountered exception while dumping document " + i + ": " + e.getMessage());
        } else {
          throw e;
        }
      }
      dumpDocument(i, doc);

      if ((i + 1) % 100 == 0) {
        mConsole.debug("Dumped " + (i + 1) + " documents");
      }
    }
  }
 private void remove(Class entity, Serializable id) {
   log.trace("remove from Lucene index: " + entity + "#" + id);
   DocumentBuilder builder = workspace.getDocumentBuilder(entity);
   Term term = builder.getTerm(id);
   IndexReader reader = workspace.getIndexReader(entity);
   TermDocs termDocs = null;
   try {
     // TODO is there a faster way?
     // TODO include TermDocs into the workspace?
     termDocs = reader.termDocs(term);
     String entityName = entity.getName();
     while (termDocs.next()) {
       int docIndex = termDocs.doc();
       if (entityName.equals(reader.document(docIndex).get(DocumentBuilder.CLASS_FIELDNAME))) {
         // remove only the one of the right class
         // loop all to remove all the matches (defensive code)
         reader.deleteDocument(docIndex);
       }
     }
   } catch (Exception e) {
     throw new HibernateException("Unable to remove from Lucene index: " + entity + "#" + id, e);
   } finally {
     if (termDocs != null)
       try {
         termDocs.close();
       } catch (IOException e) {
         log.warn("Unable to close termDocs properly", e);
       }
   }
 }
  public void testFarsiRangeFilterCollating(
      Analyzer analyzer, String firstBeg, String firstEnd, String secondBeg, String secondEnd)
      throws Exception {
    Directory dir = newDirectory();
    IndexWriter writer =
        new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
    Document doc = new Document();
    doc.add(new Field("content", "\u0633\u0627\u0628", Field.Store.YES, Field.Index.ANALYZED));
    doc.add(new Field("body", "body", Field.Store.YES, Field.Index.NOT_ANALYZED));
    writer.addDocument(doc);
    writer.close();
    IndexReader reader = IndexReader.open(dir);
    IndexSearcher searcher = new IndexSearcher(reader);
    Query query = new TermQuery(new Term("body", "body"));

    // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
    // orders the U+0698 character before the U+0633 character, so the single
    // index Term below should NOT be returned by a TermRangeFilter with a Farsi
    // Collator (or an Arabic one for the case when Farsi searcher not
    // supported).
    ScoreDoc[] result =
        searcher.search(query, new TermRangeFilter("content", firstBeg, firstEnd, true, true), 1)
            .scoreDocs;
    assertEquals("The index Term should not be included.", 0, result.length);

    result =
        searcher.search(query, new TermRangeFilter("content", secondBeg, secondEnd, true, true), 1)
            .scoreDocs;
    assertEquals("The index Term should be included.", 1, result.length);

    searcher.close();
    reader.close();
    dir.close();
  }
示例#6
0
  /**
   * give the id list of sentences, from Lucene index
   *
   * @param input input word
   * @param catalogName catalog (domain) name which we'd like to search in
   * @param limit how many hits are needed (0 means all)
   */
  public List<String> query(String input, String catalogName, int limit) {

    List<String> res = new ArrayList<String>();
    try {

      catalog c = catalogs.get(catalogName);
      IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(c.indexPath)));
      IndexSearcher searcher = new IndexSearcher(reader);

      QueryParser parser = new QueryParser("contents", analyzer);
      Query query = parser.parse(QueryParser.escape(input));

      int n = limit > 0 ? limit : searcher.count(query);
      if (n == 0) n = 1;
      TopDocs results = searcher.search(query, n);

      int endPos = limit;
      if (limit != 0) endPos = Math.min(results.totalHits, limit); // 1st n hits
      else endPos = results.totalHits; // all hits

      for (int i = 0; i < endPos; i++) {
        int id = results.scoreDocs[i].doc;
        Document doc = searcher.doc(id);
        res.add(doc.get("filename"));
      }
      reader.close();
      return res;

    } catch (ParseException e) {
      log(e.getMessage());
    } catch (IOException e) {
      log(e.getMessage());
    }
    return res;
  }
  public void testSpanNot() throws Exception {
    SpanQuery[] clauses = new SpanQuery[2];
    clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
    clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "three"));
    SpanQuery spq = new SpanNearQuery(clauses, 5, true);
    SpanNotQuery snq =
        new SpanNotQuery(spq, new SpanTermQuery(new Term(PayloadHelper.FIELD, "two")));

    Directory directory = newDirectory();
    RandomIndexWriter writer =
        new RandomIndexWriter(
            random(),
            directory,
            newIndexWriterConfig(new PayloadAnalyzer()).setSimilarity(similarity));

    Document doc = new Document();
    doc.add(newTextField(PayloadHelper.FIELD, "one two three one four three", Field.Store.YES));
    writer.addDocument(doc);
    IndexReader reader = writer.getReader();
    writer.close();

    checkSpans(MultiSpansWrapper.wrap(reader, snq, SpanWeight.Postings.PAYLOADS), 1, new int[] {2});
    reader.close();
    directory.close();
  }
  public void test() throws Exception {
    BaseDirectoryWrapper d = newDirectory();
    d.setCheckIndexOnClose(false);
    // we nuke files, but verify the reader still works
    RandomIndexWriter w = new RandomIndexWriter(random(), d);
    int numDocs = atLeast(100);
    for (int i = 0; i < numDocs; i++) {
      Document doc = new Document();
      doc.add(newField("foo", "bar", TextField.TYPE_NOT_STORED));
      w.addDocument(doc);
    }

    IndexReader r = w.getReader();
    w.commit();
    w.close();

    for (String fileName : d.listAll()) {
      try {
        d.deleteFile(fileName);
        // may succeed, e.g. if the file is completely read into RAM.
      } catch (IOException ioe) {
        // ignore: this means codec (correctly) is holding
        // the file open
      }
    }

    for (LeafReaderContext cxt : r.leaves()) {
      TestUtil.checkReader(cxt.reader());
    }

    r.close();
    d.close();
  }
  public void testEvilSearcherFactory() throws Exception {
    final Directory dir = newDirectory();
    final RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    w.commit();

    final IndexReader other = DirectoryReader.open(dir);

    final SearcherFactory theEvilOne =
        new SearcherFactory() {
          @Override
          public IndexSearcher newSearcher(IndexReader ignored) {
            return LuceneTestCase.newSearcher(other);
          }
        };

    try {
      new SearcherManager(w.w, false, theEvilOne);
      fail("didn't hit expected exception");
    } catch (IllegalStateException ise) {
      // expected
    }
    w.close();
    other.close();
    dir.close();
  }
示例#10
0
 public void search01() {
   try {
     IndexReader reader = IndexReader.open(directory);
     IndexSearcher searcher = new IndexSearcher(reader);
     TermQuery query = new TermQuery(new Term("email", "*****@*****.**"));
     TopDocs tds = searcher.search(query, 10);
     for (ScoreDoc sd : tds.scoreDocs) {
       Document doc = searcher.doc(sd.doc);
       System.out.println(
           "("
               + sd.doc
               + "-"
               + doc.getBoost()
               + "-"
               + sd.score
               + ")"
               + doc.get("name")
               + "["
               + doc.get("email")
               + "]-->"
               + doc.get("id")
               + ","
               + doc.get("attach")
               + ","
               + doc.get("date")
               + ","
               + doc.getValues("email")[1]);
     }
     reader.close();
   } catch (CorruptIndexException e) {
     e.printStackTrace();
   } catch (IOException e) {
     e.printStackTrace();
   }
 }
示例#11
0
  public void doTest(int[] docs) throws Exception {
    Directory dir = makeIndex();
    IndexReader reader = IndexReader.open(dir, true);
    for (int i = 0; i < docs.length; i++) {
      Document d = reader.document(docs[i], SELECTOR);
      d.get(MAGIC_FIELD);

      List<Fieldable> fields = d.getFields();
      for (Iterator<Fieldable> fi = fields.iterator(); fi.hasNext(); ) {
        Fieldable f = null;
        try {
          f = fi.next();
          String fname = f.name();
          String fval = f.stringValue();
          assertNotNull(docs[i] + " FIELD: " + fname, fval);
          String[] vals = fval.split("#");
          if (!dataset.contains(vals[0]) || !dataset.contains(vals[1])) {
            fail("FIELD:" + fname + ",VAL:" + fval);
          }
        } catch (Exception e) {
          throw new Exception(docs[i] + " WTF: " + f.name(), e);
        }
      }
    }
    reader.close();
    dir.close();
  }
示例#12
0
  public void test() throws IOException {
    assertTrue(dir != null);
    assertTrue(fieldInfos != null);
    IndexReader reader = DirectoryReader.open(dir);
    Document doc = reader.document(0);
    assertTrue(doc != null);
    assertTrue(doc.getField(DocHelper.TEXT_FIELD_1_KEY) != null);

    Field field = (Field) doc.getField(DocHelper.TEXT_FIELD_2_KEY);
    assertTrue(field != null);
    assertTrue(field.fieldType().storeTermVectors());

    assertFalse(field.fieldType().omitNorms());
    assertTrue(field.fieldType().indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);

    field = (Field) doc.getField(DocHelper.TEXT_FIELD_3_KEY);
    assertTrue(field != null);
    assertFalse(field.fieldType().storeTermVectors());
    assertTrue(field.fieldType().omitNorms());
    assertTrue(field.fieldType().indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);

    field = (Field) doc.getField(DocHelper.NO_TF_KEY);
    assertTrue(field != null);
    assertFalse(field.fieldType().storeTermVectors());
    assertFalse(field.fieldType().omitNorms());
    assertTrue(field.fieldType().indexOptions() == IndexOptions.DOCS);

    DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor(DocHelper.TEXT_FIELD_3_KEY);
    reader.document(0, visitor);
    final List<IndexableField> fields = visitor.getDocument().getFields();
    assertEquals(1, fields.size());
    assertEquals(DocHelper.TEXT_FIELD_3_KEY, fields.get(0).name());
    reader.close();
  }
示例#13
0
  // LUCENE-1262
  public void testExceptions() throws Throwable {
    Path indexDir = createTempDir("testfieldswriterexceptions");

    Directory fsDir = newFSDirectory(indexDir);
    FaultyFSDirectory dir = new FaultyFSDirectory(fsDir);
    IndexWriterConfig iwc =
        newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir, iwc);
    for (int i = 0; i < 2; i++) writer.addDocument(testDoc);
    writer.forceMerge(1);
    writer.close();

    IndexReader reader = DirectoryReader.open(dir);
    dir.startFailing();

    boolean exc = false;

    for (int i = 0; i < 2; i++) {
      try {
        reader.document(i);
      } catch (IOException ioe) {
        // expected
        exc = true;
      }
      try {
        reader.document(i);
      } catch (IOException ioe) {
        // expected
        exc = true;
      }
    }
    assertTrue(exc);
    reader.close();
    dir.close();
  }
示例#14
0
  /*
   * index all child directories(only first level directories) in parent directory
   * and indexed data is stored in the same name source directory
   */
  private long indexDirectories(String parent, String[] dirs, String index, SetupParameters Pa)
      throws FileHandlerException, IOException {
    long sumDocs = 0;
    // index each directory in parent directory

    for (int i = 0; i < dirs.length; i++) {
      System.out.println("\t-----FOLDER----- :" + dirs[i].toUpperCase());
      String dir_index = index + "/" + dirs[i];
      if ((index.endsWith("\\")) || (index.endsWith("/"))) {
        dir_index = index + dirs[i];
      }
      Directory di = FSDirectory.getDirectory(new File(dir_index), true);
      Pa.setDir(di);
      Pa.setWriter(new IndexWriter(Pa.getDir(), Pa.getAnalyzer(), true));

      //             //get name of directory contains website to index
      //            int begin=dirs[i].lastIndexOf("\\");
      //            if(begin==-1) begin=dirs[i].lastIndexOf("/");
      //            int end=dirs[i].length()-1;
      //            String dir_site=dirs[i].substring(begin, end);
      this.index(dirs[i].toLowerCase(), Pa.getWriter(), new File(parent + "\\" + dirs[i]));

      Pa.getWriter().optimize();
      Pa.getWriter().close();
      IndexReader reader = Pa.getReader().open(Pa.getDir());
      sumDocs += reader.numDocs();
      reader.close();
    }
    return sumDocs;
  }
 public String getSpecificFreqTermInIndex(
     int KIntopK,
     ArrayList<String> sentQueries,
     int specificFrec,
     boolean allranges,
     boolean versionOld) {
   IndexReader indexReader = null;
   try {
     indexReader = IndexReader.open(indexDirectory);
   } catch (CorruptIndexException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
   } catch (IOException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
   }
   String mostFrerqTerm = "";
   try {
     mostFrerqTerm =
         freqTermsFinderInIndex.SpecificFreqTerms(
             indexDirectory,
             analyzer,
             indexReader,
             KIntopK,
             sentQueries,
             specificFrec,
             allranges,
             versionOld);
     indexReader.close();
   } catch (Exception e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
   }
   return mostFrerqTerm;
 }
 public Query percolateQuery(
     String documentType,
     PercolateQuery.QueryStore queryStore,
     BytesReference documentSource,
     IndexSearcher searcher)
     throws IOException {
   IndexReader indexReader = searcher.getIndexReader();
   Query candidateMatchesQuery = createCandidateQuery(indexReader);
   Query verifiedMatchesQuery;
   // We can only skip the MemoryIndex verification when percolating a single document.
   // When the document being percolated contains a nested object field then the MemoryIndex
   // contains multiple
   // documents. In this case the term query that indicates whether memory index verification can
   // be skipped
   // can incorrectly indicate that non nested queries would match, while their nested variants
   // would not.
   if (indexReader.maxDoc() == 1) {
     verifiedMatchesQuery =
         new TermQuery(new Term(extractionResultField.name(), EXTRACTION_COMPLETE));
   } else {
     verifiedMatchesQuery = new MatchNoDocsQuery("nested docs, so no verified matches");
   }
   return new PercolateQuery(
       documentType,
       queryStore,
       documentSource,
       candidateMatchesQuery,
       searcher,
       verifiedMatchesQuery);
 }
 @Override
 public void collect(int doc) throws IOException {
   BytesWrap parentId = typeCache.parentIdByDoc(doc);
   if (parentId == null) {
     return;
   }
   for (Tuple<IndexReader, IdReaderTypeCache> tuple : readers) {
     IndexReader indexReader = tuple.v1();
     IdReaderTypeCache idReaderTypeCache = tuple.v2();
     if (idReaderTypeCache
         == null) { // might be if we don't have that doc with that type in this reader
       continue;
     }
     int parentDocId = idReaderTypeCache.docById(parentId);
     if (parentDocId != -1 && !indexReader.isDeleted(parentDocId)) {
       OpenBitSet docIdSet = parentDocs().get(indexReader.getCoreCacheKey());
       if (docIdSet == null) {
         docIdSet = new OpenBitSet(indexReader.maxDoc());
         parentDocs.put(indexReader.getCoreCacheKey(), docIdSet);
       }
       docIdSet.fastSet(parentDocId);
       return;
     }
   }
 }
示例#18
0
  public void testUpdateSameDoc() throws Exception {
    final Directory dir = newDirectory();

    final LineFileDocs docs = new LineFileDocs(random());
    for (int r = 0; r < 3; r++) {
      final IndexWriter w =
          new IndexWriter(
              dir, newIndexWriterConfig(new MockAnalyzer(random())).setMaxBufferedDocs(2));
      final int numUpdates = atLeast(20);
      int numThreads = TestUtil.nextInt(random(), 2, 6);
      IndexingThread[] threads = new IndexingThread[numThreads];
      for (int i = 0; i < numThreads; i++) {
        threads[i] = new IndexingThread(docs, w, numUpdates);
        threads[i].start();
      }

      for (int i = 0; i < numThreads; i++) {
        threads[i].join();
      }

      w.close();
    }

    IndexReader open = DirectoryReader.open(dir);
    assertEquals(1, open.numDocs());
    open.close();
    docs.close();
    dir.close();
  }
  @Override
  public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
    IndexReader reader = context.getIndexSearcher().getIndexReader();

    for (int i = 0; i < docs.documents.length; i++) {
      Terms terms = null;
      try {
        terms = reader.getTermVector(docs.ids[i], StatusField.TEXT.name);
      } catch (IOException e) {
        continue;
      }

      String qid = context.getQueryId().replaceFirst("^MB0*", "");
      String docid = docs.documents[i].getField(StatusField.ID.name).stringValue();

      out.print(qrels.getRelevanceGrade(qid, docid));
      out.print(" qid:" + qid);
      out.print(" 1:" + docs.scores[i]);

      float[] intFeatures = this.extractorChain.extractAll(docs.documents[i], terms, context);

      for (int j = 0; j < intFeatures.length; j++) {
        out.print(" " + (j + 2) + ":" + intFeatures[j]);
      }

      out.print(" # docid:" + docid);
      out.print("\n");
    }

    return docs;
  }
  public void testCachingWorks() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    writer.close();

    IndexReader reader = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(dir));
    AtomicReaderContext context = (AtomicReaderContext) reader.getContext();
    MockFilter filter = new MockFilter();
    CachingWrapperFilter cacher = new CachingWrapperFilter(filter);

    // first time, nested filter is called
    DocIdSet strongRef = cacher.getDocIdSet(context, context.reader().getLiveDocs());
    assertTrue("first time", filter.wasCalled());

    // make sure no exception if cache is holding the wrong docIdSet
    cacher.getDocIdSet(context, context.reader().getLiveDocs());

    // second time, nested filter should not be called
    filter.clear();
    cacher.getDocIdSet(context, context.reader().getLiveDocs());
    assertFalse("second time", filter.wasCalled());

    reader.close();
    dir.close();
  }
示例#21
0
  private static Map<String, List<String>> generate_result(Directory directory) {
    Map<String, List<String>> result_map = new HashMap<String, List<String>>();

    try {
      IndexReader reader = IndexReader.open(directory);
      TermEnum termEnum = reader.terms();
      while (termEnum.next()) {
        String termEnumString = termEnum.term().toString();
        if (termEnumString.startsWith("content:")) {
          String term = termEnumString.substring(termEnumString.lastIndexOf(":") + 1);
          TermDocs termDocs = reader.termDocs(termEnum.term());
          while (termDocs.next()) {
            Document doc = reader.document(termDocs.doc());
            String relative_path = doc.get("relative_path");

            if (result_map.containsKey(relative_path)) {
              result_map.get(relative_path).add(term + termDocs.freq());
            } else {
              result_map.put(relative_path, new ArrayList<String>());
            }
          }
        }
      }
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
    }

    return result_map;
  }
  public void testNullDocIdSetIterator() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    writer.close();

    IndexReader reader = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(dir));
    AtomicReaderContext context = (AtomicReaderContext) reader.getContext();

    final Filter filter =
        new Filter() {
          @Override
          public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) {
            return new DocIdSet() {
              @Override
              public DocIdSetIterator iterator() {
                return null;
              }
            };
          }
        };
    CachingWrapperFilter cacher = new CachingWrapperFilter(filter);

    // the caching filter should return the empty set constant
    assertNull(cacher.getDocIdSet(context, context.reader().getLiveDocs()));

    reader.close();
    dir.close();
  }
示例#23
0
  /**
   * Find words for a more-like-this query former.
   *
   * @param docNum the id of the lucene document from which to find terms
   */
  private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
    Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
    for (String fieldName : fieldNames) {
      final Fields vectors = ir.getTermVectors(docNum);
      final Terms vector;
      if (vectors != null) {
        vector = vectors.terms(fieldName);
      } else {
        vector = null;
      }

      // field does not store term vector info
      if (vector == null) {
        Document d = ir.document(docNum);
        IndexableField[] fields = d.getFields(fieldName);
        for (IndexableField field : fields) {
          final String stringValue = field.stringValue();
          if (stringValue != null) {
            addTermFrequencies(new StringReader(stringValue), field2termFreqMap, fieldName);
          }
        }
      } else {
        addTermFrequencies(field2termFreqMap, vector, fieldName);
      }
    }

    return createQueue(field2termFreqMap);
  }
 private static void assertDocIdSetCacheable(
     IndexReader reader, Filter filter, boolean shouldCacheable) throws IOException {
   assertTrue(reader.getContext() instanceof AtomicReaderContext);
   AtomicReaderContext context = (AtomicReaderContext) reader.getContext();
   final CachingWrapperFilter cacher = new CachingWrapperFilter(filter);
   final DocIdSet originalSet = filter.getDocIdSet(context, context.reader().getLiveDocs());
   final DocIdSet cachedSet = cacher.getDocIdSet(context, context.reader().getLiveDocs());
   if (originalSet == null) {
     assertNull(cachedSet);
   }
   if (cachedSet == null) {
     assertTrue(originalSet == null || originalSet.iterator() == null);
   } else {
     assertTrue(cachedSet.isCacheable());
     assertEquals(shouldCacheable, originalSet.isCacheable());
     // System.out.println("Original: "+originalSet.getClass().getName()+" -- cached:
     // "+cachedSet.getClass().getName());
     if (originalSet.isCacheable()) {
       assertEquals(
           "Cached DocIdSet must be of same class like uncached, if cacheable",
           originalSet.getClass(),
           cachedSet.getClass());
     } else {
       assertTrue(
           "Cached DocIdSet must be an FixedBitSet if the original one was not cacheable",
           cachedSet instanceof FixedBitSet || cachedSet == null);
     }
   }
 }
示例#25
0
 public static Map<String, Integer> termFrequencies(
     IndexSearcher indexSearcher,
     Query documentFilterQuery,
     String fieldName,
     String propName,
     String altName) {
   try {
     String luceneField = ComplexFieldUtil.propertyField(fieldName, propName, altName);
     Weight weight = indexSearcher.createNormalizedWeight(documentFilterQuery, false);
     Map<String, Integer> freq = new HashMap<>();
     IndexReader indexReader = indexSearcher.getIndexReader();
     for (LeafReaderContext arc : indexReader.leaves()) {
       if (weight == null) throw new RuntimeException("weight == null");
       if (arc == null) throw new RuntimeException("arc == null");
       if (arc.reader() == null) throw new RuntimeException("arc.reader() == null");
       Scorer scorer = weight.scorer(arc, arc.reader().getLiveDocs());
       if (scorer != null) {
         while (scorer.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
           getFrequenciesFromTermVector(
               indexReader, scorer.docID() + arc.docBase, luceneField, freq);
         }
       }
     }
     return freq;
   } catch (IOException e) {
     throw ExUtil.wrapRuntimeException(e);
   }
 }
示例#26
0
  /** @return the indexs */
  public List<Index> getIndexes() {
    List<Index> indexes = new ArrayList<Index>();
    // Method[] methods = Index.class.getDeclaredMethods();
    int numDocs = reader.numDocs();
    // System.out.println(numDocs);
    for (int i = 0; i < numDocs; i++) {
      try {
        Document document = reader.document(i);
        List<Fieldable> f = document.getFields();

        Index index = new Index();
        for (Fieldable fieldable : f) {
          Field field = (Field) fieldable;
          Method m =
              Index.class.getDeclaredMethod("set" + field.name(), new Class[] {String.class});
          m.invoke(index, new Object[] {field.stringValue()});
          // Method m2 = Index.class.getDeclaredMethod("get" + field.name(), new Class[]{});
          // Object val = m2.invoke(index, new Object[]{});
          // System.out.println(m2.getName()+" = "+val);
          // System.out.println(m.getName() + " " + field.stringValue());
        }
        // System.out.println("RHAAR-"+i+" = "+index.getRHaarFeature());
        indexes.add(index);
      } catch (Exception e) {
        e.printStackTrace();
      }
    }
    return indexes;
  }
  public void testMethod() throws Exception {
    Directory directory = newDirectory();

    String[] values = new String[] {"1", "2", "3", "4"};

    RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
    for (int i = 0; i < values.length; i++) {
      Document doc = new Document();
      doc.add(newStringField(FIELD, values[i], Field.Store.YES));
      writer.addDocument(doc);
    }
    IndexReader ir = writer.getReader();
    writer.close();

    BooleanQuery booleanQuery1 = new BooleanQuery();
    booleanQuery1.add(new TermQuery(new Term(FIELD, "1")), BooleanClause.Occur.SHOULD);
    booleanQuery1.add(new TermQuery(new Term(FIELD, "2")), BooleanClause.Occur.SHOULD);

    BooleanQuery query = new BooleanQuery();
    query.add(booleanQuery1, BooleanClause.Occur.MUST);
    query.add(new TermQuery(new Term(FIELD, "9")), BooleanClause.Occur.MUST_NOT);

    IndexSearcher indexSearcher = newSearcher(ir);
    ScoreDoc[] hits = indexSearcher.search(query, null, 1000).scoreDocs;
    assertEquals("Number of matched documents", 2, hits.length);
    ir.close();
    directory.close();
  }
  public TermFreqVector searchIndexReturnFreqTerms(String searchString, String termString) {
    System.out.println("Searching for '" + searchString + "'");
    // Directory directory = FSDirectory.getDirectory();
    IndexReader indexReader;
    TermFreqVector termFreqDoc = null;
    try {
      indexReader = IndexReader.open(indexDirectory);
      IndexSearcher indexSearcher = new IndexSearcher(indexReader);
      Term term = new Term(termString, searchString);
      TermQuery query = new TermQuery(term);
      TopDocs topDocs = indexSearcher.search(query, 10);
      if (topDocs.scoreDocs.length > 0) {
        // while(it.hasNext()){
        int docId = topDocs.scoreDocs[0].doc;
        Document doc = indexSearcher.doc(docId);
        //	textOfURL = doc.get("text");
        // sourceCodeOfURL = doc.get("html");
        //	this.docId = docID;
        termFreqDoc = indexReader.getTermFreqVector(docId, "text");
      }
    } catch (CorruptIndexException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }

    return termFreqDoc;
  }
示例#29
0
  public static BoboIndexReader getBoboIndexReader(Directory idxDir) throws BrowseException {
    try {
      if (!BoboIndexReader.indexExists(idxDir)) {
        throw new BrowseException("Index does not exist at: " + idxDir);
      }
    } catch (IOException ioe) {
      throw new BrowseException(ioe.getMessage(), ioe);
    }

    IndexReader reader = null;
    try {
      reader = IndexReader.open(idxDir, true);
    } catch (IOException ioe) {
      throw new BrowseException(ioe.getMessage(), ioe);
    }

    BoboIndexReader bReader = null;
    try {
      bReader = BoboIndexReader.getInstance(reader);
    } catch (IOException ioe) {
      if (reader != null) {
        try {
          reader.close();
        } catch (IOException e) {
          logger.error(e.getMessage(), e);
        }
      }
      throw new BrowseException(ioe.getMessage(), ioe);
    }
    return bReader;
  }
  public static void main(String[] args) throws IOException, ParseException {
    String indexDir = "C:/lucenedir";
    Directory directory = FSDirectory.open(Paths.get(indexDir));
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher searcher = new IndexSearcher(reader);

    int day = (int) (new Date().getTime() / Constans.DAY_MILLIS);
    QueryParser parser = new QueryParser("contents", new StandardAnalyzer());
    Query query = parser.parse("java in action");
    Query customScoreQuery =
        new RecencyBoostCustomScoreQuery(query, 2.0, day, 6 * 365, "pubmonthAsDay");
    Sort sort =
        new Sort(
            new SortField[] {
              SortField.FIELD_SCORE, new SortField("title2", SortField.Type.STRING)
            });
    TopDocs hits = searcher.search(customScoreQuery, null, Integer.MAX_VALUE, sort, true, false);

    for (int i = 0; i < hits.scoreDocs.length; i++) {
      // 两种方式取Document都行,其实searcher.doc内部本质还是调用reader.document
      // Document doc = reader.document(hits.scoreDocs[i].doc);
      Document doc = searcher.doc(hits.scoreDocs[i].doc);
      System.out.println(
          (1 + i)
              + ": "
              + doc.get("title")
              + ": pubmonth="
              + doc.get("pubmonth")
              + " score="
              + hits.scoreDocs[i].score);
    }
    reader.close();
    directory.close();
  }