示例#1
0
  @ImmutableCopy
  @NotNull
  @ThreadSafe
  public List<ResultDocument> list(@NotNull Set<String> uids)
      throws SearchException, CheckedOutOfMemoryError {
    // Construct a filter that only matches documents with the given UIDs
    TermsFilter uidFilter = new TermsFilter();
    String fieldName = Fields.UID.key();
    for (String uid : uids) uidFilter.addTerm(new Term(fieldName, uid));

    Query query = new MatchAllDocsQuery();

    readLock.lock();
    try {
      checkIndexesExist();

      // Perform search; might throw OutOfMemoryError
      ScoreDoc[] scoreDocs = luceneSearcher.search(query, uidFilter, MAX_RESULTS).scoreDocs;

      // Create result documents
      ResultDocument[] results = new ResultDocument[scoreDocs.length];
      for (int i = 0; i < results.length; i++) {
        Document doc = luceneSearcher.doc(scoreDocs[i].doc);
        float score = scoreDocs[i].score;
        LuceneIndex index = indexes.get(luceneSearcher.subSearcher(i));
        IndexingConfig config = index.getConfig();
        results[i] =
            new ResultDocument(doc, score, query, true, config, fileFactory, outlookMailFactory);
      }

      // Sort results by title
      Arrays.sort(
          results,
          new Comparator<ResultDocument>() {
            public int compare(ResultDocument o1, ResultDocument o2) {
              return AlphanumComparator.ignoreCaseInstance.compare(o1.getTitle(), o2.getTitle());
            }
          });

      return Arrays.asList(results);
    } catch (IllegalArgumentException e) {
      throw wrapEmptyIndexException(e);
    } catch (IOException e) {
      throw new SearchException(e.getMessage()); // TODO i18n
    } catch (OutOfMemoryError e) {
      throw new CheckedOutOfMemoryError(e);
    } finally {
      readLock.unlock();
    }
  }
示例#2
0
  /**
   * Skip elements, values of which repeat in the given field.
   *
   * <p>Only the first element will be included in the results.
   *
   * <p><b>Note:</b> For performance reasons you should call this method as last when constructing a
   * query. When called it will project the query and create a filter to eliminate duplicates.
   *
   * @param field
   * @return
   */
  public LuceneQuery<T> skipSame(String field) {
    String idPropertyName =
        getSession().getSessionFactory().getClassMetadata(getType()).getIdentifierPropertyName();

    List<Object> documents = listProjection(idPropertyName, field);

    Set<Object> uniqueFieldValues = new HashSet<Object>();
    TermsFilter termsFilter = new TermsFilter();
    for (Object document : documents) {
      Object[] row = (Object[]) document;
      if (uniqueFieldValues.add(row[1])) {
        termsFilter.addTerm(new Term(idPropertyName, row[0].toString()));
      }
    }

    buildQuery();
    fullTextQuery.setFilter(termsFilter);

    return this;
  }
示例#3
0
    protected Filter createBaseFilter(QueryProcessor q, String suffix) {
      if (q.wantPlaces() && q.wantTimes()) {
        TermsFilter timeFilter = new TermsFilter();
        if (q.isTime_key())
          timeFilter.addTerm(new Term(Config.S_HAS_TIME_POINTS_KEY + suffix, "true"));
        else
          timeFilter.addTerm(
              new Term(Config.S_HAS_ANY_TIME_POINT + suffix, "true")); // OPTAR POR DURACOES

        TermsFilter filterGeo = new TermsFilter();
        filterGeo.addTerm(new Term(Config.S_GEO_INDEXED + suffix, "true"));

        Filter[] filterChain = new Filter[] {filterGeo, timeFilter};
        int[] actionType = new int[] {SerialChainFilter.AND, SerialChainFilter.AND};
        return new SerialChainFilter(filterChain, actionType);
      } else if (q.wantPlaces()) {
        TermsFilter filter = new TermsFilter();
        filter.addTerm(new Term(Config.S_GEO_INDEXED + suffix, "true"));
        return filter;
      } else if (q.wantTimes()) {
        if (q.isTime_key()) {
          TermsFilter filter = new TermsFilter();
          filter.addTerm(new Term(Config.S_HAS_YYYY_KEY + suffix, "true"));
          return filter;
        } else {
          TermsFilter filter = new TermsFilter();
          filter.addTerm(
              new Term(Config.S_HAS_ANY_TIME_POINT + suffix, "true")); // OPTAR POR DURACOES
          return filter;
        }
      } else return null;
    }
示例#4
0
  /**
   * For the given query, returns the requested page of results. This method should not be called
   * anymore after {@link #shutdown()} has been called, otherwise an IOException will be thrown.
   */
  @NotNull
  @ThreadSafe
  public ResultPage search(@NotNull WebQuery webQuery)
      throws IOException, SearchException, CheckedOutOfMemoryError {
    Util.checkNotNull(webQuery);

    if (ioException != null) throw ioException;

    List<Filter> filters = new ArrayList<Filter>(3);

    // Add size filter to filter chain
    if (webQuery.minSize != null || webQuery.maxSize != null) {
      filters.add(
          NumericRangeFilter.newLongRange(
              Fields.SIZE.key(), webQuery.minSize, webQuery.maxSize, true, true));
    }

    // Add type filter to filter chain
    if (webQuery.parsers != null) {
      TermsFilter typeFilter = new TermsFilter();
      String fieldName = Fields.PARSER.key();
      typeFilter.addTerm(new Term(fieldName, Fields.EMAIL_PARSER));
      for (Parser parser : webQuery.parsers) {
        String parserName = parser.getClass().getSimpleName();
        typeFilter.addTerm(new Term(fieldName, parserName));
      }
      filters.add(typeFilter);
    }

    // Add location filter to filter chain
    if (webQuery.indexes != null) {
      Filter[] indexFilters = new Filter[webQuery.indexes.size()];
      int i = 0;
      for (LuceneIndex index : webQuery.indexes) {
        Path path = index.getRootFolder().getPath();
        String uid = index.getDocumentType().createUniqueId(path);
        Term prefix = new Term(Fields.UID.key(), uid + "/");
        indexFilters[i++] = new PrefixFilter(prefix);
      }
      filters.add(new ChainedFilter(indexFilters, ChainedFilter.OR));
    }

    // Construct filter chain
    Filter filter =
        filters.size() == 0
            ? null
            : new ChainedFilter(filters.toArray(new Filter[filters.size()]), ChainedFilter.AND);

    // Create query
    QueryWrapper queryWrapper = createQuery(webQuery.query);
    Query query = queryWrapper.query;
    boolean isPhraseQuery = queryWrapper.isPhraseQuery;

    readLock.lock();
    try {
      checkIndexesExist();

      // Perform search; might throw OutOfMemoryError
      int maxResults = (webQuery.pageIndex + 1) * PAGE_SIZE;
      TopDocs topDocs = luceneSearcher.search(query, filter, maxResults);
      ScoreDoc[] scoreDocs = topDocs.scoreDocs;

      // Compute start and end indices of returned page
      int start;
      int end = scoreDocs.length;
      if (end <= PAGE_SIZE) {
        start = 0;
      } else {
        int r = end % PAGE_SIZE;
        start = end - (r == 0 ? PAGE_SIZE : r);
      }

      // Create and fill list of result documents to return
      ResultDocument[] results = new ResultDocument[end - start];
      for (int i = start; i < end; i++) {
        Document doc = luceneSearcher.doc(scoreDocs[i].doc);
        float score = scoreDocs[i].score;
        LuceneIndex index = indexes.get(luceneSearcher.subSearcher(i));
        IndexingConfig config = index.getConfig();
        results[i - start] =
            new ResultDocument(
                doc, score, query, isPhraseQuery, config, fileFactory, outlookMailFactory);
      }

      int hitCount = topDocs.totalHits;
      int newPageIndex = start / PAGE_SIZE;
      int pageCount = (int) Math.ceil((float) hitCount / PAGE_SIZE);

      return new ResultPage(Arrays.asList(results), newPageIndex, pageCount, hitCount);
    } catch (IllegalArgumentException e) {
      throw wrapEmptyIndexException(e);
    } catch (OutOfMemoryError e) {
      throw new CheckedOutOfMemoryError(e);
    } finally {
      readLock.unlock();
    }
  }
  public void testTimeFilter() throws IOException, DocumentException, ParseException {
    LgteIndexWriter writer = new LgteIndexWriter(path + "Contents", true);
    LgteDocumentWrapper doc1 = new LgteDocumentWrapper();
    doc1.indexText(Globals.DOCUMENT_ID_FIELD, "1");
    doc1.indexText("contents", "word1 word2 word3");
    doc1.indexStringNoStore(Config.S_HAS_TIMEXES, "true");
    LgteDocumentWrapper doc2 = new LgteDocumentWrapper();
    doc2.indexText(Globals.DOCUMENT_ID_FIELD, "2");
    doc2.indexText("contents", "word2 word3 word4 word55 word96 word2 word54 word33 wordss");
    writer.addDocument(doc1);
    writer.addDocument(doc2);
    writer.close();

    LgteIndexWriter writer2 = new LgteIndexWriter(path + "Sentences", true);
    LgteDocumentWrapper sentence0 = new LgteDocumentWrapper();
    sentence0.indexText(Globals.DOCUMENT_ID_FIELD, "1_0");
    sentence0.indexText("doc_id", "1");
    sentence0.indexText("sentences", "word1 word3");
    LgteDocumentWrapper sentence1 = new LgteDocumentWrapper();
    sentence1.indexText(Globals.DOCUMENT_ID_FIELD, "1_1");
    sentence1.indexText("doc_id", "1");
    sentence1.indexText("sentences", "word1 word2 word3");
    LgteDocumentWrapper sentence2 = new LgteDocumentWrapper();
    sentence2.indexStringNoStore(Config.S_HAS_TIMEXES + "_sentences", "true");
    sentence2.indexText(Globals.DOCUMENT_ID_FIELD, "2_1");
    sentence2.indexText("doc_id", "2");
    sentence2.indexText("sentences", "word2 word3 word4 word55 word96 word2 word54 word33 wordss");
    writer2.addDocument(sentence0);
    writer2.addDocument(sentence1);
    writer2.addDocument(sentence2);
    writer2.close();

    IndexReader readerContents =
        LgteIndexManager.openReader(path + "Contents", Model.OkapiBM25Model);
    IndexReader readerSentences =
        LgteIndexManager.openReader(path + "Sentences", Model.OkapiBM25Model);
    Map<String, IndexReader> readers = new HashMap<String, IndexReader>();
    readers.put("contents", readerContents);
    readers.put("sentences", readerSentences);
    readers.put(Config.S_HAS_TIMEXES, readerContents);
    readers.put(Config.S_HAS_TIMEXES + "_sentences", readerSentences);
    readers.put("doc_id", readerSentences);
    readers.put("id", readerSentences);
    LgteIsolatedIndexReader lgteIsolatedIndexReader = new LgteIsolatedIndexReader(readers);
    lgteIsolatedIndexReader.addTreeMapping(readerContents, readerSentences, "doc_id");

    LgteIndexSearcherWrapper searcher =
        new LgteIndexSearcherWrapper(Model.OkapiBM25Model, lgteIsolatedIndexReader);
    QueryConfiguration queryConfiguration = new QueryConfiguration();
    queryConfiguration.setProperty("bm25.idf.policy", "floor_epslon");
    queryConfiguration.setProperty("bm25.idf.epslon", "0.01");
    queryConfiguration.setProperty("bm25.k1", "2.0");
    queryConfiguration.setProperty("bm25.b", "0.75");
    queryConfiguration.setProperty("index.tree", "true");
    QueryFilter queryFilter =
        new QueryFilter(
            org.apache.lucene.queryParser.QueryParser.parse(
                "true", Config.S_HAS_TIMEXES, new LgteNothingAnalyzer()));
    LgteQuery lgteQuery =
        LgteQueryParser.parseQuery(
            "sentences:word2", new LgteNothingAnalyzer(), searcher, queryConfiguration);
    LgteHits lgteHits = searcher.search(lgteQuery, queryFilter);

    assertEquals(lgteHits.length(), 1);
    assertEquals(lgteHits.id(0), 1);

    TermsFilter termsFilter = new TermsFilter();
    termsFilter.addTerm(new Term(Config.S_HAS_TIMEXES, "true"));
    lgteHits = searcher.search(lgteQuery, termsFilter);
    assertEquals(lgteHits.length(), 1);
    assertEquals(lgteHits.id(0), 1);

    termsFilter = new TermsFilter();
    termsFilter.addTerm(new Term(Config.S_HAS_TIMEXES + "_sentences", "true"));
    lgteHits = searcher.search(lgteQuery, termsFilter);
    assertEquals(lgteHits.length(), 1);
    assertEquals(lgteHits.id(0), 2);

    searcher.close();

    Files.delDirsE(path + "Contents");
    Files.delDirsE(path + "Sentences");
  }