/**
   * Return the string used to later have SOLR highlight the document with.
   *
   * @param query
   * @param literal_query
   * @param queryResults
   * @param file
   * @return
   */
  private String getHighlightQuery(
      KeywordSearchQuery query, boolean literal_query, QueryResults queryResults, Content content) {
    String highlightQueryEscaped;
    if (literal_query) {
      // literal, treat as non-regex, non-term component query
      highlightQueryEscaped = query.getQueryString();
    } else {
      // construct a Solr query using aggregated terms to get highlighting
      // the query is executed later on demand
      StringBuilder highlightQuery = new StringBuilder();

      if (queryResults.getKeywords().size() == 1) {
        // simple case, no need to process subqueries and do special escaping
        Keyword term = queryResults.getKeywords().iterator().next();
        highlightQuery.append(term.toString());
      } else {
        // find terms for this content hit
        List<String> hitTerms = new ArrayList<>();
        for (Keyword keyword : queryResults.getKeywords()) {
          for (KeywordHit hit : queryResults.getResults(keyword)) {
            if (hit.getContent().equals(content)) {
              hitTerms.add(keyword.toString());
              break; // go to next term
            }
          }
        }

        final int lastTerm = hitTerms.size() - 1;
        int curTerm = 0;
        for (String term : hitTerms) {
          // escape subqueries, they shouldn't be escaped again later
          final String termS = KeywordSearchUtil.escapeLuceneQuery(term);
          highlightQuery.append("\"");
          highlightQuery.append(termS);
          highlightQuery.append("\"");
          if (lastTerm != curTerm) {
            highlightQuery.append(" "); // acts as OR ||
            // force HIGHLIGHT_FIELD_REGEX index and stored content
            // in each term after first. First term taken care by HighlightedMatchesSource
            highlightQuery.append(LuceneQuery.HIGHLIGHT_FIELD_REGEX).append(":");
          }

          ++curTerm;
        }
      }
      // String highlightQueryEscaped =
      // KeywordSearchUtil.escapeLuceneQuery(highlightQuery.toString());
      highlightQueryEscaped = highlightQuery.toString();
    }

    return highlightQueryEscaped;
  }
 /**
  * This method returns a collection of KeywordHits with lowest SolrObjectID- Chunk-ID combination.
  * The output generated is consistent across multiple runs.
  *
  * @param queryResults QueryResult object
  * @return A consistent collection of keyword hits
  */
 Collection<KeywordHit> getOneHitPerObject(QueryResults queryResults) {
   HashMap<Long, KeywordHit> hits = new HashMap<Long, KeywordHit>();
   for (Keyword keyWord : queryResults.getKeywords()) {
     for (KeywordHit hit : queryResults.getResults(keyWord)) {
       // add hit with lowest SolrObjectID-Chunk-ID combination.
       if (!hits.containsKey(hit.getSolrObjectId())) {
         hits.put(hit.getSolrObjectId(), hit);
       } else {
         if (hit.getChunkId() < hits.get(hit.getSolrObjectId()).getChunkId()) {
           hits.put(hit.getSolrObjectId(), hit);
         }
       }
     }
   }
   return hits.values();
 }
예제 #3
0
  @Override
  public QueryResults performQuery() throws NoOpenCoreException {
    /*
     * Execute the regex query to get a list of terms that match the regex.
     * Note that the field that is being searched is tokenized based on
     * whitespace.
     */
    // create the query
    final SolrQuery q = new SolrQuery();
    q.setRequestHandler(TERMS_HANDLER);
    q.setTerms(true);
    q.setTermsRegexFlag(CASE_INSENSITIVE);
    q.setTermsRegex(escapedQuery);
    q.addTermsField(TERMS_SEARCH_FIELD);
    q.setTimeAllowed(TERMS_TIMEOUT);
    q.setShowDebugInfo(DEBUG);
    q.setTermsLimit(MAX_TERMS_RESULTS);
    LOGGER.log(Level.INFO, "Query: {0}", q.toString()); // NON-NLS

    // execute the query
    List<Term> terms = null;
    try {
      terms = KeywordSearch.getServer().queryTerms(q).getTerms(TERMS_SEARCH_FIELD);
    } catch (KeywordSearchModuleException ex) {
      LOGGER.log(
          Level.SEVERE,
          "Error executing the regex terms query: " + keyword.getQuery(),
          ex); // NON-NLS
      // TODO: this is almost certainly wrong and guaranteed to throw a NPE at some point!!!!
    }

    /*
     * For each term that matched the regex, query for full set of document
     * hits for that term.
     */
    QueryResults results = new QueryResults(this, keywordList);
    int resultSize = 0;

    for (Term term : terms) {
      final String termStr = KeywordSearchUtil.escapeLuceneQuery(term.getTerm());

      if (keyword.getType() == ATTRIBUTE_TYPE.TSK_CARD_NUMBER) {
        // If the keyword is a credit card number, pass it through luhn validator
        Matcher matcher = CCN_PATTERN.matcher(term.getTerm());
        matcher.find();
        final String ccn = CharMatcher.anyOf(" -").removeFrom(matcher.group("ccn"));
        if (false == LUHN_CHECK.isValid(ccn)) {
          continue; // if the hit does not pass the luhn check, skip it.
        }
      }

      /*
       * Note: we can't set filter query on terms query but setting filter
       * query on fileResults query will yield the same result
       */
      LuceneQuery filesQuery = new LuceneQuery(keywordList, new Keyword(termStr, true));
      filters.forEach(filesQuery::addFilter);

      try {
        QueryResults fileQueryResults = filesQuery.performQuery();
        Set<KeywordHit> filesResults = new HashSet<>();
        for (Keyword key : fileQueryResults.getKeywords()) { // flatten results into a single list
          List<KeywordHit> keyRes = fileQueryResults.getResults(key);
          resultSize += keyRes.size();
          filesResults.addAll(keyRes);
        }
        results.addResult(new Keyword(term.getTerm(), false), new ArrayList<>(filesResults));
      } catch (NoOpenCoreException | RuntimeException e) {
        LOGGER.log(Level.WARNING, "Error executing Solr query,", e); // NON-NLS
        throw e;
      }
    }

    // TODO limit how many results we store, not to hit memory limits
    LOGGER.log(Level.INFO, "Regex # results: {0}", resultSize); // NON-NLS

    return results;
  }