/** * Return the string used to later have SOLR highlight the document with. * * @param query * @param literal_query * @param queryResults * @param file * @return */ private String getHighlightQuery( KeywordSearchQuery query, boolean literal_query, QueryResults queryResults, Content content) { String highlightQueryEscaped; if (literal_query) { // literal, treat as non-regex, non-term component query highlightQueryEscaped = query.getQueryString(); } else { // construct a Solr query using aggregated terms to get highlighting // the query is executed later on demand StringBuilder highlightQuery = new StringBuilder(); if (queryResults.getKeywords().size() == 1) { // simple case, no need to process subqueries and do special escaping Keyword term = queryResults.getKeywords().iterator().next(); highlightQuery.append(term.toString()); } else { // find terms for this content hit List<String> hitTerms = new ArrayList<>(); for (Keyword keyword : queryResults.getKeywords()) { for (KeywordHit hit : queryResults.getResults(keyword)) { if (hit.getContent().equals(content)) { hitTerms.add(keyword.toString()); break; // go to next term } } } final int lastTerm = hitTerms.size() - 1; int curTerm = 0; for (String term : hitTerms) { // escape subqueries, they shouldn't be escaped again later final String termS = KeywordSearchUtil.escapeLuceneQuery(term); highlightQuery.append("\""); highlightQuery.append(termS); highlightQuery.append("\""); if (lastTerm != curTerm) { highlightQuery.append(" "); // acts as OR || // force HIGHLIGHT_FIELD_REGEX index and stored content // in each term after first. First term taken care by HighlightedMatchesSource highlightQuery.append(LuceneQuery.HIGHLIGHT_FIELD_REGEX).append(":"); } ++curTerm; } } // String highlightQueryEscaped = // KeywordSearchUtil.escapeLuceneQuery(highlightQuery.toString()); highlightQueryEscaped = highlightQuery.toString(); } return highlightQueryEscaped; }
/** * This method returns a collection of KeywordHits with lowest SolrObjectID- Chunk-ID combination. * The output generated is consistent across multiple runs. * * @param queryResults QueryResult object * @return A consistent collection of keyword hits */ Collection<KeywordHit> getOneHitPerObject(QueryResults queryResults) { HashMap<Long, KeywordHit> hits = new HashMap<Long, KeywordHit>(); for (Keyword keyWord : queryResults.getKeywords()) { for (KeywordHit hit : queryResults.getResults(keyWord)) { // add hit with lowest SolrObjectID-Chunk-ID combination. if (!hits.containsKey(hit.getSolrObjectId())) { hits.put(hit.getSolrObjectId(), hit); } else { if (hit.getChunkId() < hits.get(hit.getSolrObjectId()).getChunkId()) { hits.put(hit.getSolrObjectId(), hit); } } } } return hits.values(); }
@Override public QueryResults performQuery() throws NoOpenCoreException { /* * Execute the regex query to get a list of terms that match the regex. * Note that the field that is being searched is tokenized based on * whitespace. */ // create the query final SolrQuery q = new SolrQuery(); q.setRequestHandler(TERMS_HANDLER); q.setTerms(true); q.setTermsRegexFlag(CASE_INSENSITIVE); q.setTermsRegex(escapedQuery); q.addTermsField(TERMS_SEARCH_FIELD); q.setTimeAllowed(TERMS_TIMEOUT); q.setShowDebugInfo(DEBUG); q.setTermsLimit(MAX_TERMS_RESULTS); LOGGER.log(Level.INFO, "Query: {0}", q.toString()); // NON-NLS // execute the query List<Term> terms = null; try { terms = KeywordSearch.getServer().queryTerms(q).getTerms(TERMS_SEARCH_FIELD); } catch (KeywordSearchModuleException ex) { LOGGER.log( Level.SEVERE, "Error executing the regex terms query: " + keyword.getQuery(), ex); // NON-NLS // TODO: this is almost certainly wrong and guaranteed to throw a NPE at some point!!!! } /* * For each term that matched the regex, query for full set of document * hits for that term. */ QueryResults results = new QueryResults(this, keywordList); int resultSize = 0; for (Term term : terms) { final String termStr = KeywordSearchUtil.escapeLuceneQuery(term.getTerm()); if (keyword.getType() == ATTRIBUTE_TYPE.TSK_CARD_NUMBER) { // If the keyword is a credit card number, pass it through luhn validator Matcher matcher = CCN_PATTERN.matcher(term.getTerm()); matcher.find(); final String ccn = CharMatcher.anyOf(" -").removeFrom(matcher.group("ccn")); if (false == LUHN_CHECK.isValid(ccn)) { continue; // if the hit does not pass the luhn check, skip it. } } /* * Note: we can't set filter query on terms query but setting filter * query on fileResults query will yield the same result */ LuceneQuery filesQuery = new LuceneQuery(keywordList, new Keyword(termStr, true)); filters.forEach(filesQuery::addFilter); try { QueryResults fileQueryResults = filesQuery.performQuery(); Set<KeywordHit> filesResults = new HashSet<>(); for (Keyword key : fileQueryResults.getKeywords()) { // flatten results into a single list List<KeywordHit> keyRes = fileQueryResults.getResults(key); resultSize += keyRes.size(); filesResults.addAll(keyRes); } results.addResult(new Keyword(term.getTerm(), false), new ArrayList<>(filesResults)); } catch (NoOpenCoreException | RuntimeException e) { LOGGER.log(Level.WARNING, "Error executing Solr query,", e); // NON-NLS throw e; } } // TODO limit how many results we store, not to hit memory limits LOGGER.log(Level.INFO, "Regex # results: {0}", resultSize); // NON-NLS return results; }