Pattern[] matchRegexToDocument(String regex) { regex = interpolateTextMarksIntoRegex(regex); ArrayList<Pattern> patterns = new ArrayList<Pattern>(); String fullText = util.loadTextFromId(url.toExternalForm()).replaceAll("\\s+", " "); tempRegex = fullText; String negatedPrecedingString = getNegatedPrecedingCharacters(regex); String negatedFollowingString = getNegatedFollowingChars(regex); int precedingIndexAdjustment = negatedPrecedingString.length(); int followingIndexAdjustment = negatedFollowingString.length(); regex = util.substituteDiacritics(regex); tempRegex += "<br><br>" + regex; Pattern pattern = Pattern.compile( regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.UNIX_LINES | Pattern.DOTALL); Matcher matcher = pattern.matcher(fullText); while (matcher.find()) { int startIndex = matcher.start() - precedingIndexAdjustment; int endIndex = matcher.end() + followingIndexAdjustment; if (startIndex < 0) { startIndex = 0; } if (endIndex > fullText.length()) { endIndex = fullText.length(); } String found = fullText.substring(startIndex, endIndex); found = anchorAssertionAtStart(found, precedingIndexAdjustment); found = anchorAssertionAtEnd(found, followingIndexAdjustment); if ((precedingIndexAdjustment == 0 || !negatedPrecedingString.equals(found.substring(0, precedingIndexAdjustment + 1))) && (followingIndexAdjustment == 0 || !negatedFollowingString.equals( found.substring(found.length() - followingIndexAdjustment)))) { found = found.replaceAll("([()\\[\\]{}\\.])", "."); found = found.replaceAll("\\s{2,}", "\\\\b"); String[] foundBits = found.split("[\\d]+\\."); for (int i = 0; i < foundBits.length; i++) { String fbit = foundBits[i].trim(); fbit = fbit.replaceAll("\\s", "\\\\s+"); highlightWords.add(fbit); Pattern foundPattern = Pattern.compile( fbit, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.UNIX_LINES | Pattern.DOTALL); patterns.add(foundPattern); } } } Pattern[] arrPatterns = new Pattern[patterns.size()]; return patterns.toArray(arrPatterns); }
// used for KWIC highlighting final Pattern[] buildHighlightTerms(ArrayList<SearchClause> searchClauses) { ArrayList<Pattern> hilites = new ArrayList<Pattern>(); Iterator<SearchClause> stit = searchClauses.iterator(); while (stit.hasNext()) { try { SearchClause searchClause = stit.next(); String transformedString = searchClause.buildTransformedString(); if (transformedString == null) { continue; } if ("".equals(transformedString)) { continue; } if (searchClause.getAllClauseRoles().contains(ClauseRole.REGEX)) { String trimmedRegex = trimRegex(transformedString); Pattern[] regexPatterns = matchRegexToDocument(trimmedRegex); hilites.addAll(Arrays.asList(regexPatterns)); } else if (searchClause.parseForSearchType() == StringSearchFacet.SearchType.PROXIMITY) { transformedString = transformedString.replaceAll("(\\d+)w", ""); tempRegex = "Prox"; hilites.addAll(Arrays.asList(util.getPhraseHighlightPatterns(transformedString))); } else if (searchClause.parseForSearchType() == StringSearchFacet.SearchType.SUBSTRING) { Pattern[] patterns = util.getSubstringHighlightPatterns(transformedString); for (int i = 0; i < patterns.length; i++) { tempRegex += "### "; tempRegex += patterns[i].toString(); tempRegex += " ###"; } hilites.addAll(Arrays.asList(patterns)); } else { tempRegex = "Other"; Pattern[] patterns = util.getPhraseHighlightPatterns(transformedString); hilites.addAll(Arrays.asList(patterns)); } } catch (Exception e) { } } Pattern[] patterns = new Pattern[hilites.size()]; return hilites.toArray(patterns); }
String getKWIC() { StringBuilder html = new StringBuilder(); try { List<String> kwix = util.highlightMatches(util.loadTextFromId(url.toExternalForm()), highlightTerms); html.append("<tr class=\"result-text\"><td class=\"kwic\" colspan=\"7\">"); for (String kwic : kwix) { html.append( kwic.replaceAll( "\\s*ⓐ\\s*", "")); // TODO: why is this character sneaking through when the user does a regex // word-boundary (\b) search? html.append("<br/>\n"); } html.append("</td></tr>"); } catch (Exception e) { // TODO: Need to do something sensible here with regard to highlighting logger.error("Highlightling failure", e); } return html.toString(); }