Pattern[] matchRegexToDocument(String regex) { regex = interpolateTextMarksIntoRegex(regex); ArrayList<Pattern> patterns = new ArrayList<Pattern>(); String fullText = util.loadTextFromId(url.toExternalForm()).replaceAll("\\s+", " "); tempRegex = fullText; String negatedPrecedingString = getNegatedPrecedingCharacters(regex); String negatedFollowingString = getNegatedFollowingChars(regex); int precedingIndexAdjustment = negatedPrecedingString.length(); int followingIndexAdjustment = negatedFollowingString.length(); regex = util.substituteDiacritics(regex); tempRegex += "<br><br>" + regex; Pattern pattern = Pattern.compile( regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.UNIX_LINES | Pattern.DOTALL); Matcher matcher = pattern.matcher(fullText); while (matcher.find()) { int startIndex = matcher.start() - precedingIndexAdjustment; int endIndex = matcher.end() + followingIndexAdjustment; if (startIndex < 0) { startIndex = 0; } if (endIndex > fullText.length()) { endIndex = fullText.length(); } String found = fullText.substring(startIndex, endIndex); found = anchorAssertionAtStart(found, precedingIndexAdjustment); found = anchorAssertionAtEnd(found, followingIndexAdjustment); if ((precedingIndexAdjustment == 0 || !negatedPrecedingString.equals(found.substring(0, precedingIndexAdjustment + 1))) && (followingIndexAdjustment == 0 || !negatedFollowingString.equals( found.substring(found.length() - followingIndexAdjustment)))) { found = found.replaceAll("([()\\[\\]{}\\.])", "."); found = found.replaceAll("\\s{2,}", "\\\\b"); String[] foundBits = found.split("[\\d]+\\."); for (int i = 0; i < foundBits.length; i++) { String fbit = foundBits[i].trim(); fbit = fbit.replaceAll("\\s", "\\\\s+"); highlightWords.add(fbit); Pattern foundPattern = Pattern.compile( fbit, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.UNIX_LINES | Pattern.DOTALL); patterns.add(foundPattern); } } } Pattern[] arrPatterns = new Pattern[patterns.size()]; return patterns.toArray(arrPatterns); }
String getKWIC() { StringBuilder html = new StringBuilder(); try { List<String> kwix = util.highlightMatches(util.loadTextFromId(url.toExternalForm()), highlightTerms); html.append("<tr class=\"result-text\"><td class=\"kwic\" colspan=\"7\">"); for (String kwic : kwix) { html.append( kwic.replaceAll( "\\s*ⓐ\\s*", "")); // TODO: why is this character sneaking through when the user does a regex // word-boundary (\b) search? html.append("<br/>\n"); } html.append("</td></tr>"); } catch (Exception e) { // TODO: Need to do something sensible here with regard to highlighting logger.error("Highlightling failure", e); } return html.toString(); }