/** * Splits query into multiple searches, will be used for advanced queries * * @param query search query, can use various different search conventions * @param indexuri uri for one index * @return single Search encompassing everything in the query or null if query is a stop word * @throws InvalidSearchException if search query is invalid */ private static Search splitQuery(String query, String indexuri) throws InvalidSearchException, TaskAbortException { query = query.trim(); if (query.matches("\\A[\\S&&[^-\"]]*\\Z")) { // single search term // return null if stopword if (SearchUtil.isStopWord(query)) return null; Execution<Set<TermEntry>> request = library.getIndex(indexuri).getTermEntries(query); if (request == null) throw new InvalidSearchException( "Something wrong with query=\"" + query + "\" or indexURI=\"" + indexuri + "\", maybe something is wrong with the index or it's uri is wrong."); return new Search(query, indexuri, request); } // Make phrase search (hyphen-separated words are also treated as phrases) if (query.matches("\\A\"[^\"]*\"\\Z") || query.matches("\\A((?:[\\S&&[^-]]+-)+[\\S&&[^-]]+)\\Z")) { ArrayList<Execution<Set<TermEntry>>> phrasesearches = new ArrayList<Execution<Set<TermEntry>>>(); String[] phrase = query.replaceAll("\"(.*)\"", "$1").split("[\\s-]+"); if (logMINOR) Logger.minor(Search.class, "Phrase split: " + query); for (String subquery : phrase) { Search term = startSearch(subquery, indexuri); phrasesearches.add(term); } // Not really sure how stopwords should be handled in phrases // currently i'm thinking that they should be treated as blanks // between other words and ignored in other cases "jesus of nazareth" // is treated as "jesus <blank> nazareth". Whereas "the who" will be // treated as a stop query as just searching for "who" and purporting // that the results are representative of "the who" is misleading. // this makes sure there are no trailing nulls at the start while (phrasesearches.size() > 0 && phrasesearches.get(0) == null) phrasesearches.remove(0); // this makes sure there are no trailing nulls at the end while (phrasesearches.size() > 0 && phrasesearches.get(phrasesearches.size() - 1) == null) phrasesearches.remove(phrasesearches.size() - 1); if (phrasesearches.size() > 1) return new Search(query, indexuri, phrasesearches, ResultOperation.PHRASE); else return null; } if (logMINOR) Logger.minor(Search.class, "Splitting " + query); // Remove phrases, place them in arraylist and replace them with references to the arraylist ArrayList<String> phrases = new ArrayList<String>(); Matcher nextPhrase = Pattern.compile("\"([^\"]*?)\"").matcher(query); StringBuffer sb = new StringBuffer(); while (nextPhrase.find()) { String phrase = nextPhrase.group(1); nextPhrase.appendReplacement(sb, "£" + phrases.size() + "€"); phrases.add(phrase); } nextPhrase.appendTail(sb); if (logMINOR) Logger.minor(Search.class, "Phrases removed query: " + sb); // Remove the unmatched \" (if any) String formattedquery = sb.toString().replaceFirst("\"", ""); if (logMINOR) Logger.minor(Search.class, "Removing the unmatched bracket: " + formattedquery); // Treat hyphens as phrases, as they are treated equivalently in spider so this is the most // effective way now nextPhrase = Pattern.compile("((?:[\\S&&[^-]]+-)+[\\S&&[^-]]+)").matcher(formattedquery); sb.setLength(0); while (nextPhrase.find()) { String phrase = nextPhrase.group(1); nextPhrase.appendReplacement(sb, "£" + phrases.size() + "€"); phrases.add(phrase); } nextPhrase.appendTail(sb); formattedquery = sb.toString(); if (logMINOR) Logger.minor(Search.class, "Treat hyphenated words as phrases: " + formattedquery); // Substitute service symbols. Those which are inside phrases should not be seen as "service" // ones. formattedquery = formattedquery.replaceAll("\\s+or\\s+", "||"); if (logMINOR) Logger.minor(Search.class, "OR-subst query : " + formattedquery); formattedquery = formattedquery.replaceAll("\\s+(?:not\\s*|-)(\\S+)", "^^($1)"); if (logMINOR) Logger.minor(Search.class, "NOT-subst query : " + formattedquery); formattedquery = formattedquery.replaceAll("\\s+", "&&"); if (logMINOR) Logger.minor(Search.class, "AND-subst query : " + formattedquery); // Put phrases back in String[] phraseparts = formattedquery.split("£"); formattedquery = phraseparts[0]; for (int i = 1; i < phraseparts.length; i++) { String string = phraseparts[i]; if (logMINOR) Logger.minor(Search.class, "replacing phrase " + string.replaceFirst("(\\d+).*", "$1")); formattedquery += "\"" + phrases.get(Integer.parseInt(string.replaceFirst("(\\d+).*", "$1"))) + "\"" + string.replaceFirst("\\d+€(.*)", "$1"); } if (logMINOR) Logger.minor(Search.class, "Phrase back query: " + formattedquery); // Make complement search if (formattedquery.contains("^^(")) { ArrayList<Execution<Set<TermEntry>>> complementsearches = new ArrayList<Execution<Set<TermEntry>>>(); String[] splitup = formattedquery.split("(\\^\\^\\(|\\))", 3); Search add = startSearch(splitup[0] + splitup[2], indexuri); Search subtract = startSearch(splitup[1], indexuri); if (add == null || subtract == null) return null; // If 'and' is not to be searched for 'the -john' is not to be searched for, // also 'john -the' wouldnt have shown many results anyway complementsearches.add(add); complementsearches.add(subtract); return new Search(query, indexuri, complementsearches, ResultOperation.REMOVE); } // Split intersections if (formattedquery.contains("&&")) { ArrayList<Search> intersectsearches = new ArrayList<Search>(); String[] intersects = formattedquery.split("&&"); for (String subquery : intersects) { Search subsearch = startSearch(subquery, indexuri); if (subsearch != null) // We will assume that searching for 'the big apple' will near enough show the // same results as 'big apple', so just ignore 'the' in interseaction intersectsearches.add(subsearch); } switch (intersectsearches.size()) { case 0: // eg. 'the that' return null; case 1: // eg. 'cake that' will return a search for 'cake' return intersectsearches.get(0); default: return new Search(query, indexuri, intersectsearches, ResultOperation.INTERSECTION); } } // Split Unions if (formattedquery.contains("||")) { ArrayList<Execution<Set<TermEntry>>> unionsearches = new ArrayList<Execution<Set<TermEntry>>>(); String[] unions = formattedquery.split("\\|\\|"); for (String subquery : unions) { Search add = startSearch(subquery, indexuri); if (add == null) // eg a search for 'the or cake' would be almost the same as a search for 'the' // and so should be treated as such return null; unionsearches.add(add); } return new Search(query, indexuri, unionsearches, ResultOperation.UNION); } Logger.error(Search.class, "No split made, " + formattedquery + query); return null; }
/** * Splits query into multiple searches, will be used for advanced queries * * @param query search query, can use various different search conventions * @param indexuri uri for one index * @return single Search encompassing everything in the query or null if query is a stop word * @throws InvalidSearchException if search query is invalid */ private static Search splitQuery(String query, String indexuri) throws InvalidSearchException, TaskAbortException { if (query.matches("\\A[\\p{L}\\d]*\\Z") || query.matches("\\A[\\p{L}\\d][\\p{L}'\\d]*[\\p{L}\\d]\\Z")) { // single search term // return null if stopword if (SearchUtil.isStopWord(query)) return null; Execution<Set<TermEntry>> request = library.getIndex(indexuri).getTermEntries(query); if (request == null) throw new InvalidSearchException( "Something wrong with query=\"" + query + "\" or indexURI=\"" + indexuri + "\", maybe something is wrong with the index or it's uri is wrong."); return new Search(query, indexuri, request); } // Make phrase search if (query.matches("\\A\"[^\"]*\"\\Z")) { ArrayList<Execution<Set<TermEntry>>> phrasesearches = new ArrayList(); String[] phrase = query.replaceAll("\"(.*)\"", "$1").split("[^\\p{L}\\d]+"); if (logMINOR) Logger.minor(Search.class, "Phrase split" + query); for (String subquery : phrase) { Search term = startSearch(subquery, indexuri); phrasesearches.add(term); } // Not really sure how stopwords should be handled in phrases // currently i'm thinking that they should be treated as blanks // between other words and ignored in other cases "jesus of nazareth" // is treated as "jesus <blank> nazareth". Whereas "the who" will be // treated as a stop query as just searching for "who" and purporting // that the results are representative of "the who" is misleading. // this makes sure there are no trailing nulls at the start while (phrasesearches.get(0) == null) phrasesearches.remove(0); // this makes sure there are no trailing nulls at the end while (phrasesearches.get(phrasesearches.size() - 1) == null) phrasesearches.remove(phrasesearches.size() - 1); if (phrasesearches.size() > 1) return new Search(query, indexuri, phrasesearches, ResultOperation.PHRASE); else return null; } if (logMINOR) Logger.minor(Search.class, "Splitting " + query); String formattedquery = ""; // Remove phrases, place them in arraylist and replace tem with references to the arraylist ArrayList<String> phrases = new ArrayList(); String[] phraseparts = query.split("\""); if (phraseparts.length > 1) for (int i = 0; i < phraseparts.length; i++) { String string = phraseparts[i]; formattedquery += string; if (++i < phraseparts.length) { string = phraseparts[i]; formattedquery += "$" + phrases.size() + "£"; phrases.add(string); } } else formattedquery = query; if (logMINOR) Logger.minor(Search.class, "phrases removed query : " + formattedquery); // treat hyphens as phrases, as they are treated equivalently in spider so this is the most // effective way now query = query.replaceAll("((?:[\\d\\p{L}]+-)+[\\d\\p{L}]+)", "\"$1\""); if (logMINOR) Logger.minor(Search.class, "Treat hyphenated words as phrases"); if (!query.contains( "\"")) { // dont do the other splitting operations as we need to put phrases back in and // call self formattedquery = formattedquery.replaceAll("\\s+or\\s+", "||"); formattedquery = formattedquery.replaceAll("\\s+(?:not\\s*|-)(\\S+)", "^^($1)"); if (logMINOR) Logger.minor(Search.class, "not query : " + formattedquery); formattedquery = formattedquery.replaceAll("\\s+", "&&"); if (logMINOR) Logger.minor(Search.class, "and query : " + formattedquery); } // Put phrases back in phraseparts = formattedquery.split("\\$"); formattedquery = phraseparts[0]; for (int i = 1; i < phraseparts.length; i++) { String string = phraseparts[i]; if (logMINOR) Logger.minor(Search.class, "replacing phrase " + string.replaceFirst("(\\d+).*", "$1")); formattedquery += "\"" + phrases.get(Integer.parseInt(string.replaceFirst("(\\d+).*", "$1"))) + "\"" + string.replaceFirst("\\d+£(.*)", "$1"); } if (logMINOR) Logger.minor(Search.class, "phrase back query : " + formattedquery); if (query.contains("\"")) // recall self to remove phrases return splitQuery(query, indexuri); // Make complement search if (formattedquery.contains("^^(")) { ArrayList<Execution<Set<TermEntry>>> complementsearches = new ArrayList(); String[] splitup = formattedquery.split("(\\^\\^\\(|\\))", 3); Search add = startSearch(splitup[0] + splitup[2], indexuri); Search subtract = startSearch(splitup[1], indexuri); if (add == null || subtract == null) return null; // If 'and' is not to be searched for 'the -john' is not to be searched for, // also 'john -the' wouldnt have shown many results anyway complementsearches.add(add); complementsearches.add(subtract); return new Search(query, indexuri, complementsearches, ResultOperation.REMOVE); } // Split intersections if (formattedquery.contains("&&")) { ArrayList<Search> intersectsearches = new ArrayList(); String[] intersects = formattedquery.split("&&"); for (String subquery : intersects) { Search subsearch = startSearch(subquery, indexuri); if (subsearch != null) // We will assume that searching for 'the big apple' will near enough show the // same results as 'big apple', so just ignore 'the' in interseaction intersectsearches.add(subsearch); } switch (intersectsearches.size()) { case 0: // eg. 'the that' return null; case 1: // eg. 'cake that' will return a search for 'cake' return intersectsearches.get(0); default: return new Search(query, indexuri, intersectsearches, ResultOperation.INTERSECTION); } } // Split Unions if (formattedquery.contains("||")) { ArrayList<Execution<Set<TermEntry>>> unionsearches = new ArrayList(); String[] unions = formattedquery.split("\\|\\|"); for (String subquery : unions) { Search add = startSearch(subquery, indexuri); if (add == null) // eg a search for 'the or cake' would be almost the same as a search for 'the' // and so should be treated as such return null; unionsearches.add(add); } return new Search(query, indexuri, unionsearches, ResultOperation.UNION); } Logger.error(Search.class, "No split made, " + formattedquery + query); return null; }
/** Gets the SubIndex object which should hold the keyword */ private SubIndex getSubIndex(String keyword) { String md5 = Library.MD5(keyword); int idx = Collections.binarySearch(subIndiceList, md5); if (idx < 0) idx = -idx - 2; return subIndice.get(subIndiceList.get(idx)); }