/**
   * Splits query into multiple searches, will be used for advanced queries
   *
   * @param query search query, can use various different search conventions
   * @param indexuri uri for one index
   * @return single Search encompassing everything in the query or null if query is a stop word
   * @throws InvalidSearchException if search query is invalid
   */
  private static Search splitQuery(String query, String indexuri)
      throws InvalidSearchException, TaskAbortException {
    if (query.matches("\\A[\\p{L}\\d]*\\Z")
        || query.matches("\\A[\\p{L}\\d][\\p{L}'\\d]*[\\p{L}\\d]\\Z")) {
      // single search term
      // return null if stopword
      if (SearchUtil.isStopWord(query)) return null;
      Execution<Set<TermEntry>> request = library.getIndex(indexuri).getTermEntries(query);
      if (request == null)
        throw new InvalidSearchException(
            "Something wrong with query=\""
                + query
                + "\" or indexURI=\""
                + indexuri
                + "\", maybe something is wrong with the index or it's uri is wrong.");
      return new Search(query, indexuri, request);
    }

    // Make phrase search
    if (query.matches("\\A\"[^\"]*\"\\Z")) {
      ArrayList<Execution<Set<TermEntry>>> phrasesearches = new ArrayList();
      String[] phrase = query.replaceAll("\"(.*)\"", "$1").split("[^\\p{L}\\d]+");
      if (logMINOR) Logger.minor(Search.class, "Phrase split" + query);
      for (String subquery : phrase) {
        Search term = startSearch(subquery, indexuri);
        phrasesearches.add(term);
      }
      // Not really sure how stopwords should be handled in phrases
      // currently i'm thinking that they should be treated as blanks
      // between other words and ignored in other cases "jesus of nazareth"
      // is treated as "jesus <blank> nazareth". Whereas "the who" will be
      // treated as a stop query as just searching for "who" and purporting
      // that the results are representative of "the who" is misleading.

      // this makes sure there are no trailing nulls at the start
      while (phrasesearches.get(0) == null) phrasesearches.remove(0);
      // this makes sure there are no trailing nulls at the end
      while (phrasesearches.get(phrasesearches.size() - 1) == null)
        phrasesearches.remove(phrasesearches.size() - 1);

      if (phrasesearches.size() > 1)
        return new Search(query, indexuri, phrasesearches, ResultOperation.PHRASE);
      else return null;
    }

    if (logMINOR) Logger.minor(Search.class, "Splitting " + query);
    String formattedquery = "";
    // Remove phrases, place them in arraylist and replace tem with references to the arraylist
    ArrayList<String> phrases = new ArrayList();
    String[] phraseparts = query.split("\"");
    if (phraseparts.length > 1)
      for (int i = 0; i < phraseparts.length; i++) {
        String string = phraseparts[i];
        formattedquery += string;
        if (++i < phraseparts.length) {
          string = phraseparts[i];
          formattedquery += "$" + phrases.size() + "£";
          phrases.add(string);
        }
      }
    else formattedquery = query;

    if (logMINOR) Logger.minor(Search.class, "phrases removed query : " + formattedquery);

    // treat hyphens as phrases, as they are treated equivalently in spider so this is the most
    // effective way now
    query = query.replaceAll("((?:[\\d\\p{L}]+-)+[\\d\\p{L}]+)", "\"$1\"");
    if (logMINOR) Logger.minor(Search.class, "Treat hyphenated words as phrases");

    if (!query.contains(
        "\"")) { // dont do the other splitting operations as we need to put phrases back in and
                 // call self
      formattedquery = formattedquery.replaceAll("\\s+or\\s+", "||");
      formattedquery = formattedquery.replaceAll("\\s+(?:not\\s*|-)(\\S+)", "^^($1)");
      if (logMINOR) Logger.minor(Search.class, "not query : " + formattedquery);
      formattedquery = formattedquery.replaceAll("\\s+", "&&");
      if (logMINOR) Logger.minor(Search.class, "and query : " + formattedquery);
    }

    // Put phrases back in
    phraseparts = formattedquery.split("\\$");
    formattedquery = phraseparts[0];
    for (int i = 1; i < phraseparts.length; i++) {
      String string = phraseparts[i];
      if (logMINOR)
        Logger.minor(Search.class, "replacing phrase " + string.replaceFirst("(\\d+).*", "$1"));
      formattedquery +=
          "\""
              + phrases.get(Integer.parseInt(string.replaceFirst("(\\d+).*", "$1")))
              + "\""
              + string.replaceFirst("\\d+£(.*)", "$1");
    }
    if (logMINOR) Logger.minor(Search.class, "phrase back query : " + formattedquery);

    if (query.contains("\"")) // recall self to remove phrases
    return splitQuery(query, indexuri);

    // Make complement search
    if (formattedquery.contains("^^(")) {
      ArrayList<Execution<Set<TermEntry>>> complementsearches = new ArrayList();
      String[] splitup = formattedquery.split("(\\^\\^\\(|\\))", 3);
      Search add = startSearch(splitup[0] + splitup[2], indexuri);
      Search subtract = startSearch(splitup[1], indexuri);
      if (add == null || subtract == null)
        return null; // If 'and' is not to be searched for 'the -john' is not to be searched for,
                     // also 'john -the' wouldnt have shown many results anyway
      complementsearches.add(add);
      complementsearches.add(subtract);
      return new Search(query, indexuri, complementsearches, ResultOperation.REMOVE);
    }
    // Split intersections
    if (formattedquery.contains("&&")) {
      ArrayList<Search> intersectsearches = new ArrayList();
      String[] intersects = formattedquery.split("&&");
      for (String subquery : intersects) {
        Search subsearch = startSearch(subquery, indexuri);
        if (subsearch
            != null) // We will assume that searching for 'the big apple' will near enough show the
                     // same results as 'big apple', so just ignore 'the' in interseaction
        intersectsearches.add(subsearch);
      }
      switch (intersectsearches.size()) {
        case 0: // eg. 'the that'
          return null;
        case 1: // eg. 'cake that' will return a search for 'cake'
          return intersectsearches.get(0);
        default:
          return new Search(query, indexuri, intersectsearches, ResultOperation.INTERSECTION);
      }
    }
    // Split Unions
    if (formattedquery.contains("||")) {
      ArrayList<Execution<Set<TermEntry>>> unionsearches = new ArrayList();
      String[] unions = formattedquery.split("\\|\\|");
      for (String subquery : unions) {
        Search add = startSearch(subquery, indexuri);
        if (add
            == null) // eg a search for 'the or cake' would be almost the same as a search for 'the'
                     // and so should be treated as such
        return null;
        unionsearches.add(add);
      }
      return new Search(query, indexuri, unionsearches, ResultOperation.UNION);
    }

    Logger.error(Search.class, "No split made, " + formattedquery + query);
    return null;
  }
예제 #2
0
  /**
   * Splits query into multiple searches, will be used for advanced queries
   *
   * @param query search query, can use various different search conventions
   * @param indexuri uri for one index
   * @return single Search encompassing everything in the query or null if query is a stop word
   * @throws InvalidSearchException if search query is invalid
   */
  private static Search splitQuery(String query, String indexuri)
      throws InvalidSearchException, TaskAbortException {
    query = query.trim();
    if (query.matches("\\A[\\S&&[^-\"]]*\\Z")) {
      // single search term
      // return null if stopword
      if (SearchUtil.isStopWord(query)) return null;
      Execution<Set<TermEntry>> request = library.getIndex(indexuri).getTermEntries(query);
      if (request == null)
        throw new InvalidSearchException(
            "Something wrong with query=\""
                + query
                + "\" or indexURI=\""
                + indexuri
                + "\", maybe something is wrong with the index or it's uri is wrong.");
      return new Search(query, indexuri, request);
    }

    // Make phrase search (hyphen-separated words are also treated as phrases)
    if (query.matches("\\A\"[^\"]*\"\\Z")
        || query.matches("\\A((?:[\\S&&[^-]]+-)+[\\S&&[^-]]+)\\Z")) {
      ArrayList<Execution<Set<TermEntry>>> phrasesearches =
          new ArrayList<Execution<Set<TermEntry>>>();
      String[] phrase = query.replaceAll("\"(.*)\"", "$1").split("[\\s-]+");
      if (logMINOR) Logger.minor(Search.class, "Phrase split: " + query);
      for (String subquery : phrase) {
        Search term = startSearch(subquery, indexuri);
        phrasesearches.add(term);
      }
      // Not really sure how stopwords should be handled in phrases
      // currently i'm thinking that they should be treated as blanks
      // between other words and ignored in other cases "jesus of nazareth"
      // is treated as "jesus <blank> nazareth". Whereas "the who" will be
      // treated as a stop query as just searching for "who" and purporting
      // that the results are representative of "the who" is misleading.

      // this makes sure there are no trailing nulls at the start
      while (phrasesearches.size() > 0 && phrasesearches.get(0) == null) phrasesearches.remove(0);
      // this makes sure there are no trailing nulls at the end
      while (phrasesearches.size() > 0 && phrasesearches.get(phrasesearches.size() - 1) == null)
        phrasesearches.remove(phrasesearches.size() - 1);

      if (phrasesearches.size() > 1)
        return new Search(query, indexuri, phrasesearches, ResultOperation.PHRASE);
      else return null;
    }

    if (logMINOR) Logger.minor(Search.class, "Splitting " + query);
    // Remove phrases, place them in arraylist and replace them with references to the arraylist
    ArrayList<String> phrases = new ArrayList<String>();
    Matcher nextPhrase = Pattern.compile("\"([^\"]*?)\"").matcher(query);
    StringBuffer sb = new StringBuffer();
    while (nextPhrase.find()) {
      String phrase = nextPhrase.group(1);
      nextPhrase.appendReplacement(sb, "£" + phrases.size() + "€");
      phrases.add(phrase);
    }
    nextPhrase.appendTail(sb);

    if (logMINOR) Logger.minor(Search.class, "Phrases removed query: " + sb);

    // Remove the unmatched \" (if any)
    String formattedquery = sb.toString().replaceFirst("\"", "");
    if (logMINOR) Logger.minor(Search.class, "Removing the unmatched bracket: " + formattedquery);

    // Treat hyphens as phrases, as they are treated equivalently in spider so this is the most
    // effective way now
    nextPhrase = Pattern.compile("((?:[\\S&&[^-]]+-)+[\\S&&[^-]]+)").matcher(formattedquery);
    sb.setLength(0);
    while (nextPhrase.find()) {
      String phrase = nextPhrase.group(1);
      nextPhrase.appendReplacement(sb, "£" + phrases.size() + "€");
      phrases.add(phrase);
    }
    nextPhrase.appendTail(sb);

    formattedquery = sb.toString();
    if (logMINOR)
      Logger.minor(Search.class, "Treat hyphenated words as phrases: " + formattedquery);

    // Substitute service symbols. Those which are inside phrases should not be seen as "service"
    // ones.
    formattedquery = formattedquery.replaceAll("\\s+or\\s+", "||");
    if (logMINOR) Logger.minor(Search.class, "OR-subst query : " + formattedquery);
    formattedquery = formattedquery.replaceAll("\\s+(?:not\\s*|-)(\\S+)", "^^($1)");
    if (logMINOR) Logger.minor(Search.class, "NOT-subst query : " + formattedquery);
    formattedquery = formattedquery.replaceAll("\\s+", "&&");
    if (logMINOR) Logger.minor(Search.class, "AND-subst query : " + formattedquery);

    // Put phrases back in
    String[] phraseparts = formattedquery.split("£");
    formattedquery = phraseparts[0];
    for (int i = 1; i < phraseparts.length; i++) {
      String string = phraseparts[i];
      if (logMINOR)
        Logger.minor(Search.class, "replacing phrase " + string.replaceFirst("(\\d+).*", "$1"));
      formattedquery +=
          "\""
              + phrases.get(Integer.parseInt(string.replaceFirst("(\\d+).*", "$1")))
              + "\""
              + string.replaceFirst("\\d+€(.*)", "$1");
    }
    if (logMINOR) Logger.minor(Search.class, "Phrase back query: " + formattedquery);

    // Make complement search
    if (formattedquery.contains("^^(")) {
      ArrayList<Execution<Set<TermEntry>>> complementsearches =
          new ArrayList<Execution<Set<TermEntry>>>();
      String[] splitup = formattedquery.split("(\\^\\^\\(|\\))", 3);
      Search add = startSearch(splitup[0] + splitup[2], indexuri);
      Search subtract = startSearch(splitup[1], indexuri);
      if (add == null || subtract == null)
        return null; // If 'and' is not to be searched for 'the -john' is not to be searched for,
                     // also 'john -the' wouldnt have shown many results anyway
      complementsearches.add(add);
      complementsearches.add(subtract);
      return new Search(query, indexuri, complementsearches, ResultOperation.REMOVE);
    }
    // Split intersections
    if (formattedquery.contains("&&")) {
      ArrayList<Search> intersectsearches = new ArrayList<Search>();
      String[] intersects = formattedquery.split("&&");
      for (String subquery : intersects) {
        Search subsearch = startSearch(subquery, indexuri);
        if (subsearch
            != null) // We will assume that searching for 'the big apple' will near enough show the
                     // same results as 'big apple', so just ignore 'the' in interseaction
        intersectsearches.add(subsearch);
      }
      switch (intersectsearches.size()) {
        case 0: // eg. 'the that'
          return null;
        case 1: // eg. 'cake that' will return a search for 'cake'
          return intersectsearches.get(0);
        default:
          return new Search(query, indexuri, intersectsearches, ResultOperation.INTERSECTION);
      }
    }
    // Split Unions
    if (formattedquery.contains("||")) {
      ArrayList<Execution<Set<TermEntry>>> unionsearches =
          new ArrayList<Execution<Set<TermEntry>>>();
      String[] unions = formattedquery.split("\\|\\|");
      for (String subquery : unions) {
        Search add = startSearch(subquery, indexuri);
        if (add
            == null) // eg a search for 'the or cake' would be almost the same as a search for 'the'
                     // and so should be treated as such
        return null;
        unionsearches.add(add);
      }
      return new Search(query, indexuri, unionsearches, ResultOperation.UNION);
    }

    Logger.error(Search.class, "No split made, " + formattedquery + query);
    return null;
  }