/** * Finds the first or last occurrence of a phrase in the suffix array, within a subset of the * suffix array that is bounded by suffixArrayStart and suffixArrayEnd. For efficiency of looking * up all subphrases in a sentence we do not require that multplie int[]s be created for each * subphrase. Instead this method will look for the subphrase within the sentence between * phraseStart and phraseEnd. * * @param sentence the sentence/superphrase in int representation to draw the search phrase from * @param phraseStart the start of the phrase in the sentence (inclusive) * @param phraseEnd the end of the phrase in the sentence (exclusive) * @param suffixArrayStart the point at which to start the search in the suffix array * @param suffixArrayEnd the end point in the suffix array beyond which the search doesn't need to * take place * @param findFirst a flag that indicates whether we should find the first or last occurrence of * the phrase */ private int findPhraseBound( Phrase sentence, int phraseStart, int phraseEnd, int suffixArrayStart, int suffixArrayEnd, boolean findFirst) { int low = suffixArrayStart; int high = suffixArrayEnd; // Do a binary search between the low and high points while (low <= high) { int mid = (low + high) >> 1; int start = suffixes[mid]; int diff = corpus.comparePhrase(start, sentence, phraseStart, phraseEnd); if (diff == 0) { // If the difference between the search phrase and the phrase in the corpus // is 0, then we have found it. However, there might be multiple matches in // the corpus, so we need to continue searching until we find the end point int neighbor = mid; if (findFirst) { neighbor--; } else { neighbor++; } if (neighbor >= suffixArrayStart && neighbor <= suffixArrayEnd) { int nextDiff = corpus.comparePhrase(suffixes[neighbor], sentence, phraseStart, phraseEnd); if (nextDiff == 0) { // There's another equivalent phrase, so we need to specify // in which direction to continue searching if (findFirst) diff = 1; // search lower else diff = -1; // search higher } } } if (diff < 0) { low = mid + 1; } else if (diff > 0) { high = mid - 1; } else { return mid; // this is the edge } } return -1; // key not found. }