/** * Creates a string of the semi-infinite strings in the corpus array. Only use this on small * suffixArrays! */ public String toString() { String str = ""; for (int i = 0; i < suffixes.length; i++) { Phrase phrase = corpus.getPhrase(getCorpusIndex(i), corpus.size()); str += phrase.toString() + "\n"; } return str; }
/** Constructor takes a CorpusArray and creates a sorted suffix array from it. */ public SuffixArray(CorpusArray corpusArray) { this.corpus = corpusArray; suffixes = new int[corpusArray.size()]; // Create an array of suffix IDs for (int i = 0; i < corpusArray.size(); i++) { suffixes[i] = i; } // Sort the array of suffixes sort(suffixes); this.hierarchicalPhraseCache = new Cache<Pattern, HierarchicalPhrases>(CACHE_CAPACITY); }
/** * Constructs an auxiliary array that stores longest common prefixes. The length of the array is * the corpus size+1. Each elements lcp[i] indicates the length of the common prefix between two * positions s[i-1] and s[i] in the suffix array. */ protected int[] calculateLongestCommonPrefixes() { int[] longestCommonPrefixes = new int[suffixes.length + 1]; for (int i = 1; i < suffixes.length; i++) { int commonPrefixSize = 0; while (suffixes[i] + commonPrefixSize < size() && suffixes[i - 1] + commonPrefixSize < size() && (corpus.getWordID(suffixes[i] + commonPrefixSize) == corpus.getWordID(suffixes[i - 1] + commonPrefixSize) && commonPrefixSize <= MAX_COMPARISON_LENGTH)) { commonPrefixSize++; } longestCommonPrefixes[i] = commonPrefixSize; } longestCommonPrefixes[0] = 0; longestCommonPrefixes[suffixes.length] = 0; return longestCommonPrefixes; }
/** * Finds the first or last occurrence of a phrase in the suffix array, within a subset of the * suffix array that is bounded by suffixArrayStart and suffixArrayEnd. For efficiency of looking * up all subphrases in a sentence we do not require that multplie int[]s be created for each * subphrase. Instead this method will look for the subphrase within the sentence between * phraseStart and phraseEnd. * * @param sentence the sentence/superphrase in int representation to draw the search phrase from * @param phraseStart the start of the phrase in the sentence (inclusive) * @param phraseEnd the end of the phrase in the sentence (exclusive) * @param suffixArrayStart the point at which to start the search in the suffix array * @param suffixArrayEnd the end point in the suffix array beyond which the search doesn't need to * take place * @param findFirst a flag that indicates whether we should find the first or last occurrence of * the phrase */ private int findPhraseBound( Phrase sentence, int phraseStart, int phraseEnd, int suffixArrayStart, int suffixArrayEnd, boolean findFirst) { int low = suffixArrayStart; int high = suffixArrayEnd; // Do a binary search between the low and high points while (low <= high) { int mid = (low + high) >> 1; int start = suffixes[mid]; int diff = corpus.comparePhrase(start, sentence, phraseStart, phraseEnd); if (diff == 0) { // If the difference between the search phrase and the phrase in the corpus // is 0, then we have found it. However, there might be multiple matches in // the corpus, so we need to continue searching until we find the end point int neighbor = mid; if (findFirst) { neighbor--; } else { neighbor++; } if (neighbor >= suffixArrayStart && neighbor <= suffixArrayEnd) { int nextDiff = corpus.comparePhrase(suffixes[neighbor], sentence, phraseStart, phraseEnd); if (nextDiff == 0) { // There's another equivalent phrase, so we need to specify // in which direction to continue searching if (findFirst) diff = 1; // search lower else diff = -1; // search higher } } } if (diff < 0) { low = mid + 1; } else if (diff > 0) { high = mid - 1; } else { return mid; // this is the edge } } return -1; // key not found. }
/** * Gets a list of phrases. * * @param startPositions List of start positions in the corpus array. * @param length Length of the phrase to be extracted. * @return A list of phrases. */ public List<Phrase> getPhrases(int[] startPositions, int length) { List<Phrase> results = new ArrayList<Phrase>(startPositions.length); for (int start : startPositions) { results.add(corpus.getPhrase(start, start + length)); } return results; }
/** * Returns a list of the sentence numbers which contain the specified phrase. * * @param phrase the phrase to look for * @param maxSentences the maximum number of sentences to return * @return a list of the sentence numbers */ public int[] findSentencesContaining(Phrase phrase, int maxSentences) { int[] bounds = findPhrase(phrase); if (bounds == null) return null; int numOccurrences = (bounds[1] - bounds[0]) + 1; int[] sentences = new int[Math.min(maxSentences, numOccurrences)]; for (int i = 0; i < sentences.length; i++) { sentences[i] = corpus.getSentenceIndex(getCorpusIndex(bounds[0] + i)); } return sentences; }
/** Quick sort */ private void qsort(int[] array, int begin, int end) { if (end > begin) { int index; // partition(array, begin, end); { index = begin + RAND.nextInt(end - begin + 1); int pivot = array[index]; // swap(array, index, end); { int tmp = array[index]; array[index] = array[end]; array[end] = tmp; } for (int i = index = begin; i < end; ++i) { if (corpus.compareSuffixes(array[i], pivot, MAX_COMPARISON_LENGTH) <= 0) { // swap(array, index++, i); { int tmp = array[index]; array[index] = array[i]; array[i] = tmp; index++; } } } // swap(array, index, end); { int tmp = array[index]; array[index] = array[end]; array[end] = tmp; } } qsort(array, begin, index - 1); qsort(array, index + 1, end); } }
/** @return the corpus index that corresponds to the start of the sentence. */ public int getSentencePosition(int sentenceIndex) { return corpus.getSentencePosition(sentenceIndex); }
/** @return the sentence number corresponding the specified corpus index. */ public int getSentenceIndex(int corpusIndex) { return corpus.getSentenceIndex(corpusIndex); }
protected int getWord(int position) { int corpusIndex = getCorpusIndex(position); return corpus.getWordID(corpusIndex); }
/** @return the phrase spanning the specified indices in the corpus. */ public Phrase getPhrase(int startPosition, int endPosition) { return corpus.getPhrase(startPosition, endPosition); }
/** Implemented for the Corpus interface. */ public Phrase getSentence(int sentenceIndex) { return corpus.getSentence(sentenceIndex); }
/** Implemented for the Corpus interface. */ public int getNumWords() { return corpus.size(); }
/** Implemented for the Corpus interface. */ public int getNumSentences() { return corpus.getNumSentences(); }