예제 #1
0
 /**
  * Creates a string of the semi-infinite strings in the corpus array. Only use this on small
  * suffixArrays!
  */
 public String toString() {
   String str = "";
   for (int i = 0; i < suffixes.length; i++) {
     Phrase phrase = corpus.getPhrase(getCorpusIndex(i), corpus.size());
     str += phrase.toString() + "\n";
   }
   return str;
 }
예제 #2
0
  /** Constructor takes a CorpusArray and creates a sorted suffix array from it. */
  public SuffixArray(CorpusArray corpusArray) {
    this.corpus = corpusArray;
    suffixes = new int[corpusArray.size()];

    // Create an array of suffix IDs
    for (int i = 0; i < corpusArray.size(); i++) {
      suffixes[i] = i;
    }
    // Sort the array of suffixes
    sort(suffixes);

    this.hierarchicalPhraseCache = new Cache<Pattern, HierarchicalPhrases>(CACHE_CAPACITY);
  }
예제 #3
0
 /**
  * Constructs an auxiliary array that stores longest common prefixes. The length of the array is
  * the corpus size+1. Each elements lcp[i] indicates the length of the common prefix between two
  * positions s[i-1] and s[i] in the suffix array.
  */
 protected int[] calculateLongestCommonPrefixes() {
   int[] longestCommonPrefixes = new int[suffixes.length + 1];
   for (int i = 1; i < suffixes.length; i++) {
     int commonPrefixSize = 0;
     while (suffixes[i] + commonPrefixSize < size()
         && suffixes[i - 1] + commonPrefixSize < size()
         && (corpus.getWordID(suffixes[i] + commonPrefixSize)
                 == corpus.getWordID(suffixes[i - 1] + commonPrefixSize)
             && commonPrefixSize <= MAX_COMPARISON_LENGTH)) {
       commonPrefixSize++;
     }
     longestCommonPrefixes[i] = commonPrefixSize;
   }
   longestCommonPrefixes[0] = 0;
   longestCommonPrefixes[suffixes.length] = 0;
   return longestCommonPrefixes;
 }
예제 #4
0
  /**
   * Finds the first or last occurrence of a phrase in the suffix array, within a subset of the
   * suffix array that is bounded by suffixArrayStart and suffixArrayEnd. For efficiency of looking
   * up all subphrases in a sentence we do not require that multplie int[]s be created for each
   * subphrase. Instead this method will look for the subphrase within the sentence between
   * phraseStart and phraseEnd.
   *
   * @param sentence the sentence/superphrase in int representation to draw the search phrase from
   * @param phraseStart the start of the phrase in the sentence (inclusive)
   * @param phraseEnd the end of the phrase in the sentence (exclusive)
   * @param suffixArrayStart the point at which to start the search in the suffix array
   * @param suffixArrayEnd the end point in the suffix array beyond which the search doesn't need to
   *     take place
   * @param findFirst a flag that indicates whether we should find the first or last occurrence of
   *     the phrase
   */
  private int findPhraseBound(
      Phrase sentence,
      int phraseStart,
      int phraseEnd,
      int suffixArrayStart,
      int suffixArrayEnd,
      boolean findFirst) {
    int low = suffixArrayStart;
    int high = suffixArrayEnd;

    // Do a binary search between the low and high points
    while (low <= high) {
      int mid = (low + high) >> 1;
      int start = suffixes[mid];
      int diff = corpus.comparePhrase(start, sentence, phraseStart, phraseEnd);
      if (diff == 0) {
        // If the difference between the search phrase and the phrase in the corpus
        // is 0, then we have found it.  However, there might be multiple matches in
        // the corpus, so we need to continue searching until we find the end point
        int neighbor = mid;
        if (findFirst) {
          neighbor--;
        } else {
          neighbor++;
        }
        if (neighbor >= suffixArrayStart && neighbor <= suffixArrayEnd) {
          int nextDiff = corpus.comparePhrase(suffixes[neighbor], sentence, phraseStart, phraseEnd);
          if (nextDiff == 0) {
            // There's another equivalent phrase, so we need to specify
            // in which direction to continue searching
            if (findFirst) diff = 1; // search lower
            else diff = -1; // search higher
          }
        }
      }
      if (diff < 0) {
        low = mid + 1;
      } else if (diff > 0) {
        high = mid - 1;
      } else {
        return mid; // this is the edge
      }
    }
    return -1; // key not found.
  }
예제 #5
0
  /**
   * Gets a list of phrases.
   *
   * @param startPositions List of start positions in the corpus array.
   * @param length Length of the phrase to be extracted.
   * @return A list of phrases.
   */
  public List<Phrase> getPhrases(int[] startPositions, int length) {
    List<Phrase> results = new ArrayList<Phrase>(startPositions.length);

    for (int start : startPositions) {
      results.add(corpus.getPhrase(start, start + length));
    }

    return results;
  }
예제 #6
0
  /**
   * Returns a list of the sentence numbers which contain the specified phrase.
   *
   * @param phrase the phrase to look for
   * @param maxSentences the maximum number of sentences to return
   * @return a list of the sentence numbers
   */
  public int[] findSentencesContaining(Phrase phrase, int maxSentences) {
    int[] bounds = findPhrase(phrase);
    if (bounds == null) return null;
    int numOccurrences = (bounds[1] - bounds[0]) + 1;

    int[] sentences = new int[Math.min(maxSentences, numOccurrences)];
    for (int i = 0; i < sentences.length; i++) {
      sentences[i] = corpus.getSentenceIndex(getCorpusIndex(bounds[0] + i));
    }
    return sentences;
  }
예제 #7
0
  /** Quick sort */
  private void qsort(int[] array, int begin, int end) {
    if (end > begin) {

      int index;
      // partition(array, begin, end);
      {
        index = begin + RAND.nextInt(end - begin + 1);
        int pivot = array[index];

        // swap(array, index, end);
        {
          int tmp = array[index];
          array[index] = array[end];
          array[end] = tmp;
        }

        for (int i = index = begin; i < end; ++i) {
          if (corpus.compareSuffixes(array[i], pivot, MAX_COMPARISON_LENGTH) <= 0) {

            // swap(array, index++, i);
            {
              int tmp = array[index];
              array[index] = array[i];
              array[i] = tmp;
              index++;
            }
          }
        }
        // swap(array, index, end);
        {
          int tmp = array[index];
          array[index] = array[end];
          array[end] = tmp;
        }
      }

      qsort(array, begin, index - 1);
      qsort(array, index + 1, end);
    }
  }
예제 #8
0
 /** @return the corpus index that corresponds to the start of the sentence. */
 public int getSentencePosition(int sentenceIndex) {
   return corpus.getSentencePosition(sentenceIndex);
 }
예제 #9
0
 /** @return the sentence number corresponding the specified corpus index. */
 public int getSentenceIndex(int corpusIndex) {
   return corpus.getSentenceIndex(corpusIndex);
 }
예제 #10
0
 protected int getWord(int position) {
   int corpusIndex = getCorpusIndex(position);
   return corpus.getWordID(corpusIndex);
 }
예제 #11
0
 /** @return the phrase spanning the specified indices in the corpus. */
 public Phrase getPhrase(int startPosition, int endPosition) {
   return corpus.getPhrase(startPosition, endPosition);
 }
예제 #12
0
 /** Implemented for the Corpus interface. */
 public Phrase getSentence(int sentenceIndex) {
   return corpus.getSentence(sentenceIndex);
 }
예제 #13
0
 /** Implemented for the Corpus interface. */
 public int getNumWords() {
   return corpus.size();
 }
예제 #14
0
 /** Implemented for the Corpus interface. */
 public int getNumSentences() {
   return corpus.getNumSentences();
 }