예제 #1
0
  /**
   * Mark those phrases that overlap with other phrases by more than {@link #maxPhraseOverlap} and
   * have lower coverage.
   */
  private void markOverlappingPhrases(ArrayList<PhraseCandidate> phrases) {
    final int max = phrases.size();

    // A list of all unique words for each candidate phrase.
    final IntStack words = new IntStack(maxDescPhraseLength * phrases.size());

    // Offset pairs in the words list -- a pair [start, length].
    final IntStack offsets = new IntStack(phrases.size() * 2);

    for (PhraseCandidate p : phrases) {
      appendUniqueWords(words, offsets, p);
    }

    for (int i = 0; i < max; i++) {
      for (int j = i + 1; j < max; j++) {
        final PhraseCandidate a = phrases.get(i);
        final PhraseCandidate b = phrases.get(j);

        final int a_words = offsets.get(2 * i + 1);
        final int b_words = offsets.get(2 * j + 1);

        final float intersection =
            computeIntersection(
                words.buffer,
                offsets.get(2 * i),
                a_words,
                words.buffer,
                offsets.get(2 * j),
                b_words);

        if ((intersection / b_words) > maxPhraseOverlap && b.coverage < a.coverage) {
          b.selected = false;
        }

        if ((intersection / a_words) > maxPhraseOverlap && a.coverage < b.coverage) {
          a.selected = false;
        }
      }
    }
  }
예제 #2
0
  /**
   * Leave only most general (no other phrase is a substring of this one) and most specific (no
   * other phrase is a superstring of this one) phrases.
   */
  private void markSubSuperPhrases(ArrayList<PhraseCandidate> phrases) {
    final int max = phrases.size();

    // A list of all words for each candidate phrase.
    final IntStack words = new IntStack(maxDescPhraseLength * phrases.size());

    // Offset pairs in the words list -- a pair [start, length].
    final IntStack offsets = new IntStack(phrases.size() * 2);

    for (PhraseCandidate p : phrases) {
      appendWords(words, offsets, p);
    }

    /*
     * Mark phrases that cannot be most specific or most general.
     */
    for (int i = 0; i < max; i++) {
      for (int j = 0; j < max; j++) {
        if (i == j) continue;

        int index =
            indexOf(
                words.buffer,
                offsets.get(2 * i),
                offsets.get(2 * i + 1),
                words.buffer,
                offsets.get(2 * j),
                offsets.get(2 * j + 1));
        if (index >= 0) {
          // j is a subphrase of i, hence i cannot be mostGeneral and j
          // cannot be most specific.
          phrases.get(i).mostGeneral = false;
          phrases.get(j).mostSpecific = false;
        }
      }
    }

    /*
     * For most general phrases, do not display them if a more specific phrase
     * exists with pretty much the same coverage.
     */
    for (int i = 0; i < max; i++) {
      final PhraseCandidate a = phrases.get(i);
      if (!a.mostGeneral) continue;

      for (int j = 0; j < max; j++) {
        final PhraseCandidate b = phrases.get(j);
        if (i == j || !b.mostSpecific) continue;

        int index =
            indexOf(
                words.buffer,
                offsets.get(2 * j),
                offsets.get(2 * j + 1),
                words.buffer,
                offsets.get(2 * i),
                offsets.get(2 * i + 1));
        if (index >= 0) {
          if (a.coverage - b.coverage < mostGeneralPhraseCoverage) {
            a.selected = false;
            j = max;
          }
        }
      }
    }

    /*
     * Mark phrases that should be removed from the candidate set.
     */
    for (PhraseCandidate p : phrases) {
      if (!p.mostGeneral && !p.mostSpecific) {
        p.selected = false;
      }
    }
  }