/** * Mark those phrases that overlap with other phrases by more than {@link #maxPhraseOverlap} and * have lower coverage. */ private void markOverlappingPhrases(ArrayList<PhraseCandidate> phrases) { final int max = phrases.size(); // A list of all unique words for each candidate phrase. final IntStack words = new IntStack(maxDescPhraseLength * phrases.size()); // Offset pairs in the words list -- a pair [start, length]. final IntStack offsets = new IntStack(phrases.size() * 2); for (PhraseCandidate p : phrases) { appendUniqueWords(words, offsets, p); } for (int i = 0; i < max; i++) { for (int j = i + 1; j < max; j++) { final PhraseCandidate a = phrases.get(i); final PhraseCandidate b = phrases.get(j); final int a_words = offsets.get(2 * i + 1); final int b_words = offsets.get(2 * j + 1); final float intersection = computeIntersection( words.buffer, offsets.get(2 * i), a_words, words.buffer, offsets.get(2 * j), b_words); if ((intersection / b_words) > maxPhraseOverlap && b.coverage < a.coverage) { b.selected = false; } if ((intersection / a_words) > maxPhraseOverlap && a.coverage < b.coverage) { a.selected = false; } } } }
/** * Leave only most general (no other phrase is a substring of this one) and most specific (no * other phrase is a superstring of this one) phrases. */ private void markSubSuperPhrases(ArrayList<PhraseCandidate> phrases) { final int max = phrases.size(); // A list of all words for each candidate phrase. final IntStack words = new IntStack(maxDescPhraseLength * phrases.size()); // Offset pairs in the words list -- a pair [start, length]. final IntStack offsets = new IntStack(phrases.size() * 2); for (PhraseCandidate p : phrases) { appendWords(words, offsets, p); } /* * Mark phrases that cannot be most specific or most general. */ for (int i = 0; i < max; i++) { for (int j = 0; j < max; j++) { if (i == j) continue; int index = indexOf( words.buffer, offsets.get(2 * i), offsets.get(2 * i + 1), words.buffer, offsets.get(2 * j), offsets.get(2 * j + 1)); if (index >= 0) { // j is a subphrase of i, hence i cannot be mostGeneral and j // cannot be most specific. phrases.get(i).mostGeneral = false; phrases.get(j).mostSpecific = false; } } } /* * For most general phrases, do not display them if a more specific phrase * exists with pretty much the same coverage. */ for (int i = 0; i < max; i++) { final PhraseCandidate a = phrases.get(i); if (!a.mostGeneral) continue; for (int j = 0; j < max; j++) { final PhraseCandidate b = phrases.get(j); if (i == j || !b.mostSpecific) continue; int index = indexOf( words.buffer, offsets.get(2 * j), offsets.get(2 * j + 1), words.buffer, offsets.get(2 * i), offsets.get(2 * i + 1)); if (index >= 0) { if (a.coverage - b.coverage < mostGeneralPhraseCoverage) { a.selected = false; j = max; } } } } /* * Mark phrases that should be removed from the candidate set. */ for (PhraseCandidate p : phrases) { if (!p.mostGeneral && !p.mostSpecific) { p.selected = false; } } }