/** Calculate "effective phrase length", that is the number of non-ignored words in the phrase. */ final int effectivePhraseLength(IntStack path) { final int[] terms = sb.input.buffer; final int lower = ignoreWordIfInFewerDocs; final int upper = (int) (ignoreWordIfInHigherDocsPercent * documents.size()); int effectivePhraseLen = 0; for (int i = 0; i < path.size(); i += 2) { for (int j = path.get(i); j <= path.get(i + 1); j++) { final int termIndex = terms[j]; // If this term is a stop word, don't count it. if (TokenTypeUtils.isCommon(context.allWords.type[termIndex])) { continue; } // If this word occurs in more than a given fraction of the input // collection don't count it. final int docCount = context.allWords.tfByDocument[termIndex].length / 2; if (docCount < lower || docCount > upper) { continue; } effectivePhraseLen++; } } return effectivePhraseLen; }
/** Collect all words from a phrase. */ private void appendWords(IntStack words, IntStack offsets, PhraseCandidate p) { final int start = words.size(); final int[] phraseIndices = p.cluster.phrases.get(0); final short[] tokenTypes = context.allWords.type; for (int i = 0; i < phraseIndices.length; i += 2) { for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++) { final int termIndex = sb.input.get(j); if (!TokenTypeUtils.isCommon(tokenTypes[termIndex])) { words.push(termIndex); } } } offsets.push(start, words.size() - start); }
/** Collect all unique non-stop word from a phrase. */ private void appendUniqueWords(IntStack words, IntStack offsets, PhraseCandidate p) { assert p.cluster.phrases.size() == 1; final int start = words.size(); final int[] phraseIndices = p.cluster.phrases.get(0); final short[] tokenTypes = context.allWords.type; for (int i = 0; i < phraseIndices.length; i += 2) { for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++) { final int termIndex = sb.input.get(j); if (!TokenTypeUtils.isCommon(tokenTypes[termIndex])) { words.push(termIndex); } } } // Sort words, we don't care about their order when counting subsets. Arrays.sort(words.buffer, start, words.size()); // Reorder to keep only unique words. int j = start; for (int i = start + 1; i < words.size(); i++) { if (words.buffer[j] != words.buffer[i]) { words.buffer[++j] = words.buffer[i]; } } words.elementsCount = j + 1; offsets.push(start, words.size() - start); }
/** * Mark those phrases that overlap with other phrases by more than {@link #maxPhraseOverlap} and * have lower coverage. */ private void markOverlappingPhrases(ArrayList<PhraseCandidate> phrases) { final int max = phrases.size(); // A list of all unique words for each candidate phrase. final IntStack words = new IntStack(maxDescPhraseLength * phrases.size()); // Offset pairs in the words list -- a pair [start, length]. final IntStack offsets = new IntStack(phrases.size() * 2); for (PhraseCandidate p : phrases) { appendUniqueWords(words, offsets, p); } for (int i = 0; i < max; i++) { for (int j = i + 1; j < max; j++) { final PhraseCandidate a = phrases.get(i); final PhraseCandidate b = phrases.get(j); final int a_words = offsets.get(2 * i + 1); final int b_words = offsets.get(2 * j + 1); final float intersection = computeIntersection( words.buffer, offsets.get(2 * i), a_words, words.buffer, offsets.get(2 * j), b_words); if ((intersection / b_words) > maxPhraseOverlap && b.coverage < a.coverage) { b.selected = false; } if ((intersection / a_words) > maxPhraseOverlap && a.coverage < b.coverage) { a.selected = false; } } } }
/** * Consider certain special cases of internal suffix tree nodes. The suffix tree may contain * internal nodes with paths starting or ending with a stop word (common word). We have the * following interesting scenarios: * * <dl> * <dt>IF LEADING STOPWORD: IGNORE THE NODE. * <dd>There MUST be a phrase with this stopword chopped off in the suffix tree (a suffix of * this phrase) and its frequency will be just as high. * <dt>IF TRAILING STOPWORDS: * <dd>Check if the edge leading to the current node is composed entirely of stopwords. If so, * there must be a parent node that contains non-stopwords and we can ignore the current * node. Otherwise we can chop off the trailing stopwords from the current node's phrase * (this phrase cannot be duplicated anywhere in the tree because if it were, there would * have to be a branch somewhere in the suffix tree on the edge). * </dl> */ final boolean checkAcceptablePhrase(IntStack path) { assert path.size() > 0; final int[] terms = sb.input.buffer; final short[] tokenTypes = context.allWords.type; // Ignore nodes that start with a stop word. if (TokenTypeUtils.isCommon(tokenTypes[terms[path.get(0)]])) { return false; } // Check the last edge of the current node. int i = path.get(path.size() - 2); int j = path.get(path.size() - 1); final int k = j; while (i <= j && TokenTypeUtils.isCommon(tokenTypes[terms[j]])) { j--; } if (j < i) { // If the edge contains only stopwords, ignore the node. return false; } else if (j < k) { // There have been trailing stop words on the edge. Chop them off. path.buffer[path.size() - 1] = j; } // Check the total phrase length (in words, including stopwords). int termsCount = 0; for (j = 0; j < path.size(); j += 2) { termsCount += path.get(j + 1) - path.get(j) + 1; } if (termsCount > maxDescPhraseLength) { return false; } return true; }
/** Merge a list of base clusters into one. */ private ClusterCandidate merge(IntStack mergeList, List<ClusterCandidate> baseClusters) { assert mergeList.size() > 0; final ClusterCandidate result = new ClusterCandidate(); /* * Merge documents from all base clusters and update the score. */ for (int i = 0; i < mergeList.size(); i++) { final ClusterCandidate cc = baseClusters.get(mergeList.get(i)); result.documents.or(cc.documents); result.score += cc.score; } result.cardinality = (int) result.documents.cardinality(); /* * Combine cluster labels and try to find the best description for the cluster. */ final ArrayList<PhraseCandidate> phrases = new ArrayList<PhraseCandidate>(mergeList.size()); for (int i = 0; i < mergeList.size(); i++) { final ClusterCandidate cc = baseClusters.get(mergeList.get(i)); final float coverage = cc.cardinality / (float) result.cardinality; phrases.add(new PhraseCandidate(cc, coverage)); } markSubSuperPhrases(phrases); Collections2.filter(phrases, notSelected).clear(); markOverlappingPhrases(phrases); Collections2.filter(phrases, notSelected).clear(); Collections.sort( phrases, new Comparator<PhraseCandidate>() { public int compare(PhraseCandidate p1, PhraseCandidate p2) { if (p1.coverage < p2.coverage) return 1; if (p1.coverage > p2.coverage) return -1; return 0; }; }); int max = maxPhrases; for (PhraseCandidate p : phrases) { if (max-- <= 0) break; result.phrases.add(p.cluster.phrases.get(0)); } return result; }
/** * Leave only most general (no other phrase is a substring of this one) and most specific (no * other phrase is a superstring of this one) phrases. */ private void markSubSuperPhrases(ArrayList<PhraseCandidate> phrases) { final int max = phrases.size(); // A list of all words for each candidate phrase. final IntStack words = new IntStack(maxDescPhraseLength * phrases.size()); // Offset pairs in the words list -- a pair [start, length]. final IntStack offsets = new IntStack(phrases.size() * 2); for (PhraseCandidate p : phrases) { appendWords(words, offsets, p); } /* * Mark phrases that cannot be most specific or most general. */ for (int i = 0; i < max; i++) { for (int j = 0; j < max; j++) { if (i == j) continue; int index = indexOf( words.buffer, offsets.get(2 * i), offsets.get(2 * i + 1), words.buffer, offsets.get(2 * j), offsets.get(2 * j + 1)); if (index >= 0) { // j is a subphrase of i, hence i cannot be mostGeneral and j // cannot be most specific. phrases.get(i).mostGeneral = false; phrases.get(j).mostSpecific = false; } } } /* * For most general phrases, do not display them if a more specific phrase * exists with pretty much the same coverage. */ for (int i = 0; i < max; i++) { final PhraseCandidate a = phrases.get(i); if (!a.mostGeneral) continue; for (int j = 0; j < max; j++) { final PhraseCandidate b = phrases.get(j); if (i == j || !b.mostSpecific) continue; int index = indexOf( words.buffer, offsets.get(2 * j), offsets.get(2 * j + 1), words.buffer, offsets.get(2 * i), offsets.get(2 * i + 1)); if (index >= 0) { if (a.coverage - b.coverage < mostGeneralPhraseCoverage) { a.selected = false; j = max; } } } } /* * Mark phrases that should be removed from the candidate set. */ for (PhraseCandidate p : phrases) { if (!p.mostGeneral && !p.mostSpecific) { p.selected = false; } } }
/** * Create final clusters by merging base clusters and pruning their labels. Cluster merging is a * greedy process of compacting clusters with document sets that overlap by a certain ratio. In * other words, phrases that "cover" nearly identical document sets will be conflated. */ private ArrayList<ClusterCandidate> createMergedClusters( ArrayList<ClusterCandidate> baseClusters) { /* * Calculate overlap between base clusters first, saving adjacency lists for * each base cluster. */ // [i] - next neighbor or END, [i + 1] - neighbor cluster index. final int END = -1; final IntStack neighborList = new IntStack(); neighborList.push(END); final int[] neighbors = new int[baseClusters.size()]; final float m = (float) mergeThreshold; for (int i = 0; i < baseClusters.size(); i++) { for (int j = i + 1; j < baseClusters.size(); j++) { final ClusterCandidate c1 = baseClusters.get(i); final ClusterCandidate c2 = baseClusters.get(j); final float a = c1.cardinality; final float b = c2.cardinality; final float c = BitSet.intersectionCount(c1.documents, c2.documents); if (c / a > m && c / b > m) { neighborList.push(neighbors[i], j); neighbors[i] = neighborList.size() - 2; neighborList.push(neighbors[j], i); neighbors[j] = neighborList.size() - 2; } } } /* * Find connected components in the similarity graph using Tarjan's algorithm * (flattened to use the stack instead of recursion). */ final int NO_INDEX = -1; final int[] merged = new int[baseClusters.size()]; Arrays.fill(merged, NO_INDEX); final ArrayList<ClusterCandidate> mergedClusters = Lists.newArrayListWithCapacity(baseClusters.size()); final IntStack stack = new IntStack(baseClusters.size()); final IntStack mergeList = new IntStack(baseClusters.size()); int mergedIndex = 0; for (int v = 0; v < baseClusters.size(); v++) { if (merged[v] != NO_INDEX) continue; // Recursively mark all connected components from an unmerged cluster. stack.push(v); while (stack.size() > 0) { final int c = stack.pop(); assert merged[c] == NO_INDEX || merged[c] == mergedIndex; if (merged[c] == mergedIndex) continue; merged[c] = mergedIndex; mergeList.push(c); for (int i = neighbors[c]; neighborList.get(i) != END; ) { final int neighbor = neighborList.get(i + 1); if (merged[neighbor] == NO_INDEX) { stack.push(neighbor); } else { assert merged[neighbor] == mergedIndex; } i = neighborList.get(i); } } mergedIndex++; /* * Aggregate documents from each base cluster of the current merge, compute * the score and labels. */ mergedClusters.add(merge(mergeList, baseClusters)); mergeList.clear(); } /* * Sort merged clusters. */ Collections.sort( mergedClusters, new Comparator<ClusterCandidate>() { public int compare(ClusterCandidate c1, ClusterCandidate c2) { if (c1.score < c2.score) return 1; if (c1.score > c2.score) return -1; if (c1.cardinality < c2.cardinality) return 1; if (c1.cardinality > c2.cardinality) return -1; return 0; }; }); if (mergedClusters.size() > maxClusters) { mergedClusters.subList(maxClusters, mergedClusters.size()).clear(); } return mergedClusters; }