/** Collect all unique non-stop word from a phrase. */ private void appendUniqueWords(IntStack words, IntStack offsets, PhraseCandidate p) { assert p.cluster.phrases.size() == 1; final int start = words.size(); final int[] phraseIndices = p.cluster.phrases.get(0); final short[] tokenTypes = context.allWords.type; for (int i = 0; i < phraseIndices.length; i += 2) { for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++) { final int termIndex = sb.input.get(j); if (!TokenTypeUtils.isCommon(tokenTypes[termIndex])) { words.push(termIndex); } } } // Sort words, we don't care about their order when counting subsets. Arrays.sort(words.buffer, start, words.size()); // Reorder to keep only unique words. int j = start; for (int i = start + 1; i < words.size(); i++) { if (words.buffer[j] != words.buffer[i]) { words.buffer[++j] = words.buffer[i]; } } words.elementsCount = j + 1; offsets.push(start, words.size() - start); }
/** Collect all words from a phrase. */ private void appendWords(IntStack words, IntStack offsets, PhraseCandidate p) { final int start = words.size(); final int[] phraseIndices = p.cluster.phrases.get(0); final short[] tokenTypes = context.allWords.type; for (int i = 0; i < phraseIndices.length; i += 2) { for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++) { final int termIndex = sb.input.get(j); if (!TokenTypeUtils.isCommon(tokenTypes[termIndex])) { words.push(termIndex); } } } offsets.push(start, words.size() - start); }
/** * Create final clusters by merging base clusters and pruning their labels. Cluster merging is a * greedy process of compacting clusters with document sets that overlap by a certain ratio. In * other words, phrases that "cover" nearly identical document sets will be conflated. */ private ArrayList<ClusterCandidate> createMergedClusters( ArrayList<ClusterCandidate> baseClusters) { /* * Calculate overlap between base clusters first, saving adjacency lists for * each base cluster. */ // [i] - next neighbor or END, [i + 1] - neighbor cluster index. final int END = -1; final IntStack neighborList = new IntStack(); neighborList.push(END); final int[] neighbors = new int[baseClusters.size()]; final float m = (float) mergeThreshold; for (int i = 0; i < baseClusters.size(); i++) { for (int j = i + 1; j < baseClusters.size(); j++) { final ClusterCandidate c1 = baseClusters.get(i); final ClusterCandidate c2 = baseClusters.get(j); final float a = c1.cardinality; final float b = c2.cardinality; final float c = BitSet.intersectionCount(c1.documents, c2.documents); if (c / a > m && c / b > m) { neighborList.push(neighbors[i], j); neighbors[i] = neighborList.size() - 2; neighborList.push(neighbors[j], i); neighbors[j] = neighborList.size() - 2; } } } /* * Find connected components in the similarity graph using Tarjan's algorithm * (flattened to use the stack instead of recursion). */ final int NO_INDEX = -1; final int[] merged = new int[baseClusters.size()]; Arrays.fill(merged, NO_INDEX); final ArrayList<ClusterCandidate> mergedClusters = Lists.newArrayListWithCapacity(baseClusters.size()); final IntStack stack = new IntStack(baseClusters.size()); final IntStack mergeList = new IntStack(baseClusters.size()); int mergedIndex = 0; for (int v = 0; v < baseClusters.size(); v++) { if (merged[v] != NO_INDEX) continue; // Recursively mark all connected components from an unmerged cluster. stack.push(v); while (stack.size() > 0) { final int c = stack.pop(); assert merged[c] == NO_INDEX || merged[c] == mergedIndex; if (merged[c] == mergedIndex) continue; merged[c] = mergedIndex; mergeList.push(c); for (int i = neighbors[c]; neighborList.get(i) != END; ) { final int neighbor = neighborList.get(i + 1); if (merged[neighbor] == NO_INDEX) { stack.push(neighbor); } else { assert merged[neighbor] == mergedIndex; } i = neighborList.get(i); } } mergedIndex++; /* * Aggregate documents from each base cluster of the current merge, compute * the score and labels. */ mergedClusters.add(merge(mergeList, baseClusters)); mergeList.clear(); } /* * Sort merged clusters. */ Collections.sort( mergedClusters, new Comparator<ClusterCandidate>() { public int compare(ClusterCandidate c1, ClusterCandidate c2) { if (c1.score < c2.score) return 1; if (c1.score > c2.score) return -1; if (c1.cardinality < c2.cardinality) return 1; if (c1.cardinality > c2.cardinality) return -1; return 0; }; }); if (mergedClusters.size() > maxClusters) { mergedClusters.subList(maxClusters, mergedClusters.size()).clear(); } return mergedClusters; }