Ejemplo n.º 1
0
  /** Collect all unique non-stop word from a phrase. */
  private void appendUniqueWords(IntStack words, IntStack offsets, PhraseCandidate p) {
    assert p.cluster.phrases.size() == 1;

    final int start = words.size();
    final int[] phraseIndices = p.cluster.phrases.get(0);
    final short[] tokenTypes = context.allWords.type;
    for (int i = 0; i < phraseIndices.length; i += 2) {
      for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++) {
        final int termIndex = sb.input.get(j);
        if (!TokenTypeUtils.isCommon(tokenTypes[termIndex])) {
          words.push(termIndex);
        }
      }
    }

    // Sort words, we don't care about their order when counting subsets.
    Arrays.sort(words.buffer, start, words.size());

    // Reorder to keep only unique words.
    int j = start;
    for (int i = start + 1; i < words.size(); i++) {
      if (words.buffer[j] != words.buffer[i]) {
        words.buffer[++j] = words.buffer[i];
      }
    }
    words.elementsCount = j + 1;

    offsets.push(start, words.size() - start);
  }
Ejemplo n.º 2
0
  /** Collect all words from a phrase. */
  private void appendWords(IntStack words, IntStack offsets, PhraseCandidate p) {
    final int start = words.size();

    final int[] phraseIndices = p.cluster.phrases.get(0);
    final short[] tokenTypes = context.allWords.type;
    for (int i = 0; i < phraseIndices.length; i += 2) {
      for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++) {
        final int termIndex = sb.input.get(j);
        if (!TokenTypeUtils.isCommon(tokenTypes[termIndex])) {
          words.push(termIndex);
        }
      }
    }

    offsets.push(start, words.size() - start);
  }
Ejemplo n.º 3
0
  /**
   * Create final clusters by merging base clusters and pruning their labels. Cluster merging is a
   * greedy process of compacting clusters with document sets that overlap by a certain ratio. In
   * other words, phrases that "cover" nearly identical document sets will be conflated.
   */
  private ArrayList<ClusterCandidate> createMergedClusters(
      ArrayList<ClusterCandidate> baseClusters) {
    /*
     * Calculate overlap between base clusters first, saving adjacency lists for
     * each base cluster.
     */

    // [i] - next neighbor or END, [i + 1] - neighbor cluster index.
    final int END = -1;
    final IntStack neighborList = new IntStack();
    neighborList.push(END);
    final int[] neighbors = new int[baseClusters.size()];
    final float m = (float) mergeThreshold;
    for (int i = 0; i < baseClusters.size(); i++) {
      for (int j = i + 1; j < baseClusters.size(); j++) {
        final ClusterCandidate c1 = baseClusters.get(i);
        final ClusterCandidate c2 = baseClusters.get(j);

        final float a = c1.cardinality;
        final float b = c2.cardinality;
        final float c = BitSet.intersectionCount(c1.documents, c2.documents);

        if (c / a > m && c / b > m) {
          neighborList.push(neighbors[i], j);
          neighbors[i] = neighborList.size() - 2;
          neighborList.push(neighbors[j], i);
          neighbors[j] = neighborList.size() - 2;
        }
      }
    }

    /*
     * Find connected components in the similarity graph using Tarjan's algorithm
     * (flattened to use the stack instead of recursion).
     */

    final int NO_INDEX = -1;
    final int[] merged = new int[baseClusters.size()];
    Arrays.fill(merged, NO_INDEX);

    final ArrayList<ClusterCandidate> mergedClusters =
        Lists.newArrayListWithCapacity(baseClusters.size());
    final IntStack stack = new IntStack(baseClusters.size());
    final IntStack mergeList = new IntStack(baseClusters.size());
    int mergedIndex = 0;
    for (int v = 0; v < baseClusters.size(); v++) {
      if (merged[v] != NO_INDEX) continue;

      // Recursively mark all connected components from an unmerged cluster.
      stack.push(v);
      while (stack.size() > 0) {
        final int c = stack.pop();

        assert merged[c] == NO_INDEX || merged[c] == mergedIndex;
        if (merged[c] == mergedIndex) continue;

        merged[c] = mergedIndex;
        mergeList.push(c);

        for (int i = neighbors[c]; neighborList.get(i) != END; ) {
          final int neighbor = neighborList.get(i + 1);
          if (merged[neighbor] == NO_INDEX) {
            stack.push(neighbor);
          } else {
            assert merged[neighbor] == mergedIndex;
          }
          i = neighborList.get(i);
        }
      }
      mergedIndex++;

      /*
       * Aggregate documents from each base cluster of the current merge, compute
       * the score and labels.
       */
      mergedClusters.add(merge(mergeList, baseClusters));
      mergeList.clear();
    }

    /*
     * Sort merged clusters.
     */
    Collections.sort(
        mergedClusters,
        new Comparator<ClusterCandidate>() {
          public int compare(ClusterCandidate c1, ClusterCandidate c2) {
            if (c1.score < c2.score) return 1;
            if (c1.score > c2.score) return -1;
            if (c1.cardinality < c2.cardinality) return 1;
            if (c1.cardinality > c2.cardinality) return -1;
            return 0;
          };
        });

    if (mergedClusters.size() > maxClusters) {
      mergedClusters.subList(maxClusters, mergedClusters.size()).clear();
    }

    return mergedClusters;
  }