Exemplo n.º 1
0
  /** Collect all unique non-stop word from a phrase. */
  private void appendUniqueWords(IntStack words, IntStack offsets, PhraseCandidate p) {
    assert p.cluster.phrases.size() == 1;

    final int start = words.size();
    final int[] phraseIndices = p.cluster.phrases.get(0);
    final short[] tokenTypes = context.allWords.type;
    for (int i = 0; i < phraseIndices.length; i += 2) {
      for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++) {
        final int termIndex = sb.input.get(j);
        if (!TokenTypeUtils.isCommon(tokenTypes[termIndex])) {
          words.push(termIndex);
        }
      }
    }

    // Sort words, we don't care about their order when counting subsets.
    Arrays.sort(words.buffer, start, words.size());

    // Reorder to keep only unique words.
    int j = start;
    for (int i = start + 1; i < words.size(); i++) {
      if (words.buffer[j] != words.buffer[i]) {
        words.buffer[++j] = words.buffer[i];
      }
    }
    words.elementsCount = j + 1;

    offsets.push(start, words.size() - start);
  }
Exemplo n.º 2
0
  /** Merge a list of base clusters into one. */
  private ClusterCandidate merge(IntStack mergeList, List<ClusterCandidate> baseClusters) {
    assert mergeList.size() > 0;
    final ClusterCandidate result = new ClusterCandidate();

    /*
     * Merge documents from all base clusters and update the score.
     */
    for (int i = 0; i < mergeList.size(); i++) {
      final ClusterCandidate cc = baseClusters.get(mergeList.get(i));
      result.documents.or(cc.documents);
      result.score += cc.score;
    }
    result.cardinality = (int) result.documents.cardinality();

    /*
     * Combine cluster labels and try to find the best description for the cluster.
     */
    final ArrayList<PhraseCandidate> phrases = new ArrayList<PhraseCandidate>(mergeList.size());
    for (int i = 0; i < mergeList.size(); i++) {
      final ClusterCandidate cc = baseClusters.get(mergeList.get(i));
      final float coverage = cc.cardinality / (float) result.cardinality;
      phrases.add(new PhraseCandidate(cc, coverage));
    }

    markSubSuperPhrases(phrases);
    Collections2.filter(phrases, notSelected).clear();

    markOverlappingPhrases(phrases);
    Collections2.filter(phrases, notSelected).clear();

    Collections.sort(
        phrases,
        new Comparator<PhraseCandidate>() {
          public int compare(PhraseCandidate p1, PhraseCandidate p2) {
            if (p1.coverage < p2.coverage) return 1;
            if (p1.coverage > p2.coverage) return -1;
            return 0;
          };
        });

    int max = maxPhrases;
    for (PhraseCandidate p : phrases) {
      if (max-- <= 0) break;
      result.phrases.add(p.cluster.phrases.get(0));
    }

    return result;
  }
Exemplo n.º 3
0
  /** Calculate "effective phrase length", that is the number of non-ignored words in the phrase. */
  final int effectivePhraseLength(IntStack path) {
    final int[] terms = sb.input.buffer;
    final int lower = ignoreWordIfInFewerDocs;
    final int upper = (int) (ignoreWordIfInHigherDocsPercent * documents.size());

    int effectivePhraseLen = 0;
    for (int i = 0; i < path.size(); i += 2) {
      for (int j = path.get(i); j <= path.get(i + 1); j++) {
        final int termIndex = terms[j];

        // If this term is a stop word, don't count it.
        if (TokenTypeUtils.isCommon(context.allWords.type[termIndex])) {
          continue;
        }

        // If this word occurs in more than a given fraction of the input
        // collection don't count it.
        final int docCount = context.allWords.tfByDocument[termIndex].length / 2;
        if (docCount < lower || docCount > upper) {
          continue;
        }

        effectivePhraseLen++;
      }
    }

    return effectivePhraseLen;
  }
Exemplo n.º 4
0
  /** Collect all words from a phrase. */
  private void appendWords(IntStack words, IntStack offsets, PhraseCandidate p) {
    final int start = words.size();

    final int[] phraseIndices = p.cluster.phrases.get(0);
    final short[] tokenTypes = context.allWords.type;
    for (int i = 0; i < phraseIndices.length; i += 2) {
      for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++) {
        final int termIndex = sb.input.get(j);
        if (!TokenTypeUtils.isCommon(tokenTypes[termIndex])) {
          words.push(termIndex);
        }
      }
    }

    offsets.push(start, words.size() - start);
  }
Exemplo n.º 5
0
  /**
   * Consider certain special cases of internal suffix tree nodes. The suffix tree may contain
   * internal nodes with paths starting or ending with a stop word (common word). We have the
   * following interesting scenarios:
   *
   * <dl>
   *   <dt>IF LEADING STOPWORD: IGNORE THE NODE.
   *   <dd>There MUST be a phrase with this stopword chopped off in the suffix tree (a suffix of
   *       this phrase) and its frequency will be just as high.
   *   <dt>IF TRAILING STOPWORDS:
   *   <dd>Check if the edge leading to the current node is composed entirely of stopwords. If so,
   *       there must be a parent node that contains non-stopwords and we can ignore the current
   *       node. Otherwise we can chop off the trailing stopwords from the current node's phrase
   *       (this phrase cannot be duplicated anywhere in the tree because if it were, there would
   *       have to be a branch somewhere in the suffix tree on the edge).
   * </dl>
   */
  final boolean checkAcceptablePhrase(IntStack path) {
    assert path.size() > 0;

    final int[] terms = sb.input.buffer;
    final short[] tokenTypes = context.allWords.type;

    // Ignore nodes that start with a stop word.
    if (TokenTypeUtils.isCommon(tokenTypes[terms[path.get(0)]])) {
      return false;
    }

    // Check the last edge of the current node.
    int i = path.get(path.size() - 2);
    int j = path.get(path.size() - 1);
    final int k = j;
    while (i <= j && TokenTypeUtils.isCommon(tokenTypes[terms[j]])) {
      j--;
    }

    if (j < i) {
      // If the edge contains only stopwords, ignore the node.
      return false;
    } else if (j < k) {
      // There have been trailing stop words on the edge. Chop them off.
      path.buffer[path.size() - 1] = j;
    }

    // Check the total phrase length (in words, including stopwords).
    int termsCount = 0;
    for (j = 0; j < path.size(); j += 2) {
      termsCount += path.get(j + 1) - path.get(j) + 1;
    }

    if (termsCount > maxDescPhraseLength) {
      return false;
    }

    return true;
  }
Exemplo n.º 6
0
  /**
   * Create final clusters by merging base clusters and pruning their labels. Cluster merging is a
   * greedy process of compacting clusters with document sets that overlap by a certain ratio. In
   * other words, phrases that "cover" nearly identical document sets will be conflated.
   */
  private ArrayList<ClusterCandidate> createMergedClusters(
      ArrayList<ClusterCandidate> baseClusters) {
    /*
     * Calculate overlap between base clusters first, saving adjacency lists for
     * each base cluster.
     */

    // [i] - next neighbor or END, [i + 1] - neighbor cluster index.
    final int END = -1;
    final IntStack neighborList = new IntStack();
    neighborList.push(END);
    final int[] neighbors = new int[baseClusters.size()];
    final float m = (float) mergeThreshold;
    for (int i = 0; i < baseClusters.size(); i++) {
      for (int j = i + 1; j < baseClusters.size(); j++) {
        final ClusterCandidate c1 = baseClusters.get(i);
        final ClusterCandidate c2 = baseClusters.get(j);

        final float a = c1.cardinality;
        final float b = c2.cardinality;
        final float c = BitSet.intersectionCount(c1.documents, c2.documents);

        if (c / a > m && c / b > m) {
          neighborList.push(neighbors[i], j);
          neighbors[i] = neighborList.size() - 2;
          neighborList.push(neighbors[j], i);
          neighbors[j] = neighborList.size() - 2;
        }
      }
    }

    /*
     * Find connected components in the similarity graph using Tarjan's algorithm
     * (flattened to use the stack instead of recursion).
     */

    final int NO_INDEX = -1;
    final int[] merged = new int[baseClusters.size()];
    Arrays.fill(merged, NO_INDEX);

    final ArrayList<ClusterCandidate> mergedClusters =
        Lists.newArrayListWithCapacity(baseClusters.size());
    final IntStack stack = new IntStack(baseClusters.size());
    final IntStack mergeList = new IntStack(baseClusters.size());
    int mergedIndex = 0;
    for (int v = 0; v < baseClusters.size(); v++) {
      if (merged[v] != NO_INDEX) continue;

      // Recursively mark all connected components from an unmerged cluster.
      stack.push(v);
      while (stack.size() > 0) {
        final int c = stack.pop();

        assert merged[c] == NO_INDEX || merged[c] == mergedIndex;
        if (merged[c] == mergedIndex) continue;

        merged[c] = mergedIndex;
        mergeList.push(c);

        for (int i = neighbors[c]; neighborList.get(i) != END; ) {
          final int neighbor = neighborList.get(i + 1);
          if (merged[neighbor] == NO_INDEX) {
            stack.push(neighbor);
          } else {
            assert merged[neighbor] == mergedIndex;
          }
          i = neighborList.get(i);
        }
      }
      mergedIndex++;

      /*
       * Aggregate documents from each base cluster of the current merge, compute
       * the score and labels.
       */
      mergedClusters.add(merge(mergeList, baseClusters));
      mergeList.clear();
    }

    /*
     * Sort merged clusters.
     */
    Collections.sort(
        mergedClusters,
        new Comparator<ClusterCandidate>() {
          public int compare(ClusterCandidate c1, ClusterCandidate c2) {
            if (c1.score < c2.score) return 1;
            if (c1.score > c2.score) return -1;
            if (c1.cardinality < c2.cardinality) return 1;
            if (c1.cardinality > c2.cardinality) return -1;
            return 0;
          };
        });

    if (mergedClusters.size() > maxClusters) {
      mergedClusters.subList(maxClusters, mergedClusters.size()).clear();
    }

    return mergedClusters;
  }