Esempio n. 1
  /** Calculate "effective phrase length", that is the number of non-ignored words in the phrase. */
  final int effectivePhraseLength(IntStack path) {
    final int[] terms = sb.input.buffer;
    final int lower = ignoreWordIfInFewerDocs;
    final int upper = (int) (ignoreWordIfInHigherDocsPercent * documents.size());

    int effectivePhraseLen = 0;
    for (int i = 0; i < path.size(); i += 2) {
      for (int j = path.get(i); j <= path.get(i + 1); j++) {
        final int termIndex = terms[j];

        // If this term is a stop word, don't count it.
        if (TokenTypeUtils.isCommon(context.allWords.type[termIndex])) {

        // If this word occurs in more than a given fraction of the input
        // collection don't count it.
        final int docCount = context.allWords.tfByDocument[termIndex].length / 2;
        if (docCount < lower || docCount > upper) {


    return effectivePhraseLen;
  public void testToArray() {

    IntStack s = new IntStack();

    assertArrayEquals(new int[] {3, 2, 1}, s.toArray());
 public void dump(String reason) {
   int a = nodeArity();
   System.out.println("dump:" + reason);
   System.out.println("  mk:" + mk + "  sp:" + sp);
   for (int i = 0; i < nodes.size(); i++) {
     Node n = (Node) nodes.elementAt(i);
     System.out.println("   " + n);
   for (int i = 0; i < marks.size(); i++) {
     System.out.println("   " + marks.elementAt(i));
Esempio n. 4
  /** Collect all words from a phrase. */
  private void appendWords(IntStack words, IntStack offsets, PhraseCandidate p) {
    final int start = words.size();

    final int[] phraseIndices = p.cluster.phrases.get(0);
    final short[] tokenTypes = context.allWords.type;
    for (int i = 0; i < phraseIndices.length; i += 2) {
      for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++) {
        final int termIndex = sb.input.get(j);
        if (!TokenTypeUtils.isCommon(tokenTypes[termIndex])) {

    offsets.push(start, words.size() - start);
Esempio n. 5
  /** Collect all unique non-stop word from a phrase. */
  private void appendUniqueWords(IntStack words, IntStack offsets, PhraseCandidate p) {
    assert p.cluster.phrases.size() == 1;

    final int start = words.size();
    final int[] phraseIndices = p.cluster.phrases.get(0);
    final short[] tokenTypes = context.allWords.type;
    for (int i = 0; i < phraseIndices.length; i += 2) {
      for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++) {
        final int termIndex = sb.input.get(j);
        if (!TokenTypeUtils.isCommon(tokenTypes[termIndex])) {

    // Sort words, we don't care about their order when counting subsets.
    Arrays.sort(words.buffer, start, words.size());

    // Reorder to keep only unique words.
    int j = start;
    for (int i = start + 1; i < words.size(); i++) {
      if (words.buffer[j] != words.buffer[i]) {
        words.buffer[++j] = words.buffer[i];
    words.elementsCount = j + 1;

    offsets.push(start, words.size() - start);
Esempio n. 6
   * Mark those phrases that overlap with other phrases by more than {@link #maxPhraseOverlap} and
   * have lower coverage.
  private void markOverlappingPhrases(ArrayList<PhraseCandidate> phrases) {
    final int max = phrases.size();

    // A list of all unique words for each candidate phrase.
    final IntStack words = new IntStack(maxDescPhraseLength * phrases.size());

    // Offset pairs in the words list -- a pair [start, length].
    final IntStack offsets = new IntStack(phrases.size() * 2);

    for (PhraseCandidate p : phrases) {
      appendUniqueWords(words, offsets, p);

    for (int i = 0; i < max; i++) {
      for (int j = i + 1; j < max; j++) {
        final PhraseCandidate a = phrases.get(i);
        final PhraseCandidate b = phrases.get(j);

        final int a_words = offsets.get(2 * i + 1);
        final int b_words = offsets.get(2 * j + 1);

        final float intersection =
                offsets.get(2 * i),
                offsets.get(2 * j),

        if ((intersection / b_words) > maxPhraseOverlap && b.coverage < a.coverage) {
          b.selected = false;

        if ((intersection / a_words) > maxPhraseOverlap && a.coverage < b.coverage) {
          a.selected = false;
 /* A conditional node is constructed if its condition is true.  All
 the nodes that have been pushed since the node was opened are
 made children of the the conditional node, which is then pushed
 on to the stack.  If the condition is false the node is not
 constructed and they are left on the stack. */
 void closeNodeScope(Node n, boolean condition) throws ParseException {
   SimpleNode sn = (SimpleNode) n;
   if (condition) {
     SimpleNode newNode = null;
     try {
       newNode = builder.closeNode(sn, nodeArity());
     } catch (ParseException exc) {
       throw exc;
     } catch (Exception exc) {
       throw new ParseException("Internal error:" + exc);
     if (newNode == null) {
       throw new ParseException("Internal AST builder error");
     mk = marks.pop();
     node_created = true;
   } else {
     mk = marks.pop();
     node_created = false;
  public void testStack() {

    IntStack s = new IntStack();

    assertEquals(0, s.size());

    for (int i = 0; i < 100; i++) {

    assertEquals(100, s.size());

    for (int i = 99; i >= 0; i--) {
      assertEquals(i, s.pop());

    assertEquals(0, s.size());
 /* A definite node is constructed from a specified number of
 children.  That number of nodes are popped from the stack and
 made the children of the definite node.  Then the definite node
 is pushed on to the stack. */
 void closeNodeScope(Node n, int num) throws ParseException {
   SimpleNode sn = (SimpleNode) n;
   mk = marks.pop();
   SimpleNode newNode = null;
   try {
     newNode = builder.closeNode(sn, num);
   } catch (ParseException exc) {
     throw exc;
   } catch (Exception exc) {
     throw new ParseException("Internal error:" + exc);
   if (newNode == null) {
     throw new ParseException("Internal AST builder error");
   node_created = true;
Esempio n. 10
   * Consider certain special cases of internal suffix tree nodes. The suffix tree may contain
   * internal nodes with paths starting or ending with a stop word (common word). We have the
   * following interesting scenarios:
   * <dl>
   *   <dd>There MUST be a phrase with this stopword chopped off in the suffix tree (a suffix of
   *       this phrase) and its frequency will be just as high.
   *   <dd>Check if the edge leading to the current node is composed entirely of stopwords. If so,
   *       there must be a parent node that contains non-stopwords and we can ignore the current
   *       node. Otherwise we can chop off the trailing stopwords from the current node's phrase
   *       (this phrase cannot be duplicated anywhere in the tree because if it were, there would
   *       have to be a branch somewhere in the suffix tree on the edge).
   * </dl>
  final boolean checkAcceptablePhrase(IntStack path) {
    assert path.size() > 0;

    final int[] terms = sb.input.buffer;
    final short[] tokenTypes = context.allWords.type;

    // Ignore nodes that start with a stop word.
    if (TokenTypeUtils.isCommon(tokenTypes[terms[path.get(0)]])) {
      return false;

    // Check the last edge of the current node.
    int i = path.get(path.size() - 2);
    int j = path.get(path.size() - 1);
    final int k = j;
    while (i <= j && TokenTypeUtils.isCommon(tokenTypes[terms[j]])) {

    if (j < i) {
      // If the edge contains only stopwords, ignore the node.
      return false;
    } else if (j < k) {
      // There have been trailing stop words on the edge. Chop them off.
      path.buffer[path.size() - 1] = j;

    // Check the total phrase length (in words, including stopwords).
    int termsCount = 0;
    for (j = 0; j < path.size(); j += 2) {
      termsCount += path.get(j + 1) - path.get(j) + 1;

    if (termsCount > maxDescPhraseLength) {
      return false;

    return true;
Esempio n. 11
  /** Merge a list of base clusters into one. */
  private ClusterCandidate merge(IntStack mergeList, List<ClusterCandidate> baseClusters) {
    assert mergeList.size() > 0;
    final ClusterCandidate result = new ClusterCandidate();

     * Merge documents from all base clusters and update the score.
    for (int i = 0; i < mergeList.size(); i++) {
      final ClusterCandidate cc = baseClusters.get(mergeList.get(i));
      result.score += cc.score;
    result.cardinality = (int) result.documents.cardinality();

     * Combine cluster labels and try to find the best description for the cluster.
    final ArrayList<PhraseCandidate> phrases = new ArrayList<PhraseCandidate>(mergeList.size());
    for (int i = 0; i < mergeList.size(); i++) {
      final ClusterCandidate cc = baseClusters.get(mergeList.get(i));
      final float coverage = cc.cardinality / (float) result.cardinality;
      phrases.add(new PhraseCandidate(cc, coverage));

    Collections2.filter(phrases, notSelected).clear();

    Collections2.filter(phrases, notSelected).clear();

        new Comparator<PhraseCandidate>() {
          public int compare(PhraseCandidate p1, PhraseCandidate p2) {
            if (p1.coverage < p2.coverage) return 1;
            if (p1.coverage > p2.coverage) return -1;
            return 0;

    int max = maxPhrases;
    for (PhraseCandidate p : phrases) {
      if (max-- <= 0) break;

    return result;
 void openNodeScope(Node n) {
   mk = sp;
 void clearNodeScope(Node n) {
   while (sp > mk) {
   mk = marks.pop();
 void setNodePos() {
   SimpleNode n = (SimpleNode) peekNode();
   n.beginLine = lines.pop();
   n.beginColumn = columns.pop();
 void pushNodePos(int line, int col) {
 /* Returns the node on the top of the stack, and remove it from the
 stack.  */
 Node popNode() {
   if (--sp < mk) {
     mk = marks.pop();
   return (Node) nodes.pop();
 /* Call this to reinitialize the node stack.  It is called
 automatically by the parser's ReInit() method. */
 void reset() {
   sp = 0;
   mk = 0;
Esempio n. 18
   * Create final clusters by merging base clusters and pruning their labels. Cluster merging is a
   * greedy process of compacting clusters with document sets that overlap by a certain ratio. In
   * other words, phrases that "cover" nearly identical document sets will be conflated.
  private ArrayList<ClusterCandidate> createMergedClusters(
      ArrayList<ClusterCandidate> baseClusters) {
     * Calculate overlap between base clusters first, saving adjacency lists for
     * each base cluster.

    // [i] - next neighbor or END, [i + 1] - neighbor cluster index.
    final int END = -1;
    final IntStack neighborList = new IntStack();
    final int[] neighbors = new int[baseClusters.size()];
    final float m = (float) mergeThreshold;
    for (int i = 0; i < baseClusters.size(); i++) {
      for (int j = i + 1; j < baseClusters.size(); j++) {
        final ClusterCandidate c1 = baseClusters.get(i);
        final ClusterCandidate c2 = baseClusters.get(j);

        final float a = c1.cardinality;
        final float b = c2.cardinality;
        final float c = BitSet.intersectionCount(c1.documents, c2.documents);

        if (c / a > m && c / b > m) {
          neighborList.push(neighbors[i], j);
          neighbors[i] = neighborList.size() - 2;
          neighborList.push(neighbors[j], i);
          neighbors[j] = neighborList.size() - 2;

     * Find connected components in the similarity graph using Tarjan's algorithm
     * (flattened to use the stack instead of recursion).

    final int NO_INDEX = -1;
    final int[] merged = new int[baseClusters.size()];
    Arrays.fill(merged, NO_INDEX);

    final ArrayList<ClusterCandidate> mergedClusters =
    final IntStack stack = new IntStack(baseClusters.size());
    final IntStack mergeList = new IntStack(baseClusters.size());
    int mergedIndex = 0;
    for (int v = 0; v < baseClusters.size(); v++) {
      if (merged[v] != NO_INDEX) continue;

      // Recursively mark all connected components from an unmerged cluster.
      while (stack.size() > 0) {
        final int c = stack.pop();

        assert merged[c] == NO_INDEX || merged[c] == mergedIndex;
        if (merged[c] == mergedIndex) continue;

        merged[c] = mergedIndex;

        for (int i = neighbors[c]; neighborList.get(i) != END; ) {
          final int neighbor = neighborList.get(i + 1);
          if (merged[neighbor] == NO_INDEX) {
          } else {
            assert merged[neighbor] == mergedIndex;
          i = neighborList.get(i);

       * Aggregate documents from each base cluster of the current merge, compute
       * the score and labels.
      mergedClusters.add(merge(mergeList, baseClusters));

     * Sort merged clusters.
        new Comparator<ClusterCandidate>() {
          public int compare(ClusterCandidate c1, ClusterCandidate c2) {
            if (c1.score < c2.score) return 1;
            if (c1.score > c2.score) return -1;
            if (c1.cardinality < c2.cardinality) return 1;
            if (c1.cardinality > c2.cardinality) return -1;
            return 0;

    if (mergedClusters.size() > maxClusters) {
      mergedClusters.subList(maxClusters, mergedClusters.size()).clear();

    return mergedClusters;
Esempio n. 19
   * Leave only most general (no other phrase is a substring of this one) and most specific (no
   * other phrase is a superstring of this one) phrases.
  private void markSubSuperPhrases(ArrayList<PhraseCandidate> phrases) {
    final int max = phrases.size();

    // A list of all words for each candidate phrase.
    final IntStack words = new IntStack(maxDescPhraseLength * phrases.size());

    // Offset pairs in the words list -- a pair [start, length].
    final IntStack offsets = new IntStack(phrases.size() * 2);

    for (PhraseCandidate p : phrases) {
      appendWords(words, offsets, p);

     * Mark phrases that cannot be most specific or most general.
    for (int i = 0; i < max; i++) {
      for (int j = 0; j < max; j++) {
        if (i == j) continue;

        int index =
                offsets.get(2 * i),
                offsets.get(2 * i + 1),
                offsets.get(2 * j),
                offsets.get(2 * j + 1));
        if (index >= 0) {
          // j is a subphrase of i, hence i cannot be mostGeneral and j
          // cannot be most specific.
          phrases.get(i).mostGeneral = false;
          phrases.get(j).mostSpecific = false;

     * For most general phrases, do not display them if a more specific phrase
     * exists with pretty much the same coverage.
    for (int i = 0; i < max; i++) {
      final PhraseCandidate a = phrases.get(i);
      if (!a.mostGeneral) continue;

      for (int j = 0; j < max; j++) {
        final PhraseCandidate b = phrases.get(j);
        if (i == j || !b.mostSpecific) continue;

        int index =
                offsets.get(2 * j),
                offsets.get(2 * j + 1),
                offsets.get(2 * i),
                offsets.get(2 * i + 1));
        if (index >= 0) {
          if (a.coverage - b.coverage < mostGeneralPhraseCoverage) {
            a.selected = false;
            j = max;

     * Mark phrases that should be removed from the candidate set.
    for (PhraseCandidate p : phrases) {
      if (!p.mostGeneral && !p.mostSpecific) {
        p.selected = false;