Пример #1
0
  /** Build the cluster's label from suffix tree edge indices. */
  private String buildLabel(int[] phraseIndices) {
    final boolean joinWithSpace =
        context.language.getLanguageCode() != LanguageCode.CHINESE_SIMPLIFIED;

    // Count the number of terms first.
    int termsCount = 0;
    for (int j = 0; j < phraseIndices.length; j += 2) {
      termsCount += phraseIndices[j + 1] - phraseIndices[j] + 1;
    }

    // Extract terms info for the phrase and construct the label.
    final boolean[] stopwords = new boolean[termsCount];
    final char[][] images = new char[termsCount][];
    final short[] tokenTypes = context.allWords.type;

    int k = 0;
    for (int i = 0; i < phraseIndices.length; i += 2) {
      for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++, k++) {
        final int termIndex = sb.input.get(j);
        images[k] = context.allWords.image[termIndex];
        stopwords[k] = TokenTypeUtils.isCommon(tokenTypes[termIndex]);
      }
    }

    return LabelFormatter.format(images, stopwords, joinWithSpace);
  }
Пример #2
0
  /** Calculate "effective phrase length", that is the number of non-ignored words in the phrase. */
  final int effectivePhraseLength(IntStack path) {
    final int[] terms = sb.input.buffer;
    final int lower = ignoreWordIfInFewerDocs;
    final int upper = (int) (ignoreWordIfInHigherDocsPercent * documents.size());

    int effectivePhraseLen = 0;
    for (int i = 0; i < path.size(); i += 2) {
      for (int j = path.get(i); j <= path.get(i + 1); j++) {
        final int termIndex = terms[j];

        // If this term is a stop word, don't count it.
        if (TokenTypeUtils.isCommon(context.allWords.type[termIndex])) {
          continue;
        }

        // If this word occurs in more than a given fraction of the input
        // collection don't count it.
        final int docCount = context.allWords.tfByDocument[termIndex].length / 2;
        if (docCount < lower || docCount > upper) {
          continue;
        }

        effectivePhraseLen++;
      }
    }

    return effectivePhraseLen;
  }
Пример #3
0
  /** Collect all unique non-stop word from a phrase. */
  private void appendUniqueWords(IntStack words, IntStack offsets, PhraseCandidate p) {
    assert p.cluster.phrases.size() == 1;

    final int start = words.size();
    final int[] phraseIndices = p.cluster.phrases.get(0);
    final short[] tokenTypes = context.allWords.type;
    for (int i = 0; i < phraseIndices.length; i += 2) {
      for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++) {
        final int termIndex = sb.input.get(j);
        if (!TokenTypeUtils.isCommon(tokenTypes[termIndex])) {
          words.push(termIndex);
        }
      }
    }

    // Sort words, we don't care about their order when counting subsets.
    Arrays.sort(words.buffer, start, words.size());

    // Reorder to keep only unique words.
    int j = start;
    for (int i = start + 1; i < words.size(); i++) {
      if (words.buffer[j] != words.buffer[i]) {
        words.buffer[++j] = words.buffer[i];
      }
    }
    words.elementsCount = j + 1;

    offsets.push(start, words.size() - start);
  }
  /**
   * Computes stem indices of words that are one-word label candidates or are non-stop words from
   * phrase label candidates.
   */
  private int[] computeRequiredStemIndices(PreprocessingContext context) {
    final int[] labelsFeatureIndex = context.allLabels.featureIndex;
    final int[] wordsStemIndex = context.allWords.stemIndex;
    final short[] wordsTypes = context.allWords.type;
    final int[][] phrasesWordIndices = context.allPhrases.wordIndices;
    final int wordCount = wordsStemIndex.length;

    final int[][] stemsTfByDocument = context.allStems.tfByDocument;
    int documentCount = context.documents.size();
    final BitSet requiredStemIndices = new BitSet(labelsFeatureIndex.length);

    for (int i = 0; i < labelsFeatureIndex.length; i++) {
      final int featureIndex = labelsFeatureIndex[i];
      if (featureIndex < wordCount) {
        addStemIndex(
            wordsStemIndex, documentCount, stemsTfByDocument, requiredStemIndices, featureIndex);
      } else {
        final int[] wordIndices = phrasesWordIndices[featureIndex - wordCount];
        for (int j = 0; j < wordIndices.length; j++) {
          final int wordIndex = wordIndices[j];
          if (!TokenTypeUtils.isCommon(wordsTypes[wordIndex])) {
            addStemIndex(
                wordsStemIndex, documentCount, stemsTfByDocument, requiredStemIndices, wordIndex);
          }
        }
      }
    }

    return requiredStemIndices.asIntLookupContainer().toArray();
  }
Пример #5
0
  /**
   * Consider certain special cases of internal suffix tree nodes. The suffix tree may contain
   * internal nodes with paths starting or ending with a stop word (common word). We have the
   * following interesting scenarios:
   *
   * <dl>
   *   <dt>IF LEADING STOPWORD: IGNORE THE NODE.
   *   <dd>There MUST be a phrase with this stopword chopped off in the suffix tree (a suffix of
   *       this phrase) and its frequency will be just as high.
   *   <dt>IF TRAILING STOPWORDS:
   *   <dd>Check if the edge leading to the current node is composed entirely of stopwords. If so,
   *       there must be a parent node that contains non-stopwords and we can ignore the current
   *       node. Otherwise we can chop off the trailing stopwords from the current node's phrase
   *       (this phrase cannot be duplicated anywhere in the tree because if it were, there would
   *       have to be a branch somewhere in the suffix tree on the edge).
   * </dl>
   */
  final boolean checkAcceptablePhrase(IntStack path) {
    assert path.size() > 0;

    final int[] terms = sb.input.buffer;
    final short[] tokenTypes = context.allWords.type;

    // Ignore nodes that start with a stop word.
    if (TokenTypeUtils.isCommon(tokenTypes[terms[path.get(0)]])) {
      return false;
    }

    // Check the last edge of the current node.
    int i = path.get(path.size() - 2);
    int j = path.get(path.size() - 1);
    final int k = j;
    while (i <= j && TokenTypeUtils.isCommon(tokenTypes[terms[j]])) {
      j--;
    }

    if (j < i) {
      // If the edge contains only stopwords, ignore the node.
      return false;
    } else if (j < k) {
      // There have been trailing stop words on the edge. Chop them off.
      path.buffer[path.size() - 1] = j;
    }

    // Check the total phrase length (in words, including stopwords).
    int termsCount = 0;
    for (j = 0; j < path.size(); j += 2) {
      termsCount += path.get(j + 1) - path.get(j) + 1;
    }

    if (termsCount > maxDescPhraseLength) {
      return false;
    }

    return true;
  }
Пример #6
0
  /** Collect all words from a phrase. */
  private void appendWords(IntStack words, IntStack offsets, PhraseCandidate p) {
    final int start = words.size();

    final int[] phraseIndices = p.cluster.phrases.get(0);
    final short[] tokenTypes = context.allWords.type;
    for (int i = 0; i < phraseIndices.length; i += 2) {
      for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++) {
        final int termIndex = sb.input.get(j);
        if (!TokenTypeUtils.isCommon(tokenTypes[termIndex])) {
          words.push(termIndex);
        }
      }
    }

    offsets.push(start, words.size() - start);
  }
Пример #7
0
  /**
   * Build a cluster's label from suffix tree edge indices, including some debugging and diagnostic
   * information.
   */
  @SuppressWarnings("unused")
  private String buildDebugLabel(int[] phraseIndices) {
    final StringBuilder b = new StringBuilder();

    String sep = "";
    int k = 0;
    final short[] tokenTypes = context.allWords.type;
    for (int i = 0; i < phraseIndices.length; i += 2) {
      for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++, k++) {
        b.append(sep);

        final int termIndex = sb.input.get(j);
        b.append(context.allWords.image[termIndex]);

        if (TokenTypeUtils.isCommon(tokenTypes[termIndex])) b.append("[S]");
        sep = " ";
      }
      sep = "_";
    }

    return b.toString();
  }