Java TokenTypeUtils примеры использования

Язык программирования: Java

Пространство имен/Пакет: org.carrot2.text.analysis

Класс/Тип: TokenTypeUtils

Примеров на hotexamples.com: 7

Java TokenTypeUtils - 7 примеров найдено. Это лучшие примеры Java кода для org.carrot2.text.analysis.TokenTypeUtils, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

isCommon(7)

Пример #1

Показать файл

Файл: STCClusteringAlgorithm.java Проект: nkabir/carrot2

  /** Build the cluster's label from suffix tree edge indices. */
  private String buildLabel(int[] phraseIndices) {
    final boolean joinWithSpace =
        context.language.getLanguageCode() != LanguageCode.CHINESE_SIMPLIFIED;

    // Count the number of terms first.
    int termsCount = 0;
    for (int j = 0; j < phraseIndices.length; j += 2) {
      termsCount += phraseIndices[j + 1] - phraseIndices[j] + 1;
    }

    // Extract terms info for the phrase and construct the label.
    final boolean[] stopwords = new boolean[termsCount];
    final char[][] images = new char[termsCount][];
    final short[] tokenTypes = context.allWords.type;

    int k = 0;
    for (int i = 0; i < phraseIndices.length; i += 2) {
      for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++, k++) {
        final int termIndex = sb.input.get(j);
        images[k] = context.allWords.image[termIndex];
        stopwords[k] = TokenTypeUtils.isCommon(tokenTypes[termIndex]);
      }
    }

    return LabelFormatter.format(images, stopwords, joinWithSpace);
  }

Пример #2

Показать файл

Файл: STCClusteringAlgorithm.java Проект: nkabir/carrot2

  /** Calculate "effective phrase length", that is the number of non-ignored words in the phrase. */
  final int effectivePhraseLength(IntStack path) {
    final int[] terms = sb.input.buffer;
    final int lower = ignoreWordIfInFewerDocs;
    final int upper = (int) (ignoreWordIfInHigherDocsPercent * documents.size());

    int effectivePhraseLen = 0;
    for (int i = 0; i < path.size(); i += 2) {
      for (int j = path.get(i); j <= path.get(i + 1); j++) {
        final int termIndex = terms[j];

        // If this term is a stop word, don't count it.
        if (TokenTypeUtils.isCommon(context.allWords.type[termIndex])) {
          continue;
        }

        // If this word occurs in more than a given fraction of the input
        // collection don't count it.
        final int docCount = context.allWords.tfByDocument[termIndex].length / 2;
        if (docCount < lower || docCount > upper) {
          continue;
        }

        effectivePhraseLen++;
      }
    }

    return effectivePhraseLen;
  }

Пример #3

Показать файл

Файл: STCClusteringAlgorithm.java Проект: nkabir/carrot2

  /** Collect all unique non-stop word from a phrase. */
  private void appendUniqueWords(IntStack words, IntStack offsets, PhraseCandidate p) {
    assert p.cluster.phrases.size() == 1;

    final int start = words.size();
    final int[] phraseIndices = p.cluster.phrases.get(0);
    final short[] tokenTypes = context.allWords.type;
    for (int i = 0; i < phraseIndices.length; i += 2) {
      for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++) {
        final int termIndex = sb.input.get(j);
        if (!TokenTypeUtils.isCommon(tokenTypes[termIndex])) {
          words.push(termIndex);
        }
      }
    }

    // Sort words, we don't care about their order when counting subsets.
    Arrays.sort(words.buffer, start, words.size());

    // Reorder to keep only unique words.
    int j = start;
    for (int i = start + 1; i < words.size(); i++) {
      if (words.buffer[j] != words.buffer[i]) {
        words.buffer[++j] = words.buffer[i];
      }
    }
    words.elementsCount = j + 1;

    offsets.push(start, words.size() - start);
  }

Пример #4

Показать файл

Файл: TermDocumentMatrixBuilder.java Проект: renodim/carrot2

  /**
   * Computes stem indices of words that are one-word label candidates or are non-stop words from
   * phrase label candidates.
   */
  private int[] computeRequiredStemIndices(PreprocessingContext context) {
    final int[] labelsFeatureIndex = context.allLabels.featureIndex;
    final int[] wordsStemIndex = context.allWords.stemIndex;
    final short[] wordsTypes = context.allWords.type;
    final int[][] phrasesWordIndices = context.allPhrases.wordIndices;
    final int wordCount = wordsStemIndex.length;

    final int[][] stemsTfByDocument = context.allStems.tfByDocument;
    int documentCount = context.documents.size();
    final BitSet requiredStemIndices = new BitSet(labelsFeatureIndex.length);

    for (int i = 0; i < labelsFeatureIndex.length; i++) {
      final int featureIndex = labelsFeatureIndex[i];
      if (featureIndex < wordCount) {
        addStemIndex(
            wordsStemIndex, documentCount, stemsTfByDocument, requiredStemIndices, featureIndex);
      } else {
        final int[] wordIndices = phrasesWordIndices[featureIndex - wordCount];
        for (int j = 0; j < wordIndices.length; j++) {
          final int wordIndex = wordIndices[j];
          if (!TokenTypeUtils.isCommon(wordsTypes[wordIndex])) {
            addStemIndex(
                wordsStemIndex, documentCount, stemsTfByDocument, requiredStemIndices, wordIndex);
          }
        }
      }
    }

    return requiredStemIndices.asIntLookupContainer().toArray();
  }

Пример #5

Показать файл

Файл: STCClusteringAlgorithm.java Проект: nkabir/carrot2

  /**
   * Consider certain special cases of internal suffix tree nodes. The suffix tree may contain
   * internal nodes with paths starting or ending with a stop word (common word). We have the
   * following interesting scenarios:
   *
   * <dl>
   *   <dt>IF LEADING STOPWORD: IGNORE THE NODE.
   *   <dd>There MUST be a phrase with this stopword chopped off in the suffix tree (a suffix of
   *       this phrase) and its frequency will be just as high.
   *   <dt>IF TRAILING STOPWORDS:
   *   <dd>Check if the edge leading to the current node is composed entirely of stopwords. If so,
   *       there must be a parent node that contains non-stopwords and we can ignore the current
   *       node. Otherwise we can chop off the trailing stopwords from the current node's phrase
   *       (this phrase cannot be duplicated anywhere in the tree because if it were, there would
   *       have to be a branch somewhere in the suffix tree on the edge).
   * </dl>
   */
  final boolean checkAcceptablePhrase(IntStack path) {
    assert path.size() > 0;

    final int[] terms = sb.input.buffer;
    final short[] tokenTypes = context.allWords.type;

    // Ignore nodes that start with a stop word.
    if (TokenTypeUtils.isCommon(tokenTypes[terms[path.get(0)]])) {
      return false;
    }

    // Check the last edge of the current node.
    int i = path.get(path.size() - 2);
    int j = path.get(path.size() - 1);
    final int k = j;
    while (i <= j && TokenTypeUtils.isCommon(tokenTypes[terms[j]])) {
      j--;
    }

    if (j < i) {
      // If the edge contains only stopwords, ignore the node.
      return false;
    } else if (j < k) {
      // There have been trailing stop words on the edge. Chop them off.
      path.buffer[path.size() - 1] = j;
    }

    // Check the total phrase length (in words, including stopwords).
    int termsCount = 0;
    for (j = 0; j < path.size(); j += 2) {
      termsCount += path.get(j + 1) - path.get(j) + 1;
    }

    if (termsCount > maxDescPhraseLength) {
      return false;
    }

    return true;
  }

Пример #6

Показать файл

Файл: STCClusteringAlgorithm.java Проект: nkabir/carrot2

  /** Collect all words from a phrase. */
  private void appendWords(IntStack words, IntStack offsets, PhraseCandidate p) {
    final int start = words.size();

    final int[] phraseIndices = p.cluster.phrases.get(0);
    final short[] tokenTypes = context.allWords.type;
    for (int i = 0; i < phraseIndices.length; i += 2) {
      for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++) {
        final int termIndex = sb.input.get(j);
        if (!TokenTypeUtils.isCommon(tokenTypes[termIndex])) {
          words.push(termIndex);
        }
      }
    }

    offsets.push(start, words.size() - start);
  }

Пример #7

Показать файл

Файл: STCClusteringAlgorithm.java Проект: nkabir/carrot2

  /**
   * Build a cluster's label from suffix tree edge indices, including some debugging and diagnostic
   * information.
   */
  @SuppressWarnings("unused")
  private String buildDebugLabel(int[] phraseIndices) {
    final StringBuilder b = new StringBuilder();

    String sep = "";
    int k = 0;
    final short[] tokenTypes = context.allWords.type;
    for (int i = 0; i < phraseIndices.length; i += 2) {
      for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++, k++) {
        b.append(sep);

        final int termIndex = sb.input.get(j);
        b.append(context.allWords.image[termIndex]);

        if (TokenTypeUtils.isCommon(tokenTypes[termIndex])) b.append("[S]");
        sep = " ";
      }
      sep = "_";
    }

    return b.toString();
  }