コード例 #1
0
  public TopicScores getDistanceFromUniform() {
    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("uniform_dist", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    int numTypes = alphabet.size();

    for (int topic = 0; topic < numTopics; topic++) {

      double topicScore = 0.0;
      int position = 0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double count = info.getWeight();

        double score =
            (count / tokensPerTopic[topic]) * Math.log((count * numTypes) / tokensPerTopic[topic]);

        if (position < numTopWords) {
          scores.setTopicWordScore(topic, position, score);
        }

        topicScore += score;
        position++;
      }

      scores.setTopicScore(topic, topicScore);
    }

    return scores;
  }
コード例 #2
0
  /** Low-quality topics may have words that are also prominent in other topics. */
  public TopicScores getExclusivity() {

    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("exclusivity", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    double sumDefaultProbs = 0.0;
    for (int topic = 0; topic < numTopics; topic++) {
      sumDefaultProbs += model.beta / (model.betaSum + tokensPerTopic[topic]);
    }

    for (int topic = 0; topic < numTopics; topic++) {

      double topicScore = 0.0;
      int position = 0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double count = info.getWeight();

        double sumTypeProbs = sumDefaultProbs;
        int[] topicCounts = model.typeTopicCounts[type];

        int index = 0;
        while (index < topicCounts.length && topicCounts[index] > 0) {

          int otherTopic = topicCounts[index] & model.topicMask;
          int otherCount = topicCounts[index] >> model.topicBits;

          // We've already accounted for the smoothing parameter,
          //  now we need to add the actual count for the non-zero
          //  topics.
          sumTypeProbs += ((double) otherCount) / (model.betaSum + tokensPerTopic[otherTopic]);

          index++;
        }

        double score =
            ((model.beta + count) / (model.betaSum + tokensPerTopic[topic])) / sumTypeProbs;
        scores.setTopicWordScore(topic, position, score);
        topicScore += score;

        position++;
        if (position == numTopWords) {
          break;
        }
      }

      scores.setTopicScore(topic, topicScore / numTopWords);
    }

    return scores;
  }
コード例 #3
0
  public TopicModelDiagnostics(ParallelTopicModel model, int numTopWords) {
    numTopics = model.getNumTopics();
    this.numTopWords = numTopWords;

    this.model = model;

    alphabet = model.getAlphabet();
    topicSortedWords = model.getSortedWords();

    topicTopWords = new String[numTopics][numTopWords];

    numRank1Documents = new int[numTopics];
    numNonZeroDocuments = new int[numTopics];
    numDocumentsAtProportions = new int[numTopics][DEFAULT_DOC_PROPORTIONS.length];
    sumCountTimesLogCount = new double[numTopics];

    diagnostics = new ArrayList<TopicScores>();

    for (int topic = 0; topic < numTopics; topic++) {

      int position = 0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      // How many words should we report? Some topics may have fewer than
      //  the default number of words with non-zero weight.
      int limit = numTopWords;
      if (sortedWords.size() < numTopWords) {
        limit = sortedWords.size();
      }

      Iterator<IDSorter> iterator = sortedWords.iterator();
      for (int i = 0; i < limit; i++) {
        IDSorter info = iterator.next();
        topicTopWords[topic][i] = (String) alphabet.lookupObject(info.getID());
      }
    }

    collectDocumentStatistics();

    diagnostics.add(getTokensPerTopic(model.tokensPerTopic));
    diagnostics.add(getDocumentEntropy(model.tokensPerTopic));
    diagnostics.add(getWordLengthScores());
    diagnostics.add(getCoherence());
    diagnostics.add(getDistanceFromUniform());
    diagnostics.add(getDistanceFromCorpus());
    diagnostics.add(getEffectiveNumberOfWords());
    diagnostics.add(getTokenDocumentDiscrepancies());
    diagnostics.add(getRank1Percent());
    diagnostics.add(getDocumentPercentRatio(FIFTY_PERCENT_INDEX, TWO_PERCENT_INDEX));
    diagnostics.add(getDocumentPercent(5));
    diagnostics.add(getExclusivity());
  }
コード例 #4
0
  /** Low-quality topics may be very similar to the global distribution. */
  public TopicScores getDistanceFromCorpus() {

    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("corpus_dist", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    for (int topic = 0; topic < numTopics; topic++) {

      double coefficient = (double) numTokens / tokensPerTopic[topic];

      double topicScore = 0.0;
      int position = 0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double count = info.getWeight();

        double score =
            (count / tokensPerTopic[topic]) * Math.log(coefficient * count / wordTypeCounts[type]);

        if (position < numTopWords) {
          // System.out.println(alphabet.lookupObject(type) + ": " + count + " * " + numTokens + " /
          // " + wordTypeCounts[type] + " * " + tokensPerTopic[topic] + " = " + (coefficient * count
          // / wordTypeCounts[type]));
          scores.setTopicWordScore(topic, position, score);
        }

        topicScore += score;

        position++;
      }

      scores.setTopicScore(topic, topicScore);
    }

    return scores;
  }
コード例 #5
0
  public TopicScores getEffectiveNumberOfWords() {
    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("eff_num_words", numTopics, numTopWords);

    int numTypes = alphabet.size();

    for (int topic = 0; topic < numTopics; topic++) {

      double sumSquaredProbabilities = 0.0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double probability = info.getWeight() / tokensPerTopic[topic];

        sumSquaredProbabilities += probability * probability;
      }

      scores.setTopicScore(topic, 1.0 / sumSquaredProbabilities);
    }

    return scores;
  }