public TopicScores getDistanceFromUniform() {
    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("uniform_dist", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    int numTypes = alphabet.size();

    for (int topic = 0; topic < numTopics; topic++) {

      double topicScore = 0.0;
      int position = 0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double count = info.getWeight();

        double score =
            (count / tokensPerTopic[topic]) * Math.log((count * numTypes) / tokensPerTopic[topic]);

        if (position < numTopWords) {
          scores.setTopicWordScore(topic, position, score);
        }

        topicScore += score;
        position++;
      }

      scores.setTopicScore(topic, topicScore);
    }

    return scores;
  }
  public TopicScores getCoherence() {
    TopicScores scores = new TopicScores("coherence", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    for (int topic = 0; topic < numTopics; topic++) {
      int[][] matrix = topicCodocumentMatrices[topic];

      double topicScore = 0.0;

      for (int row = 0; row < numTopWords; row++) {
        double rowScore = 0.0;
        double minScore = 0.0;
        for (int col = 0; col < row; col++) {
          double score =
              Math.log((matrix[row][col] + model.beta) / (matrix[col][col] + model.beta));
          rowScore += score;
          if (score < minScore) {
            minScore = score;
          }
        }
        topicScore += rowScore;
        scores.setTopicWordScore(topic, row, minScore);
      }

      scores.setTopicScore(topic, topicScore);
    }

    return scores;
  }
  /** Low-quality topics often have lots of unusually short words. */
  public TopicScores getWordLengthStandardDeviation() {

    TopicScores scores = new TopicScores("word-length-sd", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    // Get the mean length

    double meanLength = 0.0;
    int totalWords = 0;

    for (int topic = 0; topic < numTopics; topic++) {
      for (int position = 0; position < topicTopWords[topic].length; position++) {
        // Some topics may not have all N words
        if (topicTopWords[topic][position] == null) {
          break;
        }
        meanLength += topicTopWords[topic][position].length();
        totalWords++;
      }
    }

    meanLength /= totalWords;

    // Now calculate the standard deviation

    double lengthVariance = 0.0;

    for (int topic = 0; topic < numTopics; topic++) {
      for (int position = 0; position < topicTopWords[topic].length; position++) {
        if (topicTopWords[topic][position] == null) {
          break;
        }

        int length = topicTopWords[topic][position].length();

        lengthVariance += (length - meanLength) * (length - meanLength);
      }
    }
    lengthVariance /= (totalWords - 1);

    // Finally produce an overall topic score

    double lengthSD = Math.sqrt(lengthVariance);
    for (int topic = 0; topic < numTopics; topic++) {
      for (int position = 0; position < topicTopWords[topic].length; position++) {
        if (topicTopWords[topic][position] == null) {
          break;
        }

        int length = topicTopWords[topic][position].length();

        scores.addToTopicScore(topic, (length - meanLength) / lengthSD);
        scores.setTopicWordScore(topic, position, (length - meanLength) / lengthSD);
      }
    }

    return scores;
  }
  public TopicScores getRank1Percent() {
    TopicScores scores = new TopicScores("rank_1_docs", numTopics, numTopWords);

    for (int topic = 0; topic < numTopics; topic++) {
      scores.setTopicScore(topic, (double) numRank1Documents[topic] / numNonZeroDocuments[topic]);
    }

    return scores;
  }
  public TopicScores getTokensPerTopic(int[] tokensPerTopic) {
    TopicScores scores = new TopicScores("tokens", numTopics, numTopWords);

    for (int topic = 0; topic < numTopics; topic++) {
      scores.setTopicScore(topic, tokensPerTopic[topic]);
    }

    return scores;
  }
  /** Low-quality topics may have words that are also prominent in other topics. */
  public TopicScores getExclusivity() {

    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("exclusivity", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    double sumDefaultProbs = 0.0;
    for (int topic = 0; topic < numTopics; topic++) {
      sumDefaultProbs += model.beta / (model.betaSum + tokensPerTopic[topic]);
    }

    for (int topic = 0; topic < numTopics; topic++) {

      double topicScore = 0.0;
      int position = 0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double count = info.getWeight();

        double sumTypeProbs = sumDefaultProbs;
        int[] topicCounts = model.typeTopicCounts[type];

        int index = 0;
        while (index < topicCounts.length && topicCounts[index] > 0) {

          int otherTopic = topicCounts[index] & model.topicMask;
          int otherCount = topicCounts[index] >> model.topicBits;

          // We've already accounted for the smoothing parameter,
          //  now we need to add the actual count for the non-zero
          //  topics.
          sumTypeProbs += ((double) otherCount) / (model.betaSum + tokensPerTopic[otherTopic]);

          index++;
        }

        double score =
            ((model.beta + count) / (model.betaSum + tokensPerTopic[topic])) / sumTypeProbs;
        scores.setTopicWordScore(topic, position, score);
        topicScore += score;

        position++;
        if (position == numTopWords) {
          break;
        }
      }

      scores.setTopicScore(topic, topicScore / numTopWords);
    }

    return scores;
  }
  public TopicScores getDocumentEntropy(int[] tokensPerTopic) {
    TopicScores scores = new TopicScores("document_entropy", numTopics, numTopWords);

    for (int topic = 0; topic < numTopics; topic++) {
      scores.setTopicScore(
          topic,
          -sumCountTimesLogCount[topic] / tokensPerTopic[topic] + Math.log(tokensPerTopic[topic]));
    }

    return scores;
  }
  public TopicScores getTokenDocumentDiscrepancies() {
    TopicScores scores = new TopicScores("token-doc-diff", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    for (int topic = 0; topic < numTopics; topic++) {
      int[][] matrix = topicCodocumentMatrices[topic];
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      double topicScore = 0.0;

      double[] wordDistribution = new double[numTopWords];
      double[] docDistribution = new double[numTopWords];

      double wordSum = 0.0;
      double docSum = 0.0;

      int position = 0;
      Iterator<IDSorter> iterator = sortedWords.iterator();
      while (iterator.hasNext() && position < numTopWords) {
        IDSorter info = iterator.next();

        wordDistribution[position] = info.getWeight();
        docDistribution[position] = matrix[position][position];

        wordSum += wordDistribution[position];
        docSum += docDistribution[position];

        position++;
      }

      for (position = 0; position < numTopWords; position++) {
        double p = wordDistribution[position] / wordSum;
        double q = docDistribution[position] / docSum;
        double meanProb = 0.5 * (p + q);

        double score = 0.0;
        if (p > 0) {
          score += 0.5 * p * Math.log(p / meanProb);
        }
        if (q > 0) {
          score += 0.5 * q * Math.log(q / meanProb);
        }

        scores.setTopicWordScore(topic, position, score);
        topicScore += score;
      }

      scores.setTopicScore(topic, topicScore);
    }

    return scores;
  }
  public TopicScores getDocumentPercent(int i) {
    TopicScores scores = new TopicScores("allocation_count", numTopics, numTopWords);

    if (i > numDocumentsAtProportions[0].length) {
      System.err.println(
          "Invalid proportion indices (max "
              + (numDocumentsAtProportions[0].length - 1)
              + ") : "
              + i);
      return scores;
    }

    for (int topic = 0; topic < numTopics; topic++) {
      scores.setTopicScore(
          topic, (double) numDocumentsAtProportions[topic][i] / numNonZeroDocuments[topic]);
    }

    return scores;
  }
예제 #10
0
  /** Low-quality topics often have lots of unusually short words. */
  public TopicScores getWordLengthScores() {

    TopicScores scores = new TopicScores("word-length", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    for (int topic = 0; topic < numTopics; topic++) {
      int total = 0;
      for (int position = 0; position < topicTopWords[topic].length; position++) {
        if (topicTopWords[topic][position] == null) {
          break;
        }

        int length = topicTopWords[topic][position].length();
        total += length;

        scores.setTopicWordScore(topic, position, length);
      }
      scores.setTopicScore(topic, (double) total / topicTopWords[topic].length);
    }

    return scores;
  }
예제 #11
0
  /** Low-quality topics may be very similar to the global distribution. */
  public TopicScores getDistanceFromCorpus() {

    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("corpus_dist", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    for (int topic = 0; topic < numTopics; topic++) {

      double coefficient = (double) numTokens / tokensPerTopic[topic];

      double topicScore = 0.0;
      int position = 0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double count = info.getWeight();

        double score =
            (count / tokensPerTopic[topic]) * Math.log(coefficient * count / wordTypeCounts[type]);

        if (position < numTopWords) {
          // System.out.println(alphabet.lookupObject(type) + ": " + count + " * " + numTokens + " /
          // " + wordTypeCounts[type] + " * " + tokensPerTopic[topic] + " = " + (coefficient * count
          // / wordTypeCounts[type]));
          scores.setTopicWordScore(topic, position, score);
        }

        topicScore += score;

        position++;
      }

      scores.setTopicScore(topic, topicScore);
    }

    return scores;
  }
예제 #12
0
  public TopicScores getDocumentPercentRatio(int numeratorIndex, int denominatorIndex) {
    TopicScores scores = new TopicScores("allocation_ratio", numTopics, numTopWords);

    if (numeratorIndex > numDocumentsAtProportions[0].length
        || denominatorIndex > numDocumentsAtProportions[0].length) {
      System.err.println(
          "Invalid proportion indices (max "
              + (numDocumentsAtProportions[0].length - 1)
              + ") : "
              + numeratorIndex
              + ", "
              + denominatorIndex);
      return scores;
    }

    for (int topic = 0; topic < numTopics; topic++) {
      scores.setTopicScore(
          topic,
          (double) numDocumentsAtProportions[topic][numeratorIndex]
              / numDocumentsAtProportions[topic][denominatorIndex]);
    }

    return scores;
  }
예제 #13
0
  public TopicScores getEffectiveNumberOfWords() {
    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("eff_num_words", numTopics, numTopWords);

    int numTypes = alphabet.size();

    for (int topic = 0; topic < numTopics; topic++) {

      double sumSquaredProbabilities = 0.0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double probability = info.getWeight() / tokensPerTopic[topic];

        sumSquaredProbabilities += probability * probability;
      }

      scores.setTopicScore(topic, 1.0 / sumSquaredProbabilities);
    }

    return scores;
  }