public TopicScores getDistanceFromUniform() { int[] tokensPerTopic = model.tokensPerTopic; TopicScores scores = new TopicScores("uniform_dist", numTopics, numTopWords); scores.wordScoresDefined = true; int numTypes = alphabet.size(); for (int topic = 0; topic < numTopics; topic++) { double topicScore = 0.0; int position = 0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); for (IDSorter info : sortedWords) { int type = info.getID(); double count = info.getWeight(); double score = (count / tokensPerTopic[topic]) * Math.log((count * numTypes) / tokensPerTopic[topic]); if (position < numTopWords) { scores.setTopicWordScore(topic, position, score); } topicScore += score; position++; } scores.setTopicScore(topic, topicScore); } return scores; }
public TopicScores getCoherence() { TopicScores scores = new TopicScores("coherence", numTopics, numTopWords); scores.wordScoresDefined = true; for (int topic = 0; topic < numTopics; topic++) { int[][] matrix = topicCodocumentMatrices[topic]; double topicScore = 0.0; for (int row = 0; row < numTopWords; row++) { double rowScore = 0.0; double minScore = 0.0; for (int col = 0; col < row; col++) { double score = Math.log((matrix[row][col] + model.beta) / (matrix[col][col] + model.beta)); rowScore += score; if (score < minScore) { minScore = score; } } topicScore += rowScore; scores.setTopicWordScore(topic, row, minScore); } scores.setTopicScore(topic, topicScore); } return scores; }
/** Low-quality topics often have lots of unusually short words. */ public TopicScores getWordLengthStandardDeviation() { TopicScores scores = new TopicScores("word-length-sd", numTopics, numTopWords); scores.wordScoresDefined = true; // Get the mean length double meanLength = 0.0; int totalWords = 0; for (int topic = 0; topic < numTopics; topic++) { for (int position = 0; position < topicTopWords[topic].length; position++) { // Some topics may not have all N words if (topicTopWords[topic][position] == null) { break; } meanLength += topicTopWords[topic][position].length(); totalWords++; } } meanLength /= totalWords; // Now calculate the standard deviation double lengthVariance = 0.0; for (int topic = 0; topic < numTopics; topic++) { for (int position = 0; position < topicTopWords[topic].length; position++) { if (topicTopWords[topic][position] == null) { break; } int length = topicTopWords[topic][position].length(); lengthVariance += (length - meanLength) * (length - meanLength); } } lengthVariance /= (totalWords - 1); // Finally produce an overall topic score double lengthSD = Math.sqrt(lengthVariance); for (int topic = 0; topic < numTopics; topic++) { for (int position = 0; position < topicTopWords[topic].length; position++) { if (topicTopWords[topic][position] == null) { break; } int length = topicTopWords[topic][position].length(); scores.addToTopicScore(topic, (length - meanLength) / lengthSD); scores.setTopicWordScore(topic, position, (length - meanLength) / lengthSD); } } return scores; }
public TopicScores getRank1Percent() { TopicScores scores = new TopicScores("rank_1_docs", numTopics, numTopWords); for (int topic = 0; topic < numTopics; topic++) { scores.setTopicScore(topic, (double) numRank1Documents[topic] / numNonZeroDocuments[topic]); } return scores; }
public TopicScores getTokensPerTopic(int[] tokensPerTopic) { TopicScores scores = new TopicScores("tokens", numTopics, numTopWords); for (int topic = 0; topic < numTopics; topic++) { scores.setTopicScore(topic, tokensPerTopic[topic]); } return scores; }
/** Low-quality topics may have words that are also prominent in other topics. */ public TopicScores getExclusivity() { int[] tokensPerTopic = model.tokensPerTopic; TopicScores scores = new TopicScores("exclusivity", numTopics, numTopWords); scores.wordScoresDefined = true; double sumDefaultProbs = 0.0; for (int topic = 0; topic < numTopics; topic++) { sumDefaultProbs += model.beta / (model.betaSum + tokensPerTopic[topic]); } for (int topic = 0; topic < numTopics; topic++) { double topicScore = 0.0; int position = 0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); for (IDSorter info : sortedWords) { int type = info.getID(); double count = info.getWeight(); double sumTypeProbs = sumDefaultProbs; int[] topicCounts = model.typeTopicCounts[type]; int index = 0; while (index < topicCounts.length && topicCounts[index] > 0) { int otherTopic = topicCounts[index] & model.topicMask; int otherCount = topicCounts[index] >> model.topicBits; // We've already accounted for the smoothing parameter, // now we need to add the actual count for the non-zero // topics. sumTypeProbs += ((double) otherCount) / (model.betaSum + tokensPerTopic[otherTopic]); index++; } double score = ((model.beta + count) / (model.betaSum + tokensPerTopic[topic])) / sumTypeProbs; scores.setTopicWordScore(topic, position, score); topicScore += score; position++; if (position == numTopWords) { break; } } scores.setTopicScore(topic, topicScore / numTopWords); } return scores; }
public TopicScores getDocumentEntropy(int[] tokensPerTopic) { TopicScores scores = new TopicScores("document_entropy", numTopics, numTopWords); for (int topic = 0; topic < numTopics; topic++) { scores.setTopicScore( topic, -sumCountTimesLogCount[topic] / tokensPerTopic[topic] + Math.log(tokensPerTopic[topic])); } return scores; }
public TopicScores getTokenDocumentDiscrepancies() { TopicScores scores = new TopicScores("token-doc-diff", numTopics, numTopWords); scores.wordScoresDefined = true; for (int topic = 0; topic < numTopics; topic++) { int[][] matrix = topicCodocumentMatrices[topic]; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); double topicScore = 0.0; double[] wordDistribution = new double[numTopWords]; double[] docDistribution = new double[numTopWords]; double wordSum = 0.0; double docSum = 0.0; int position = 0; Iterator<IDSorter> iterator = sortedWords.iterator(); while (iterator.hasNext() && position < numTopWords) { IDSorter info = iterator.next(); wordDistribution[position] = info.getWeight(); docDistribution[position] = matrix[position][position]; wordSum += wordDistribution[position]; docSum += docDistribution[position]; position++; } for (position = 0; position < numTopWords; position++) { double p = wordDistribution[position] / wordSum; double q = docDistribution[position] / docSum; double meanProb = 0.5 * (p + q); double score = 0.0; if (p > 0) { score += 0.5 * p * Math.log(p / meanProb); } if (q > 0) { score += 0.5 * q * Math.log(q / meanProb); } scores.setTopicWordScore(topic, position, score); topicScore += score; } scores.setTopicScore(topic, topicScore); } return scores; }
public TopicScores getDocumentPercent(int i) { TopicScores scores = new TopicScores("allocation_count", numTopics, numTopWords); if (i > numDocumentsAtProportions[0].length) { System.err.println( "Invalid proportion indices (max " + (numDocumentsAtProportions[0].length - 1) + ") : " + i); return scores; } for (int topic = 0; topic < numTopics; topic++) { scores.setTopicScore( topic, (double) numDocumentsAtProportions[topic][i] / numNonZeroDocuments[topic]); } return scores; }
/** Low-quality topics often have lots of unusually short words. */ public TopicScores getWordLengthScores() { TopicScores scores = new TopicScores("word-length", numTopics, numTopWords); scores.wordScoresDefined = true; for (int topic = 0; topic < numTopics; topic++) { int total = 0; for (int position = 0; position < topicTopWords[topic].length; position++) { if (topicTopWords[topic][position] == null) { break; } int length = topicTopWords[topic][position].length(); total += length; scores.setTopicWordScore(topic, position, length); } scores.setTopicScore(topic, (double) total / topicTopWords[topic].length); } return scores; }
/** Low-quality topics may be very similar to the global distribution. */ public TopicScores getDistanceFromCorpus() { int[] tokensPerTopic = model.tokensPerTopic; TopicScores scores = new TopicScores("corpus_dist", numTopics, numTopWords); scores.wordScoresDefined = true; for (int topic = 0; topic < numTopics; topic++) { double coefficient = (double) numTokens / tokensPerTopic[topic]; double topicScore = 0.0; int position = 0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); for (IDSorter info : sortedWords) { int type = info.getID(); double count = info.getWeight(); double score = (count / tokensPerTopic[topic]) * Math.log(coefficient * count / wordTypeCounts[type]); if (position < numTopWords) { // System.out.println(alphabet.lookupObject(type) + ": " + count + " * " + numTokens + " / // " + wordTypeCounts[type] + " * " + tokensPerTopic[topic] + " = " + (coefficient * count // / wordTypeCounts[type])); scores.setTopicWordScore(topic, position, score); } topicScore += score; position++; } scores.setTopicScore(topic, topicScore); } return scores; }
public TopicScores getDocumentPercentRatio(int numeratorIndex, int denominatorIndex) { TopicScores scores = new TopicScores("allocation_ratio", numTopics, numTopWords); if (numeratorIndex > numDocumentsAtProportions[0].length || denominatorIndex > numDocumentsAtProportions[0].length) { System.err.println( "Invalid proportion indices (max " + (numDocumentsAtProportions[0].length - 1) + ") : " + numeratorIndex + ", " + denominatorIndex); return scores; } for (int topic = 0; topic < numTopics; topic++) { scores.setTopicScore( topic, (double) numDocumentsAtProportions[topic][numeratorIndex] / numDocumentsAtProportions[topic][denominatorIndex]); } return scores; }
public TopicScores getEffectiveNumberOfWords() { int[] tokensPerTopic = model.tokensPerTopic; TopicScores scores = new TopicScores("eff_num_words", numTopics, numTopWords); int numTypes = alphabet.size(); for (int topic = 0; topic < numTopics; topic++) { double sumSquaredProbabilities = 0.0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); for (IDSorter info : sortedWords) { int type = info.getID(); double probability = info.getWeight() / tokensPerTopic[topic]; sumSquaredProbabilities += probability * probability; } scores.setTopicScore(topic, 1.0 / sumSquaredProbabilities); } return scores; }