public TopicScores getDistanceFromUniform() { int[] tokensPerTopic = model.tokensPerTopic; TopicScores scores = new TopicScores("uniform_dist", numTopics, numTopWords); scores.wordScoresDefined = true; int numTypes = alphabet.size(); for (int topic = 0; topic < numTopics; topic++) { double topicScore = 0.0; int position = 0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); for (IDSorter info : sortedWords) { int type = info.getID(); double count = info.getWeight(); double score = (count / tokensPerTopic[topic]) * Math.log((count * numTypes) / tokensPerTopic[topic]); if (position < numTopWords) { scores.setTopicWordScore(topic, position, score); } topicScore += score; position++; } scores.setTopicScore(topic, topicScore); } return scores; }
/** Low-quality topics may have words that are also prominent in other topics. */ public TopicScores getExclusivity() { int[] tokensPerTopic = model.tokensPerTopic; TopicScores scores = new TopicScores("exclusivity", numTopics, numTopWords); scores.wordScoresDefined = true; double sumDefaultProbs = 0.0; for (int topic = 0; topic < numTopics; topic++) { sumDefaultProbs += model.beta / (model.betaSum + tokensPerTopic[topic]); } for (int topic = 0; topic < numTopics; topic++) { double topicScore = 0.0; int position = 0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); for (IDSorter info : sortedWords) { int type = info.getID(); double count = info.getWeight(); double sumTypeProbs = sumDefaultProbs; int[] topicCounts = model.typeTopicCounts[type]; int index = 0; while (index < topicCounts.length && topicCounts[index] > 0) { int otherTopic = topicCounts[index] & model.topicMask; int otherCount = topicCounts[index] >> model.topicBits; // We've already accounted for the smoothing parameter, // now we need to add the actual count for the non-zero // topics. sumTypeProbs += ((double) otherCount) / (model.betaSum + tokensPerTopic[otherTopic]); index++; } double score = ((model.beta + count) / (model.betaSum + tokensPerTopic[topic])) / sumTypeProbs; scores.setTopicWordScore(topic, position, score); topicScore += score; position++; if (position == numTopWords) { break; } } scores.setTopicScore(topic, topicScore / numTopWords); } return scores; }
public TopicModelDiagnostics(ParallelTopicModel model, int numTopWords) { numTopics = model.getNumTopics(); this.numTopWords = numTopWords; this.model = model; alphabet = model.getAlphabet(); topicSortedWords = model.getSortedWords(); topicTopWords = new String[numTopics][numTopWords]; numRank1Documents = new int[numTopics]; numNonZeroDocuments = new int[numTopics]; numDocumentsAtProportions = new int[numTopics][DEFAULT_DOC_PROPORTIONS.length]; sumCountTimesLogCount = new double[numTopics]; diagnostics = new ArrayList<TopicScores>(); for (int topic = 0; topic < numTopics; topic++) { int position = 0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); // How many words should we report? Some topics may have fewer than // the default number of words with non-zero weight. int limit = numTopWords; if (sortedWords.size() < numTopWords) { limit = sortedWords.size(); } Iterator<IDSorter> iterator = sortedWords.iterator(); for (int i = 0; i < limit; i++) { IDSorter info = iterator.next(); topicTopWords[topic][i] = (String) alphabet.lookupObject(info.getID()); } } collectDocumentStatistics(); diagnostics.add(getTokensPerTopic(model.tokensPerTopic)); diagnostics.add(getDocumentEntropy(model.tokensPerTopic)); diagnostics.add(getWordLengthScores()); diagnostics.add(getCoherence()); diagnostics.add(getDistanceFromUniform()); diagnostics.add(getDistanceFromCorpus()); diagnostics.add(getEffectiveNumberOfWords()); diagnostics.add(getTokenDocumentDiscrepancies()); diagnostics.add(getRank1Percent()); diagnostics.add(getDocumentPercentRatio(FIFTY_PERCENT_INDEX, TWO_PERCENT_INDEX)); diagnostics.add(getDocumentPercent(5)); diagnostics.add(getExclusivity()); }
/** Low-quality topics may be very similar to the global distribution. */ public TopicScores getDistanceFromCorpus() { int[] tokensPerTopic = model.tokensPerTopic; TopicScores scores = new TopicScores("corpus_dist", numTopics, numTopWords); scores.wordScoresDefined = true; for (int topic = 0; topic < numTopics; topic++) { double coefficient = (double) numTokens / tokensPerTopic[topic]; double topicScore = 0.0; int position = 0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); for (IDSorter info : sortedWords) { int type = info.getID(); double count = info.getWeight(); double score = (count / tokensPerTopic[topic]) * Math.log(coefficient * count / wordTypeCounts[type]); if (position < numTopWords) { // System.out.println(alphabet.lookupObject(type) + ": " + count + " * " + numTokens + " / // " + wordTypeCounts[type] + " * " + tokensPerTopic[topic] + " = " + (coefficient * count // / wordTypeCounts[type])); scores.setTopicWordScore(topic, position, score); } topicScore += score; position++; } scores.setTopicScore(topic, topicScore); } return scores; }
public TopicScores getEffectiveNumberOfWords() { int[] tokensPerTopic = model.tokensPerTopic; TopicScores scores = new TopicScores("eff_num_words", numTopics, numTopWords); int numTypes = alphabet.size(); for (int topic = 0; topic < numTopics; topic++) { double sumSquaredProbabilities = 0.0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); for (IDSorter info : sortedWords) { int type = info.getID(); double probability = info.getWeight() / tokensPerTopic[topic]; sumSquaredProbabilities += probability * probability; } scores.setTopicScore(topic, 1.0 / sumSquaredProbabilities); } return scores; }