public TopicScores getTokenDocumentDiscrepancies() { TopicScores scores = new TopicScores("token-doc-diff", numTopics, numTopWords); scores.wordScoresDefined = true; for (int topic = 0; topic < numTopics; topic++) { int[][] matrix = topicCodocumentMatrices[topic]; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); double topicScore = 0.0; double[] wordDistribution = new double[numTopWords]; double[] docDistribution = new double[numTopWords]; double wordSum = 0.0; double docSum = 0.0; int position = 0; Iterator<IDSorter> iterator = sortedWords.iterator(); while (iterator.hasNext() && position < numTopWords) { IDSorter info = iterator.next(); wordDistribution[position] = info.getWeight(); docDistribution[position] = matrix[position][position]; wordSum += wordDistribution[position]; docSum += docDistribution[position]; position++; } for (position = 0; position < numTopWords; position++) { double p = wordDistribution[position] / wordSum; double q = docDistribution[position] / docSum; double meanProb = 0.5 * (p + q); double score = 0.0; if (p > 0) { score += 0.5 * p * Math.log(p / meanProb); } if (q > 0) { score += 0.5 * q * Math.log(q / meanProb); } scores.setTopicWordScore(topic, position, score); topicScore += score; } scores.setTopicScore(topic, topicScore); } return scores; }
public TopicScores getCoherence() { TopicScores scores = new TopicScores("coherence", numTopics, numTopWords); scores.wordScoresDefined = true; for (int topic = 0; topic < numTopics; topic++) { int[][] matrix = topicCodocumentMatrices[topic]; double topicScore = 0.0; for (int row = 0; row < numTopWords; row++) { double rowScore = 0.0; double minScore = 0.0; for (int col = 0; col < row; col++) { double score = Math.log((matrix[row][col] + model.beta) / (matrix[col][col] + model.beta)); rowScore += score; if (score < minScore) { minScore = score; } } topicScore += rowScore; scores.setTopicWordScore(topic, row, minScore); } scores.setTopicScore(topic, topicScore); } return scores; }
public TopicScores getDistanceFromUniform() { int[] tokensPerTopic = model.tokensPerTopic; TopicScores scores = new TopicScores("uniform_dist", numTopics, numTopWords); scores.wordScoresDefined = true; int numTypes = alphabet.size(); for (int topic = 0; topic < numTopics; topic++) { double topicScore = 0.0; int position = 0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); for (IDSorter info : sortedWords) { int type = info.getID(); double count = info.getWeight(); double score = (count / tokensPerTopic[topic]) * Math.log((count * numTypes) / tokensPerTopic[topic]); if (position < numTopWords) { scores.setTopicWordScore(topic, position, score); } topicScore += score; position++; } scores.setTopicScore(topic, topicScore); } return scores; }
public TopicScores getDocumentEntropy(int[] tokensPerTopic) { TopicScores scores = new TopicScores("document_entropy", numTopics, numTopWords); for (int topic = 0; topic < numTopics; topic++) { scores.setTopicScore( topic, -sumCountTimesLogCount[topic] / tokensPerTopic[topic] + Math.log(tokensPerTopic[topic])); } return scores; }
public double labelLogLikelihood(InstanceList ilist) { double logLikelihood = 0; for (int ii = 0; ii < ilist.size(); ii++) { double instanceWeight = ilist.getInstanceWeight(ii); Instance inst = ilist.get(ii); Labeling labeling = inst.getLabeling(); if (labeling == null) continue; Labeling predicted = this.classify(inst).getLabeling(); // System.err.println ("label = \n"+labeling); // System.err.println ("predicted = \n"+predicted); if (labeling.numLocations() == 1) { logLikelihood += instanceWeight * Math.log(predicted.value(labeling.getBestIndex())); } else { for (int lpos = 0; lpos < labeling.numLocations(); lpos++) { int li = labeling.indexAtLocation(lpos); double labelWeight = labeling.valueAtLocation(lpos); // System.err.print (", "+labelWeight); if (labelWeight == 0) continue; logLikelihood += instanceWeight * labelWeight * Math.log(predicted.value(li)); } } } return logLikelihood; }
/** Low-quality topics may be very similar to the global distribution. */ public TopicScores getDistanceFromCorpus() { int[] tokensPerTopic = model.tokensPerTopic; TopicScores scores = new TopicScores("corpus_dist", numTopics, numTopWords); scores.wordScoresDefined = true; for (int topic = 0; topic < numTopics; topic++) { double coefficient = (double) numTokens / tokensPerTopic[topic]; double topicScore = 0.0; int position = 0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); for (IDSorter info : sortedWords) { int type = info.getID(); double count = info.getWeight(); double score = (count / tokensPerTopic[topic]) * Math.log(coefficient * count / wordTypeCounts[type]); if (position < numTopWords) { // System.out.println(alphabet.lookupObject(type) + ": " + count + " * " + numTokens + " / // " + wordTypeCounts[type] + " * " + tokensPerTopic[topic] + " = " + (coefficient * count // / wordTypeCounts[type])); scores.setTopicWordScore(topic, position, score); } topicScore += score; position++; } scores.setTopicScore(topic, topicScore); } return scores; }
public void collectDocumentStatistics() { topicCodocumentMatrices = new int[numTopics][numTopWords][numTopWords]; wordTypeCounts = new int[alphabet.size()]; numTokens = 0; // This is an array of hash sets containing the words-of-interest for each topic, // used for checking if the word at some position is one of those words. IntHashSet[] topicTopWordIndices = new IntHashSet[numTopics]; // The same as the topic top words, but with int indices instead of strings, // used for iterating over positions. int[][] topicWordIndicesInOrder = new int[numTopics][numTopWords]; // This is an array of hash sets that will hold the words-of-interest present in a document, // which will be cleared after every document. IntHashSet[] docTopicWordIndices = new IntHashSet[numTopics]; int numDocs = model.getData().size(); // The count of each topic, again cleared after every document. int[] topicCounts = new int[numTopics]; for (int topic = 0; topic < numTopics; topic++) { IntHashSet wordIndices = new IntHashSet(); for (int i = 0; i < numTopWords; i++) { if (topicTopWords[topic][i] != null) { int type = alphabet.lookupIndex(topicTopWords[topic][i]); topicWordIndicesInOrder[topic][i] = type; wordIndices.add(type); } } topicTopWordIndices[topic] = wordIndices; docTopicWordIndices[topic] = new IntHashSet(); } int doc = 0; for (TopicAssignment document : model.getData()) { FeatureSequence tokens = (FeatureSequence) document.instance.getData(); FeatureSequence topics = (FeatureSequence) document.topicSequence; for (int position = 0; position < tokens.size(); position++) { int type = tokens.getIndexAtPosition(position); int topic = topics.getIndexAtPosition(position); numTokens++; wordTypeCounts[type]++; topicCounts[topic]++; if (topicTopWordIndices[topic].contains(type)) { docTopicWordIndices[topic].add(type); } } int docLength = tokens.size(); if (docLength > 0) { int maxTopic = -1; int maxCount = -1; for (int topic = 0; topic < numTopics; topic++) { if (topicCounts[topic] > 0) { numNonZeroDocuments[topic]++; if (topicCounts[topic] > maxCount) { maxTopic = topic; maxCount = topicCounts[topic]; } sumCountTimesLogCount[topic] += topicCounts[topic] * Math.log(topicCounts[topic]); double proportion = (model.alpha[topic] + topicCounts[topic]) / (model.alphaSum + docLength); for (int i = 0; i < DEFAULT_DOC_PROPORTIONS.length; i++) { if (proportion < DEFAULT_DOC_PROPORTIONS[i]) { break; } numDocumentsAtProportions[topic][i]++; } IntHashSet supportedWords = docTopicWordIndices[topic]; int[] indices = topicWordIndicesInOrder[topic]; for (int i = 0; i < numTopWords; i++) { if (supportedWords.contains(indices[i])) { for (int j = i; j < numTopWords; j++) { if (i == j) { // Diagonals are total number of documents with word W in topic T topicCodocumentMatrices[topic][i][i]++; } else if (supportedWords.contains(indices[j])) { topicCodocumentMatrices[topic][i][j]++; topicCodocumentMatrices[topic][j][i]++; } } } } docTopicWordIndices[topic].clear(); topicCounts[topic] = 0; } } if (maxTopic > -1) { numRank1Documents[maxTopic]++; } } doc++; } }