public TopicScores getTokenDocumentDiscrepancies() { TopicScores scores = new TopicScores("token-doc-diff", numTopics, numTopWords); scores.wordScoresDefined = true; for (int topic = 0; topic < numTopics; topic++) { int[][] matrix = topicCodocumentMatrices[topic]; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); double topicScore = 0.0; double[] wordDistribution = new double[numTopWords]; double[] docDistribution = new double[numTopWords]; double wordSum = 0.0; double docSum = 0.0; int position = 0; Iterator<IDSorter> iterator = sortedWords.iterator(); while (iterator.hasNext() && position < numTopWords) { IDSorter info = iterator.next(); wordDistribution[position] = info.getWeight(); docDistribution[position] = matrix[position][position]; wordSum += wordDistribution[position]; docSum += docDistribution[position]; position++; } for (position = 0; position < numTopWords; position++) { double p = wordDistribution[position] / wordSum; double q = docDistribution[position] / docSum; double meanProb = 0.5 * (p + q); double score = 0.0; if (p > 0) { score += 0.5 * p * Math.log(p / meanProb); } if (q > 0) { score += 0.5 * q * Math.log(q / meanProb); } scores.setTopicWordScore(topic, position, score); topicScore += score; } scores.setTopicScore(topic, topicScore); } return scores; }
public TopicModelDiagnostics(ParallelTopicModel model, int numTopWords) { numTopics = model.getNumTopics(); this.numTopWords = numTopWords; this.model = model; alphabet = model.getAlphabet(); topicSortedWords = model.getSortedWords(); topicTopWords = new String[numTopics][numTopWords]; numRank1Documents = new int[numTopics]; numNonZeroDocuments = new int[numTopics]; numDocumentsAtProportions = new int[numTopics][DEFAULT_DOC_PROPORTIONS.length]; sumCountTimesLogCount = new double[numTopics]; diagnostics = new ArrayList<TopicScores>(); for (int topic = 0; topic < numTopics; topic++) { int position = 0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); // How many words should we report? Some topics may have fewer than // the default number of words with non-zero weight. int limit = numTopWords; if (sortedWords.size() < numTopWords) { limit = sortedWords.size(); } Iterator<IDSorter> iterator = sortedWords.iterator(); for (int i = 0; i < limit; i++) { IDSorter info = iterator.next(); topicTopWords[topic][i] = (String) alphabet.lookupObject(info.getID()); } } collectDocumentStatistics(); diagnostics.add(getTokensPerTopic(model.tokensPerTopic)); diagnostics.add(getDocumentEntropy(model.tokensPerTopic)); diagnostics.add(getWordLengthScores()); diagnostics.add(getCoherence()); diagnostics.add(getDistanceFromUniform()); diagnostics.add(getDistanceFromCorpus()); diagnostics.add(getEffectiveNumberOfWords()); diagnostics.add(getTokenDocumentDiscrepancies()); diagnostics.add(getRank1Percent()); diagnostics.add(getDocumentPercentRatio(FIFTY_PERCENT_INDEX, TWO_PERCENT_INDEX)); diagnostics.add(getDocumentPercent(5)); diagnostics.add(getExclusivity()); }
// Derives QFunctions for the given value function and simulates the // greedy policy for the given number of trials and steps per trial. // Returns final value of every trial. public ArrayList simulate(int trials, int steps, long rand_seed) { ArrayList values = new ArrayList(); _r = new Random(rand_seed); for (int trial = 1; trial <= trials; trial++) { System.out.println("\n -----------\n Trial " + trial + "\n -----------"); // Initialize state _state = new ArrayList(); _nVars = _mdp._alVars.size(); for (int c = 0; c < (_nVars << 1); c++) { _state.add("-"); } Iterator i = _mdp._alVars.iterator(); _vars = new TreeSet(); while (i.hasNext()) { String s = (String) i.next(); if (!s.endsWith("\'")) { Integer gid = (Integer) _mdp._tmVar2ID.get(s); _vars.add(gid); // Note: assign level (level is gid-1 b/c gids in order) _state.set(gid.intValue() - 1, _r.nextBoolean() ? TRUE : FALSE); } } // System.out.println(_mdp._context.printNode(_mdp._valueDD) + "\n" + _state); double reward = _mdp._context.evaluate(_mdp._rewardDD, _state); System.out.print(" " + PrintState(_state) + " " + MDP._df.format(reward)); // Run steps for (int step = 1; step <= steps; step++) { // Get action Action a; if (_bUseBasis) { a = getBasisAction(); } else { a = getAction(); } // Execute action executeAction(a); // Update reward reward = (_mdp._bdDiscount.doubleValue() * reward) + _mdp._context.evaluate(_mdp._rewardDD, _state); System.out.println(", a=" + a._sName); System.out.print( " " + PrintState(_state) + " " + MDP._df.format(reward) + ": " + "Step " + step); } values.add(new Double(reward)); System.out.println(); } return values; }
public String toXML() { int[] tokensPerTopic = model.tokensPerTopic; StringBuilder out = new StringBuilder(); Formatter formatter = new Formatter(out, Locale.US); out.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"); out.append("<model>\n"); for (int topic = 0; topic < numTopics; topic++) { int[][] matrix = topicCodocumentMatrices[topic]; formatter.format("<topic id='%d'", topic); for (TopicScores scores : diagnostics) { formatter.format(" %s='%.4f'", scores.name, scores.scores[topic]); } out.append(">\n"); TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); // How many words should we report? Some topics may have fewer than // the default number of words with non-zero weight. int limit = numTopWords; if (sortedWords.size() < numTopWords) { limit = sortedWords.size(); } double cumulativeProbability = 0.0; Iterator<IDSorter> iterator = sortedWords.iterator(); for (int position = 0; position < limit; position++) { IDSorter info = iterator.next(); double probability = info.getWeight() / tokensPerTopic[topic]; cumulativeProbability += probability; formatter.format( "<word rank='%d' count='%.0f' prob='%.5f' cumulative='%.5f' docs='%d'", position + 1, info.getWeight(), probability, cumulativeProbability, matrix[position][position]); for (TopicScores scores : diagnostics) { if (scores.wordScoresDefined) { formatter.format(" %s='%.4f'", scores.name, scores.topicWordScores[topic][position]); } } formatter.format( ">%s</word>\n", topicTopWords[topic][position].replaceAll("&", "&").replaceAll("<", ">")); } out.append("</topic>\n"); } out.append("</model>\n"); return out.toString(); }