/** This is (mostly) copied from CRF4.java */ public boolean[][] labelConnectionsIn( Alphabet outputAlphabet, InstanceList trainingSet, String start) { int numLabels = outputAlphabet.size(); boolean[][] connections = new boolean[numLabels][numLabels]; for (int i = 0; i < trainingSet.size(); i++) { Instance instance = trainingSet.getInstance(i); FeatureSequence output = (FeatureSequence) instance.getTarget(); for (int j = 1; j < output.size(); j++) { int sourceIndex = outputAlphabet.lookupIndex(output.get(j - 1)); int destIndex = outputAlphabet.lookupIndex(output.get(j)); assert (sourceIndex >= 0 && destIndex >= 0); connections[sourceIndex][destIndex] = true; } } // Handle start state if (start != null) { int startIndex = outputAlphabet.lookupIndex(start); for (int j = 0; j < outputAlphabet.size(); j++) { connections[startIndex][j] = true; } } return connections; }
public void collectDocumentStatistics() { topicCodocumentMatrices = new int[numTopics][numTopWords][numTopWords]; wordTypeCounts = new int[alphabet.size()]; numTokens = 0; // This is an array of hash sets containing the words-of-interest for each topic, // used for checking if the word at some position is one of those words. IntHashSet[] topicTopWordIndices = new IntHashSet[numTopics]; // The same as the topic top words, but with int indices instead of strings, // used for iterating over positions. int[][] topicWordIndicesInOrder = new int[numTopics][numTopWords]; // This is an array of hash sets that will hold the words-of-interest present in a document, // which will be cleared after every document. IntHashSet[] docTopicWordIndices = new IntHashSet[numTopics]; int numDocs = model.getData().size(); // The count of each topic, again cleared after every document. int[] topicCounts = new int[numTopics]; for (int topic = 0; topic < numTopics; topic++) { IntHashSet wordIndices = new IntHashSet(); for (int i = 0; i < numTopWords; i++) { if (topicTopWords[topic][i] != null) { int type = alphabet.lookupIndex(topicTopWords[topic][i]); topicWordIndicesInOrder[topic][i] = type; wordIndices.add(type); } } topicTopWordIndices[topic] = wordIndices; docTopicWordIndices[topic] = new IntHashSet(); } int doc = 0; for (TopicAssignment document : model.getData()) { FeatureSequence tokens = (FeatureSequence) document.instance.getData(); FeatureSequence topics = (FeatureSequence) document.topicSequence; for (int position = 0; position < tokens.size(); position++) { int type = tokens.getIndexAtPosition(position); int topic = topics.getIndexAtPosition(position); numTokens++; wordTypeCounts[type]++; topicCounts[topic]++; if (topicTopWordIndices[topic].contains(type)) { docTopicWordIndices[topic].add(type); } } int docLength = tokens.size(); if (docLength > 0) { int maxTopic = -1; int maxCount = -1; for (int topic = 0; topic < numTopics; topic++) { if (topicCounts[topic] > 0) { numNonZeroDocuments[topic]++; if (topicCounts[topic] > maxCount) { maxTopic = topic; maxCount = topicCounts[topic]; } sumCountTimesLogCount[topic] += topicCounts[topic] * Math.log(topicCounts[topic]); double proportion = (model.alpha[topic] + topicCounts[topic]) / (model.alphaSum + docLength); for (int i = 0; i < DEFAULT_DOC_PROPORTIONS.length; i++) { if (proportion < DEFAULT_DOC_PROPORTIONS[i]) { break; } numDocumentsAtProportions[topic][i]++; } IntHashSet supportedWords = docTopicWordIndices[topic]; int[] indices = topicWordIndicesInOrder[topic]; for (int i = 0; i < numTopWords; i++) { if (supportedWords.contains(indices[i])) { for (int j = i; j < numTopWords; j++) { if (i == j) { // Diagonals are total number of documents with word W in topic T topicCodocumentMatrices[topic][i][i]++; } else if (supportedWords.contains(indices[j])) { topicCodocumentMatrices[topic][i][j]++; topicCodocumentMatrices[topic][j][i]++; } } } } docTopicWordIndices[topic].clear(); topicCounts[topic] = 0; } } if (maxTopic > -1) { numRank1Documents[maxTopic]++; } } doc++; } }