/** This is (mostly) copied from CRF4.java */ public boolean[][] labelConnectionsIn( Alphabet outputAlphabet, InstanceList trainingSet, String start) { int numLabels = outputAlphabet.size(); boolean[][] connections = new boolean[numLabels][numLabels]; for (int i = 0; i < trainingSet.size(); i++) { Instance instance = trainingSet.getInstance(i); FeatureSequence output = (FeatureSequence) instance.getTarget(); for (int j = 1; j < output.size(); j++) { int sourceIndex = outputAlphabet.lookupIndex(output.get(j - 1)); int destIndex = outputAlphabet.lookupIndex(output.get(j)); assert (sourceIndex >= 0 && destIndex >= 0); connections[sourceIndex][destIndex] = true; } } // Handle start state if (start != null) { int startIndex = outputAlphabet.lookupIndex(start); for (int j = 0; j < outputAlphabet.size(); j++) { connections[startIndex][j] = true; } } return connections; }
public TopicScores getDistanceFromUniform() { int[] tokensPerTopic = model.tokensPerTopic; TopicScores scores = new TopicScores("uniform_dist", numTopics, numTopWords); scores.wordScoresDefined = true; int numTypes = alphabet.size(); for (int topic = 0; topic < numTopics; topic++) { double topicScore = 0.0; int position = 0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); for (IDSorter info : sortedWords) { int type = info.getID(); double count = info.getWeight(); double score = (count / tokensPerTopic[topic]) * Math.log((count * numTypes) / tokensPerTopic[topic]); if (position < numTopWords) { scores.setTopicWordScore(topic, position, score); } topicScore += score; position++; } scores.setTopicScore(topic, topicScore); } return scores; }
public void printState(PrintWriter pw) { Alphabet a = ilist.getDataAlphabet(); pw.println("#doc pos typeindex type topic"); for (int di = 0; di < topics.length; di++) { FeatureSequence fs = (FeatureSequence) ilist.get(di).getData(); for (int si = 0; si < topics[di].length; si++) { int type = fs.getIndexAtPosition(si); pw.print(di); pw.print(' '); pw.print(si); pw.print(' '); pw.print(type); pw.print(' '); pw.print(a.lookupObject(type)); pw.print(' '); pw.print(topics[di][si]); pw.println(); } } }
public void add(Object key) { int fi = dictionary.lookupIndex(key); if (fi >= 0) // This will happen if the dictionary is frozen, // and key is not already in the dictionary. add(fi); else // xxx Should we raise an exception if the appending doesn't happen? "yes" -akm, added 1/2008 throw new IllegalStateException( "Object cannot be added to FeatureSequence because its Alphabet is frozen."); }
public TopicModelDiagnostics(ParallelTopicModel model, int numTopWords) { numTopics = model.getNumTopics(); this.numTopWords = numTopWords; this.model = model; alphabet = model.getAlphabet(); topicSortedWords = model.getSortedWords(); topicTopWords = new String[numTopics][numTopWords]; numRank1Documents = new int[numTopics]; numNonZeroDocuments = new int[numTopics]; numDocumentsAtProportions = new int[numTopics][DEFAULT_DOC_PROPORTIONS.length]; sumCountTimesLogCount = new double[numTopics]; diagnostics = new ArrayList<TopicScores>(); for (int topic = 0; topic < numTopics; topic++) { int position = 0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); // How many words should we report? Some topics may have fewer than // the default number of words with non-zero weight. int limit = numTopWords; if (sortedWords.size() < numTopWords) { limit = sortedWords.size(); } Iterator<IDSorter> iterator = sortedWords.iterator(); for (int i = 0; i < limit; i++) { IDSorter info = iterator.next(); topicTopWords[topic][i] = (String) alphabet.lookupObject(info.getID()); } } collectDocumentStatistics(); diagnostics.add(getTokensPerTopic(model.tokensPerTopic)); diagnostics.add(getDocumentEntropy(model.tokensPerTopic)); diagnostics.add(getWordLengthScores()); diagnostics.add(getCoherence()); diagnostics.add(getDistanceFromUniform()); diagnostics.add(getDistanceFromCorpus()); diagnostics.add(getEffectiveNumberOfWords()); diagnostics.add(getTokenDocumentDiscrepancies()); diagnostics.add(getRank1Percent()); diagnostics.add(getDocumentPercentRatio(FIFTY_PERCENT_INDEX, TWO_PERCENT_INDEX)); diagnostics.add(getDocumentPercent(5)); diagnostics.add(getExclusivity()); }
public void printCounts() { Alphabet alphabet = instances.getDataAlphabet(); NumberFormat nf = NumberFormat.getInstance(); nf.setMinimumFractionDigits(0); nf.setMaximumFractionDigits(6); nf.setGroupingUsed(false); for (int feature = 0; feature < numFeatures; feature++) { Formatter formatter = new Formatter(new StringBuilder(), Locale.US); formatter.format( "%s\t%s\t%d", alphabet.lookupObject(feature).toString(), nf.format(featureCounts[feature]), documentFrequencies[feature]); System.out.println(formatter); } }
public String toString() { StringBuffer sb = new StringBuffer(); for (int fsi = 0; fsi < length; fsi++) { Object o = dictionary.lookupObject(features[fsi]); sb.append(fsi); sb.append(": "); sb.append(o.toString()); sb.append(" ("); sb.append(features[fsi]); sb.append(")\n"); } return sb.toString(); }
/** * Remove features from the sequence that occur fewer than <code>cutoff</code> times in the * corpus, as indicated by the provided counts. Also swap in the new, reduced alphabet. This * method alters the instance in place; it is not appropriate if the original instance will be * needed. */ public void prune(double[] counts, Alphabet newAlphabet, int cutoff) { // The goal is to replace the sequence of features in place, by // creating a new array and then swapping it in. // First: figure out how long the new array will have to be int newLength = 0; for (int i = 0; i < length; i++) { if (counts[features[i]] >= cutoff) { newLength++; } } // Second: allocate a new features array int[] newFeatures = new int[newLength]; // Third: fill the new array int newIndex = 0; for (int i = 0; i < length; i++) { if (counts[features[i]] >= cutoff) { Object feature = dictionary.lookupObject(features[i]); newFeatures[newIndex] = newAlphabet.lookupIndex(feature); newIndex++; } } // Fourth: swap out the arrays features = newFeatures; length = newLength; dictionary = newAlphabet; }
public void printState(PrintWriter pw) { pw.println("#doc pos typeindex type bigrampossible? topic bigram"); for (int di = 0; di < topics.length; di++) { FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.get(di).getData(); for (int si = 0; si < topics[di].length; si++) { int type = fs.getIndexAtPosition(si); pw.print(di); pw.print(' '); pw.print(si); pw.print(' '); pw.print(type); pw.print(' '); pw.print(uniAlphabet.lookupObject(type)); pw.print(' '); pw.print(fs.getBiIndexAtPosition(si) == -1 ? 0 : 1); pw.print(' '); pw.print(topics[di][si]); pw.print(' '); pw.print(grams[di][si]); pw.println(); } } }
public TopicScores getEffectiveNumberOfWords() { int[] tokensPerTopic = model.tokensPerTopic; TopicScores scores = new TopicScores("eff_num_words", numTopics, numTopWords); int numTypes = alphabet.size(); for (int topic = 0; topic < numTopics; topic++) { double sumSquaredProbabilities = 0.0; TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic); for (IDSorter info : sortedWords) { int type = info.getID(); double probability = info.getWeight() / tokensPerTopic[topic]; sumSquaredProbabilities += probability * probability; } scores.setTopicScore(topic, 1.0 / sumSquaredProbabilities); } return scores; }
public void collectDocumentStatistics() { topicCodocumentMatrices = new int[numTopics][numTopWords][numTopWords]; wordTypeCounts = new int[alphabet.size()]; numTokens = 0; // This is an array of hash sets containing the words-of-interest for each topic, // used for checking if the word at some position is one of those words. IntHashSet[] topicTopWordIndices = new IntHashSet[numTopics]; // The same as the topic top words, but with int indices instead of strings, // used for iterating over positions. int[][] topicWordIndicesInOrder = new int[numTopics][numTopWords]; // This is an array of hash sets that will hold the words-of-interest present in a document, // which will be cleared after every document. IntHashSet[] docTopicWordIndices = new IntHashSet[numTopics]; int numDocs = model.getData().size(); // The count of each topic, again cleared after every document. int[] topicCounts = new int[numTopics]; for (int topic = 0; topic < numTopics; topic++) { IntHashSet wordIndices = new IntHashSet(); for (int i = 0; i < numTopWords; i++) { if (topicTopWords[topic][i] != null) { int type = alphabet.lookupIndex(topicTopWords[topic][i]); topicWordIndicesInOrder[topic][i] = type; wordIndices.add(type); } } topicTopWordIndices[topic] = wordIndices; docTopicWordIndices[topic] = new IntHashSet(); } int doc = 0; for (TopicAssignment document : model.getData()) { FeatureSequence tokens = (FeatureSequence) document.instance.getData(); FeatureSequence topics = (FeatureSequence) document.topicSequence; for (int position = 0; position < tokens.size(); position++) { int type = tokens.getIndexAtPosition(position); int topic = topics.getIndexAtPosition(position); numTokens++; wordTypeCounts[type]++; topicCounts[topic]++; if (topicTopWordIndices[topic].contains(type)) { docTopicWordIndices[topic].add(type); } } int docLength = tokens.size(); if (docLength > 0) { int maxTopic = -1; int maxCount = -1; for (int topic = 0; topic < numTopics; topic++) { if (topicCounts[topic] > 0) { numNonZeroDocuments[topic]++; if (topicCounts[topic] > maxCount) { maxTopic = topic; maxCount = topicCounts[topic]; } sumCountTimesLogCount[topic] += topicCounts[topic] * Math.log(topicCounts[topic]); double proportion = (model.alpha[topic] + topicCounts[topic]) / (model.alphaSum + docLength); for (int i = 0; i < DEFAULT_DOC_PROPORTIONS.length; i++) { if (proportion < DEFAULT_DOC_PROPORTIONS[i]) { break; } numDocumentsAtProportions[topic][i]++; } IntHashSet supportedWords = docTopicWordIndices[topic]; int[] indices = topicWordIndicesInOrder[topic]; for (int i = 0; i < numTopWords; i++) { if (supportedWords.contains(indices[i])) { for (int j = i; j < numTopWords; j++) { if (i == j) { // Diagonals are total number of documents with word W in topic T topicCodocumentMatrices[topic][i][i]++; } else if (supportedWords.contains(indices[j])) { topicCodocumentMatrices[topic][i][j]++; topicCodocumentMatrices[topic][j][i]++; } } } } docTopicWordIndices[topic].clear(); topicCounts[topic] = 0; } } if (maxTopic > -1) { numRank1Documents[maxTopic]++; } } doc++; } }
// xxx This method name seems a bit ambiguous? public Object get(int pos) { return dictionary.lookupObject(features[pos]); }
public void add(int featureIndex) { growIfNecessary(); assert (featureIndex < dictionary.size()); features[length++] = featureIndex; }
public void estimate( InstanceList documents, int numIterations, int showTopicsInterval, int outputModelInterval, String outputModelFilename, Randoms r) { ilist = documents; uniAlphabet = ilist.getDataAlphabet(); biAlphabet = ((FeatureSequenceWithBigrams) ilist.get(0).getData()).getBiAlphabet(); numTypes = uniAlphabet.size(); numBitypes = biAlphabet.size(); int numDocs = ilist.size(); topics = new int[numDocs][]; grams = new int[numDocs][]; docTopicCounts = new int[numDocs][numTopics]; typeNgramTopicCounts = new int[numTypes][2][numTopics]; unitypeTopicCounts = new int[numTypes][numTopics]; bitypeTopicCounts = new int[numBitypes][numTopics]; tokensPerTopic = new int[numTopics]; bitokensPerTopic = new int[numTypes][numTopics]; tAlpha = alpha * numTopics; vBeta = beta * numTypes; vGamma = gamma * numTypes; long startTime = System.currentTimeMillis(); // Initialize with random assignments of tokens to topics // and finish allocating this.topics and this.tokens int topic, gram, seqLen, fi; for (int di = 0; di < numDocs; di++) { FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.get(di).getData(); seqLen = fs.getLength(); numTokens += seqLen; topics[di] = new int[seqLen]; grams[di] = new int[seqLen]; // Randomly assign tokens to topics int prevFi = -1, prevTopic = -1; for (int si = 0; si < seqLen; si++) { // randomly sample a topic for the word at position si topic = r.nextInt(numTopics); // if a bigram is allowed at position si, then sample a gram status for it. gram = (fs.getBiIndexAtPosition(si) == -1 ? 0 : r.nextInt(2)); if (gram != 0) biTokens++; topics[di][si] = topic; grams[di][si] = gram; docTopicCounts[di][topic]++; fi = fs.getIndexAtPosition(si); if (prevFi != -1) typeNgramTopicCounts[prevFi][gram][prevTopic]++; if (gram == 0) { unitypeTopicCounts[fi][topic]++; tokensPerTopic[topic]++; } else { bitypeTopicCounts[fs.getBiIndexAtPosition(si)][topic]++; bitokensPerTopic[prevFi][topic]++; } prevFi = fi; prevTopic = topic; } } for (int iterations = 0; iterations < numIterations; iterations++) { sampleTopicsForAllDocs(r); if (iterations % 10 == 0) System.out.print(iterations); else System.out.print("."); System.out.flush(); if (showTopicsInterval != 0 && iterations % showTopicsInterval == 0 && iterations > 0) { System.out.println(); printTopWords(5, false); } if (outputModelInterval != 0 && iterations % outputModelInterval == 0 && iterations > 0) { this.write(new File(outputModelFilename + '.' + iterations)); } } System.out.println( "\nTotal time (sec): " + ((System.currentTimeMillis() - startTime) / 1000.0)); }
public void printTopWords(int numWords, boolean useNewLines) { class WordProb implements Comparable { int wi; double p; public WordProb(int wi, double p) { this.wi = wi; this.p = p; } public final int compareTo(Object o2) { if (p > ((WordProb) o2).p) return -1; else if (p == ((WordProb) o2).p) return 0; else return 1; } } for (int ti = 0; ti < numTopics; ti++) { // Unigrams WordProb[] wp = new WordProb[numTypes]; for (int wi = 0; wi < numTypes; wi++) wp[wi] = new WordProb(wi, (double) unitypeTopicCounts[wi][ti]); Arrays.sort(wp); int numToPrint = Math.min(wp.length, numWords); if (useNewLines) { System.out.println("\nTopic " + ti + " unigrams"); for (int i = 0; i < numToPrint; i++) System.out.println( uniAlphabet.lookupObject(wp[i].wi).toString() + " " + wp[i].p / tokensPerTopic[ti]); } else { System.out.print("Topic " + ti + ": "); for (int i = 0; i < numToPrint; i++) System.out.print(uniAlphabet.lookupObject(wp[i].wi).toString() + " "); } // Bigrams /* wp = new WordProb[numBitypes]; int bisum = 0; for (int wi = 0; wi < numBitypes; wi++) { wp[wi] = new WordProb (wi, ((double)bitypeTopicCounts[wi][ti])); bisum += bitypeTopicCounts[wi][ti]; } Arrays.sort (wp); numToPrint = Math.min(wp.length, numWords); if (useNewLines) { System.out.println ("\nTopic "+ti+" bigrams"); for (int i = 0; i < numToPrint; i++) System.out.println (biAlphabet.lookupObject(wp[i].wi).toString() + " " + wp[i].p/bisum); } else { System.out.print (" "); for (int i = 0; i < numToPrint; i++) System.out.print (biAlphabet.lookupObject(wp[i].wi).toString() + " "); System.out.println(); } */ // Ngrams AugmentableFeatureVector afv = new AugmentableFeatureVector(new Alphabet(), 10000, false); for (int di = 0; di < topics.length; di++) { FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.get(di).getData(); for (int si = topics[di].length - 1; si >= 0; si--) { if (topics[di][si] == ti && grams[di][si] == 1) { String gramString = uniAlphabet.lookupObject(fs.getIndexAtPosition(si)).toString(); while (grams[di][si] == 1 && --si >= 0) gramString = uniAlphabet.lookupObject(fs.getIndexAtPosition(si)).toString() + "_" + gramString; afv.add(gramString, 1.0); } } } // System.out.println ("pre-sorting"); int numNgrams = afv.numLocations(); // System.out.println ("post-sorting "+numNgrams); wp = new WordProb[numNgrams]; int ngramSum = 0; for (int loc = 0; loc < numNgrams; loc++) { wp[loc] = new WordProb(afv.indexAtLocation(loc), afv.valueAtLocation(loc)); ngramSum += wp[loc].p; } Arrays.sort(wp); int numUnitypeTokens = 0, numBitypeTokens = 0, numUnitypeTypes = 0, numBitypeTypes = 0; for (int fi = 0; fi < numTypes; fi++) { numUnitypeTokens += unitypeTopicCounts[fi][ti]; if (unitypeTopicCounts[fi][ti] != 0) numUnitypeTypes++; } for (int fi = 0; fi < numBitypes; fi++) { numBitypeTokens += bitypeTopicCounts[fi][ti]; if (bitypeTopicCounts[fi][ti] != 0) numBitypeTypes++; } if (useNewLines) { System.out.println( "\nTopic " + ti + " unigrams " + numUnitypeTokens + "/" + numUnitypeTypes + " bigrams " + numBitypeTokens + "/" + numBitypeTypes + " phrases " + Math.round(afv.oneNorm()) + "/" + numNgrams); for (int i = 0; i < Math.min(numNgrams, numWords); i++) System.out.println( afv.getAlphabet().lookupObject(wp[i].wi).toString() + " " + wp[i].p / ngramSum); } else { System.out.print( " (unigrams " + numUnitypeTokens + "/" + numUnitypeTypes + " bigrams " + numBitypeTokens + "/" + numBitypeTypes + " phrases " + Math.round(afv.oneNorm()) + "/" + numNgrams + ")\n "); // System.out.print (" (unique-ngrams="+numNgrams+" // ngram-count="+Math.round(afv.oneNorm())+")\n "); for (int i = 0; i < Math.min(numNgrams, numWords); i++) System.out.print(afv.getAlphabet().lookupObject(wp[i].wi).toString() + " "); System.out.println(); } } }