public void estimate( InstanceList documents, int numIterations, int showTopicsInterval, int outputModelInterval, String outputModelFilename, Randoms r) { ilist = documents; uniAlphabet = ilist.getDataAlphabet(); biAlphabet = ((FeatureSequenceWithBigrams) ilist.get(0).getData()).getBiAlphabet(); numTypes = uniAlphabet.size(); numBitypes = biAlphabet.size(); int numDocs = ilist.size(); topics = new int[numDocs][]; grams = new int[numDocs][]; docTopicCounts = new int[numDocs][numTopics]; typeNgramTopicCounts = new int[numTypes][2][numTopics]; unitypeTopicCounts = new int[numTypes][numTopics]; bitypeTopicCounts = new int[numBitypes][numTopics]; tokensPerTopic = new int[numTopics]; bitokensPerTopic = new int[numTypes][numTopics]; tAlpha = alpha * numTopics; vBeta = beta * numTypes; vGamma = gamma * numTypes; long startTime = System.currentTimeMillis(); // Initialize with random assignments of tokens to topics // and finish allocating this.topics and this.tokens int topic, gram, seqLen, fi; for (int di = 0; di < numDocs; di++) { FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.get(di).getData(); seqLen = fs.getLength(); numTokens += seqLen; topics[di] = new int[seqLen]; grams[di] = new int[seqLen]; // Randomly assign tokens to topics int prevFi = -1, prevTopic = -1; for (int si = 0; si < seqLen; si++) { // randomly sample a topic for the word at position si topic = r.nextInt(numTopics); // if a bigram is allowed at position si, then sample a gram status for it. gram = (fs.getBiIndexAtPosition(si) == -1 ? 0 : r.nextInt(2)); if (gram != 0) biTokens++; topics[di][si] = topic; grams[di][si] = gram; docTopicCounts[di][topic]++; fi = fs.getIndexAtPosition(si); if (prevFi != -1) typeNgramTopicCounts[prevFi][gram][prevTopic]++; if (gram == 0) { unitypeTopicCounts[fi][topic]++; tokensPerTopic[topic]++; } else { bitypeTopicCounts[fs.getBiIndexAtPosition(si)][topic]++; bitokensPerTopic[prevFi][topic]++; } prevFi = fi; prevTopic = topic; } } for (int iterations = 0; iterations < numIterations; iterations++) { sampleTopicsForAllDocs(r); if (iterations % 10 == 0) System.out.print(iterations); else System.out.print("."); System.out.flush(); if (showTopicsInterval != 0 && iterations % showTopicsInterval == 0 && iterations > 0) { System.out.println(); printTopWords(5, false); } if (outputModelInterval != 0 && iterations % outputModelInterval == 0 && iterations > 0) { this.write(new File(outputModelFilename + '.' + iterations)); } } System.out.println( "\nTotal time (sec): " + ((System.currentTimeMillis() - startTime) / 1000.0)); }
private void sampleTopicsForOneDoc( FeatureSequenceWithBigrams oneDocTokens, int[] oneDocTopics, int[] oneDocGrams, int[] oneDocTopicCounts, // indexed by topic index double[] uniTopicWeights, // length==numTopics double[] biTopicWeights, // length==numTopics*2: joint topic/gram sampling Randoms r) { int[] currentTypeTopicCounts; int[] currentBitypeTopicCounts; int[] previousBitokensPerTopic; int type, bitype, oldGram, nextGram, newGram, oldTopic, newTopic; double topicWeightsSum, tw; // xxx int docLen = oneDocTokens.length; int docLen = oneDocTokens.getLength(); // Iterate over the positions (words) in the document for (int si = 0; si < docLen; si++) { type = oneDocTokens.getIndexAtPosition(si); bitype = oneDocTokens.getBiIndexAtPosition(si); // if (bitype == -1) System.out.println ("biblock "+si+" at "+uniAlphabet.lookupObject(type)); oldTopic = oneDocTopics[si]; oldGram = oneDocGrams[si]; nextGram = (si == docLen - 1) ? -1 : oneDocGrams[si + 1]; // nextGram = (si == docLen-1) ? -1 : (oneDocTokens.getBiIndexAtPosition(si+1) == -1 ? 0 : 1); boolean bigramPossible = (bitype != -1); assert (!(!bigramPossible && oldGram == 1)); if (!bigramPossible) { // Remove this token from all counts oneDocTopicCounts[oldTopic]--; tokensPerTopic[oldTopic]--; unitypeTopicCounts[type][oldTopic]--; if (si != docLen - 1) { typeNgramTopicCounts[type][nextGram][oldTopic]--; assert (typeNgramTopicCounts[type][nextGram][oldTopic] >= 0); } assert (oneDocTopicCounts[oldTopic] >= 0); assert (tokensPerTopic[oldTopic] >= 0); assert (unitypeTopicCounts[type][oldTopic] >= 0); // Build a distribution over topics for this token Arrays.fill(uniTopicWeights, 0.0); topicWeightsSum = 0; currentTypeTopicCounts = unitypeTopicCounts[type]; for (int ti = 0; ti < numTopics; ti++) { tw = ((currentTypeTopicCounts[ti] + beta) / (tokensPerTopic[ti] + vBeta)) * ((oneDocTopicCounts[ti] + alpha)); // additional term is constance across all topics topicWeightsSum += tw; uniTopicWeights[ti] = tw; } // Sample a topic assignment from this distribution newTopic = r.nextDiscrete(uniTopicWeights, topicWeightsSum); // Put that new topic into the counts oneDocTopics[si] = newTopic; oneDocTopicCounts[newTopic]++; unitypeTopicCounts[type][newTopic]++; tokensPerTopic[newTopic]++; if (si != docLen - 1) typeNgramTopicCounts[type][nextGram][newTopic]++; } else { // Bigram is possible int prevType = oneDocTokens.getIndexAtPosition(si - 1); int prevTopic = oneDocTopics[si - 1]; // Remove this token from all counts oneDocTopicCounts[oldTopic]--; typeNgramTopicCounts[prevType][oldGram][prevTopic]--; if (si != docLen - 1) typeNgramTopicCounts[type][nextGram][oldTopic]--; if (oldGram == 0) { unitypeTopicCounts[type][oldTopic]--; tokensPerTopic[oldTopic]--; } else { bitypeTopicCounts[bitype][oldTopic]--; bitokensPerTopic[prevType][oldTopic]--; biTokens--; } assert (oneDocTopicCounts[oldTopic] >= 0); assert (typeNgramTopicCounts[prevType][oldGram][prevTopic] >= 0); assert (si == docLen - 1 || typeNgramTopicCounts[type][nextGram][oldTopic] >= 0); assert (unitypeTopicCounts[type][oldTopic] >= 0); assert (tokensPerTopic[oldTopic] >= 0); assert (bitypeTopicCounts[bitype][oldTopic] >= 0); assert (bitokensPerTopic[prevType][oldTopic] >= 0); assert (biTokens >= 0); // Build a joint distribution over topics and ngram-status for this token Arrays.fill(biTopicWeights, 0.0); topicWeightsSum = 0; currentTypeTopicCounts = unitypeTopicCounts[type]; currentBitypeTopicCounts = bitypeTopicCounts[bitype]; previousBitokensPerTopic = bitokensPerTopic[prevType]; for (int ti = 0; ti < numTopics; ti++) { newTopic = ti << 1; // just using this variable as an index into [ti*2+gram] // The unigram outcome tw = (currentTypeTopicCounts[ti] + beta) / (tokensPerTopic[ti] + vBeta) * (oneDocTopicCounts[ti] + alpha) * (typeNgramTopicCounts[prevType][0][prevTopic] + delta1); topicWeightsSum += tw; biTopicWeights[newTopic] = tw; // The bigram outcome newTopic++; tw = (currentBitypeTopicCounts[ti] + gamma) / (previousBitokensPerTopic[ti] + vGamma) * (oneDocTopicCounts[ti] + alpha) * (typeNgramTopicCounts[prevType][1][prevTopic] + delta2); topicWeightsSum += tw; biTopicWeights[newTopic] = tw; } // Sample a topic assignment from this distribution newTopic = r.nextDiscrete(biTopicWeights, topicWeightsSum); // Put that new topic into the counts newGram = newTopic % 2; newTopic /= 2; // Put that new topic into the counts oneDocTopics[si] = newTopic; oneDocGrams[si] = newGram; oneDocTopicCounts[newTopic]++; typeNgramTopicCounts[prevType][newGram][prevTopic]++; if (si != docLen - 1) typeNgramTopicCounts[type][nextGram][newTopic]++; if (newGram == 0) { unitypeTopicCounts[type][newTopic]++; tokensPerTopic[newTopic]++; } else { bitypeTopicCounts[bitype][newTopic]++; bitokensPerTopic[prevType][newTopic]++; biTokens++; } } } }