Exemplo n.º 1
0
  public void estimate(
      InstanceList documents,
      int numIterations,
      int showTopicsInterval,
      int outputModelInterval,
      String outputModelFilename,
      Randoms r) {
    ilist = documents;
    uniAlphabet = ilist.getDataAlphabet();
    biAlphabet = ((FeatureSequenceWithBigrams) ilist.get(0).getData()).getBiAlphabet();
    numTypes = uniAlphabet.size();
    numBitypes = biAlphabet.size();
    int numDocs = ilist.size();
    topics = new int[numDocs][];
    grams = new int[numDocs][];
    docTopicCounts = new int[numDocs][numTopics];
    typeNgramTopicCounts = new int[numTypes][2][numTopics];
    unitypeTopicCounts = new int[numTypes][numTopics];
    bitypeTopicCounts = new int[numBitypes][numTopics];
    tokensPerTopic = new int[numTopics];
    bitokensPerTopic = new int[numTypes][numTopics];
    tAlpha = alpha * numTopics;
    vBeta = beta * numTypes;
    vGamma = gamma * numTypes;

    long startTime = System.currentTimeMillis();

    // Initialize with random assignments of tokens to topics
    // and finish allocating this.topics and this.tokens
    int topic, gram, seqLen, fi;
    for (int di = 0; di < numDocs; di++) {
      FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.get(di).getData();
      seqLen = fs.getLength();
      numTokens += seqLen;
      topics[di] = new int[seqLen];
      grams[di] = new int[seqLen];
      // Randomly assign tokens to topics
      int prevFi = -1, prevTopic = -1;
      for (int si = 0; si < seqLen; si++) {
        // randomly sample a topic for the word at position si
        topic = r.nextInt(numTopics);
        // if a bigram is allowed at position si, then sample a gram status for it.
        gram = (fs.getBiIndexAtPosition(si) == -1 ? 0 : r.nextInt(2));
        if (gram != 0) biTokens++;
        topics[di][si] = topic;
        grams[di][si] = gram;
        docTopicCounts[di][topic]++;
        fi = fs.getIndexAtPosition(si);
        if (prevFi != -1) typeNgramTopicCounts[prevFi][gram][prevTopic]++;
        if (gram == 0) {
          unitypeTopicCounts[fi][topic]++;
          tokensPerTopic[topic]++;
        } else {
          bitypeTopicCounts[fs.getBiIndexAtPosition(si)][topic]++;
          bitokensPerTopic[prevFi][topic]++;
        }
        prevFi = fi;
        prevTopic = topic;
      }
    }

    for (int iterations = 0; iterations < numIterations; iterations++) {
      sampleTopicsForAllDocs(r);
      if (iterations % 10 == 0) System.out.print(iterations);
      else System.out.print(".");
      System.out.flush();
      if (showTopicsInterval != 0 && iterations % showTopicsInterval == 0 && iterations > 0) {
        System.out.println();
        printTopWords(5, false);
      }
      if (outputModelInterval != 0 && iterations % outputModelInterval == 0 && iterations > 0) {
        this.write(new File(outputModelFilename + '.' + iterations));
      }
    }

    System.out.println(
        "\nTotal time (sec): " + ((System.currentTimeMillis() - startTime) / 1000.0));
  }
Exemplo n.º 2
0
 private void sampleTopicsForOneDoc(
     FeatureSequenceWithBigrams oneDocTokens,
     int[] oneDocTopics,
     int[] oneDocGrams,
     int[] oneDocTopicCounts, // indexed by topic index
     double[] uniTopicWeights, // length==numTopics
     double[] biTopicWeights, // length==numTopics*2: joint topic/gram sampling
     Randoms r) {
   int[] currentTypeTopicCounts;
   int[] currentBitypeTopicCounts;
   int[] previousBitokensPerTopic;
   int type, bitype, oldGram, nextGram, newGram, oldTopic, newTopic;
   double topicWeightsSum, tw;
   // xxx int docLen = oneDocTokens.length;
   int docLen = oneDocTokens.getLength();
   // Iterate over the positions (words) in the document
   for (int si = 0; si < docLen; si++) {
     type = oneDocTokens.getIndexAtPosition(si);
     bitype = oneDocTokens.getBiIndexAtPosition(si);
     // if (bitype == -1) System.out.println ("biblock "+si+" at "+uniAlphabet.lookupObject(type));
     oldTopic = oneDocTopics[si];
     oldGram = oneDocGrams[si];
     nextGram = (si == docLen - 1) ? -1 : oneDocGrams[si + 1];
     // nextGram = (si == docLen-1) ? -1 : (oneDocTokens.getBiIndexAtPosition(si+1) == -1 ? 0 : 1);
     boolean bigramPossible = (bitype != -1);
     assert (!(!bigramPossible && oldGram == 1));
     if (!bigramPossible) {
       // Remove this token from all counts
       oneDocTopicCounts[oldTopic]--;
       tokensPerTopic[oldTopic]--;
       unitypeTopicCounts[type][oldTopic]--;
       if (si != docLen - 1) {
         typeNgramTopicCounts[type][nextGram][oldTopic]--;
         assert (typeNgramTopicCounts[type][nextGram][oldTopic] >= 0);
       }
       assert (oneDocTopicCounts[oldTopic] >= 0);
       assert (tokensPerTopic[oldTopic] >= 0);
       assert (unitypeTopicCounts[type][oldTopic] >= 0);
       // Build a distribution over topics for this token
       Arrays.fill(uniTopicWeights, 0.0);
       topicWeightsSum = 0;
       currentTypeTopicCounts = unitypeTopicCounts[type];
       for (int ti = 0; ti < numTopics; ti++) {
         tw =
             ((currentTypeTopicCounts[ti] + beta) / (tokensPerTopic[ti] + vBeta))
                 * ((oneDocTopicCounts[ti]
                     + alpha)); // additional term is constance across all topics
         topicWeightsSum += tw;
         uniTopicWeights[ti] = tw;
       }
       // Sample a topic assignment from this distribution
       newTopic = r.nextDiscrete(uniTopicWeights, topicWeightsSum);
       // Put that new topic into the counts
       oneDocTopics[si] = newTopic;
       oneDocTopicCounts[newTopic]++;
       unitypeTopicCounts[type][newTopic]++;
       tokensPerTopic[newTopic]++;
       if (si != docLen - 1) typeNgramTopicCounts[type][nextGram][newTopic]++;
     } else {
       // Bigram is possible
       int prevType = oneDocTokens.getIndexAtPosition(si - 1);
       int prevTopic = oneDocTopics[si - 1];
       // Remove this token from all counts
       oneDocTopicCounts[oldTopic]--;
       typeNgramTopicCounts[prevType][oldGram][prevTopic]--;
       if (si != docLen - 1) typeNgramTopicCounts[type][nextGram][oldTopic]--;
       if (oldGram == 0) {
         unitypeTopicCounts[type][oldTopic]--;
         tokensPerTopic[oldTopic]--;
       } else {
         bitypeTopicCounts[bitype][oldTopic]--;
         bitokensPerTopic[prevType][oldTopic]--;
         biTokens--;
       }
       assert (oneDocTopicCounts[oldTopic] >= 0);
       assert (typeNgramTopicCounts[prevType][oldGram][prevTopic] >= 0);
       assert (si == docLen - 1 || typeNgramTopicCounts[type][nextGram][oldTopic] >= 0);
       assert (unitypeTopicCounts[type][oldTopic] >= 0);
       assert (tokensPerTopic[oldTopic] >= 0);
       assert (bitypeTopicCounts[bitype][oldTopic] >= 0);
       assert (bitokensPerTopic[prevType][oldTopic] >= 0);
       assert (biTokens >= 0);
       // Build a joint distribution over topics and ngram-status for this token
       Arrays.fill(biTopicWeights, 0.0);
       topicWeightsSum = 0;
       currentTypeTopicCounts = unitypeTopicCounts[type];
       currentBitypeTopicCounts = bitypeTopicCounts[bitype];
       previousBitokensPerTopic = bitokensPerTopic[prevType];
       for (int ti = 0; ti < numTopics; ti++) {
         newTopic = ti << 1; // just using this variable as an index into [ti*2+gram]
         // The unigram outcome
         tw =
             (currentTypeTopicCounts[ti] + beta)
                 / (tokensPerTopic[ti] + vBeta)
                 * (oneDocTopicCounts[ti] + alpha)
                 * (typeNgramTopicCounts[prevType][0][prevTopic] + delta1);
         topicWeightsSum += tw;
         biTopicWeights[newTopic] = tw;
         // The bigram outcome
         newTopic++;
         tw =
             (currentBitypeTopicCounts[ti] + gamma)
                 / (previousBitokensPerTopic[ti] + vGamma)
                 * (oneDocTopicCounts[ti] + alpha)
                 * (typeNgramTopicCounts[prevType][1][prevTopic] + delta2);
         topicWeightsSum += tw;
         biTopicWeights[newTopic] = tw;
       }
       // Sample a topic assignment from this distribution
       newTopic = r.nextDiscrete(biTopicWeights, topicWeightsSum);
       // Put that new topic into the counts
       newGram = newTopic % 2;
       newTopic /= 2;
       // Put that new topic into the counts
       oneDocTopics[si] = newTopic;
       oneDocGrams[si] = newGram;
       oneDocTopicCounts[newTopic]++;
       typeNgramTopicCounts[prevType][newGram][prevTopic]++;
       if (si != docLen - 1) typeNgramTopicCounts[type][nextGram][newTopic]++;
       if (newGram == 0) {
         unitypeTopicCounts[type][newTopic]++;
         tokensPerTopic[newTopic]++;
       } else {
         bitypeTopicCounts[bitype][newTopic]++;
         bitokensPerTopic[prevType][newTopic]++;
         biTokens++;
       }
     }
   }
 }