Ejemplo n.º 1
0
 public void printState(PrintWriter pw) {
   pw.println("#doc pos typeindex type bigrampossible? topic bigram");
   for (int di = 0; di < topics.length; di++) {
     FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.get(di).getData();
     for (int si = 0; si < topics[di].length; si++) {
       int type = fs.getIndexAtPosition(si);
       pw.print(di);
       pw.print(' ');
       pw.print(si);
       pw.print(' ');
       pw.print(type);
       pw.print(' ');
       pw.print(uniAlphabet.lookupObject(type));
       pw.print(' ');
       pw.print(fs.getBiIndexAtPosition(si) == -1 ? 0 : 1);
       pw.print(' ');
       pw.print(topics[di][si]);
       pw.print(' ');
       pw.print(grams[di][si]);
       pw.println();
     }
   }
 }
Ejemplo n.º 2
0
  public void estimate(
      InstanceList documents,
      int numIterations,
      int showTopicsInterval,
      int outputModelInterval,
      String outputModelFilename,
      Randoms r) {
    ilist = documents;
    uniAlphabet = ilist.getDataAlphabet();
    biAlphabet = ((FeatureSequenceWithBigrams) ilist.get(0).getData()).getBiAlphabet();
    numTypes = uniAlphabet.size();
    numBitypes = biAlphabet.size();
    int numDocs = ilist.size();
    topics = new int[numDocs][];
    grams = new int[numDocs][];
    docTopicCounts = new int[numDocs][numTopics];
    typeNgramTopicCounts = new int[numTypes][2][numTopics];
    unitypeTopicCounts = new int[numTypes][numTopics];
    bitypeTopicCounts = new int[numBitypes][numTopics];
    tokensPerTopic = new int[numTopics];
    bitokensPerTopic = new int[numTypes][numTopics];
    tAlpha = alpha * numTopics;
    vBeta = beta * numTypes;
    vGamma = gamma * numTypes;

    long startTime = System.currentTimeMillis();

    // Initialize with random assignments of tokens to topics
    // and finish allocating this.topics and this.tokens
    int topic, gram, seqLen, fi;
    for (int di = 0; di < numDocs; di++) {
      FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.get(di).getData();
      seqLen = fs.getLength();
      numTokens += seqLen;
      topics[di] = new int[seqLen];
      grams[di] = new int[seqLen];
      // Randomly assign tokens to topics
      int prevFi = -1, prevTopic = -1;
      for (int si = 0; si < seqLen; si++) {
        // randomly sample a topic for the word at position si
        topic = r.nextInt(numTopics);
        // if a bigram is allowed at position si, then sample a gram status for it.
        gram = (fs.getBiIndexAtPosition(si) == -1 ? 0 : r.nextInt(2));
        if (gram != 0) biTokens++;
        topics[di][si] = topic;
        grams[di][si] = gram;
        docTopicCounts[di][topic]++;
        fi = fs.getIndexAtPosition(si);
        if (prevFi != -1) typeNgramTopicCounts[prevFi][gram][prevTopic]++;
        if (gram == 0) {
          unitypeTopicCounts[fi][topic]++;
          tokensPerTopic[topic]++;
        } else {
          bitypeTopicCounts[fs.getBiIndexAtPosition(si)][topic]++;
          bitokensPerTopic[prevFi][topic]++;
        }
        prevFi = fi;
        prevTopic = topic;
      }
    }

    for (int iterations = 0; iterations < numIterations; iterations++) {
      sampleTopicsForAllDocs(r);
      if (iterations % 10 == 0) System.out.print(iterations);
      else System.out.print(".");
      System.out.flush();
      if (showTopicsInterval != 0 && iterations % showTopicsInterval == 0 && iterations > 0) {
        System.out.println();
        printTopWords(5, false);
      }
      if (outputModelInterval != 0 && iterations % outputModelInterval == 0 && iterations > 0) {
        this.write(new File(outputModelFilename + '.' + iterations));
      }
    }

    System.out.println(
        "\nTotal time (sec): " + ((System.currentTimeMillis() - startTime) / 1000.0));
  }
Ejemplo n.º 3
0
  public void printTopWords(int numWords, boolean useNewLines) {
    class WordProb implements Comparable {
      int wi;
      double p;

      public WordProb(int wi, double p) {
        this.wi = wi;
        this.p = p;
      }

      public final int compareTo(Object o2) {
        if (p > ((WordProb) o2).p) return -1;
        else if (p == ((WordProb) o2).p) return 0;
        else return 1;
      }
    }

    for (int ti = 0; ti < numTopics; ti++) {
      // Unigrams
      WordProb[] wp = new WordProb[numTypes];
      for (int wi = 0; wi < numTypes; wi++)
        wp[wi] = new WordProb(wi, (double) unitypeTopicCounts[wi][ti]);
      Arrays.sort(wp);
      int numToPrint = Math.min(wp.length, numWords);
      if (useNewLines) {
        System.out.println("\nTopic " + ti + " unigrams");
        for (int i = 0; i < numToPrint; i++)
          System.out.println(
              uniAlphabet.lookupObject(wp[i].wi).toString() + " " + wp[i].p / tokensPerTopic[ti]);
      } else {
        System.out.print("Topic " + ti + ": ");
        for (int i = 0; i < numToPrint; i++)
          System.out.print(uniAlphabet.lookupObject(wp[i].wi).toString() + " ");
      }

      // Bigrams
      /*
      wp = new WordProb[numBitypes];
      int bisum = 0;
      for (int wi = 0; wi < numBitypes; wi++) {
      	wp[wi] = new WordProb (wi, ((double)bitypeTopicCounts[wi][ti]));
      	bisum += bitypeTopicCounts[wi][ti];
      }
      Arrays.sort (wp);
      numToPrint = Math.min(wp.length, numWords);
      if (useNewLines) {
      	System.out.println ("\nTopic "+ti+" bigrams");
      	for (int i = 0; i < numToPrint; i++)
      		System.out.println (biAlphabet.lookupObject(wp[i].wi).toString() + " " + wp[i].p/bisum);
      } else {
      	System.out.print ("          ");
      	for (int i = 0; i < numToPrint; i++)
      		System.out.print (biAlphabet.lookupObject(wp[i].wi).toString() + " ");
      	System.out.println();
      }
      */

      // Ngrams
      AugmentableFeatureVector afv = new AugmentableFeatureVector(new Alphabet(), 10000, false);
      for (int di = 0; di < topics.length; di++) {
        FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.get(di).getData();
        for (int si = topics[di].length - 1; si >= 0; si--) {
          if (topics[di][si] == ti && grams[di][si] == 1) {
            String gramString = uniAlphabet.lookupObject(fs.getIndexAtPosition(si)).toString();
            while (grams[di][si] == 1 && --si >= 0)
              gramString =
                  uniAlphabet.lookupObject(fs.getIndexAtPosition(si)).toString() + "_" + gramString;
            afv.add(gramString, 1.0);
          }
        }
      }
      // System.out.println ("pre-sorting");
      int numNgrams = afv.numLocations();
      // System.out.println ("post-sorting "+numNgrams);
      wp = new WordProb[numNgrams];
      int ngramSum = 0;
      for (int loc = 0; loc < numNgrams; loc++) {
        wp[loc] = new WordProb(afv.indexAtLocation(loc), afv.valueAtLocation(loc));
        ngramSum += wp[loc].p;
      }
      Arrays.sort(wp);
      int numUnitypeTokens = 0, numBitypeTokens = 0, numUnitypeTypes = 0, numBitypeTypes = 0;
      for (int fi = 0; fi < numTypes; fi++) {
        numUnitypeTokens += unitypeTopicCounts[fi][ti];
        if (unitypeTopicCounts[fi][ti] != 0) numUnitypeTypes++;
      }
      for (int fi = 0; fi < numBitypes; fi++) {
        numBitypeTokens += bitypeTopicCounts[fi][ti];
        if (bitypeTopicCounts[fi][ti] != 0) numBitypeTypes++;
      }

      if (useNewLines) {
        System.out.println(
            "\nTopic "
                + ti
                + " unigrams "
                + numUnitypeTokens
                + "/"
                + numUnitypeTypes
                + " bigrams "
                + numBitypeTokens
                + "/"
                + numBitypeTypes
                + " phrases "
                + Math.round(afv.oneNorm())
                + "/"
                + numNgrams);
        for (int i = 0; i < Math.min(numNgrams, numWords); i++)
          System.out.println(
              afv.getAlphabet().lookupObject(wp[i].wi).toString() + " " + wp[i].p / ngramSum);
      } else {
        System.out.print(
            " (unigrams "
                + numUnitypeTokens
                + "/"
                + numUnitypeTypes
                + " bigrams "
                + numBitypeTokens
                + "/"
                + numBitypeTypes
                + " phrases "
                + Math.round(afv.oneNorm())
                + "/"
                + numNgrams
                + ")\n         ");
        // System.out.print (" (unique-ngrams="+numNgrams+"
        // ngram-count="+Math.round(afv.oneNorm())+")\n         ");
        for (int i = 0; i < Math.min(numNgrams, numWords); i++)
          System.out.print(afv.getAlphabet().lookupObject(wp[i].wi).toString() + " ");
        System.out.println();
      }
    }
  }
Ejemplo n.º 4
0
 private void sampleTopicsForOneDoc(
     FeatureSequenceWithBigrams oneDocTokens,
     int[] oneDocTopics,
     int[] oneDocGrams,
     int[] oneDocTopicCounts, // indexed by topic index
     double[] uniTopicWeights, // length==numTopics
     double[] biTopicWeights, // length==numTopics*2: joint topic/gram sampling
     Randoms r) {
   int[] currentTypeTopicCounts;
   int[] currentBitypeTopicCounts;
   int[] previousBitokensPerTopic;
   int type, bitype, oldGram, nextGram, newGram, oldTopic, newTopic;
   double topicWeightsSum, tw;
   // xxx int docLen = oneDocTokens.length;
   int docLen = oneDocTokens.getLength();
   // Iterate over the positions (words) in the document
   for (int si = 0; si < docLen; si++) {
     type = oneDocTokens.getIndexAtPosition(si);
     bitype = oneDocTokens.getBiIndexAtPosition(si);
     // if (bitype == -1) System.out.println ("biblock "+si+" at "+uniAlphabet.lookupObject(type));
     oldTopic = oneDocTopics[si];
     oldGram = oneDocGrams[si];
     nextGram = (si == docLen - 1) ? -1 : oneDocGrams[si + 1];
     // nextGram = (si == docLen-1) ? -1 : (oneDocTokens.getBiIndexAtPosition(si+1) == -1 ? 0 : 1);
     boolean bigramPossible = (bitype != -1);
     assert (!(!bigramPossible && oldGram == 1));
     if (!bigramPossible) {
       // Remove this token from all counts
       oneDocTopicCounts[oldTopic]--;
       tokensPerTopic[oldTopic]--;
       unitypeTopicCounts[type][oldTopic]--;
       if (si != docLen - 1) {
         typeNgramTopicCounts[type][nextGram][oldTopic]--;
         assert (typeNgramTopicCounts[type][nextGram][oldTopic] >= 0);
       }
       assert (oneDocTopicCounts[oldTopic] >= 0);
       assert (tokensPerTopic[oldTopic] >= 0);
       assert (unitypeTopicCounts[type][oldTopic] >= 0);
       // Build a distribution over topics for this token
       Arrays.fill(uniTopicWeights, 0.0);
       topicWeightsSum = 0;
       currentTypeTopicCounts = unitypeTopicCounts[type];
       for (int ti = 0; ti < numTopics; ti++) {
         tw =
             ((currentTypeTopicCounts[ti] + beta) / (tokensPerTopic[ti] + vBeta))
                 * ((oneDocTopicCounts[ti]
                     + alpha)); // additional term is constance across all topics
         topicWeightsSum += tw;
         uniTopicWeights[ti] = tw;
       }
       // Sample a topic assignment from this distribution
       newTopic = r.nextDiscrete(uniTopicWeights, topicWeightsSum);
       // Put that new topic into the counts
       oneDocTopics[si] = newTopic;
       oneDocTopicCounts[newTopic]++;
       unitypeTopicCounts[type][newTopic]++;
       tokensPerTopic[newTopic]++;
       if (si != docLen - 1) typeNgramTopicCounts[type][nextGram][newTopic]++;
     } else {
       // Bigram is possible
       int prevType = oneDocTokens.getIndexAtPosition(si - 1);
       int prevTopic = oneDocTopics[si - 1];
       // Remove this token from all counts
       oneDocTopicCounts[oldTopic]--;
       typeNgramTopicCounts[prevType][oldGram][prevTopic]--;
       if (si != docLen - 1) typeNgramTopicCounts[type][nextGram][oldTopic]--;
       if (oldGram == 0) {
         unitypeTopicCounts[type][oldTopic]--;
         tokensPerTopic[oldTopic]--;
       } else {
         bitypeTopicCounts[bitype][oldTopic]--;
         bitokensPerTopic[prevType][oldTopic]--;
         biTokens--;
       }
       assert (oneDocTopicCounts[oldTopic] >= 0);
       assert (typeNgramTopicCounts[prevType][oldGram][prevTopic] >= 0);
       assert (si == docLen - 1 || typeNgramTopicCounts[type][nextGram][oldTopic] >= 0);
       assert (unitypeTopicCounts[type][oldTopic] >= 0);
       assert (tokensPerTopic[oldTopic] >= 0);
       assert (bitypeTopicCounts[bitype][oldTopic] >= 0);
       assert (bitokensPerTopic[prevType][oldTopic] >= 0);
       assert (biTokens >= 0);
       // Build a joint distribution over topics and ngram-status for this token
       Arrays.fill(biTopicWeights, 0.0);
       topicWeightsSum = 0;
       currentTypeTopicCounts = unitypeTopicCounts[type];
       currentBitypeTopicCounts = bitypeTopicCounts[bitype];
       previousBitokensPerTopic = bitokensPerTopic[prevType];
       for (int ti = 0; ti < numTopics; ti++) {
         newTopic = ti << 1; // just using this variable as an index into [ti*2+gram]
         // The unigram outcome
         tw =
             (currentTypeTopicCounts[ti] + beta)
                 / (tokensPerTopic[ti] + vBeta)
                 * (oneDocTopicCounts[ti] + alpha)
                 * (typeNgramTopicCounts[prevType][0][prevTopic] + delta1);
         topicWeightsSum += tw;
         biTopicWeights[newTopic] = tw;
         // The bigram outcome
         newTopic++;
         tw =
             (currentBitypeTopicCounts[ti] + gamma)
                 / (previousBitokensPerTopic[ti] + vGamma)
                 * (oneDocTopicCounts[ti] + alpha)
                 * (typeNgramTopicCounts[prevType][1][prevTopic] + delta2);
         topicWeightsSum += tw;
         biTopicWeights[newTopic] = tw;
       }
       // Sample a topic assignment from this distribution
       newTopic = r.nextDiscrete(biTopicWeights, topicWeightsSum);
       // Put that new topic into the counts
       newGram = newTopic % 2;
       newTopic /= 2;
       // Put that new topic into the counts
       oneDocTopics[si] = newTopic;
       oneDocGrams[si] = newGram;
       oneDocTopicCounts[newTopic]++;
       typeNgramTopicCounts[prevType][newGram][prevTopic]++;
       if (si != docLen - 1) typeNgramTopicCounts[type][nextGram][newTopic]++;
       if (newGram == 0) {
         unitypeTopicCounts[type][newTopic]++;
         tokensPerTopic[newTopic]++;
       } else {
         bitypeTopicCounts[bitype][newTopic]++;
         bitokensPerTopic[prevType][newTopic]++;
         biTokens++;
       }
     }
   }
 }