public TopicScores getTokenDocumentDiscrepancies() {
    TopicScores scores = new TopicScores("token-doc-diff", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    for (int topic = 0; topic < numTopics; topic++) {
      int[][] matrix = topicCodocumentMatrices[topic];
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      double topicScore = 0.0;

      double[] wordDistribution = new double[numTopWords];
      double[] docDistribution = new double[numTopWords];

      double wordSum = 0.0;
      double docSum = 0.0;

      int position = 0;
      Iterator<IDSorter> iterator = sortedWords.iterator();
      while (iterator.hasNext() && position < numTopWords) {
        IDSorter info = iterator.next();

        wordDistribution[position] = info.getWeight();
        docDistribution[position] = matrix[position][position];

        wordSum += wordDistribution[position];
        docSum += docDistribution[position];

        position++;
      }

      for (position = 0; position < numTopWords; position++) {
        double p = wordDistribution[position] / wordSum;
        double q = docDistribution[position] / docSum;
        double meanProb = 0.5 * (p + q);

        double score = 0.0;
        if (p > 0) {
          score += 0.5 * p * Math.log(p / meanProb);
        }
        if (q > 0) {
          score += 0.5 * q * Math.log(q / meanProb);
        }

        scores.setTopicWordScore(topic, position, score);
        topicScore += score;
      }

      scores.setTopicScore(topic, topicScore);
    }

    return scores;
  }
  public TopicScores getCoherence() {
    TopicScores scores = new TopicScores("coherence", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    for (int topic = 0; topic < numTopics; topic++) {
      int[][] matrix = topicCodocumentMatrices[topic];

      double topicScore = 0.0;

      for (int row = 0; row < numTopWords; row++) {
        double rowScore = 0.0;
        double minScore = 0.0;
        for (int col = 0; col < row; col++) {
          double score =
              Math.log((matrix[row][col] + model.beta) / (matrix[col][col] + model.beta));
          rowScore += score;
          if (score < minScore) {
            minScore = score;
          }
        }
        topicScore += rowScore;
        scores.setTopicWordScore(topic, row, minScore);
      }

      scores.setTopicScore(topic, topicScore);
    }

    return scores;
  }
  public TopicScores getDistanceFromUniform() {
    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("uniform_dist", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    int numTypes = alphabet.size();

    for (int topic = 0; topic < numTopics; topic++) {

      double topicScore = 0.0;
      int position = 0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double count = info.getWeight();

        double score =
            (count / tokensPerTopic[topic]) * Math.log((count * numTypes) / tokensPerTopic[topic]);

        if (position < numTopWords) {
          scores.setTopicWordScore(topic, position, score);
        }

        topicScore += score;
        position++;
      }

      scores.setTopicScore(topic, topicScore);
    }

    return scores;
  }
  // frequency & exclusivity weight
  double calcFEW(int[] currentTypeTopicCounts, int currentTotalTypeCount, int maxTypeCount) {

    int index = 0;
    // int currentTopic, currentValue;
    double skewIndex = 0;
    // distinctivines, exclusivity calculation
    while (index < currentTypeTopicCounts.length && currentTypeTopicCounts[index] > 0) {
      // currentTopic = currentTypeTopicCounts[index] & topicMask;
      // currentValue = currentTypeTopicCounts[index] >> topicBits;
      skewIndex += Math.pow(currentTypeTopicCounts[index] >> topicBits, 2);
    }
    skewIndex = skewIndex / Math.pow(currentTotalTypeCount, 2);
    // frequency consideration
    skewIndex = currentTotalTypeCount / maxTypeCount * skewIndex;

    return skewIndex;
  }
  /** Low-quality topics often have lots of unusually short words. */
  public TopicScores getWordLengthStandardDeviation() {

    TopicScores scores = new TopicScores("word-length-sd", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    // Get the mean length

    double meanLength = 0.0;
    int totalWords = 0;

    for (int topic = 0; topic < numTopics; topic++) {
      for (int position = 0; position < topicTopWords[topic].length; position++) {
        // Some topics may not have all N words
        if (topicTopWords[topic][position] == null) {
          break;
        }
        meanLength += topicTopWords[topic][position].length();
        totalWords++;
      }
    }

    meanLength /= totalWords;

    // Now calculate the standard deviation

    double lengthVariance = 0.0;

    for (int topic = 0; topic < numTopics; topic++) {
      for (int position = 0; position < topicTopWords[topic].length; position++) {
        if (topicTopWords[topic][position] == null) {
          break;
        }

        int length = topicTopWords[topic][position].length();

        lengthVariance += (length - meanLength) * (length - meanLength);
      }
    }
    lengthVariance /= (totalWords - 1);

    // Finally produce an overall topic score

    double lengthSD = Math.sqrt(lengthVariance);
    for (int topic = 0; topic < numTopics; topic++) {
      for (int position = 0; position < topicTopWords[topic].length; position++) {
        if (topicTopWords[topic][position] == null) {
          break;
        }

        int length = topicTopWords[topic][position].length();

        scores.addToTopicScore(topic, (length - meanLength) / lengthSD);
        scores.setTopicWordScore(topic, position, (length - meanLength) / lengthSD);
      }
    }

    return scores;
  }
  public TopicScores getDocumentEntropy(int[] tokensPerTopic) {
    TopicScores scores = new TopicScores("document_entropy", numTopics, numTopWords);

    for (int topic = 0; topic < numTopics; topic++) {
      scores.setTopicScore(
          topic,
          -sumCountTimesLogCount[topic] / tokensPerTopic[topic] + Math.log(tokensPerTopic[topic]));
    }

    return scores;
  }
示例#7
0
 public double labelLogLikelihood(InstanceList ilist) {
   double logLikelihood = 0;
   for (int ii = 0; ii < ilist.size(); ii++) {
     double instanceWeight = ilist.getInstanceWeight(ii);
     Instance inst = ilist.get(ii);
     Labeling labeling = inst.getLabeling();
     if (labeling == null) continue;
     Labeling predicted = this.classify(inst).getLabeling();
     // System.err.println ("label = \n"+labeling);
     // System.err.println ("predicted = \n"+predicted);
     if (labeling.numLocations() == 1) {
       logLikelihood += instanceWeight * Math.log(predicted.value(labeling.getBestIndex()));
     } else {
       for (int lpos = 0; lpos < labeling.numLocations(); lpos++) {
         int li = labeling.indexAtLocation(lpos);
         double labelWeight = labeling.valueAtLocation(lpos);
         // System.err.print (", "+labelWeight);
         if (labelWeight == 0) continue;
         logLikelihood += instanceWeight * labelWeight * Math.log(predicted.value(li));
       }
     }
   }
   return logLikelihood;
 }
示例#8
0
 public void printWords(int numToPrint) {
   Alphabet alphabet = instancePipe.getDataAlphabet();
   int numFeatures = alphabet.size();
   int numLabels = instancePipe.getTargetAlphabet().size();
   double[] probs = new double[numFeatures];
   numToPrint = Math.min(numToPrint, numFeatures);
   for (int li = 0; li < numLabels; li++) {
     Arrays.fill(probs, 0.0);
     p[li].addProbabilities(probs);
     RankedFeatureVector rfv = new RankedFeatureVector(alphabet, probs);
     System.out.println(
         "\nFeature probabilities " + instancePipe.getTargetAlphabet().lookupObject(li));
     for (int i = 0; i < numToPrint; i++)
       System.out.println(rfv.getObjectAtRank(i) + " " + rfv.getValueAtRank(i));
   }
 }
示例#9
0
  /* Perform several rounds of Gibbs sampling on the documents in the given range. */
  public void estimate(
      int docIndexStart,
      int docIndexLength,
      int numIterations,
      int showTopicsInterval,
      int outputModelInterval,
      String outputModelFilename,
      Randoms r) {
    long startTime = System.currentTimeMillis();
    for (int iterations = 0; iterations < numIterations; iterations++) {
      if (iterations % 10 == 0) System.out.print(iterations);
      else System.out.print(".");
      System.out.flush();
      if (showTopicsInterval != 0 && iterations % showTopicsInterval == 0 && iterations > 0) {
        System.out.println();
        printTopWords(5, false);
      }
      if (outputModelInterval != 0 && iterations % outputModelInterval == 0 && iterations > 0) {
        this.write(new File(outputModelFilename + '.' + iterations));
      }
      sampleTopicsForDocs(docIndexStart, docIndexLength, r);
    }

    long seconds = Math.round((System.currentTimeMillis() - startTime) / 1000.0);
    long minutes = seconds / 60;
    seconds %= 60;
    long hours = minutes / 60;
    minutes %= 60;
    long days = hours / 24;
    hours %= 24;
    System.out.print("\nTotal time: ");
    if (days != 0) {
      System.out.print(days);
      System.out.print(" days ");
    }
    if (hours != 0) {
      System.out.print(hours);
      System.out.print(" hours ");
    }
    if (minutes != 0) {
      System.out.print(minutes);
      System.out.print(" minutes ");
    }
    System.out.print(seconds);
    System.out.println(" seconds");
  }
示例#10
0
  /**
   * Classify an instance using NaiveBayes according to the trained data. The alphabet of the
   * featureVector of the instance must match the alphabe of the pipe used to train the classifier.
   *
   * @param instance to be classified. Data field must be a FeatureVector
   * @return Classification containing the labeling of the instance
   */
  public Classification classify(Instance instance) {
    // Note that the current size of the label alphabet can be larger
    // than it was at the time of training.  We are careful here
    // to correctly handle those labels here. For example,
    // we assume the log prior probability of those classes is
    // minus infinity.
    int numClasses = getLabelAlphabet().size();
    double[] scores = new double[numClasses];
    FeatureVector fv = (FeatureVector) instance.getData();
    // Make sure the feature vector's feature dictionary matches
    // what we are expecting from our data pipe (and thus our notion
    // of feature probabilities.
    assert (instancePipe == null || fv.getAlphabet() == instancePipe.getDataAlphabet());
    int fvisize = fv.numLocations();

    prior.addLogProbabilities(scores);

    // Set the scores according to the feature weights and per-class probabilities
    for (int fvi = 0; fvi < fvisize; fvi++) {
      int fi = fv.indexAtLocation(fvi);
      for (int ci = 0; ci < numClasses; ci++) {
        // guard against dataAlphabet or target alphabet growing; can happen if classifying
        // a never before seen feature.  Ignore these.
        if (ci >= p.length || fi >= p[ci].size()) continue;

        scores[ci] += fv.valueAtLocation(fvi) * p[ci].logProbability(fi);
      }
    }

    // Get the scores in the range near zero, where exp() is more accurate
    double maxScore = Double.NEGATIVE_INFINITY;
    for (int ci = 0; ci < numClasses; ci++) if (scores[ci] > maxScore) maxScore = scores[ci];
    for (int ci = 0; ci < numClasses; ci++) scores[ci] -= maxScore;

    // Exponentiate and normalize
    double sum = 0;
    for (int ci = 0; ci < numClasses; ci++) sum += (scores[ci] = Math.exp(scores[ci]));
    for (int ci = 0; ci < numClasses; ci++) scores[ci] /= sum;

    // Create and return a Classification object
    return new Classification(instance, this, new LabelVector(getLabelAlphabet(), scores));
  }
  /** Low-quality topics may be very similar to the global distribution. */
  public TopicScores getDistanceFromCorpus() {

    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("corpus_dist", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    for (int topic = 0; topic < numTopics; topic++) {

      double coefficient = (double) numTokens / tokensPerTopic[topic];

      double topicScore = 0.0;
      int position = 0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double count = info.getWeight();

        double score =
            (count / tokensPerTopic[topic]) * Math.log(coefficient * count / wordTypeCounts[type]);

        if (position < numTopWords) {
          // System.out.println(alphabet.lookupObject(type) + ": " + count + " * " + numTokens + " /
          // " + wordTypeCounts[type] + " * " + tokensPerTopic[topic] + " = " + (coefficient * count
          // / wordTypeCounts[type]));
          scores.setTopicWordScore(topic, position, score);
        }

        topicScore += score;

        position++;
      }

      scores.setTopicScore(topic, topicScore);
    }

    return scores;
  }
  public void collectDocumentStatistics() {

    topicCodocumentMatrices = new int[numTopics][numTopWords][numTopWords];
    wordTypeCounts = new int[alphabet.size()];
    numTokens = 0;

    // This is an array of hash sets containing the words-of-interest for each topic,
    //  used for checking if the word at some position is one of those words.
    IntHashSet[] topicTopWordIndices = new IntHashSet[numTopics];

    // The same as the topic top words, but with int indices instead of strings,
    //  used for iterating over positions.
    int[][] topicWordIndicesInOrder = new int[numTopics][numTopWords];

    // This is an array of hash sets that will hold the words-of-interest present in a document,
    //  which will be cleared after every document.
    IntHashSet[] docTopicWordIndices = new IntHashSet[numTopics];

    int numDocs = model.getData().size();

    // The count of each topic, again cleared after every document.
    int[] topicCounts = new int[numTopics];

    for (int topic = 0; topic < numTopics; topic++) {
      IntHashSet wordIndices = new IntHashSet();

      for (int i = 0; i < numTopWords; i++) {
        if (topicTopWords[topic][i] != null) {
          int type = alphabet.lookupIndex(topicTopWords[topic][i]);
          topicWordIndicesInOrder[topic][i] = type;
          wordIndices.add(type);
        }
      }

      topicTopWordIndices[topic] = wordIndices;
      docTopicWordIndices[topic] = new IntHashSet();
    }

    int doc = 0;

    for (TopicAssignment document : model.getData()) {

      FeatureSequence tokens = (FeatureSequence) document.instance.getData();
      FeatureSequence topics = (FeatureSequence) document.topicSequence;

      for (int position = 0; position < tokens.size(); position++) {
        int type = tokens.getIndexAtPosition(position);
        int topic = topics.getIndexAtPosition(position);

        numTokens++;
        wordTypeCounts[type]++;

        topicCounts[topic]++;

        if (topicTopWordIndices[topic].contains(type)) {
          docTopicWordIndices[topic].add(type);
        }
      }

      int docLength = tokens.size();

      if (docLength > 0) {
        int maxTopic = -1;
        int maxCount = -1;

        for (int topic = 0; topic < numTopics; topic++) {

          if (topicCounts[topic] > 0) {
            numNonZeroDocuments[topic]++;

            if (topicCounts[topic] > maxCount) {
              maxTopic = topic;
              maxCount = topicCounts[topic];
            }

            sumCountTimesLogCount[topic] += topicCounts[topic] * Math.log(topicCounts[topic]);

            double proportion =
                (model.alpha[topic] + topicCounts[topic]) / (model.alphaSum + docLength);
            for (int i = 0; i < DEFAULT_DOC_PROPORTIONS.length; i++) {
              if (proportion < DEFAULT_DOC_PROPORTIONS[i]) {
                break;
              }
              numDocumentsAtProportions[topic][i]++;
            }

            IntHashSet supportedWords = docTopicWordIndices[topic];
            int[] indices = topicWordIndicesInOrder[topic];

            for (int i = 0; i < numTopWords; i++) {
              if (supportedWords.contains(indices[i])) {
                for (int j = i; j < numTopWords; j++) {
                  if (i == j) {
                    // Diagonals are total number of documents with word W in topic T
                    topicCodocumentMatrices[topic][i][i]++;
                  } else if (supportedWords.contains(indices[j])) {
                    topicCodocumentMatrices[topic][i][j]++;
                    topicCodocumentMatrices[topic][j][i]++;
                  }
                }
              }
            }

            docTopicWordIndices[topic].clear();
            topicCounts[topic] = 0;
          }
        }

        if (maxTopic > -1) {
          numRank1Documents[maxTopic]++;
        }
      }

      doc++;
    }
  }
示例#13
0
  public void printTopWords(int numWords, boolean useNewLines) {
    class WordProb implements Comparable {
      int wi;
      double p;

      public WordProb(int wi, double p) {
        this.wi = wi;
        this.p = p;
      }

      public final int compareTo(Object o2) {
        if (p > ((WordProb) o2).p) return -1;
        else if (p == ((WordProb) o2).p) return 0;
        else return 1;
      }
    }

    for (int ti = 0; ti < numTopics; ti++) {
      // Unigrams
      WordProb[] wp = new WordProb[numTypes];
      for (int wi = 0; wi < numTypes; wi++)
        wp[wi] = new WordProb(wi, (double) unitypeTopicCounts[wi][ti]);
      Arrays.sort(wp);
      int numToPrint = Math.min(wp.length, numWords);
      if (useNewLines) {
        System.out.println("\nTopic " + ti + " unigrams");
        for (int i = 0; i < numToPrint; i++)
          System.out.println(
              uniAlphabet.lookupObject(wp[i].wi).toString() + " " + wp[i].p / tokensPerTopic[ti]);
      } else {
        System.out.print("Topic " + ti + ": ");
        for (int i = 0; i < numToPrint; i++)
          System.out.print(uniAlphabet.lookupObject(wp[i].wi).toString() + " ");
      }

      // Bigrams
      /*
      wp = new WordProb[numBitypes];
      int bisum = 0;
      for (int wi = 0; wi < numBitypes; wi++) {
      	wp[wi] = new WordProb (wi, ((double)bitypeTopicCounts[wi][ti]));
      	bisum += bitypeTopicCounts[wi][ti];
      }
      Arrays.sort (wp);
      numToPrint = Math.min(wp.length, numWords);
      if (useNewLines) {
      	System.out.println ("\nTopic "+ti+" bigrams");
      	for (int i = 0; i < numToPrint; i++)
      		System.out.println (biAlphabet.lookupObject(wp[i].wi).toString() + " " + wp[i].p/bisum);
      } else {
      	System.out.print ("          ");
      	for (int i = 0; i < numToPrint; i++)
      		System.out.print (biAlphabet.lookupObject(wp[i].wi).toString() + " ");
      	System.out.println();
      }
      */

      // Ngrams
      AugmentableFeatureVector afv = new AugmentableFeatureVector(new Alphabet(), 10000, false);
      for (int di = 0; di < topics.length; di++) {
        FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.get(di).getData();
        for (int si = topics[di].length - 1; si >= 0; si--) {
          if (topics[di][si] == ti && grams[di][si] == 1) {
            String gramString = uniAlphabet.lookupObject(fs.getIndexAtPosition(si)).toString();
            while (grams[di][si] == 1 && --si >= 0)
              gramString =
                  uniAlphabet.lookupObject(fs.getIndexAtPosition(si)).toString() + "_" + gramString;
            afv.add(gramString, 1.0);
          }
        }
      }
      // System.out.println ("pre-sorting");
      int numNgrams = afv.numLocations();
      // System.out.println ("post-sorting "+numNgrams);
      wp = new WordProb[numNgrams];
      int ngramSum = 0;
      for (int loc = 0; loc < numNgrams; loc++) {
        wp[loc] = new WordProb(afv.indexAtLocation(loc), afv.valueAtLocation(loc));
        ngramSum += wp[loc].p;
      }
      Arrays.sort(wp);
      int numUnitypeTokens = 0, numBitypeTokens = 0, numUnitypeTypes = 0, numBitypeTypes = 0;
      for (int fi = 0; fi < numTypes; fi++) {
        numUnitypeTokens += unitypeTopicCounts[fi][ti];
        if (unitypeTopicCounts[fi][ti] != 0) numUnitypeTypes++;
      }
      for (int fi = 0; fi < numBitypes; fi++) {
        numBitypeTokens += bitypeTopicCounts[fi][ti];
        if (bitypeTopicCounts[fi][ti] != 0) numBitypeTypes++;
      }

      if (useNewLines) {
        System.out.println(
            "\nTopic "
                + ti
                + " unigrams "
                + numUnitypeTokens
                + "/"
                + numUnitypeTypes
                + " bigrams "
                + numBitypeTokens
                + "/"
                + numBitypeTypes
                + " phrases "
                + Math.round(afv.oneNorm())
                + "/"
                + numNgrams);
        for (int i = 0; i < Math.min(numNgrams, numWords); i++)
          System.out.println(
              afv.getAlphabet().lookupObject(wp[i].wi).toString() + " " + wp[i].p / ngramSum);
      } else {
        System.out.print(
            " (unigrams "
                + numUnitypeTokens
                + "/"
                + numUnitypeTypes
                + " bigrams "
                + numBitypeTokens
                + "/"
                + numBitypeTypes
                + " phrases "
                + Math.round(afv.oneNorm())
                + "/"
                + numNgrams
                + ")\n         ");
        // System.out.print (" (unique-ngrams="+numNgrams+"
        // ngram-count="+Math.round(afv.oneNorm())+")\n         ");
        for (int i = 0; i < Math.min(numNgrams, numWords); i++)
          System.out.print(afv.getAlphabet().lookupObject(wp[i].wi).toString() + " ");
        System.out.println();
      }
    }
  }