public TopicScores getTokenDocumentDiscrepancies() {
    TopicScores scores = new TopicScores("token-doc-diff", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    for (int topic = 0; topic < numTopics; topic++) {
      int[][] matrix = topicCodocumentMatrices[topic];
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      double topicScore = 0.0;

      double[] wordDistribution = new double[numTopWords];
      double[] docDistribution = new double[numTopWords];

      double wordSum = 0.0;
      double docSum = 0.0;

      int position = 0;
      Iterator<IDSorter> iterator = sortedWords.iterator();
      while (iterator.hasNext() && position < numTopWords) {
        IDSorter info = iterator.next();

        wordDistribution[position] = info.getWeight();
        docDistribution[position] = matrix[position][position];

        wordSum += wordDistribution[position];
        docSum += docDistribution[position];

        position++;
      }

      for (position = 0; position < numTopWords; position++) {
        double p = wordDistribution[position] / wordSum;
        double q = docDistribution[position] / docSum;
        double meanProb = 0.5 * (p + q);

        double score = 0.0;
        if (p > 0) {
          score += 0.5 * p * Math.log(p / meanProb);
        }
        if (q > 0) {
          score += 0.5 * q * Math.log(q / meanProb);
        }

        scores.setTopicWordScore(topic, position, score);
        topicScore += score;
      }

      scores.setTopicScore(topic, topicScore);
    }

    return scores;
  }
  public TopicScores getCoherence() {
    TopicScores scores = new TopicScores("coherence", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    for (int topic = 0; topic < numTopics; topic++) {
      int[][] matrix = topicCodocumentMatrices[topic];

      double topicScore = 0.0;

      for (int row = 0; row < numTopWords; row++) {
        double rowScore = 0.0;
        double minScore = 0.0;
        for (int col = 0; col < row; col++) {
          double score =
              Math.log((matrix[row][col] + model.beta) / (matrix[col][col] + model.beta));
          rowScore += score;
          if (score < minScore) {
            minScore = score;
          }
        }
        topicScore += rowScore;
        scores.setTopicWordScore(topic, row, minScore);
      }

      scores.setTopicScore(topic, topicScore);
    }

    return scores;
  }
  public TopicScores getDistanceFromUniform() {
    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("uniform_dist", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    int numTypes = alphabet.size();

    for (int topic = 0; topic < numTopics; topic++) {

      double topicScore = 0.0;
      int position = 0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double count = info.getWeight();

        double score =
            (count / tokensPerTopic[topic]) * Math.log((count * numTypes) / tokensPerTopic[topic]);

        if (position < numTopWords) {
          scores.setTopicWordScore(topic, position, score);
        }

        topicScore += score;
        position++;
      }

      scores.setTopicScore(topic, topicScore);
    }

    return scores;
  }
  public TopicScores getDocumentEntropy(int[] tokensPerTopic) {
    TopicScores scores = new TopicScores("document_entropy", numTopics, numTopWords);

    for (int topic = 0; topic < numTopics; topic++) {
      scores.setTopicScore(
          topic,
          -sumCountTimesLogCount[topic] / tokensPerTopic[topic] + Math.log(tokensPerTopic[topic]));
    }

    return scores;
  }
示例#5
0
 public double labelLogLikelihood(InstanceList ilist) {
   double logLikelihood = 0;
   for (int ii = 0; ii < ilist.size(); ii++) {
     double instanceWeight = ilist.getInstanceWeight(ii);
     Instance inst = ilist.get(ii);
     Labeling labeling = inst.getLabeling();
     if (labeling == null) continue;
     Labeling predicted = this.classify(inst).getLabeling();
     // System.err.println ("label = \n"+labeling);
     // System.err.println ("predicted = \n"+predicted);
     if (labeling.numLocations() == 1) {
       logLikelihood += instanceWeight * Math.log(predicted.value(labeling.getBestIndex()));
     } else {
       for (int lpos = 0; lpos < labeling.numLocations(); lpos++) {
         int li = labeling.indexAtLocation(lpos);
         double labelWeight = labeling.valueAtLocation(lpos);
         // System.err.print (", "+labelWeight);
         if (labelWeight == 0) continue;
         logLikelihood += instanceWeight * labelWeight * Math.log(predicted.value(li));
       }
     }
   }
   return logLikelihood;
 }
  /** Low-quality topics may be very similar to the global distribution. */
  public TopicScores getDistanceFromCorpus() {

    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("corpus_dist", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    for (int topic = 0; topic < numTopics; topic++) {

      double coefficient = (double) numTokens / tokensPerTopic[topic];

      double topicScore = 0.0;
      int position = 0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double count = info.getWeight();

        double score =
            (count / tokensPerTopic[topic]) * Math.log(coefficient * count / wordTypeCounts[type]);

        if (position < numTopWords) {
          // System.out.println(alphabet.lookupObject(type) + ": " + count + " * " + numTokens + " /
          // " + wordTypeCounts[type] + " * " + tokensPerTopic[topic] + " = " + (coefficient * count
          // / wordTypeCounts[type]));
          scores.setTopicWordScore(topic, position, score);
        }

        topicScore += score;

        position++;
      }

      scores.setTopicScore(topic, topicScore);
    }

    return scores;
  }
  public void collectDocumentStatistics() {

    topicCodocumentMatrices = new int[numTopics][numTopWords][numTopWords];
    wordTypeCounts = new int[alphabet.size()];
    numTokens = 0;

    // This is an array of hash sets containing the words-of-interest for each topic,
    //  used for checking if the word at some position is one of those words.
    IntHashSet[] topicTopWordIndices = new IntHashSet[numTopics];

    // The same as the topic top words, but with int indices instead of strings,
    //  used for iterating over positions.
    int[][] topicWordIndicesInOrder = new int[numTopics][numTopWords];

    // This is an array of hash sets that will hold the words-of-interest present in a document,
    //  which will be cleared after every document.
    IntHashSet[] docTopicWordIndices = new IntHashSet[numTopics];

    int numDocs = model.getData().size();

    // The count of each topic, again cleared after every document.
    int[] topicCounts = new int[numTopics];

    for (int topic = 0; topic < numTopics; topic++) {
      IntHashSet wordIndices = new IntHashSet();

      for (int i = 0; i < numTopWords; i++) {
        if (topicTopWords[topic][i] != null) {
          int type = alphabet.lookupIndex(topicTopWords[topic][i]);
          topicWordIndicesInOrder[topic][i] = type;
          wordIndices.add(type);
        }
      }

      topicTopWordIndices[topic] = wordIndices;
      docTopicWordIndices[topic] = new IntHashSet();
    }

    int doc = 0;

    for (TopicAssignment document : model.getData()) {

      FeatureSequence tokens = (FeatureSequence) document.instance.getData();
      FeatureSequence topics = (FeatureSequence) document.topicSequence;

      for (int position = 0; position < tokens.size(); position++) {
        int type = tokens.getIndexAtPosition(position);
        int topic = topics.getIndexAtPosition(position);

        numTokens++;
        wordTypeCounts[type]++;

        topicCounts[topic]++;

        if (topicTopWordIndices[topic].contains(type)) {
          docTopicWordIndices[topic].add(type);
        }
      }

      int docLength = tokens.size();

      if (docLength > 0) {
        int maxTopic = -1;
        int maxCount = -1;

        for (int topic = 0; topic < numTopics; topic++) {

          if (topicCounts[topic] > 0) {
            numNonZeroDocuments[topic]++;

            if (topicCounts[topic] > maxCount) {
              maxTopic = topic;
              maxCount = topicCounts[topic];
            }

            sumCountTimesLogCount[topic] += topicCounts[topic] * Math.log(topicCounts[topic]);

            double proportion =
                (model.alpha[topic] + topicCounts[topic]) / (model.alphaSum + docLength);
            for (int i = 0; i < DEFAULT_DOC_PROPORTIONS.length; i++) {
              if (proportion < DEFAULT_DOC_PROPORTIONS[i]) {
                break;
              }
              numDocumentsAtProportions[topic][i]++;
            }

            IntHashSet supportedWords = docTopicWordIndices[topic];
            int[] indices = topicWordIndicesInOrder[topic];

            for (int i = 0; i < numTopWords; i++) {
              if (supportedWords.contains(indices[i])) {
                for (int j = i; j < numTopWords; j++) {
                  if (i == j) {
                    // Diagonals are total number of documents with word W in topic T
                    topicCodocumentMatrices[topic][i][i]++;
                  } else if (supportedWords.contains(indices[j])) {
                    topicCodocumentMatrices[topic][i][j]++;
                    topicCodocumentMatrices[topic][j][i]++;
                  }
                }
              }
            }

            docTopicWordIndices[topic].clear();
            topicCounts[topic] = 0;
          }
        }

        if (maxTopic > -1) {
          numRank1Documents[maxTopic]++;
        }
      }

      doc++;
    }
  }