Java IDSorter.getID Examples

Programming Language: Java

Namespace/Package Name: cc.mallet.util

Class/Type: IDSorter

Method/Function: getID

Examples at hotexamples.com: 5

Java IDSorter.getID - 5 examples found. These are the top rated real world Java examples of cc.mallet.util.IDSorter.getID extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

getWeight(6)

getID(5)

Frequently Used Methods

Example #1

0

Show file

File: TopicModelDiagnostics.java Project: luolanfeixue/Mallet

  public TopicScores getDistanceFromUniform() {
    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("uniform_dist", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    int numTypes = alphabet.size();

    for (int topic = 0; topic < numTopics; topic++) {

      double topicScore = 0.0;
      int position = 0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double count = info.getWeight();

        double score =
            (count / tokensPerTopic[topic]) * Math.log((count * numTypes) / tokensPerTopic[topic]);

        if (position < numTopWords) {
          scores.setTopicWordScore(topic, position, score);
        }

        topicScore += score;
        position++;
      }

      scores.setTopicScore(topic, topicScore);
    }

    return scores;
  }

Example #2

0

Show file

File: TopicModelDiagnostics.java Project: luolanfeixue/Mallet

  /** Low-quality topics may have words that are also prominent in other topics. */
  public TopicScores getExclusivity() {

    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("exclusivity", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    double sumDefaultProbs = 0.0;
    for (int topic = 0; topic < numTopics; topic++) {
      sumDefaultProbs += model.beta / (model.betaSum + tokensPerTopic[topic]);
    }

    for (int topic = 0; topic < numTopics; topic++) {

      double topicScore = 0.0;
      int position = 0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double count = info.getWeight();

        double sumTypeProbs = sumDefaultProbs;
        int[] topicCounts = model.typeTopicCounts[type];

        int index = 0;
        while (index < topicCounts.length && topicCounts[index] > 0) {

          int otherTopic = topicCounts[index] & model.topicMask;
          int otherCount = topicCounts[index] >> model.topicBits;

          // We've already accounted for the smoothing parameter,
          //  now we need to add the actual count for the non-zero
          //  topics.
          sumTypeProbs += ((double) otherCount) / (model.betaSum + tokensPerTopic[otherTopic]);

          index++;
        }

        double score =
            ((model.beta + count) / (model.betaSum + tokensPerTopic[topic])) / sumTypeProbs;
        scores.setTopicWordScore(topic, position, score);
        topicScore += score;

        position++;
        if (position == numTopWords) {
          break;
        }
      }

      scores.setTopicScore(topic, topicScore / numTopWords);
    }

    return scores;
  }

Example #3

0

Show file

File: TopicModelDiagnostics.java Project: luolanfeixue/Mallet

  public TopicModelDiagnostics(ParallelTopicModel model, int numTopWords) {
    numTopics = model.getNumTopics();
    this.numTopWords = numTopWords;

    this.model = model;

    alphabet = model.getAlphabet();
    topicSortedWords = model.getSortedWords();

    topicTopWords = new String[numTopics][numTopWords];

    numRank1Documents = new int[numTopics];
    numNonZeroDocuments = new int[numTopics];
    numDocumentsAtProportions = new int[numTopics][DEFAULT_DOC_PROPORTIONS.length];
    sumCountTimesLogCount = new double[numTopics];

    diagnostics = new ArrayList<TopicScores>();

    for (int topic = 0; topic < numTopics; topic++) {

      int position = 0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      // How many words should we report? Some topics may have fewer than
      //  the default number of words with non-zero weight.
      int limit = numTopWords;
      if (sortedWords.size() < numTopWords) {
        limit = sortedWords.size();
      }

      Iterator<IDSorter> iterator = sortedWords.iterator();
      for (int i = 0; i < limit; i++) {
        IDSorter info = iterator.next();
        topicTopWords[topic][i] = (String) alphabet.lookupObject(info.getID());
      }
    }

    collectDocumentStatistics();

    diagnostics.add(getTokensPerTopic(model.tokensPerTopic));
    diagnostics.add(getDocumentEntropy(model.tokensPerTopic));
    diagnostics.add(getWordLengthScores());
    diagnostics.add(getCoherence());
    diagnostics.add(getDistanceFromUniform());
    diagnostics.add(getDistanceFromCorpus());
    diagnostics.add(getEffectiveNumberOfWords());
    diagnostics.add(getTokenDocumentDiscrepancies());
    diagnostics.add(getRank1Percent());
    diagnostics.add(getDocumentPercentRatio(FIFTY_PERCENT_INDEX, TWO_PERCENT_INDEX));
    diagnostics.add(getDocumentPercent(5));
    diagnostics.add(getExclusivity());
  }

Example #4

0

Show file

File: TopicModelDiagnostics.java Project: luolanfeixue/Mallet

  /** Low-quality topics may be very similar to the global distribution. */
  public TopicScores getDistanceFromCorpus() {

    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("corpus_dist", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    for (int topic = 0; topic < numTopics; topic++) {

      double coefficient = (double) numTokens / tokensPerTopic[topic];

      double topicScore = 0.0;
      int position = 0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double count = info.getWeight();

        double score =
            (count / tokensPerTopic[topic]) * Math.log(coefficient * count / wordTypeCounts[type]);

        if (position < numTopWords) {
          // System.out.println(alphabet.lookupObject(type) + ": " + count + " * " + numTokens + " /
          // " + wordTypeCounts[type] + " * " + tokensPerTopic[topic] + " = " + (coefficient * count
          // / wordTypeCounts[type]));
          scores.setTopicWordScore(topic, position, score);
        }

        topicScore += score;

        position++;
      }

      scores.setTopicScore(topic, topicScore);
    }

    return scores;
  }

Example #5

0

Show file

File: TopicModelDiagnostics.java Project: luolanfeixue/Mallet

  public TopicScores getEffectiveNumberOfWords() {
    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("eff_num_words", numTopics, numTopWords);

    int numTypes = alphabet.size();

    for (int topic = 0; topic < numTopics; topic++) {

      double sumSquaredProbabilities = 0.0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double probability = info.getWeight() / tokensPerTopic[topic];

        sumSquaredProbabilities += probability * probability;
      }

      scores.setTopicScore(topic, 1.0 / sumSquaredProbabilities);
    }

    return scores;
  }