Java Alphabet примеры использования

Язык программирования: Java

Пространство имен/Пакет: cc.mallet.util

Класс/Тип: Alphabet

Примеров на hotexamples.com: 4

Java Alphabet - 4 примера найдено. Это лучшие примеры Java кода для cc.mallet.util.Alphabet, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

size(3)

lookupIndex(1)

lookupObject(1)

Пример #1

Показать файл

Файл: TopicModelDiagnostics.java Проект: luolanfeixue/Mallet

  public TopicScores getDistanceFromUniform() {
    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("uniform_dist", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    int numTypes = alphabet.size();

    for (int topic = 0; topic < numTopics; topic++) {

      double topicScore = 0.0;
      int position = 0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double count = info.getWeight();

        double score =
            (count / tokensPerTopic[topic]) * Math.log((count * numTypes) / tokensPerTopic[topic]);

        if (position < numTopWords) {
          scores.setTopicWordScore(topic, position, score);
        }

        topicScore += score;
        position++;
      }

      scores.setTopicScore(topic, topicScore);
    }

    return scores;
  }

Пример #2

Показать файл

Файл: TopicModelDiagnostics.java Проект: luolanfeixue/Mallet

  public TopicModelDiagnostics(ParallelTopicModel model, int numTopWords) {
    numTopics = model.getNumTopics();
    this.numTopWords = numTopWords;

    this.model = model;

    alphabet = model.getAlphabet();
    topicSortedWords = model.getSortedWords();

    topicTopWords = new String[numTopics][numTopWords];

    numRank1Documents = new int[numTopics];
    numNonZeroDocuments = new int[numTopics];
    numDocumentsAtProportions = new int[numTopics][DEFAULT_DOC_PROPORTIONS.length];
    sumCountTimesLogCount = new double[numTopics];

    diagnostics = new ArrayList<TopicScores>();

    for (int topic = 0; topic < numTopics; topic++) {

      int position = 0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      // How many words should we report? Some topics may have fewer than
      //  the default number of words with non-zero weight.
      int limit = numTopWords;
      if (sortedWords.size() < numTopWords) {
        limit = sortedWords.size();
      }

      Iterator<IDSorter> iterator = sortedWords.iterator();
      for (int i = 0; i < limit; i++) {
        IDSorter info = iterator.next();
        topicTopWords[topic][i] = (String) alphabet.lookupObject(info.getID());
      }
    }

    collectDocumentStatistics();

    diagnostics.add(getTokensPerTopic(model.tokensPerTopic));
    diagnostics.add(getDocumentEntropy(model.tokensPerTopic));
    diagnostics.add(getWordLengthScores());
    diagnostics.add(getCoherence());
    diagnostics.add(getDistanceFromUniform());
    diagnostics.add(getDistanceFromCorpus());
    diagnostics.add(getEffectiveNumberOfWords());
    diagnostics.add(getTokenDocumentDiscrepancies());
    diagnostics.add(getRank1Percent());
    diagnostics.add(getDocumentPercentRatio(FIFTY_PERCENT_INDEX, TWO_PERCENT_INDEX));
    diagnostics.add(getDocumentPercent(5));
    diagnostics.add(getExclusivity());
  }

Пример #3

Показать файл

Файл: TopicModelDiagnostics.java Проект: luolanfeixue/Mallet

  public TopicScores getEffectiveNumberOfWords() {
    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("eff_num_words", numTopics, numTopWords);

    int numTypes = alphabet.size();

    for (int topic = 0; topic < numTopics; topic++) {

      double sumSquaredProbabilities = 0.0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double probability = info.getWeight() / tokensPerTopic[topic];

        sumSquaredProbabilities += probability * probability;
      }

      scores.setTopicScore(topic, 1.0 / sumSquaredProbabilities);
    }

    return scores;
  }

Пример #4

Показать файл

Файл: TopicModelDiagnostics.java Проект: luolanfeixue/Mallet

  public void collectDocumentStatistics() {

    topicCodocumentMatrices = new int[numTopics][numTopWords][numTopWords];
    wordTypeCounts = new int[alphabet.size()];
    numTokens = 0;

    // This is an array of hash sets containing the words-of-interest for each topic,
    //  used for checking if the word at some position is one of those words.
    IntHashSet[] topicTopWordIndices = new IntHashSet[numTopics];

    // The same as the topic top words, but with int indices instead of strings,
    //  used for iterating over positions.
    int[][] topicWordIndicesInOrder = new int[numTopics][numTopWords];

    // This is an array of hash sets that will hold the words-of-interest present in a document,
    //  which will be cleared after every document.
    IntHashSet[] docTopicWordIndices = new IntHashSet[numTopics];

    int numDocs = model.getData().size();

    // The count of each topic, again cleared after every document.
    int[] topicCounts = new int[numTopics];

    for (int topic = 0; topic < numTopics; topic++) {
      IntHashSet wordIndices = new IntHashSet();

      for (int i = 0; i < numTopWords; i++) {
        if (topicTopWords[topic][i] != null) {
          int type = alphabet.lookupIndex(topicTopWords[topic][i]);
          topicWordIndicesInOrder[topic][i] = type;
          wordIndices.add(type);
        }
      }

      topicTopWordIndices[topic] = wordIndices;
      docTopicWordIndices[topic] = new IntHashSet();
    }

    int doc = 0;

    for (TopicAssignment document : model.getData()) {

      FeatureSequence tokens = (FeatureSequence) document.instance.getData();
      FeatureSequence topics = (FeatureSequence) document.topicSequence;

      for (int position = 0; position < tokens.size(); position++) {
        int type = tokens.getIndexAtPosition(position);
        int topic = topics.getIndexAtPosition(position);

        numTokens++;
        wordTypeCounts[type]++;

        topicCounts[topic]++;

        if (topicTopWordIndices[topic].contains(type)) {
          docTopicWordIndices[topic].add(type);
        }
      }

      int docLength = tokens.size();

      if (docLength > 0) {
        int maxTopic = -1;
        int maxCount = -1;

        for (int topic = 0; topic < numTopics; topic++) {

          if (topicCounts[topic] > 0) {
            numNonZeroDocuments[topic]++;

            if (topicCounts[topic] > maxCount) {
              maxTopic = topic;
              maxCount = topicCounts[topic];
            }

            sumCountTimesLogCount[topic] += topicCounts[topic] * Math.log(topicCounts[topic]);

            double proportion =
                (model.alpha[topic] + topicCounts[topic]) / (model.alphaSum + docLength);
            for (int i = 0; i < DEFAULT_DOC_PROPORTIONS.length; i++) {
              if (proportion < DEFAULT_DOC_PROPORTIONS[i]) {
                break;
              }
              numDocumentsAtProportions[topic][i]++;
            }

            IntHashSet supportedWords = docTopicWordIndices[topic];
            int[] indices = topicWordIndicesInOrder[topic];

            for (int i = 0; i < numTopWords; i++) {
              if (supportedWords.contains(indices[i])) {
                for (int j = i; j < numTopWords; j++) {
                  if (i == j) {
                    // Diagonals are total number of documents with word W in topic T
                    topicCodocumentMatrices[topic][i][i]++;
                  } else if (supportedWords.contains(indices[j])) {
                    topicCodocumentMatrices[topic][i][j]++;
                    topicCodocumentMatrices[topic][j][i]++;
                  }
                }
              }
            }

            docTopicWordIndices[topic].clear();
            topicCounts[topic] = 0;
          }
        }

        if (maxTopic > -1) {
          numRank1Documents[maxTopic]++;
        }
      }

      doc++;
    }
  }