Esempio n. 1
0
  /** This is (mostly) copied from CRF4.java */
  public boolean[][] labelConnectionsIn(
      Alphabet outputAlphabet, InstanceList trainingSet, String start) {
    int numLabels = outputAlphabet.size();
    boolean[][] connections = new boolean[numLabels][numLabels];
    for (int i = 0; i < trainingSet.size(); i++) {
      Instance instance = trainingSet.getInstance(i);
      FeatureSequence output = (FeatureSequence) instance.getTarget();
      for (int j = 1; j < output.size(); j++) {
        int sourceIndex = outputAlphabet.lookupIndex(output.get(j - 1));
        int destIndex = outputAlphabet.lookupIndex(output.get(j));
        assert (sourceIndex >= 0 && destIndex >= 0);
        connections[sourceIndex][destIndex] = true;
      }
    }

    // Handle start state
    if (start != null) {
      int startIndex = outputAlphabet.lookupIndex(start);
      for (int j = 0; j < outputAlphabet.size(); j++) {
        connections[startIndex][j] = true;
      }
    }

    return connections;
  }
  public TopicScores getDistanceFromUniform() {
    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("uniform_dist", numTopics, numTopWords);
    scores.wordScoresDefined = true;

    int numTypes = alphabet.size();

    for (int topic = 0; topic < numTopics; topic++) {

      double topicScore = 0.0;
      int position = 0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double count = info.getWeight();

        double score =
            (count / tokensPerTopic[topic]) * Math.log((count * numTypes) / tokensPerTopic[topic]);

        if (position < numTopWords) {
          scores.setTopicWordScore(topic, position, score);
        }

        topicScore += score;
        position++;
      }

      scores.setTopicScore(topic, topicScore);
    }

    return scores;
  }
  public TopicScores getEffectiveNumberOfWords() {
    int[] tokensPerTopic = model.tokensPerTopic;

    TopicScores scores = new TopicScores("eff_num_words", numTopics, numTopWords);

    int numTypes = alphabet.size();

    for (int topic = 0; topic < numTopics; topic++) {

      double sumSquaredProbabilities = 0.0;
      TreeSet<IDSorter> sortedWords = topicSortedWords.get(topic);

      for (IDSorter info : sortedWords) {
        int type = info.getID();
        double probability = info.getWeight() / tokensPerTopic[topic];

        sumSquaredProbabilities += probability * probability;
      }

      scores.setTopicScore(topic, 1.0 / sumSquaredProbabilities);
    }

    return scores;
  }
  public void collectDocumentStatistics() {

    topicCodocumentMatrices = new int[numTopics][numTopWords][numTopWords];
    wordTypeCounts = new int[alphabet.size()];
    numTokens = 0;

    // This is an array of hash sets containing the words-of-interest for each topic,
    //  used for checking if the word at some position is one of those words.
    IntHashSet[] topicTopWordIndices = new IntHashSet[numTopics];

    // The same as the topic top words, but with int indices instead of strings,
    //  used for iterating over positions.
    int[][] topicWordIndicesInOrder = new int[numTopics][numTopWords];

    // This is an array of hash sets that will hold the words-of-interest present in a document,
    //  which will be cleared after every document.
    IntHashSet[] docTopicWordIndices = new IntHashSet[numTopics];

    int numDocs = model.getData().size();

    // The count of each topic, again cleared after every document.
    int[] topicCounts = new int[numTopics];

    for (int topic = 0; topic < numTopics; topic++) {
      IntHashSet wordIndices = new IntHashSet();

      for (int i = 0; i < numTopWords; i++) {
        if (topicTopWords[topic][i] != null) {
          int type = alphabet.lookupIndex(topicTopWords[topic][i]);
          topicWordIndicesInOrder[topic][i] = type;
          wordIndices.add(type);
        }
      }

      topicTopWordIndices[topic] = wordIndices;
      docTopicWordIndices[topic] = new IntHashSet();
    }

    int doc = 0;

    for (TopicAssignment document : model.getData()) {

      FeatureSequence tokens = (FeatureSequence) document.instance.getData();
      FeatureSequence topics = (FeatureSequence) document.topicSequence;

      for (int position = 0; position < tokens.size(); position++) {
        int type = tokens.getIndexAtPosition(position);
        int topic = topics.getIndexAtPosition(position);

        numTokens++;
        wordTypeCounts[type]++;

        topicCounts[topic]++;

        if (topicTopWordIndices[topic].contains(type)) {
          docTopicWordIndices[topic].add(type);
        }
      }

      int docLength = tokens.size();

      if (docLength > 0) {
        int maxTopic = -1;
        int maxCount = -1;

        for (int topic = 0; topic < numTopics; topic++) {

          if (topicCounts[topic] > 0) {
            numNonZeroDocuments[topic]++;

            if (topicCounts[topic] > maxCount) {
              maxTopic = topic;
              maxCount = topicCounts[topic];
            }

            sumCountTimesLogCount[topic] += topicCounts[topic] * Math.log(topicCounts[topic]);

            double proportion =
                (model.alpha[topic] + topicCounts[topic]) / (model.alphaSum + docLength);
            for (int i = 0; i < DEFAULT_DOC_PROPORTIONS.length; i++) {
              if (proportion < DEFAULT_DOC_PROPORTIONS[i]) {
                break;
              }
              numDocumentsAtProportions[topic][i]++;
            }

            IntHashSet supportedWords = docTopicWordIndices[topic];
            int[] indices = topicWordIndicesInOrder[topic];

            for (int i = 0; i < numTopWords; i++) {
              if (supportedWords.contains(indices[i])) {
                for (int j = i; j < numTopWords; j++) {
                  if (i == j) {
                    // Diagonals are total number of documents with word W in topic T
                    topicCodocumentMatrices[topic][i][i]++;
                  } else if (supportedWords.contains(indices[j])) {
                    topicCodocumentMatrices[topic][i][j]++;
                    topicCodocumentMatrices[topic][j][i]++;
                  }
                }
              }
            }

            docTopicWordIndices[topic].clear();
            topicCounts[topic] = 0;
          }
        }

        if (maxTopic > -1) {
          numRank1Documents[maxTopic]++;
        }
      }

      doc++;
    }
  }
Esempio n. 5
0
 public void add(int featureIndex) {
   growIfNecessary();
   assert (featureIndex < dictionary.size());
   features[length++] = featureIndex;
 }
Esempio n. 6
0
  public void estimate(
      InstanceList documents,
      int numIterations,
      int showTopicsInterval,
      int outputModelInterval,
      String outputModelFilename,
      Randoms r) {
    ilist = documents;
    uniAlphabet = ilist.getDataAlphabet();
    biAlphabet = ((FeatureSequenceWithBigrams) ilist.get(0).getData()).getBiAlphabet();
    numTypes = uniAlphabet.size();
    numBitypes = biAlphabet.size();
    int numDocs = ilist.size();
    topics = new int[numDocs][];
    grams = new int[numDocs][];
    docTopicCounts = new int[numDocs][numTopics];
    typeNgramTopicCounts = new int[numTypes][2][numTopics];
    unitypeTopicCounts = new int[numTypes][numTopics];
    bitypeTopicCounts = new int[numBitypes][numTopics];
    tokensPerTopic = new int[numTopics];
    bitokensPerTopic = new int[numTypes][numTopics];
    tAlpha = alpha * numTopics;
    vBeta = beta * numTypes;
    vGamma = gamma * numTypes;

    long startTime = System.currentTimeMillis();

    // Initialize with random assignments of tokens to topics
    // and finish allocating this.topics and this.tokens
    int topic, gram, seqLen, fi;
    for (int di = 0; di < numDocs; di++) {
      FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.get(di).getData();
      seqLen = fs.getLength();
      numTokens += seqLen;
      topics[di] = new int[seqLen];
      grams[di] = new int[seqLen];
      // Randomly assign tokens to topics
      int prevFi = -1, prevTopic = -1;
      for (int si = 0; si < seqLen; si++) {
        // randomly sample a topic for the word at position si
        topic = r.nextInt(numTopics);
        // if a bigram is allowed at position si, then sample a gram status for it.
        gram = (fs.getBiIndexAtPosition(si) == -1 ? 0 : r.nextInt(2));
        if (gram != 0) biTokens++;
        topics[di][si] = topic;
        grams[di][si] = gram;
        docTopicCounts[di][topic]++;
        fi = fs.getIndexAtPosition(si);
        if (prevFi != -1) typeNgramTopicCounts[prevFi][gram][prevTopic]++;
        if (gram == 0) {
          unitypeTopicCounts[fi][topic]++;
          tokensPerTopic[topic]++;
        } else {
          bitypeTopicCounts[fs.getBiIndexAtPosition(si)][topic]++;
          bitokensPerTopic[prevFi][topic]++;
        }
        prevFi = fi;
        prevTopic = topic;
      }
    }

    for (int iterations = 0; iterations < numIterations; iterations++) {
      sampleTopicsForAllDocs(r);
      if (iterations % 10 == 0) System.out.print(iterations);
      else System.out.print(".");
      System.out.flush();
      if (showTopicsInterval != 0 && iterations % showTopicsInterval == 0 && iterations > 0) {
        System.out.println();
        printTopWords(5, false);
      }
      if (outputModelInterval != 0 && iterations % outputModelInterval == 0 && iterations > 0) {
        this.write(new File(outputModelFilename + '.' + iterations));
      }
    }

    System.out.println(
        "\nTotal time (sec): " + ((System.currentTimeMillis() - startTime) / 1000.0));
  }