예제 #1
0
파일: LDA.java 프로젝트: shamsa-abid/habiba
  public void printTopWords(int numWords, boolean useNewLines) {
    class WordProb implements Comparable {
      int wi;
      double p;

      public WordProb(int wi, double p) {
        this.wi = wi;
        this.p = p;
      }

      public final int compareTo(Object o2) {
        if (p > ((WordProb) o2).p) return -1;
        else if (p == ((WordProb) o2).p) return 0;
        else return 1;
      }
    }

    WordProb[] wp = new WordProb[numTypes];
    for (int ti = 0; ti < numTopics; ti++) {
      for (int wi = 0; wi < numTypes; wi++)
        wp[wi] = new WordProb(wi, ((double) typeTopicCounts[wi][ti]) / tokensPerTopic[ti]);
      Arrays.sort(wp);
      if (useNewLines) {
        System.out.println("\nTopic " + ti);
        for (int i = 0; i < numWords; i++)
          System.out.println(
              ilist.getDataAlphabet().lookupObject(wp[i].wi).toString() + " " + wp[i].p);
      } else {
        System.out.print("Topic " + ti + ": ");
        for (int i = 0; i < numWords; i++)
          System.out.print(ilist.getDataAlphabet().lookupObject(wp[i].wi).toString() + " ");
        System.out.println();
      }
    }
  }
예제 #2
0
파일: LDA.java 프로젝트: shamsa-abid/habiba
  public void addDocuments(
      InstanceList additionalDocuments,
      int numIterations,
      int showTopicsInterval,
      int outputModelInterval,
      String outputModelFilename,
      Randoms r) {
    if (ilist == null) throw new IllegalStateException("Must already have some documents first.");
    for (Instance inst : additionalDocuments) ilist.add(inst);
    assert (ilist.getDataAlphabet() == additionalDocuments.getDataAlphabet());
    assert (additionalDocuments.getDataAlphabet().size() >= numTypes);
    numTypes = additionalDocuments.getDataAlphabet().size();
    int numNewDocs = additionalDocuments.size();
    int numOldDocs = topics.length;
    int numDocs = numOldDocs + numNewDocs;
    // Expand various arrays to make space for the new data.
    int[][] newTopics = new int[numDocs][];
    for (int i = 0; i < topics.length; i++) newTopics[i] = topics[i];

    topics = newTopics; // The rest of this array will be initialized below.
    int[][] newDocTopicCounts = new int[numDocs][numTopics];
    for (int i = 0; i < docTopicCounts.length; i++) newDocTopicCounts[i] = docTopicCounts[i];
    docTopicCounts = newDocTopicCounts; // The rest of this array will be initialized below.
    int[][] newTypeTopicCounts = new int[numTypes][numTopics];
    for (int i = 0; i < typeTopicCounts.length; i++)
      for (int j = 0; j < numTopics; j++)
        newTypeTopicCounts[i][j] = typeTopicCounts[i][j]; // This array further populated below

    FeatureSequence fs;
    for (int di = numOldDocs; di < numDocs; di++) {
      try {
        fs = (FeatureSequence) additionalDocuments.get(di - numOldDocs).getData();
      } catch (ClassCastException e) {
        System.err.println(
            "LDA and other topic models expect FeatureSequence data, not FeatureVector data.  "
                + "With text2vectors, you can obtain such data with --keep-sequence or --keep-bisequence.");
        throw e;
      }
      int seqLen = fs.getLength();
      numTokens += seqLen;
      topics[di] = new int[seqLen];
      // Randomly assign tokens to topics
      for (int si = 0; si < seqLen; si++) {
        int topic = r.nextInt(numTopics);
        topics[di][si] = topic;
        docTopicCounts[di][topic]++;
        typeTopicCounts[fs.getIndexAtPosition(si)][topic]++;
        tokensPerTopic[topic]++;
      }
    }
  }
예제 #3
0
파일: LDA.java 프로젝트: shamsa-abid/habiba
 private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
   int featuresLength;
   int version = in.readInt();
   ilist = (InstanceList) in.readObject();
   numTopics = in.readInt();
   alpha = in.readDouble();
   beta = in.readDouble();
   tAlpha = in.readDouble();
   vBeta = in.readDouble();
   int numDocs = ilist.size();
   topics = new int[numDocs][];
   for (int di = 0; di < ilist.size(); di++) {
     int docLen = ((FeatureSequence) ilist.get(di).getData()).getLength();
     topics[di] = new int[docLen];
     for (int si = 0; si < docLen; si++) topics[di][si] = in.readInt();
   }
   docTopicCounts = new int[numDocs][numTopics];
   for (int di = 0; di < ilist.size(); di++)
     for (int ti = 0; ti < numTopics; ti++) docTopicCounts[di][ti] = in.readInt();
   int numTypes = ilist.getDataAlphabet().size();
   typeTopicCounts = new int[numTypes][numTopics];
   for (int fi = 0; fi < numTypes; fi++)
     for (int ti = 0; ti < numTopics; ti++) typeTopicCounts[fi][ti] = in.readInt();
   tokensPerTopic = new int[numTopics];
   for (int ti = 0; ti < numTopics; ti++) tokensPerTopic[ti] = in.readInt();
 }
예제 #4
0
  public FeatureCountTool(InstanceList instances) {
    this.instances = instances;
    numFeatures = instances.getDataAlphabet().size();

    featureCounts = new double[numFeatures];
    documentFrequencies = new int[numFeatures];
  }
예제 #5
0
파일: LDA.java 프로젝트: shamsa-abid/habiba
  public void estimate(
      InstanceList documents,
      int numIterations,
      int showTopicsInterval,
      int outputModelInterval,
      String outputModelFilename,
      Randoms r) {
    ilist = documents.shallowClone();
    numTypes = ilist.getDataAlphabet().size();
    int numDocs = ilist.size();
    topics = new int[numDocs][];
    docTopicCounts = new int[numDocs][numTopics];
    typeTopicCounts = new int[numTypes][numTopics];
    tokensPerTopic = new int[numTopics];
    tAlpha = alpha * numTopics;
    vBeta = beta * numTypes;

    long startTime = System.currentTimeMillis();

    // Initialize with random assignments of tokens to topics
    // and finish allocating this.topics and this.tokens
    int topic, seqLen;
    FeatureSequence fs;
    for (int di = 0; di < numDocs; di++) {
      try {
        fs = (FeatureSequence) ilist.get(di).getData();
      } catch (ClassCastException e) {
        System.err.println(
            "LDA and other topic models expect FeatureSequence data, not FeatureVector data.  "
                + "With text2vectors, you can obtain such data with --keep-sequence or --keep-bisequence.");
        throw e;
      }
      seqLen = fs.getLength();
      numTokens += seqLen;
      topics[di] = new int[seqLen];
      // Randomly assign tokens to topics
      for (int si = 0; si < seqLen; si++) {
        topic = r.nextInt(numTopics);
        topics[di][si] = topic;
        docTopicCounts[di][topic]++;
        typeTopicCounts[fs.getIndexAtPosition(si)][topic]++;
        tokensPerTopic[topic]++;
      }
    }

    this.estimate(
        0, numDocs, numIterations, showTopicsInterval, outputModelInterval, outputModelFilename, r);
    // 124.5 seconds
    // 144.8 seconds after using FeatureSequence instead of tokens[][] array
    // 121.6 seconds after putting "final" on FeatureSequence.getIndexAtPosition()
    // 106.3 seconds after avoiding array lookup in inner loop with a temporary variable

  }
  /**
   * Initialize this separate model using a complete list.
   *
   * @param documents
   * @param testStartIndex
   */
  public void divideDocuments(InstanceList documents, int testStartIndex) {
    Alphabet dataAlpha = documents.getDataAlphabet();
    Alphabet targetAlpha = documents.getTargetAlphabet();

    this.training = new InstanceList(dataAlpha, targetAlpha);
    this.test = new InstanceList(dataAlpha, targetAlpha);
    int di = 0;
    for (di = 0; di < testStartIndex; di++) {
      training.add(documents.get(di));
    }
    for (di = testStartIndex; di < documents.size(); di++) {
      test.add(documents.get(di));
    }
  }
예제 #7
0
파일: LDA.java 프로젝트: shamsa-abid/habiba
 public void printState(PrintWriter pw) {
   Alphabet a = ilist.getDataAlphabet();
   pw.println("#doc pos typeindex type topic");
   for (int di = 0; di < topics.length; di++) {
     FeatureSequence fs = (FeatureSequence) ilist.get(di).getData();
     for (int si = 0; si < topics[di].length; si++) {
       int type = fs.getIndexAtPosition(si);
       pw.print(di);
       pw.print(' ');
       pw.print(si);
       pw.print(' ');
       pw.print(type);
       pw.print(' ');
       pw.print(a.lookupObject(type));
       pw.print(' ');
       pw.print(topics[di][si]);
       pw.println();
     }
   }
 }
예제 #8
0
  public void printCounts() {

    Alphabet alphabet = instances.getDataAlphabet();

    NumberFormat nf = NumberFormat.getInstance();
    nf.setMinimumFractionDigits(0);
    nf.setMaximumFractionDigits(6);
    nf.setGroupingUsed(false);

    for (int feature = 0; feature < numFeatures; feature++) {

      Formatter formatter = new Formatter(new StringBuilder(), Locale.US);

      formatter.format(
          "%s\t%s\t%d",
          alphabet.lookupObject(feature).toString(),
          nf.format(featureCounts[feature]),
          documentFrequencies[feature]);

      System.out.println(formatter);
    }
  }
예제 #9
0
  public void estimate(
      InstanceList documents,
      int numIterations,
      int showTopicsInterval,
      int outputModelInterval,
      String outputModelFilename,
      Randoms r) {
    ilist = documents;
    uniAlphabet = ilist.getDataAlphabet();
    biAlphabet = ((FeatureSequenceWithBigrams) ilist.get(0).getData()).getBiAlphabet();
    numTypes = uniAlphabet.size();
    numBitypes = biAlphabet.size();
    int numDocs = ilist.size();
    topics = new int[numDocs][];
    grams = new int[numDocs][];
    docTopicCounts = new int[numDocs][numTopics];
    typeNgramTopicCounts = new int[numTypes][2][numTopics];
    unitypeTopicCounts = new int[numTypes][numTopics];
    bitypeTopicCounts = new int[numBitypes][numTopics];
    tokensPerTopic = new int[numTopics];
    bitokensPerTopic = new int[numTypes][numTopics];
    tAlpha = alpha * numTopics;
    vBeta = beta * numTypes;
    vGamma = gamma * numTypes;

    long startTime = System.currentTimeMillis();

    // Initialize with random assignments of tokens to topics
    // and finish allocating this.topics and this.tokens
    int topic, gram, seqLen, fi;
    for (int di = 0; di < numDocs; di++) {
      FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.get(di).getData();
      seqLen = fs.getLength();
      numTokens += seqLen;
      topics[di] = new int[seqLen];
      grams[di] = new int[seqLen];
      // Randomly assign tokens to topics
      int prevFi = -1, prevTopic = -1;
      for (int si = 0; si < seqLen; si++) {
        // randomly sample a topic for the word at position si
        topic = r.nextInt(numTopics);
        // if a bigram is allowed at position si, then sample a gram status for it.
        gram = (fs.getBiIndexAtPosition(si) == -1 ? 0 : r.nextInt(2));
        if (gram != 0) biTokens++;
        topics[di][si] = topic;
        grams[di][si] = gram;
        docTopicCounts[di][topic]++;
        fi = fs.getIndexAtPosition(si);
        if (prevFi != -1) typeNgramTopicCounts[prevFi][gram][prevTopic]++;
        if (gram == 0) {
          unitypeTopicCounts[fi][topic]++;
          tokensPerTopic[topic]++;
        } else {
          bitypeTopicCounts[fs.getBiIndexAtPosition(si)][topic]++;
          bitokensPerTopic[prevFi][topic]++;
        }
        prevFi = fi;
        prevTopic = topic;
      }
    }

    for (int iterations = 0; iterations < numIterations; iterations++) {
      sampleTopicsForAllDocs(r);
      if (iterations % 10 == 0) System.out.print(iterations);
      else System.out.print(".");
      System.out.flush();
      if (showTopicsInterval != 0 && iterations % showTopicsInterval == 0 && iterations > 0) {
        System.out.println();
        printTopWords(5, false);
      }
      if (outputModelInterval != 0 && iterations % outputModelInterval == 0 && iterations > 0) {
        this.write(new File(outputModelFilename + '.' + iterations));
      }
    }

    System.out.println(
        "\nTotal time (sec): " + ((System.currentTimeMillis() - startTime) / 1000.0));
  }