Beispiel #1
0
  public void printTopWords(int numWords, boolean useNewLines) {
    class WordProb implements Comparable {
      int wi;
      double p;

      public WordProb(int wi, double p) {
        this.wi = wi;
        this.p = p;
      }

      public final int compareTo(Object o2) {
        if (p > ((WordProb) o2).p) return -1;
        else if (p == ((WordProb) o2).p) return 0;
        else return 1;
      }
    }

    WordProb[] wp = new WordProb[numTypes];
    for (int ti = 0; ti < numTopics; ti++) {
      for (int wi = 0; wi < numTypes; wi++)
        wp[wi] = new WordProb(wi, ((double) typeTopicCounts[wi][ti]) / tokensPerTopic[ti]);
      Arrays.sort(wp);
      if (useNewLines) {
        System.out.println("\nTopic " + ti);
        for (int i = 0; i < numWords; i++)
          System.out.println(
              ilist.getDataAlphabet().lookupObject(wp[i].wi).toString() + " " + wp[i].p);
      } else {
        System.out.print("Topic " + ti + ": ");
        for (int i = 0; i < numWords; i++)
          System.out.print(ilist.getDataAlphabet().lookupObject(wp[i].wi).toString() + " ");
        System.out.println();
      }
    }
  }
Beispiel #2
0
  public void addDocuments(
      InstanceList additionalDocuments,
      int numIterations,
      int showTopicsInterval,
      int outputModelInterval,
      String outputModelFilename,
      Randoms r) {
    if (ilist == null) throw new IllegalStateException("Must already have some documents first.");
    for (Instance inst : additionalDocuments) ilist.add(inst);
    assert (ilist.getDataAlphabet() == additionalDocuments.getDataAlphabet());
    assert (additionalDocuments.getDataAlphabet().size() >= numTypes);
    numTypes = additionalDocuments.getDataAlphabet().size();
    int numNewDocs = additionalDocuments.size();
    int numOldDocs = topics.length;
    int numDocs = numOldDocs + numNewDocs;
    // Expand various arrays to make space for the new data.
    int[][] newTopics = new int[numDocs][];
    for (int i = 0; i < topics.length; i++) newTopics[i] = topics[i];

    topics = newTopics; // The rest of this array will be initialized below.
    int[][] newDocTopicCounts = new int[numDocs][numTopics];
    for (int i = 0; i < docTopicCounts.length; i++) newDocTopicCounts[i] = docTopicCounts[i];
    docTopicCounts = newDocTopicCounts; // The rest of this array will be initialized below.
    int[][] newTypeTopicCounts = new int[numTypes][numTopics];
    for (int i = 0; i < typeTopicCounts.length; i++)
      for (int j = 0; j < numTopics; j++)
        newTypeTopicCounts[i][j] = typeTopicCounts[i][j]; // This array further populated below

    FeatureSequence fs;
    for (int di = numOldDocs; di < numDocs; di++) {
      try {
        fs = (FeatureSequence) additionalDocuments.get(di - numOldDocs).getData();
      } catch (ClassCastException e) {
        System.err.println(
            "LDA and other topic models expect FeatureSequence data, not FeatureVector data.  "
                + "With text2vectors, you can obtain such data with --keep-sequence or --keep-bisequence.");
        throw e;
      }
      int seqLen = fs.getLength();
      numTokens += seqLen;
      topics[di] = new int[seqLen];
      // Randomly assign tokens to topics
      for (int si = 0; si < seqLen; si++) {
        int topic = r.nextInt(numTopics);
        topics[di][si] = topic;
        docTopicCounts[di][topic]++;
        typeTopicCounts[fs.getIndexAtPosition(si)][topic]++;
        tokensPerTopic[topic]++;
      }
    }
  }
Beispiel #3
0
 private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
   int featuresLength;
   int version = in.readInt();
   ilist = (InstanceList) in.readObject();
   numTopics = in.readInt();
   alpha = in.readDouble();
   beta = in.readDouble();
   tAlpha = in.readDouble();
   vBeta = in.readDouble();
   int numDocs = ilist.size();
   topics = new int[numDocs][];
   for (int di = 0; di < ilist.size(); di++) {
     int docLen = ((FeatureSequence) ilist.get(di).getData()).getLength();
     topics[di] = new int[docLen];
     for (int si = 0; si < docLen; si++) topics[di][si] = in.readInt();
   }
   docTopicCounts = new int[numDocs][numTopics];
   for (int di = 0; di < ilist.size(); di++)
     for (int ti = 0; ti < numTopics; ti++) docTopicCounts[di][ti] = in.readInt();
   int numTypes = ilist.getDataAlphabet().size();
   typeTopicCounts = new int[numTypes][numTopics];
   for (int fi = 0; fi < numTypes; fi++)
     for (int ti = 0; ti < numTopics; ti++) typeTopicCounts[fi][ti] = in.readInt();
   tokensPerTopic = new int[numTopics];
   for (int ti = 0; ti < numTopics; ti++) tokensPerTopic[ti] = in.readInt();
 }
  public FeatureCountTool(InstanceList instances) {
    this.instances = instances;
    numFeatures = instances.getDataAlphabet().size();

    featureCounts = new double[numFeatures];
    documentFrequencies = new int[numFeatures];
  }
  public static InstanceList copy(InstanceList instances) {
    InstanceList ret = (InstanceList) instances.clone();
    // LabelAlphabet labelDict = (LabelAlphabet) ret.getTargetAlphabet();
    Alphabet featDict = ret.getDataAlphabet();

    for (int i = 0; i < ret.size(); i++) {
      Instance instance = ret.get(i);
      Instance clone = (Instance) instance.clone();
      FeatureVector fv = (FeatureVector) clone.getData();

      int[] indices = fv.getIndices();
      double[] values = fv.getValues();

      int[] newIndices = new int[indices.length];
      System.arraycopy(indices, 0, newIndices, 0, indices.length);

      double[] newValues = new double[indices.length];
      System.arraycopy(values, 0, newValues, 0, indices.length);

      FeatureVector newFv = new FeatureVector(featDict, newIndices, newValues);
      Instance newInstance =
          new Instance(newFv, instance.getTarget(), instance.getName(), instance.getSource());
      ret.set(i, newInstance);
    }

    return ret;
  }
  public static InstanceList scale(InstanceList trainingList, double lower, double upper) {
    InstanceList ret = copy(trainingList);
    Alphabet featDict = ret.getDataAlphabet();

    double[] feat_max = new double[featDict.size()];
    double[] feat_min = new double[featDict.size()];

    for (int i = 0; i < feat_max.length; i++) {
      feat_max[i] = -Double.MAX_VALUE;
      feat_min[i] = Double.MAX_VALUE;
    }

    for (int i = 0; i < ret.size(); i++) {
      Instance inst = ret.get(i);
      FeatureVector fv = (FeatureVector) inst.getData();

      for (int loc = 0; loc < fv.numLocations(); loc++) {
        int featId = fv.indexAtLocation(loc);
        double value = fv.valueAtLocation(loc);
        double maxValue = feat_max[featId];
        double minValue = feat_min[featId];

        double newMaxValue = Math.max(value, maxValue);
        double newMinValue = Math.min(value, minValue);

        feat_max[featId] = newMaxValue;
        feat_min[featId] = newMinValue;
      }
    }

    // double lower = -1;
    // double upper = 1;

    for (int i = 0; i < ret.size(); i++) {
      Instance inst = ret.get(i);
      FeatureVector fv = (FeatureVector) inst.getData();

      for (int loc = 0; loc < fv.numLocations(); loc++) {
        int featId = fv.indexAtLocation(loc);
        double value = fv.valueAtLocation(loc);
        double maxValue = feat_max[featId];
        double minValue = feat_min[featId];
        double newValue = Double.NaN;
        if (maxValue == minValue) {
          newValue = value;
        } else if (value == minValue) {
          newValue = lower;
        } else if (value == maxValue) {
          newValue = upper;
        } else {
          newValue = lower + (upper - lower) * (value - minValue) / (maxValue - minValue);
        }

        fv.setValueAtLocation(loc, newValue);
      }
    }

    return ret;
  }
  public SVM train(InstanceList trainingList) {
    svm_problem problem = new svm_problem();
    problem.l = trainingList.size();
    problem.x = new svm_node[problem.l][];
    problem.y = new double[problem.l];

    for (int i = 0; i < trainingList.size(); i++) {
      Instance instance = trainingList.get(i);
      svm_node[] input = SVM.getSvmNodes(instance);
      if (input == null) {
        continue;
      }
      int labelIndex = ((Label) instance.getTarget()).getIndex();
      problem.x[i] = input;
      problem.y[i] = labelIndex;
    }

    int max_index = trainingList.getDataAlphabet().size();

    if (param.gamma == 0 && max_index > 0) {
      param.gamma = 1.0 / max_index;
    }

    // int numLabels = trainingList.getTargetAlphabet().size();
    // int[] weight_label = new int[numLabels];
    // double[] weight = trainingList.targetLabelDistribution().getValues();
    // double minValue = Double.MAX_VALUE;
    //
    // for (int i = 0; i < weight.length; i++) {
    // if (minValue > weight[i]) {
    // minValue = weight[i];
    // }
    // }
    //
    // for (int i = 0; i < weight.length; i++) {
    // weight_label[i] = i;
    // weight[i] = weight[i] / minValue;
    // }
    //
    // param.weight_label = weight_label;
    // param.weight = weight;

    String error_msg = svm.svm_check_parameter(problem, param);

    if (error_msg != null) {
      System.err.print("Error: " + error_msg + "\n");
      System.exit(1);
    }

    svm_model model = svm.svm_train(problem, param);

    classifier = new SVM(model, trainingList.getPipe());

    return classifier;
  }
Beispiel #8
0
  public void estimate(
      InstanceList documents,
      int numIterations,
      int showTopicsInterval,
      int outputModelInterval,
      String outputModelFilename,
      Randoms r) {
    ilist = documents.shallowClone();
    numTypes = ilist.getDataAlphabet().size();
    int numDocs = ilist.size();
    topics = new int[numDocs][];
    docTopicCounts = new int[numDocs][numTopics];
    typeTopicCounts = new int[numTypes][numTopics];
    tokensPerTopic = new int[numTopics];
    tAlpha = alpha * numTopics;
    vBeta = beta * numTypes;

    long startTime = System.currentTimeMillis();

    // Initialize with random assignments of tokens to topics
    // and finish allocating this.topics and this.tokens
    int topic, seqLen;
    FeatureSequence fs;
    for (int di = 0; di < numDocs; di++) {
      try {
        fs = (FeatureSequence) ilist.get(di).getData();
      } catch (ClassCastException e) {
        System.err.println(
            "LDA and other topic models expect FeatureSequence data, not FeatureVector data.  "
                + "With text2vectors, you can obtain such data with --keep-sequence or --keep-bisequence.");
        throw e;
      }
      seqLen = fs.getLength();
      numTokens += seqLen;
      topics[di] = new int[seqLen];
      // Randomly assign tokens to topics
      for (int si = 0; si < seqLen; si++) {
        topic = r.nextInt(numTopics);
        topics[di][si] = topic;
        docTopicCounts[di][topic]++;
        typeTopicCounts[fs.getIndexAtPosition(si)][topic]++;
        tokensPerTopic[topic]++;
      }
    }

    this.estimate(
        0, numDocs, numIterations, showTopicsInterval, outputModelInterval, outputModelFilename, r);
    // 124.5 seconds
    // 144.8 seconds after using FeatureSequence instead of tokens[][] array
    // 121.6 seconds after putting "final" on FeatureSequence.getIndexAtPosition()
    // 106.3 seconds after avoiding array lookup in inner loop with a temporary variable

  }
Beispiel #9
0
 public Node(InstanceList ilist, Node parent, int minNumInsts, int[] instIndices) {
   if (instIndices == null) {
     instIndices = new int[ilist.size()];
     for (int ii = 0; ii < instIndices.length; ii++) instIndices[ii] = ii;
   }
   m_gainRatio = GainRatio.createGainRatio(ilist, instIndices, minNumInsts);
   m_ilist = ilist;
   m_instIndices = instIndices;
   m_dataDict = m_ilist.getDataAlphabet();
   m_minNumInsts = minNumInsts;
   m_parent = parent;
   m_leftChild = m_rightChild = null;
 }
  @Test
  public void testLoadRareWords() throws UnsupportedEncodingException, FileNotFoundException {
    String dataset_fn = "src/main/resources/datasets/SmallTexts.txt";
    InstanceList nonPrunedInstances = LDAUtils.loadInstances(dataset_fn, "stoplist.txt", 0);
    System.out.println(LDAUtils.instancesToString(nonPrunedInstances));
    System.out.println("Non pruned Alphabet size: " + nonPrunedInstances.getDataAlphabet().size());
    System.out.println("No. instances: " + nonPrunedInstances.size());

    InstanceList originalInstances = LDAUtils.loadInstances(dataset_fn, "stoplist.txt", 2);
    System.out.println("Alphabet size: " + originalInstances.getDataAlphabet().size());
    System.out.println(LDAUtils.instancesToString(originalInstances));
    System.out.println("No. instances: " + originalInstances.size());

    int[] wordCounts = {0, 3, 3, 0, 0};
    int idx = 0;
    for (Instance instance : originalInstances) {
      FeatureSequence fs = (FeatureSequence) instance.getData();
      // This assertion would fail for eventhough the feature sequence
      // is "empty" the underlying array is 2 long.
      // assertEquals(wordCounts[idx++], fs.getFeatures().length);
      assertEquals(wordCounts[idx++], fs.size());
    }
  }
  /**
   * Initialize this separate model using a complete list.
   *
   * @param documents
   * @param testStartIndex
   */
  public void divideDocuments(InstanceList documents, int testStartIndex) {
    Alphabet dataAlpha = documents.getDataAlphabet();
    Alphabet targetAlpha = documents.getTargetAlphabet();

    this.training = new InstanceList(dataAlpha, targetAlpha);
    this.test = new InstanceList(dataAlpha, targetAlpha);
    int di = 0;
    for (di = 0; di < testStartIndex; di++) {
      training.add(documents.get(di));
    }
    for (di = testStartIndex; di < documents.size(); di++) {
      test.add(documents.get(di));
    }
  }
Beispiel #12
0
 public void printState(PrintWriter pw) {
   Alphabet a = ilist.getDataAlphabet();
   pw.println("#doc pos typeindex type topic");
   for (int di = 0; di < topics.length; di++) {
     FeatureSequence fs = (FeatureSequence) ilist.get(di).getData();
     for (int si = 0; si < topics[di].length; si++) {
       int type = fs.getIndexAtPosition(si);
       pw.print(di);
       pw.print(' ');
       pw.print(si);
       pw.print(' ');
       pw.print(type);
       pw.print(' ');
       pw.print(a.lookupObject(type));
       pw.print(' ');
       pw.print(topics[di][si]);
       pw.println();
     }
   }
 }
  public void printCounts() {

    Alphabet alphabet = instances.getDataAlphabet();

    NumberFormat nf = NumberFormat.getInstance();
    nf.setMinimumFractionDigits(0);
    nf.setMaximumFractionDigits(6);
    nf.setGroupingUsed(false);

    for (int feature = 0; feature < numFeatures; feature++) {

      Formatter formatter = new Formatter(new StringBuilder(), Locale.US);

      formatter.format(
          "%s\t%s\t%d",
          alphabet.lookupObject(feature).toString(),
          nf.format(featureCounts[feature]),
          documentFrequencies[feature]);

      System.out.println(formatter);
    }
  }
Beispiel #14
0
  public void estimate(
      InstanceList documents,
      int numIterations,
      int showTopicsInterval,
      int outputModelInterval,
      String outputModelFilename,
      Randoms r) {
    ilist = documents;
    uniAlphabet = ilist.getDataAlphabet();
    biAlphabet = ((FeatureSequenceWithBigrams) ilist.get(0).getData()).getBiAlphabet();
    numTypes = uniAlphabet.size();
    numBitypes = biAlphabet.size();
    int numDocs = ilist.size();
    topics = new int[numDocs][];
    grams = new int[numDocs][];
    docTopicCounts = new int[numDocs][numTopics];
    typeNgramTopicCounts = new int[numTypes][2][numTopics];
    unitypeTopicCounts = new int[numTypes][numTopics];
    bitypeTopicCounts = new int[numBitypes][numTopics];
    tokensPerTopic = new int[numTopics];
    bitokensPerTopic = new int[numTypes][numTopics];
    tAlpha = alpha * numTopics;
    vBeta = beta * numTypes;
    vGamma = gamma * numTypes;

    long startTime = System.currentTimeMillis();

    // Initialize with random assignments of tokens to topics
    // and finish allocating this.topics and this.tokens
    int topic, gram, seqLen, fi;
    for (int di = 0; di < numDocs; di++) {
      FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.get(di).getData();
      seqLen = fs.getLength();
      numTokens += seqLen;
      topics[di] = new int[seqLen];
      grams[di] = new int[seqLen];
      // Randomly assign tokens to topics
      int prevFi = -1, prevTopic = -1;
      for (int si = 0; si < seqLen; si++) {
        // randomly sample a topic for the word at position si
        topic = r.nextInt(numTopics);
        // if a bigram is allowed at position si, then sample a gram status for it.
        gram = (fs.getBiIndexAtPosition(si) == -1 ? 0 : r.nextInt(2));
        if (gram != 0) biTokens++;
        topics[di][si] = topic;
        grams[di][si] = gram;
        docTopicCounts[di][topic]++;
        fi = fs.getIndexAtPosition(si);
        if (prevFi != -1) typeNgramTopicCounts[prevFi][gram][prevTopic]++;
        if (gram == 0) {
          unitypeTopicCounts[fi][topic]++;
          tokensPerTopic[topic]++;
        } else {
          bitypeTopicCounts[fs.getBiIndexAtPosition(si)][topic]++;
          bitokensPerTopic[prevFi][topic]++;
        }
        prevFi = fi;
        prevTopic = topic;
      }
    }

    for (int iterations = 0; iterations < numIterations; iterations++) {
      sampleTopicsForAllDocs(r);
      if (iterations % 10 == 0) System.out.print(iterations);
      else System.out.print(".");
      System.out.flush();
      if (showTopicsInterval != 0 && iterations % showTopicsInterval == 0 && iterations > 0) {
        System.out.println();
        printTopWords(5, false);
      }
      if (outputModelInterval != 0 && iterations % outputModelInterval == 0 && iterations > 0) {
        this.write(new File(outputModelFilename + '.' + iterations));
      }
    }

    System.out.println(
        "\nTotal time (sec): " + ((System.currentTimeMillis() - startTime) / 1000.0));
  }