Beispiel #1
0
  public void doInference() {

    try {

      ParallelTopicModel model = ParallelTopicModel.read(new File(inferencerFile));
      TopicInferencer inferencer = model.getInferencer();

      // TopicInferencer inferencer =
      //    TopicInferencer.read(new File(inferencerFile));

      // InstanceList testing = readFile();
      readFile();
      InstanceList testing = generateInstanceList(); // readFile();

      for (int i = 0; i < testing.size(); i++) {

        StringBuilder probabilities = new StringBuilder();
        double[] testProbabilities = inferencer.getSampledDistribution(testing.get(i), 10, 1, 5);

        ArrayList probabilityList = new ArrayList();

        for (int j = 0; j < testProbabilities.length; j++) {
          probabilityList.add(new Pair<Integer, Double>(j, testProbabilities[j]));
        }

        Collections.sort(probabilityList, new CustomComparator());

        for (int j = 0; j < testProbabilities.length && j < topN; j++) {
          if (j > 0) probabilities.append(" ");
          probabilities.append(
              ((Pair<Integer, Double>) probabilityList.get(j)).getFirst().toString()
                  + ","
                  + ((Pair<Integer, Double>) probabilityList.get(j)).getSecond().toString());
        }

        System.out.println(docIds.get(i) + "," + probabilities.toString());
      }

    } catch (Exception e) {
      e.printStackTrace();
      System.err.println(e.getMessage());
    }
  }
  public static void main(String[] args) {
    //		String malletFile = "dataset/vlc_lectures.all.en.f8.mallet";
    //		String simFile = "dataset/vlc/sim5p.csv";
    //		String solutionFile = "dataset/vlc/task1_solution.en.f8.lm.txt";
    //		String queryFile = "dataset/task1_query.en.f8.txt";
    //		String targetFile = "dataset/task1_target.en.f8.txt";

    String malletFile = "dataset/vlc/folds/all.0.4189.mallet";
    String trainMalletFile = "dataset/vlc/folds/training.0.mallet";
    String testMalletFile = "dataset/vlc/folds/test.0.mallet";
    String queryFile = "dataset/vlc/folds/query.0.csv";
    String linkFile = "dataset/vlc/folds/trainingPairs.0.csv";
    String targetFile = "dataset/vlc/folds/target.0.csv";
    String solutionFile = "dataset/vlc/task1_solution.en.f8.lm.txt";

    int numTopics = 160;
    int numIterations = 200;
    double alpha = 0.0016;
    double beta = 0.0001;

    InstanceList train = InstanceList.load(new File(trainMalletFile));
    InstanceList test = InstanceList.load(new File(testMalletFile));
    SeparateParallelLda spl = new SeparateParallelLda(train, test);
    spl.trainDocuments(numTopics, numIterations, alpha, beta);
    spl.generateTestInference();
    spl.lda.printTopWords(System.out, 10, true);
    BasicTask1Solution solver = new Task1SolutionWithSeparateData(spl);

    double precision;
    try {
      solver.retrieveTask1Solution(queryFile, solutionFile);
      precision = Task1Solution.evaluateResult(targetFile, solutionFile);
      System.out.println(
          String.format(
              "SeparateParallelLda: iteration: %d, precisoion: %f", numIterations, precision));
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
  public void run() {

    try {

      if (!isFinished) {
        System.out.println("already running!");
        return;
      }

      isFinished = false;

      // Initialize the smoothing-only sampling bucket
      smoothingOnlyMass = 0;
      smoothingOnlyLabelMass = 0;

      // Initialize the cached coefficients, using only smoothing.
      //  These values will be selectively replaced in documents with
      //  non-zero counts in particular topics.

      for (int topic = 0; topic < numTopics; topic++) {
        smoothingOnlyMass += alpha[topic] * beta / (tokensPerTopic[topic] + betaSum);

        if (ignoreLabels) {
          // smoothingOnlyMass += alpha[topic] * beta / (tokensPerTopic[topic] + betaSum);
          cachedCoefficients[topic] = alpha[topic] / (tokensPerTopic[topic] + betaSum);
        } else {
          // smoothingOnlyMass += (1 + lblWeight) * alpha[topic] * beta / (tokensPerTopic[topic] +
          // betaSum);
          cachedCoefficients[topic] =
              (1 + lblWeight) * alpha[topic] / (tokensPerTopic[topic] + betaSum);
        }

        smoothingOnlyLabelMass += alpha[topic] * gamma / (labelsPerTopic[topic] + gammaSum);

        if (ignoreLabels) {
          // smoothingOnlyLabelMass += alpha[topic] * gamma / (labelsPerTopic[topic] + gammaSum);
          cachedLabelCoefficients[topic] = alpha[topic] / (labelsPerTopic[topic] + gammaSum);
        } else {
          // smoothingOnlyLabelMass += (1 + 1 / lblWeight) * alpha[topic] * gamma /
          // (labelsPerTopic[topic] + gammaSum);
          cachedLabelCoefficients[topic] =
              (1 + 1 / lblWeight) * alpha[topic] / (labelsPerTopic[topic] + gammaSum);
        }
      }

      for (int doc = startDoc; doc < data.size() && doc < startDoc + numDocs; doc++) {

        /*
        if (doc % 10000 == 0) {
        System.out.println("processing doc " + doc);
        }
        */

        /*

        FeatureSequence tokenSequence =
        (FeatureSequence) data.get(doc).instance.getData();

        LabelSequence topicSequence =
        (LabelSequence) data.get(doc).topicSequence;

        LabelSequence lblTopicSequence =
        (LabelSequence) data.get(doc).lblTopicSequence;

        FeatureSequence labelSequence =
        (FeatureSequence) data.get(doc).instance.getTarget();*/

        sampleTopicsForOneDoc(data.get(doc));
        // typeTopicCounts);
        // , cachedCoefficients, tokensPerTopic, betaSum, beta, smoothingOnlyMass,
        // lbltypeTopicCounts, cachedLabelCoefficients, labelsPerTopic, gammaSum, gamma,
        // smoothingOnlyLabelMass);

        //  sampleTopicsForOneDoc(tokenSequence, topicSequence,
        //          true, typeTopicCounts, cachedCoefficients, tokensPerTopic, betaSum, beta,
        // smoothingOnlyMass);

        // homer sample labels now

        // sampleTopicsForOneDoc(labelSequence, topicSequence,
        //         true, lbltypeTopicCounts, cachedLabelCoefficients, labelsPerTopic, gammaSum,
        // gamma, smoothingOnlyLabelMass);
        // homer

      }

      if (shouldBuildLocalCounts) {
        buildLocalTypeTopicCounts();
      }

      shouldSaveState = false;
      isFinished = true;

    } catch (Exception e) {
      e.printStackTrace();
    }
  }