public void doInference() { try { ParallelTopicModel model = ParallelTopicModel.read(new File(inferencerFile)); TopicInferencer inferencer = model.getInferencer(); // TopicInferencer inferencer = // TopicInferencer.read(new File(inferencerFile)); // InstanceList testing = readFile(); readFile(); InstanceList testing = generateInstanceList(); // readFile(); for (int i = 0; i < testing.size(); i++) { StringBuilder probabilities = new StringBuilder(); double[] testProbabilities = inferencer.getSampledDistribution(testing.get(i), 10, 1, 5); ArrayList probabilityList = new ArrayList(); for (int j = 0; j < testProbabilities.length; j++) { probabilityList.add(new Pair<Integer, Double>(j, testProbabilities[j])); } Collections.sort(probabilityList, new CustomComparator()); for (int j = 0; j < testProbabilities.length && j < topN; j++) { if (j > 0) probabilities.append(" "); probabilities.append( ((Pair<Integer, Double>) probabilityList.get(j)).getFirst().toString() + "," + ((Pair<Integer, Double>) probabilityList.get(j)).getSecond().toString()); } System.out.println(docIds.get(i) + "," + probabilities.toString()); } } catch (Exception e) { e.printStackTrace(); System.err.println(e.getMessage()); } }
public static void main(String[] args) { // String malletFile = "dataset/vlc_lectures.all.en.f8.mallet"; // String simFile = "dataset/vlc/sim5p.csv"; // String solutionFile = "dataset/vlc/task1_solution.en.f8.lm.txt"; // String queryFile = "dataset/task1_query.en.f8.txt"; // String targetFile = "dataset/task1_target.en.f8.txt"; String malletFile = "dataset/vlc/folds/all.0.4189.mallet"; String trainMalletFile = "dataset/vlc/folds/training.0.mallet"; String testMalletFile = "dataset/vlc/folds/test.0.mallet"; String queryFile = "dataset/vlc/folds/query.0.csv"; String linkFile = "dataset/vlc/folds/trainingPairs.0.csv"; String targetFile = "dataset/vlc/folds/target.0.csv"; String solutionFile = "dataset/vlc/task1_solution.en.f8.lm.txt"; int numTopics = 160; int numIterations = 200; double alpha = 0.0016; double beta = 0.0001; InstanceList train = InstanceList.load(new File(trainMalletFile)); InstanceList test = InstanceList.load(new File(testMalletFile)); SeparateParallelLda spl = new SeparateParallelLda(train, test); spl.trainDocuments(numTopics, numIterations, alpha, beta); spl.generateTestInference(); spl.lda.printTopWords(System.out, 10, true); BasicTask1Solution solver = new Task1SolutionWithSeparateData(spl); double precision; try { solver.retrieveTask1Solution(queryFile, solutionFile); precision = Task1Solution.evaluateResult(targetFile, solutionFile); System.out.println( String.format( "SeparateParallelLda: iteration: %d, precisoion: %f", numIterations, precision)); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
public void run() { try { if (!isFinished) { System.out.println("already running!"); return; } isFinished = false; // Initialize the smoothing-only sampling bucket smoothingOnlyMass = 0; smoothingOnlyLabelMass = 0; // Initialize the cached coefficients, using only smoothing. // These values will be selectively replaced in documents with // non-zero counts in particular topics. for (int topic = 0; topic < numTopics; topic++) { smoothingOnlyMass += alpha[topic] * beta / (tokensPerTopic[topic] + betaSum); if (ignoreLabels) { // smoothingOnlyMass += alpha[topic] * beta / (tokensPerTopic[topic] + betaSum); cachedCoefficients[topic] = alpha[topic] / (tokensPerTopic[topic] + betaSum); } else { // smoothingOnlyMass += (1 + lblWeight) * alpha[topic] * beta / (tokensPerTopic[topic] + // betaSum); cachedCoefficients[topic] = (1 + lblWeight) * alpha[topic] / (tokensPerTopic[topic] + betaSum); } smoothingOnlyLabelMass += alpha[topic] * gamma / (labelsPerTopic[topic] + gammaSum); if (ignoreLabels) { // smoothingOnlyLabelMass += alpha[topic] * gamma / (labelsPerTopic[topic] + gammaSum); cachedLabelCoefficients[topic] = alpha[topic] / (labelsPerTopic[topic] + gammaSum); } else { // smoothingOnlyLabelMass += (1 + 1 / lblWeight) * alpha[topic] * gamma / // (labelsPerTopic[topic] + gammaSum); cachedLabelCoefficients[topic] = (1 + 1 / lblWeight) * alpha[topic] / (labelsPerTopic[topic] + gammaSum); } } for (int doc = startDoc; doc < data.size() && doc < startDoc + numDocs; doc++) { /* if (doc % 10000 == 0) { System.out.println("processing doc " + doc); } */ /* FeatureSequence tokenSequence = (FeatureSequence) data.get(doc).instance.getData(); LabelSequence topicSequence = (LabelSequence) data.get(doc).topicSequence; LabelSequence lblTopicSequence = (LabelSequence) data.get(doc).lblTopicSequence; FeatureSequence labelSequence = (FeatureSequence) data.get(doc).instance.getTarget();*/ sampleTopicsForOneDoc(data.get(doc)); // typeTopicCounts); // , cachedCoefficients, tokensPerTopic, betaSum, beta, smoothingOnlyMass, // lbltypeTopicCounts, cachedLabelCoefficients, labelsPerTopic, gammaSum, gamma, // smoothingOnlyLabelMass); // sampleTopicsForOneDoc(tokenSequence, topicSequence, // true, typeTopicCounts, cachedCoefficients, tokensPerTopic, betaSum, beta, // smoothingOnlyMass); // homer sample labels now // sampleTopicsForOneDoc(labelSequence, topicSequence, // true, lbltypeTopicCounts, cachedLabelCoefficients, labelsPerTopic, gammaSum, // gamma, smoothingOnlyLabelMass); // homer } if (shouldBuildLocalCounts) { buildLocalTypeTopicCounts(); } shouldSaveState = false; isFinished = true; } catch (Exception e) { e.printStackTrace(); } }