private static void generateTopicFromAbstractsInDB() throws SQLException, FileNotFoundException, IOException, ClassNotFoundException { InstanceList allInstances, trainingInstances = null, testingInstances = null; ParallelTopicModel topicModel; TopicTrainingModel ldaTrainModel = new TopicTrainingModel(); allInstances = Utilities.readInstancesFromMalletFile( TopicConstants.INPUTDIRPATH + File.separator + TopicConstants.ALLMALLETFILENAME); trainingInstances = Utilities.readInstancesFromMalletFile( TopicConstants.INPUTDIRPATH + File.separator + TopicConstants.TRAININGMALLETFILENAME); testingInstances = Utilities.readInstancesFromMalletFile( TopicConstants.INPUTDIRPATH + File.separator + TopicConstants.TESTINGMALLETFILENAME); /** ******** Extraction of topics from training data using the optimal parameters *********** */ ldaTrainModel.buildTopicModelUsingLDA( trainingInstances, TopicConstants.NUMTOPICS, TopicConstants.NUMTOPICS * TopicConstants.ALPHA, TopicConstants.BETA, TopicConstants.NUMITERATIONS); topicModel = ldaTrainModel.getTopicModel(); ldaTrainModel.generateTopicOutputFiles(topicModel, TopicConstants.OUTPUTDIRPATH); }
/** Best values for SE: 80 topics, 20000 iterations, alpha= 0.001, beta= 0.01 */ private static void optimizeLDAParameters() throws IOException, ClassNotFoundException { /** ******** Selection of optimal Parameters ********** */ // double[] alphasToBeConsidered= {0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1}; // double[] betasToBeConsidered= {0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1}; // int[] topicSizesToBeConsidered= {3, 5, 10, 15, 20, 30, 40, 50, 60, 80, 100}; // int[] iterationsToBeConsidered= {5, 10, 50, 100, 500, 1000, 2000, 2500, 3000}; double[] alphasToBeConsidered = {0.001}; double[] betasToBeConsidered = {0.01}; int[] topicSizesToBeConsidered = {90}; int[] iterationsToBeConsidered = {20000}; InstanceList trainingInstances = Utilities.readInstancesFromMalletFile( TopicConstants.INPUTDIRPATH + File.separator + TopicConstants.ALLMALLETFILENAME); TopicTrainingModel ldaTrainModel = new TopicTrainingModel(); String res = ldaTrainModel.selectOptimalParameters( trainingInstances, TopicConstants.OUTPUTDIRPATH, alphasToBeConsidered, betasToBeConsidered, iterationsToBeConsidered, topicSizesToBeConsidered); System.out.println(res); /* ldaTrainModel.selectOptimalAlpha(trainingInstances, inputDir,outputDir, alphasToBeConsidered); ldaTrainModel.selectOptimalBeta(trainingInstances, inputDir, outputDir, betasToBeConsidered); ldaTrainModel.selectOptimalIterationsSize(trainingInstances, inputDir, outputDir, iterationsToBeConsidered); ldaTrainModel.selectOptimalTopicSize(trainingInstances, inputDir, outputDir, topicSizesToBeConsidered); */ }
private static int[] createMalletInstances(DataManager mgr, int venueId) throws SQLException, FileNotFoundException, IOException { PaperAbstractReader instanceGenerator = new PaperAbstractReader(); instanceGenerator.setDomainId(venueId); String inputPath = TopicConstants.INPUTDIRPATH + File.separator + "papers"; // instanceGenerator.readAbstractsFromDirectory(MalletConstants.commentsFolderPath,true,true); int[] counts = instanceGenerator.readAbstractsFromDatabase(mgr, inputPath, true, true); instanceGenerator.splitInstances(0.8, 0.0, TopicConstants.INPUTDIRPATH); Utilities.writeInstancesToMalletFile( instanceGenerator.getAllInstances(), TopicConstants.INPUTDIRPATH + File.separator + TopicConstants.ALLMALLETFILENAME); Utilities.writeInstancesToMalletFile( instanceGenerator.getTrainingInstances(), TopicConstants.INPUTDIRPATH + File.separator + TopicConstants.TRAININGMALLETFILENAME); Utilities.writeInstancesToMalletFile( instanceGenerator.getTestingInstances(), TopicConstants.INPUTDIRPATH + File.separator + TopicConstants.TESTINGMALLETFILENAME); return counts; }