/** * TODO move this to Utils.java because other classes have a similar method Creates a file * containing the documents to be used for training Documents are represented as an n x m matrix, * where each row is normalized (ie each entry i, j is the percentage of document i that is word * j) * * @param filename The name of the file to be created * @param papers The list of documents to be used for training */ private void createKmeansInput(String filename, List<TrainingPaper> papers) { System.out.print("creating kmeans input: " + filename + " ... "); double[][] fullRepresentation = new double[papers.size()][terms.size()]; int paperIndex = 0; for (TrainingPaper paper : papers) { double sum = 0; for (int word : paper.getTrainingWords()) { double wordFrequency = paper.getTrainingTf(word); fullRepresentation[paperIndex][word] = wordFrequency; sum += wordFrequency; } for (int col = 0; col < fullRepresentation[paperIndex].length; col++) { fullRepresentation[paperIndex][col] /= sum; } paperIndex++; } PlusoneFileWriter fileWriter = new PlusoneFileWriter(filename); for (int row = 0; row < fullRepresentation.length; row++) { for (int col = 0; col < fullRepresentation[row].length; col++) { fileWriter.write(fullRepresentation[row][col] + " "); } fileWriter.write("\n"); } fileWriter.close(); System.out.println("done."); }
/** * TODO This method exists in several classes--move to Utils? Takes a list of PaperAbstract * documents and writes them to file according to the format specified by lda-c-dist * * @param filename name of the file to be created (will be overwritten if it already exists) * @param papers list of papers to be written to file */ private void createLdaInputTest(String filename, List<PredictionPaper> papers) { System.out.print("creating lda test input in file: " + filename + " ... "); PlusoneFileWriter fileWriter = new PlusoneFileWriter(filename); for (PredictionPaper paper : papers) { fileWriter.write(paper.getTrainingWords().size() + " "); for (int word : paper.getTrainingWords()) { fileWriter.write(word + ":" + paper.getTrainingTf(word) + " "); } fileWriter.write("\n"); } fileWriter.close(); System.out.println("done."); }