public static double[][] extractBigrams( File file, MapSymbolTable symbolTable, TokenizerFactory tokenizerFactory, String charset) throws Exception { char[] cs = Files.readCharsFromFile(file, charset); String[] tokens = tokenizerFactory.tokenizer(cs, 0, cs.length).tokenize(); System.out.println(" Number of tokens=" + tokens.length); int[] symbols = new int[tokens.length]; for (int i = 0; i < tokens.length; ++i) { symbols[i] = Strings.allLetters(tokens[i].toCharArray()) ? symbolTable.getOrAddSymbol(tokens[i]) : -1; } int numSymbols = symbolTable.numSymbols(); System.out.println(" Number of distinct tokens=" + numSymbols); System.out.println(" #Matrix entries=" + numSymbols * numSymbols); double[][] values = new double[numSymbols][numSymbols]; for (int i = 0; i < numSymbols; ++i) Arrays.fill(values[i], 0.0); for (int i = 1; i < symbols.length; ++i) { int left = symbols[i - 1]; int right = symbols[i]; if (left >= 0 && right >= 0) values[symbols[i - 1]][symbols[i]] += 1.0; } return values; }
public static void main(String[] args) throws ClassNotFoundException, IOException { if (args.length != 3) { System.out.println("No Arguments. No Model Trainer."); return; } TESTING_DIR = new File(args[0]); TESTING_RESULTS_DIR = new File(args[1]); MODELS_DIR = new File(args[2]); CATEGORIES = TESTING_DIR.list(); NGRAM_SIZE = CATEGORIES.length; File[] models = MODELS_DIR.listFiles(); PartsOfSpeech = new String[models.length]; compiledClassifiers = new HashMap<String, LMClassifier>(); for (int i = 0; i < models.length; i++) { LMClassifier classifier = loadModel(models[i].getAbsolutePath()); PartsOfSpeech[i] = models[i].getName().substring(0, models[i].getName().length() - 6); compiledClassifiers.put(PartsOfSpeech[i], classifier); } standardClassifier = compiledClassifiers.get("all"); // testing // ConfusionMatrix confMatrix = new ConfusionMatrix(CATEGORIES); ClassifierEvaluator evaluator = new ClassifierEvaluator(standardClassifier, CATEGORIES); int hits = 0, misses = 0, hits_standard = 0, misses_standard = 0; for (int i = 0; i < CATEGORIES.length; i++) { File classDir = new File(TESTING_DIR, CATEGORIES[i]); String[] testingFiles = classDir.list(); for (int j = 0; j < testingFiles.length; j++) { // String text = Files.readFromFile(new File(classDir,testingFiles[j])); System.out.print("Testing on " + CATEGORIES[i] + "/" + testingFiles[j] + " "); // evaluator.addCase(CATEGORIES[i],text); Document d = new Document(new File(classDir, testingFiles[j])); Set<Document> s = new HashSet<Document>(); s.add(d); POSFinder p = new POSFinder(s); p.process(); JointClassification jc = jointJointClassify(d); String bestCategory = jc.bestCategory(); JointClassification jc_standard = standardClassifier.classify(d.mText_str); // evaluator.addClassification(CATEGORIES[i], new // com.aliasi.classify.Classification(bestCategory)); evaluator.addCase(CATEGORIES[i], d.mText_str); System.out.println("Got best category of: " + bestCategory); System.out.println("(standard) Got best category of: " + jc_standard.bestCategory()); System.out.println("---------------"); File bestCategoryDirectory = new File(TESTING_RESULTS_DIR, bestCategory); File bestCategoryDirectory_inTest = new File(TESTING_DIR, bestCategory); File bestCategoryDirectory_inTest_standard = new File(TESTING_DIR, jc_standard.bestCategory()); if (!bestCategoryDirectory.exists()) bestCategoryDirectory.mkdir(); boolean hit = false; for (String ss : bestCategoryDirectory_inTest.list()) if (ss.equals(d.mFile.getName())) hit = true; if (hit) { hits++; System.err.println( "-------------HIT! P()=" + (int) (jc.conditionalProbability(0) * 100) + "% (" + hits + " hits)------------"); } else { misses++; System.err.println( "-------------MISS! P()=" + (int) (jc.conditionalProbability(0) * 100) + "% (" + misses + " misses)------------"); } hit = false; for (String ss : bestCategoryDirectory_inTest_standard.list()) if (ss.equals(d.mFile.getName())) hit = true; if (hit) { hits_standard++; System.err.println( "-----(standard)---HIT! P()=" + (int) (jc_standard.conditionalProbability(0) * 100) + "% (" + hits_standard + " hits)------------"); } else { misses_standard++; System.err.println( "----(standard)---MISS! P()=" + (int) (jc_standard.conditionalProbability(0) * 100) + "% (" + misses_standard + " misses)------------"); } if (hits_standard < hits) System.err.println( "Akshat's Classifier is WINNING <" + ((double) hits / (double) (hits + misses)) + "> to <" + (double) ((double) hits_standard / ((double) hits_standard + (double) misses_standard)) + ">"); else System.err.println( "Akshat's Classifier is LOSING <" + ((double) hits / (double) (hits + misses)) + "> to <" + (double) ((double) hits_standard / ((double) hits_standard + (double) misses_standard)) + ">"); // DocTreeWidget.copyFile(d.mFile, new File(bestCategoryDirectory,d.mFile.getName())); // //this is just a file copy function } } ConfusionMatrix confMatrix = evaluator.confusionMatrix(); String myresults = "Hits: " + hits + ", Misses: " + misses + ", Total Accuracy: " + ((double) hits / (double) (hits + misses)); myresults += "\n(standard) Hits :" + hits_standard + ", Misses: " + misses_standard + " Total accuracy = " + (double) ((double) hits_standard / ((double) hits_standard + (double) misses_standard)); System.out.println(myresults); com.aliasi.util.Files.writeStringToFile( myresults, new File(TESTING_RESULTS_DIR, "results.txt")); }