Ejemplo n.º 1
0
  public static double[][] extractBigrams(
      File file, MapSymbolTable symbolTable, TokenizerFactory tokenizerFactory, String charset)
      throws Exception {

    char[] cs = Files.readCharsFromFile(file, charset);
    String[] tokens = tokenizerFactory.tokenizer(cs, 0, cs.length).tokenize();
    System.out.println("    Number of tokens=" + tokens.length);

    int[] symbols = new int[tokens.length];
    for (int i = 0; i < tokens.length; ++i) {
      symbols[i] =
          Strings.allLetters(tokens[i].toCharArray()) ? symbolTable.getOrAddSymbol(tokens[i]) : -1;
    }

    int numSymbols = symbolTable.numSymbols();
    System.out.println("    Number of distinct tokens=" + numSymbols);
    System.out.println("    #Matrix entries=" + numSymbols * numSymbols);

    double[][] values = new double[numSymbols][numSymbols];
    for (int i = 0; i < numSymbols; ++i) Arrays.fill(values[i], 0.0);

    for (int i = 1; i < symbols.length; ++i) {
      int left = symbols[i - 1];
      int right = symbols[i];
      if (left >= 0 && right >= 0) values[symbols[i - 1]][symbols[i]] += 1.0;
    }

    return values;
  }
Ejemplo n.º 2
0
  public static void main(String[] args) throws ClassNotFoundException, IOException {

    if (args.length != 3) {
      System.out.println("No Arguments. No Model Trainer.");
      return;
    }
    TESTING_DIR = new File(args[0]);
    TESTING_RESULTS_DIR = new File(args[1]);
    MODELS_DIR = new File(args[2]);
    CATEGORIES = TESTING_DIR.list();
    NGRAM_SIZE = CATEGORIES.length;

    File[] models = MODELS_DIR.listFiles();
    PartsOfSpeech = new String[models.length];
    compiledClassifiers = new HashMap<String, LMClassifier>();

    for (int i = 0; i < models.length; i++) {

      LMClassifier classifier = loadModel(models[i].getAbsolutePath());
      PartsOfSpeech[i] = models[i].getName().substring(0, models[i].getName().length() - 6);
      compiledClassifiers.put(PartsOfSpeech[i], classifier);
    }
    standardClassifier = compiledClassifiers.get("all");

    // testing
    //		ConfusionMatrix confMatrix = new ConfusionMatrix(CATEGORIES);
    ClassifierEvaluator evaluator = new ClassifierEvaluator(standardClassifier, CATEGORIES);
    int hits = 0, misses = 0, hits_standard = 0, misses_standard = 0;
    for (int i = 0; i < CATEGORIES.length; i++) {
      File classDir = new File(TESTING_DIR, CATEGORIES[i]);
      String[] testingFiles = classDir.list();
      for (int j = 0; j < testingFiles.length; j++) {
        // String text = Files.readFromFile(new File(classDir,testingFiles[j]));
        System.out.print("Testing on " + CATEGORIES[i] + "/" + testingFiles[j] + " ");
        // evaluator.addCase(CATEGORIES[i],text);
        Document d = new Document(new File(classDir, testingFiles[j]));
        Set<Document> s = new HashSet<Document>();
        s.add(d);
        POSFinder p = new POSFinder(s);
        p.process();
        JointClassification jc = jointJointClassify(d);
        String bestCategory = jc.bestCategory();
        JointClassification jc_standard = standardClassifier.classify(d.mText_str);
        // evaluator.addClassification(CATEGORIES[i], new
        // com.aliasi.classify.Classification(bestCategory));
        evaluator.addCase(CATEGORIES[i], d.mText_str);

        System.out.println("Got best category of: " + bestCategory);
        System.out.println("(standard) Got best category of: " + jc_standard.bestCategory());

        System.out.println("---------------");
        File bestCategoryDirectory = new File(TESTING_RESULTS_DIR, bestCategory);
        File bestCategoryDirectory_inTest = new File(TESTING_DIR, bestCategory);
        File bestCategoryDirectory_inTest_standard =
            new File(TESTING_DIR, jc_standard.bestCategory());
        if (!bestCategoryDirectory.exists()) bestCategoryDirectory.mkdir();

        boolean hit = false;
        for (String ss : bestCategoryDirectory_inTest.list())
          if (ss.equals(d.mFile.getName())) hit = true;
        if (hit) {
          hits++;
          System.err.println(
              "-------------HIT! P()="
                  + (int) (jc.conditionalProbability(0) * 100)
                  + "% ("
                  + hits
                  + " hits)------------");
        } else {
          misses++;
          System.err.println(
              "-------------MISS! P()="
                  + (int) (jc.conditionalProbability(0) * 100)
                  + "% ("
                  + misses
                  + " misses)------------");
        }

        hit = false;
        for (String ss : bestCategoryDirectory_inTest_standard.list())
          if (ss.equals(d.mFile.getName())) hit = true;
        if (hit) {
          hits_standard++;
          System.err.println(
              "-----(standard)---HIT! P()="
                  + (int) (jc_standard.conditionalProbability(0) * 100)
                  + "% ("
                  + hits_standard
                  + " hits)------------");
        } else {
          misses_standard++;
          System.err.println(
              "----(standard)---MISS! P()="
                  + (int) (jc_standard.conditionalProbability(0) * 100)
                  + "% ("
                  + misses_standard
                  + " misses)------------");
        }

        if (hits_standard < hits)
          System.err.println(
              "Akshat's Classifier is WINNING <"
                  + ((double) hits / (double) (hits + misses))
                  + "> to <"
                  + (double)
                      ((double) hits_standard / ((double) hits_standard + (double) misses_standard))
                  + ">");
        else
          System.err.println(
              "Akshat's Classifier is LOSING <"
                  + ((double) hits / (double) (hits + misses))
                  + "> to <"
                  + (double)
                      ((double) hits_standard / ((double) hits_standard + (double) misses_standard))
                  + ">");

        // DocTreeWidget.copyFile(d.mFile, new File(bestCategoryDirectory,d.mFile.getName()));
        // //this is just a file copy function
      }
    }
    ConfusionMatrix confMatrix = evaluator.confusionMatrix();

    String myresults =
        "Hits: "
            + hits
            + ", Misses: "
            + misses
            + ", Total Accuracy: "
            + ((double) hits / (double) (hits + misses));
    myresults +=
        "\n(standard) Hits :"
            + hits_standard
            + ", Misses: "
            + misses_standard
            + " Total accuracy = "
            + (double)
                ((double) hits_standard / ((double) hits_standard + (double) misses_standard));

    System.out.println(myresults);
    com.aliasi.util.Files.writeStringToFile(
        myresults, new File(TESTING_RESULTS_DIR, "results.txt"));
  }