コード例 #1
0
  /**
   * @param file is the file to be classified
   * @return returns true if correctly classified otherwise false
   */
  public boolean classify(String file) {
    String category = "";
    double category_prob = 0;

    // Read document;
    DocumentI document = new Document(vocab);
    document.read_file(file, false);
    // Mapping for document
    Map<String, Integer> document_dictionary = document.get_words();
    // Distinct words in vocabulary
    Set<String> vocab_distinct_words = vocab.distinct_words();

    for (Entry<String, TextI> entry : categories.entrySet()) {
      double pv = (double) entry.getValue().num_docs() / (double) total_docs;
      double pwv = 1;
      double prevpwv = 1;
      for (String word : document_dictionary.keySet()) {
        if (vocab_distinct_words.contains(word)) {
          if ((pwv *= entry.getValue().pwv(word)) == 0) {
            pwv = prevpwv;
            break;
          }
          prevpwv = pwv;
        }
      }
      if (pwv * pv > category_prob) {
        category = entry.getKey();
        category_prob = pwv * pv;
      }
    }
    String predicted_category = category.replace(this.directory + "\\", "");
    String actual_category = document.getCategory();
    if (actual_category.contains(predicted_category)) {
      return true;
    } else {
      return false;
    }
  }
コード例 #2
0
  /**
   * @param dir is 20_newsgroups directory
   * @param vocab is the vocabulary of the naive bayes classifier Currently hard coded to accept
   *     20_newsgroups folder where each sub-folder is treated as a category
   */
  public void learn(String dir, VocabI vocab) {
    this.directory = dir;
    this.vocab = vocab;

    File folder = new File(dir);
    if (!folder.isDirectory()) {
      System.out.println("Please provide directory");
      return;
    }
    File[] subdirectories = folder.listFiles();
    for (File subdirectory : subdirectories) {
      TextI category_text = new Text(vocab);
      if (subdirectory.isDirectory()) {
        File[] files = subdirectory.listFiles();
        for (File file : files) {
          category_text.read_file(file.toString(), true);
          total_docs++;
        }
      }
      categories.put(subdirectory.toString(), category_text);
    }
    vocab.refine();
  }