/** * 计算给定实例文档属于指定类别的概率,返回的是取对数后的数值 * * @param category * @param doc * @return */ public double getProbability(String category, Instance doc) { double result = getCategoryProbability(category); for (String feature : doc.getWords()) { if (VARIABLE.containFeature(feature)) { result += getFeatureProbability(feature, category); } } return result; }
/** * 判断该实例所属的类别category * * @param doc * @return */ public String getCategory(Instance doc) { Collection<String> categories = VARIABLE.getCategories(); System.out.println(categories); double best = Double.NEGATIVE_INFINITY; String bestName = null; for (String c : categories) { double current = getProbability(c, doc); System.out.println(c + ":" + current); if (best < current) { best = current; bestName = c; } } return bestName; }
/** * 训练一篇文档 * * @param doc */ public void training(Instance doc) { VARIABLE.addInstance(doc); }
/** * 保存训练结果 * * @throws IOException */ void save(File file) throws IOException { DataOutput out = new DataOutputStream(new FileOutputStream(file)); VARIABLE.write(out); }
/** * 加载训练结果 * * @param file * @throws IOException */ public void load(File file) throws IOException { DataInputStream in = new DataInputStream(new FileInputStream(file)); VARIABLE = Variable.read(in); }
/** * 计算P(feature|cateogry),返回的是取对数后的数值 * * @param feature * @param category * @return */ public double getFeatureProbability(String feature, String category) { int m = VARIABLE.getFeatureCount(); return Math.log( (VARIABLE.getDocCount(feature, category) + 1.0) / (VARIABLE.getDocCount(category) + m)); }
/** * 计算P(C)=该类型文档总数/文档总数,返回的数对数值 * * @param category * @return */ public double getCategoryProbability(String category) { return Math.log(VARIABLE.getDocCount(category) * 1.0f / VARIABLE.getDocCount()); }