public void init() { HashMap<String, Object> config = new HashMap<String, Object>(); config.put("DF_MIN", Constants.DF_MIN); config.put("DF_MAX", Constants.DF_MAX); config.put("THERSAUS", ""); // 同义词库 config.put("WEIGHT", "Lucene"); config.put("CORPUS", root1.getPath()); config.put("ENCODE_TRAIN", "utf-8"); config.put("ENCODE_RECOGNIZE", "utf-8"); VSMService.build(config, new FileTokenizer()); service = VSMService.getInstance(); // _dimension(); // config.put("CORPUS", root.getPath()); // VSMService.build(config, new FileTokenizer()); // service = VSMService.getInstance(); }
private Doc buildDoc(int x, File file) { double minScore = 0.05; String s = FileUtil.read(file); String[] arr = s.split("\n"); WVTWordVector vector = service.getNativeVector(s); if (filter) { Map<String, Double> newMap = new HashMap<String, Double>(); for (Map.Entry<String, Double> entry : vector.getTFIDFValues().entrySet()) { if (entry.getValue() < minScore) { newMap.put(entry.getKey(), 0D); } } double[] values = vector.getValues(); for (int i = 0; i < values.length; i++) { if (values[i] < minScore) { values[i] = 0D; } } } return new Doc(vector, file, x, arr[2], arr[0], arr[1]); }