Beispiel #1
0
 public void init() {
   HashMap<String, Object> config = new HashMap<String, Object>();
   config.put("DF_MIN", Constants.DF_MIN);
   config.put("DF_MAX", Constants.DF_MAX);
   config.put("THERSAUS", ""); // 同义词库
   config.put("WEIGHT", "Lucene");
   config.put("CORPUS", root1.getPath());
   config.put("ENCODE_TRAIN", "utf-8");
   config.put("ENCODE_RECOGNIZE", "utf-8");
   VSMService.build(config, new FileTokenizer());
   service = VSMService.getInstance();
   //        _dimension();
   //        config.put("CORPUS", root.getPath());
   //        VSMService.build(config, new FileTokenizer());
   //        service = VSMService.getInstance();
 }
Beispiel #2
0
  private Doc buildDoc(int x, File file) {
    double minScore = 0.05;
    String s = FileUtil.read(file);
    String[] arr = s.split("\n");
    WVTWordVector vector = service.getNativeVector(s);
    if (filter) {
      Map<String, Double> newMap = new HashMap<String, Double>();
      for (Map.Entry<String, Double> entry : vector.getTFIDFValues().entrySet()) {
        if (entry.getValue() < minScore) {
          newMap.put(entry.getKey(), 0D);
        }
      }
      double[] values = vector.getValues();
      for (int i = 0; i < values.length; i++) {
        if (values[i] < minScore) {
          values[i] = 0D;
        }
      }
    }

    return new Doc(vector, file, x, arr[2], arr[0], arr[1]);
  }