示例#1
0
  public void initTonNDocs(Set<String> docs_topn) {
    if (docs_topn == _mPrevInit) return; // Already initialized

    _mPrevInit = docs_topn;
    _hmKey2IDF = new HashMap<Object, Double>();
    double total_doc_length = 0d;

    // Have to go through all documents to compute term IDF
    for (String doc : docs_topn) {
      // Map<Object,Double> features = (Map<Object,Double>)getObjectRepresentation(doc);
      Map<Object, Double> features = DocUtils.ConvertToFeatureMap(_docs_all.get(doc));
      //
      total_doc_length += VectorUtils.L1Norm(features);
      //
      features = VectorUtils.ConvertToBoolean(features);
      //
      _hmKey2IDF = VectorUtils.Sum(_hmKey2IDF, features);

      /*
      if (false && ++i % 100 == 0) {
      	System.out.println("- Converted " + i + " documents to internal representation");
      	System.out.flush();
      }
      */
    }
    _dAvgDocLength = total_doc_length / (double) docs_topn.size();

    for (Object key : _hmKey2IDF.keySet()) {
      Double idf = docs_topn.size() / (_hmKey2IDF.get(key) + 1d);
      _hmKey2IDF.put(key, Math.log(idf));
    }
    _dDefaultIDF = Math.log(docs_topn.size() / 1d);
    if (DEBUG) {
      System.out.println("Avg doc length: " + _dAvgDocLength);
      System.out.println("Default IDF   : " + _dDefaultIDF);
      System.out.println("IDF after log:  " + _hmKey2IDF);
    }
  }
示例#2
0
 public Object getNoncachedObjectRepresentation(String content) {
   Map<Object, Double> features = DocUtils.ConvertToFeatureMap(content);
   return features;
 }