public void initTonNDocs(Set<String> docs_topn) { if (docs_topn == _mPrevInit) return; // Already initialized _mPrevInit = docs_topn; _hmKey2IDF = new HashMap<Object, Double>(); double total_doc_length = 0d; // Have to go through all documents to compute term IDF for (String doc : docs_topn) { // Map<Object,Double> features = (Map<Object,Double>)getObjectRepresentation(doc); Map<Object, Double> features = DocUtils.ConvertToFeatureMap(_docs_all.get(doc)); // total_doc_length += VectorUtils.L1Norm(features); // features = VectorUtils.ConvertToBoolean(features); // _hmKey2IDF = VectorUtils.Sum(_hmKey2IDF, features); /* if (false && ++i % 100 == 0) { System.out.println("- Converted " + i + " documents to internal representation"); System.out.flush(); } */ } _dAvgDocLength = total_doc_length / (double) docs_topn.size(); for (Object key : _hmKey2IDF.keySet()) { Double idf = docs_topn.size() / (_hmKey2IDF.get(key) + 1d); _hmKey2IDF.put(key, Math.log(idf)); } _dDefaultIDF = Math.log(docs_topn.size() / 1d); if (DEBUG) { System.out.println("Avg doc length: " + _dAvgDocLength); System.out.println("Default IDF : " + _dDefaultIDF); System.out.println("IDF after log: " + _hmKey2IDF); } }
public Object getNoncachedObjectRepresentation(String content) { Map<Object, Double> features = DocUtils.ConvertToFeatureMap(content); return features; }