/** * Note: the BM25 kernel places interpretations on the two objects compared... * * @return BM25 similarity between query and document, and query should always come first. */ public double sim(Object objQuery, Object objDoc) { // Will modify in place, so need to copy Map<Object, Double> query = VectorUtils.Copy((Map<Object, Double>) objQuery); Map<Object, Double> doc = VectorUtils.Copy((Map<Object, Double>) objDoc); // Modify query according to BM25 kernel for (Object key : query.keySet()) { double freq = query.get(key); freq = ((_k3 + 1) * freq) / (_k3 + freq); Double idf = _hmKey2IDF.get(key); if (idf == null) idf = _dDefaultIDF; query.put(key, freq * idf); } // Modify doc according to BM25 kernel double doc_length_ratio = VectorUtils.L1Norm(doc) / _dAvgDocLength; for (Object key : doc.keySet()) { double freq = doc.get(key); freq = ((_k1 + 1) * freq) / (_k1 * (1d - _b + _b * doc_length_ratio) + freq); doc.put(key, freq); } return VectorUtils.DotProduct(query, doc); }
/** * */ /// * public double sim(Object o1, Object o2, Object ow) { System.out.println("ERROR: Cannot do BM25 query-reweighted similarity"); System.exit(1); return -1d; } // */ public double sim_(Object o1, Object o2, Object ow) { Map<Object, Double> map1 = (Map<Object, Double>) o1; Map<Object, Double> map2 = (Map<Object, Double>) o2; Map<Object, Double> query; Map<Object, Double> doc; if (map1.size() < map2.size()) { query = VectorUtils.Copy(map1); doc = VectorUtils.Copy(map2); } else { query = VectorUtils.Copy(map2); doc = VectorUtils.Copy(map1); } // Modify query according to BM25 kernel for (Object key : query.keySet()) { double freq = query.get(key); freq = ((_k3 + 1) * freq) / (_k3 + freq); Double idf = _hmKey2IDF.get(key); if (idf == null) idf = _dDefaultIDF; query.put(key, freq * idf); } // Modify doc according to BM25 kernel double doc_length_ratio = VectorUtils.L1Norm(doc) / _dAvgDocLength; for (Object key : doc.keySet()) { double freq = doc.get(key); freq = ((_k1 + 1) * freq) / (_k1 * (1d - _b + _b * doc_length_ratio) + freq); doc.put(key, freq); } return VectorUtils.DotProduct(query, doc); }
public void initTonNDocs(Set<String> docs_topn) { if (docs_topn == _mPrevInit) return; // Already initialized _mPrevInit = docs_topn; _hmKey2IDF = new HashMap<Object, Double>(); double total_doc_length = 0d; // Have to go through all documents to compute term IDF for (String doc : docs_topn) { // Map<Object,Double> features = (Map<Object,Double>)getObjectRepresentation(doc); Map<Object, Double> features = DocUtils.ConvertToFeatureMap(_docs_all.get(doc)); // total_doc_length += VectorUtils.L1Norm(features); // features = VectorUtils.ConvertToBoolean(features); // _hmKey2IDF = VectorUtils.Sum(_hmKey2IDF, features); /* if (false && ++i % 100 == 0) { System.out.println("- Converted " + i + " documents to internal representation"); System.out.flush(); } */ } _dAvgDocLength = total_doc_length / (double) docs_topn.size(); for (Object key : _hmKey2IDF.keySet()) { Double idf = docs_topn.size() / (_hmKey2IDF.get(key) + 1d); _hmKey2IDF.put(key, Math.log(idf)); } _dDefaultIDF = Math.log(docs_topn.size() / 1d); if (DEBUG) { System.out.println("Avg doc length: " + _dAvgDocLength); System.out.println("Default IDF : " + _dDefaultIDF); System.out.println("IDF after log: " + _hmKey2IDF); } }