/** Compute the tf-idf vector for each document using Sahami-Heilman's similarity Algorithm */ public String getNormalizedAugment() { // Build the TFIDF term vectors for the relevant documents ArrayList<TermVector> docVectors = new ArrayList<TermVector>(); for (ResultNode rn : result.getRelevantResultNodes()) { TermVector tv = new TermVector(rn, invIdx); // term vector for a document docVectors.add(tv); } // Original algorithm truncates each vector to 50 highest scoring terms // Our vectors are short, no need to truncate // Find the Centroid of the L2 normalized term vectors // Centroid is a vector of all distinct terms in the relevant documents // The term weight is the sum of the normalized weight of the term in all relevant docs TermVector centroid = new TermVector(); for (TermVector tv : docVectors) { for (TermNode tn : tv.getTerms()) { String term = tn.getTerm(); double weight = tn.getNormalizedWeight(); centroid.addTerm(term, weight); } } // Normalize the Centroid centroid.l2normalize(); // Sort descending order by normalized weight ArrayList<TermNode> termList = centroid.getTerms(); Collections.sort(termList); // quick way to index all the previously used search terms // check the new found terms against previous query to make sure there are no duplicates HashSet<String> lastQueryWords = new HashSet<String>(Arrays.asList(lastQuery.split(" "))); // augment with up to 2 top tf-idf score terms double weightDiffThreshold = 0.2; // weight is less than 20% difference String augment = null; double augmentWeight = 0; for (int i = 0; i < termList.size(); i++) { TermNode t = termList.get(i); String term = t.getTerm(); double weight = t.getWeight(); // do not re-query terms that are in the previous query if (lastQueryWords.contains(term)) { System.out.println("term " + term + " (" + weight + ") is in previous query. Skipping."); continue; } // System.out.println("DEBUG: " + term + " weight=" + t.getWeight()); // if the top 2 terms are close together in the score // use both terms in the next search if ((augment != null) && (weight > 0)) { // tie break results that are close double weightDiff = (augmentWeight - weight) / augmentWeight; if (weightDiff < weightDiffThreshold) { augment += " " + term; } break; } else { augment = term; augmentWeight = weight; } } return augment; }