/** * combine weights according to expansion formula * * @param queryTerms * @param relevantDocsTerms * @param irrelevantDocsTerms * @return */ public List<TermQuery> combine( Map<String, TermQuery> queryTerms, Map<String, TermQuery> relevantDocsTerms, Map<String, TermQuery> irrelevantDocsTerms) { // Add Terms of the relevant documents for (Map.Entry<String, TermQuery> e : queryTerms.entrySet()) { if (relevantDocsTerms.containsKey(e.getKey())) { TermQuery tq = relevantDocsTerms.get(e.getKey()); tq.setBoost(tq.getBoost() + e.getValue().getBoost()); } else { relevantDocsTerms.put(e.getKey(), e.getValue()); } } // Substract terms of irrelevant documents for (Map.Entry<String, TermQuery> e : irrelevantDocsTerms.entrySet()) { if (relevantDocsTerms.containsKey(e.getKey())) { TermQuery tq = relevantDocsTerms.get(e.getKey()); tq.setBoost(tq.getBoost() - e.getValue().getBoost()); } else { TermQuery tq = e.getValue(); tq.setBoost(-tq.getBoost()); relevantDocsTerms.put(e.getKey(), tq); } } return new ArrayList<>(relevantDocsTerms.values()); }
/** * Sets boost of termClaimsDescriptionAbstractTitles. boost = weight = factor(tf*idf) * * @param vecsTerms * @param currentField * @param factor - adjustment factor ( ex. alpha or beta ) * @param decayFactor * @return * @throws java.io.IOException */ public Map<String, TermQuery> setBoost( Map<TermFreqVector, String> vecsTerms, String currentField, float factor, float decayFactor) throws IOException { Map<String, TermQuery> terms = new HashMap<>(); // setBoost for each of the terms of each of the docs int i = 0; float norm = (float) 1 / vecsTerms.size(); // System.out.println("--------------------------"); for (Map.Entry<TermFreqVector, String> e : vecsTerms.entrySet()) { // Increase decay String field = e.getValue(); TermFreqVector docTerms = e.getKey(); float decay = decayFactor * i; // Populate terms: with TermQuries and set boost for (String termTxt : docTerms.getTerms()) { // Create Term Term term = new Term(currentField, termTxt); // Calculate weight float tf = docTerms.getFreq(termTxt); // float idf = ir.docFreq(termTitle); int docs; float idf; if (sourceField.equals(PatentQuery.all)) { docs = ir.getDocCount(field); idf = (float) Math.log10((double) docs / (ir.docFreq(new Term(field, termTxt)) + 1)); } else { docs = ir.getDocCount(sourceField); idf = (float) Math.log10((double) docs / (ir.docFreq(new Term(sourceField, termTxt)) + 1)); } float weight = tf * idf; // System.out.println(term.text() + " -> tf= " + tf + " idf= " + idf + " // tfidf= " + weight); // Adjust weight by decay factor weight = weight - (weight * decay); // Create TermQuery and add it to the collection TermQuery termQuery = new TermQuery(term); // Calculate and set boost float boost; if (vecsTerms.size() == 1) { boost = factor * tf; } else { boost = factor; } if (boost != 0) { termQuery.setBoost(boost * norm); if (terms.containsKey(termTxt)) { TermQuery tq = terms.get(termTxt); tq.setBoost(tq.getBoost() + termQuery.getBoost()); } else { terms.put(termTxt, termQuery); } } } i++; } return terms; }
public Map<String, Float> getRocchioVector(String currentField) throws IOException { Map<String, Float> out = new HashMap<>(); float beta = parameters.get(RocchioQueryExpansion.ROCCHIO_BETA_FLD); float gamma = parameters.get(RocchioQueryExpansion.ROCCHIO_GAMMA_FLD); float decay = parameters.get(RocchioQueryExpansion.DECAY_FLD); Map<String, TermQuery> relevantDocsTerms = setBoost(docsTermVectorReldocs, currentField, beta, decay); Map<String, TermQuery> irrrelevantDocsTerms = setBoost(docsTermVectorIrreldocs, currentField, gamma, decay); List<TermQuery> expandedQueryTerms = combine( new HashMap<String, TermQuery>(), relevantDocsTerms, new HashMap<String, TermQuery>()); for (TermQuery tq : expandedQueryTerms) { out.put(tq.getTerm().text(), tq.getBoost()); } return out; }