/** * combine weights according to expansion formula * * @param queryTerms * @param relevantDocsTerms * @param irrelevantDocsTerms * @return */ public List<TermQuery> combine( Map<String, TermQuery> queryTerms, Map<String, TermQuery> relevantDocsTerms, Map<String, TermQuery> irrelevantDocsTerms) { // Add Terms of the relevant documents for (Map.Entry<String, TermQuery> e : queryTerms.entrySet()) { if (relevantDocsTerms.containsKey(e.getKey())) { TermQuery tq = relevantDocsTerms.get(e.getKey()); tq.setBoost(tq.getBoost() + e.getValue().getBoost()); } else { relevantDocsTerms.put(e.getKey(), e.getValue()); } } // Substract terms of irrelevant documents for (Map.Entry<String, TermQuery> e : irrelevantDocsTerms.entrySet()) { if (relevantDocsTerms.containsKey(e.getKey())) { TermQuery tq = relevantDocsTerms.get(e.getKey()); tq.setBoost(tq.getBoost() - e.getValue().getBoost()); } else { TermQuery tq = e.getValue(); tq.setBoost(-tq.getBoost()); relevantDocsTerms.put(e.getKey(), tq); } } return new ArrayList<>(relevantDocsTerms.values()); }
/** * Sets boost of termClaimsDescriptionAbstractTitles. boost = weight = factor(tf*idf) * * @param vecsTerms * @param currentField * @param factor - adjustment factor ( ex. alpha or beta ) * @param decayFactor * @return * @throws java.io.IOException */ public Map<String, TermQuery> setBoost( Map<TermFreqVector, String> vecsTerms, String currentField, float factor, float decayFactor) throws IOException { Map<String, TermQuery> terms = new HashMap<>(); // setBoost for each of the terms of each of the docs int i = 0; float norm = (float) 1 / vecsTerms.size(); // System.out.println("--------------------------"); for (Map.Entry<TermFreqVector, String> e : vecsTerms.entrySet()) { // Increase decay String field = e.getValue(); TermFreqVector docTerms = e.getKey(); float decay = decayFactor * i; // Populate terms: with TermQuries and set boost for (String termTxt : docTerms.getTerms()) { // Create Term Term term = new Term(currentField, termTxt); // Calculate weight float tf = docTerms.getFreq(termTxt); // float idf = ir.docFreq(termTitle); int docs; float idf; if (sourceField.equals(PatentQuery.all)) { docs = ir.getDocCount(field); idf = (float) Math.log10((double) docs / (ir.docFreq(new Term(field, termTxt)) + 1)); } else { docs = ir.getDocCount(sourceField); idf = (float) Math.log10((double) docs / (ir.docFreq(new Term(sourceField, termTxt)) + 1)); } float weight = tf * idf; // System.out.println(term.text() + " -> tf= " + tf + " idf= " + idf + " // tfidf= " + weight); // Adjust weight by decay factor weight = weight - (weight * decay); // Create TermQuery and add it to the collection TermQuery termQuery = new TermQuery(term); // Calculate and set boost float boost; if (vecsTerms.size() == 1) { boost = factor * tf; } else { boost = factor; } if (boost != 0) { termQuery.setBoost(boost * norm); if (terms.containsKey(termTxt)) { TermQuery tq = terms.get(termTxt); tq.setBoost(tq.getBoost() + termQuery.getBoost()); } else { terms.put(termTxt, termQuery); } } } i++; } return terms; }
private static JsonObject convertTermQuery(TermQuery termQuery) { String field = termQuery.getTerm().field(); JsonObject matchQuery = JsonBuilder.object() .add( "term", JsonBuilder.object() .add( field, JsonBuilder.object() .addProperty("value", termQuery.getTerm().text()) .addProperty("boost", termQuery.getBoost()))) .build(); return wrapQueryForNestedIfRequired(field, matchQuery); }
public Map<String, Float> getRocchioVector(String currentField) throws IOException { Map<String, Float> out = new HashMap<>(); float beta = parameters.get(RocchioQueryExpansion.ROCCHIO_BETA_FLD); float gamma = parameters.get(RocchioQueryExpansion.ROCCHIO_GAMMA_FLD); float decay = parameters.get(RocchioQueryExpansion.DECAY_FLD); Map<String, TermQuery> relevantDocsTerms = setBoost(docsTermVectorReldocs, currentField, beta, decay); Map<String, TermQuery> irrrelevantDocsTerms = setBoost(docsTermVectorIrreldocs, currentField, gamma, decay); List<TermQuery> expandedQueryTerms = combine( new HashMap<String, TermQuery>(), relevantDocsTerms, new HashMap<String, TermQuery>()); for (TermQuery tq : expandedQueryTerms) { out.put(tq.getTerm().text(), tq.getBoost()); } return out; }
private void _includeIfUnique( BooleanQuery booleanQuery, boolean like, QueryParser queryParser, Query query, BooleanClause.Occur occur) { if (query instanceof TermQuery) { Set<Term> terms = new HashSet<Term>(); TermQuery termQuery = (TermQuery) query; termQuery.extractTerms(terms); float boost = termQuery.getBoost(); for (Term term : terms) { String termValue = term.text(); if (like) { termValue = termValue.toLowerCase(queryParser.getLocale()); term = term.createTerm(StringPool.STAR.concat(termValue).concat(StringPool.STAR)); query = new WildcardQuery(term); } else { query = new TermQuery(term); } query.setBoost(boost); boolean included = false; for (BooleanClause booleanClause : booleanQuery.getClauses()) { if (query.equals(booleanClause.getQuery())) { included = true; } } if (!included) { booleanQuery.add(query, occur); } } } else if (query instanceof BooleanQuery) { BooleanQuery curBooleanQuery = (BooleanQuery) query; BooleanQuery containerBooleanQuery = new BooleanQuery(); for (BooleanClause booleanClause : curBooleanQuery.getClauses()) { _includeIfUnique( containerBooleanQuery, like, queryParser, booleanClause.getQuery(), booleanClause.getOccur()); } if (containerBooleanQuery.getClauses().length > 0) { booleanQuery.add(containerBooleanQuery, occur); } } else { boolean included = false; for (BooleanClause booleanClause : booleanQuery.getClauses()) { if (query.equals(booleanClause.getQuery())) { included = true; } } if (!included) { booleanQuery.add(query, occur); } } }
public void testStarParsing() throws Exception { final int[] type = new int[1]; QueryParser qp = new QueryParser( TEST_VERSION_CURRENT, "field", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)) { @Override protected Query getWildcardQuery(String field, String termStr) throws ParseException { // override error checking of superclass type[0] = 1; return new TermQuery(new Term(field, termStr)); } @Override protected Query getPrefixQuery(String field, String termStr) throws ParseException { // override error checking of superclass type[0] = 2; return new TermQuery(new Term(field, termStr)); } @Override protected Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException { type[0] = 3; return super.getFieldQuery(field, queryText, quoted); } }; TermQuery tq; tq = (TermQuery) qp.parse("foo:zoo*"); assertEquals("zoo", tq.getTerm().text()); assertEquals(2, type[0]); tq = (TermQuery) qp.parse("foo:zoo*^2"); assertEquals("zoo", tq.getTerm().text()); assertEquals(2, type[0]); assertEquals(tq.getBoost(), 2, 0); tq = (TermQuery) qp.parse("foo:*"); assertEquals("*", tq.getTerm().text()); assertEquals(1, type[0]); // could be a valid prefix query in the future too tq = (TermQuery) qp.parse("foo:*^2"); assertEquals("*", tq.getTerm().text()); assertEquals(1, type[0]); assertEquals(tq.getBoost(), 2, 0); tq = (TermQuery) qp.parse("*:foo"); assertEquals("*", tq.getTerm().field()); assertEquals("foo", tq.getTerm().text()); assertEquals(3, type[0]); tq = (TermQuery) qp.parse("*:*"); assertEquals("*", tq.getTerm().field()); assertEquals("*", tq.getTerm().text()); assertEquals(1, type[0]); // could be handled as a prefix query in the future tq = (TermQuery) qp.parse("(*:*)"); assertEquals("*", tq.getTerm().field()); assertEquals("*", tq.getTerm().text()); assertEquals(1, type[0]); }