@Override public void createWeight(QueryContext context, IndexSearcher searcher) throws IOException { TFIDFSimilarity sim = asTFIDF(searcher.getSimilarity(), field); if (sim == null) { throw new UnsupportedOperationException( "requires a TFIDFSimilarity (such as DefaultSimilarity)"); } int docfreq = searcher.getIndexReader().docFreq(new Term(indexedField, indexedBytes)); float idf = sim.idf(docfreq, searcher.getIndexReader().maxDoc()); FuncValues result = new ConstDoubleDocValues(idf, this); context.put(this, result); }
/** * Create a PriorityQueue from a word->tf map. * * @param words a map of words keyed on the word(String) with Int objects as the values. * @param fieldNames an array of field names to override defaults. */ private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words, String... fieldNames) throws IOException { // have collected all words in doc and their freqs int numDocs = ir.numDocs(); final int limit = Math.min(maxQueryTerms, words.size()); FreqQ queue = new FreqQ(limit); // will order words by score for (String word : words.keySet()) { // for every word int tf = words.get(word).x; // term freq in the source doc if (minTermFreq > 0 && tf < minTermFreq) { continue; // filter out words that don't occur enough times in the source } // go through all the fields and find the largest document frequency String topField = fieldNames[0]; int docFreq = 0; for (String fieldName : fieldNames) { int freq = ir.docFreq(new Term(fieldName, word)); topField = (freq > docFreq) ? fieldName : topField; docFreq = (freq > docFreq) ? freq : docFreq; } if (minDocFreq > 0 && docFreq < minDocFreq) { continue; // filter out words that don't occur in enough docs } if (docFreq > maxDocFreq) { continue; // filter out words that occur in too many docs } if (docFreq == 0) { continue; // index update problem? } float idf = similarity.idf(docFreq, numDocs); float score = tf * idf; if (queue.size() < limit) { // there is still space in the queue queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf)); } else { ScoreTerm term = queue.top(); if (term.score < score) { // update the smallest in the queue in place and update the queue. term.update(word, topField, score, idf, docFreq, tf); queue.updateTop(); } } } return queue; }
/** * Create a PriorityQueue from a word->tf map. * * @param perFieldTermFrequencies a per field map of words keyed on the word(String) with Int * objects as the values. */ private PriorityQueue<ScoreTerm> createQueue( Map<String, Map<String, Int>> perFieldTermFrequencies) throws IOException { // have collected all words in doc and their freqs int numDocs = ir.numDocs(); final int limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies)); FreqQ queue = new FreqQ(limit); // will order words by score for (Map.Entry<String, Map<String, Int>> entry : perFieldTermFrequencies.entrySet()) { Map<String, Int> perWordTermFrequencies = entry.getValue(); String fieldName = entry.getKey(); for (Map.Entry<String, Int> tfEntry : perWordTermFrequencies.entrySet()) { // for every word String word = tfEntry.getKey(); int tf = tfEntry.getValue().x; // term freq in the source doc if (minTermFreq > 0 && tf < minTermFreq) { continue; // filter out words that don't occur enough times in the source } int docFreq = ir.docFreq(new Term(fieldName, word)); if (minDocFreq > 0 && docFreq < minDocFreq) { continue; // filter out words that don't occur in enough docs } if (docFreq > maxDocFreq) { continue; // filter out words that occur in too many docs } if (docFreq == 0) { continue; // index update problem? } float idf = similarity.idf(docFreq, numDocs); float score = tf * idf; if (queue.size() < limit) { // there is still space in the queue queue.add(new ScoreTerm(word, fieldName, score, idf, docFreq, tf)); } else { ScoreTerm term = queue.top(); if (term.score < score) { // update the smallest in the queue in place and update the queue. term.update(word, fieldName, score, idf, docFreq, tf); queue.updateTop(); } } } } return queue; }
public void testHyperbolicSweetSpot() { SweetSpotSimilarity ss = new SweetSpotSimilarity() { @Override public float tf(int freq) { return hyperbolicTf(freq); } }; ss.setHyperbolicTfFactors(3.3f, 7.7f, Math.E, 5.0f); TFIDFSimilarity s = ss; for (int i = 1; i <= 1000; i++) { assertTrue("MIN tf: i=" + i + " : s=" + s.tf(i), 3.3f <= s.tf(i)); assertTrue("MAX tf: i=" + i + " : s=" + s.tf(i), s.tf(i) <= 7.7f); } assertEquals("MID tf", 3.3f + (7.7f - 3.3f) / 2.0f, s.tf(5), 0.00001f); // stupidity assertEquals("tf zero", 0.0f, s.tf(0), 0.0f); }
public void testSweetSpotTf() { SweetSpotSimilarity ss = new SweetSpotSimilarity(); TFIDFSimilarity d = new DefaultSimilarity(); TFIDFSimilarity s = ss; // tf equal ss.setBaselineTfFactors(0.0f, 0.0f); for (int i = 1; i < 1000; i++) { assertEquals("tf: i=" + i, d.tf(i), s.tf(i), 0.0f); } // tf higher ss.setBaselineTfFactors(1.0f, 0.0f); for (int i = 1; i < 1000; i++) { assertTrue("tf: i=" + i + " : d=" + d.tf(i) + " < s=" + s.tf(i), d.tf(i) < s.tf(i)); } // tf flat ss.setBaselineTfFactors(1.0f, 6.0f); for (int i = 1; i <= 6; i++) { assertEquals("tf flat1: i=" + i, 1.0f, s.tf(i), 0.0f); } ss.setBaselineTfFactors(2.0f, 6.0f); for (int i = 1; i <= 6; i++) { assertEquals("tf flat2: i=" + i, 2.0f, s.tf(i), 0.0f); } for (int i = 6; i <= 1000; i++) { assertTrue("tf: i=" + i + " : s=" + s.tf(i) + " < d=" + d.tf(i), s.tf(i) < d.tf(i)); } // stupidity assertEquals("tf zero", 0.0f, s.tf(0), 0.0f); }
private void addTerms(IndexReader reader, FieldVals f, ScoreTermQueue q) throws IOException { if (f.queryString == null) return; final Terms terms = MultiFields.getTerms(reader, f.fieldName); if (terms == null) { return; } try (TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); int corpusNumDocs = reader.numDocs(); HashSet<String> processedTerms = new HashSet<>(); ts.reset(); while (ts.incrementToken()) { String term = termAtt.toString(); if (!processedTerms.contains(term)) { processedTerms.add(term); ScoreTermQueue variantsQ = new ScoreTermQueue( MAX_VARIANTS_PER_TERM); // maxNum variants considered for any one term float minScore = 0; Term startTerm = new Term(f.fieldName, term); AttributeSource atts = new AttributeSource(); MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength); // store the df so all variants use same idf int df = reader.docFreq(startTerm); int numVariants = 0; int totalVariantDocFreqs = 0; BytesRef possibleMatch; BoostAttribute boostAtt = fe.attributes().addAttribute(BoostAttribute.class); while ((possibleMatch = fe.next()) != null) { numVariants++; totalVariantDocFreqs += fe.docFreq(); float score = boostAtt.getBoost(); if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) { ScoreTerm st = new ScoreTerm( new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm); variantsQ.insertWithOverflow(st); minScore = variantsQ.top().score; // maintain minScore } maxBoostAtt.setMaxNonCompetitiveBoost( variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY); } if (numVariants > 0) { int avgDf = totalVariantDocFreqs / numVariants; if (df == 0) // no direct match we can use as df for all variants { df = avgDf; // use avg df of all variants } // take the top variants (scored by edit distance) and reset the score // to include an IDF factor then add to the global queue for ranking // overall top query terms int size = variantsQ.size(); for (int i = 0; i < size; i++) { ScoreTerm st = variantsQ.pop(); st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs); q.insertWithOverflow(st); } } } } ts.end(); } }