/** * Create a PriorityQueue from a word->tf map. * * @param words a map of words keyed on the word(String) with Int objects as the values. * @param fieldNames an array of field names to override defaults. */ private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words, String... fieldNames) throws IOException { // have collected all words in doc and their freqs int numDocs = ir.numDocs(); final int limit = Math.min(maxQueryTerms, words.size()); FreqQ queue = new FreqQ(limit); // will order words by score for (String word : words.keySet()) { // for every word int tf = words.get(word).x; // term freq in the source doc if (minTermFreq > 0 && tf < minTermFreq) { continue; // filter out words that don't occur enough times in the source } // go through all the fields and find the largest document frequency String topField = fieldNames[0]; int docFreq = 0; for (String fieldName : fieldNames) { int freq = ir.docFreq(new Term(fieldName, word)); topField = (freq > docFreq) ? fieldName : topField; docFreq = (freq > docFreq) ? freq : docFreq; } if (minDocFreq > 0 && docFreq < minDocFreq) { continue; // filter out words that don't occur in enough docs } if (docFreq > maxDocFreq) { continue; // filter out words that occur in too many docs } if (docFreq == 0) { continue; // index update problem? } float idf = similarity.idf(docFreq, numDocs); float score = tf * idf; if (queue.size() < limit) { // there is still space in the queue queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf)); } else { ScoreTerm term = queue.top(); if (term.score < score) { // update the smallest in the queue in place and update the queue. term.update(word, topField, score, idf, docFreq, tf); queue.updateTop(); } } } return queue; }
/** * Create a PriorityQueue from a word->tf map. * * @param perFieldTermFrequencies a per field map of words keyed on the word(String) with Int * objects as the values. */ private PriorityQueue<ScoreTerm> createQueue( Map<String, Map<String, Int>> perFieldTermFrequencies) throws IOException { // have collected all words in doc and their freqs int numDocs = ir.numDocs(); final int limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies)); FreqQ queue = new FreqQ(limit); // will order words by score for (Map.Entry<String, Map<String, Int>> entry : perFieldTermFrequencies.entrySet()) { Map<String, Int> perWordTermFrequencies = entry.getValue(); String fieldName = entry.getKey(); for (Map.Entry<String, Int> tfEntry : perWordTermFrequencies.entrySet()) { // for every word String word = tfEntry.getKey(); int tf = tfEntry.getValue().x; // term freq in the source doc if (minTermFreq > 0 && tf < minTermFreq) { continue; // filter out words that don't occur enough times in the source } int docFreq = ir.docFreq(new Term(fieldName, word)); if (minDocFreq > 0 && docFreq < minDocFreq) { continue; // filter out words that don't occur in enough docs } if (docFreq > maxDocFreq) { continue; // filter out words that occur in too many docs } if (docFreq == 0) { continue; // index update problem? } float idf = similarity.idf(docFreq, numDocs); float score = tf * idf; if (queue.size() < limit) { // there is still space in the queue queue.add(new ScoreTerm(word, fieldName, score, idf, docFreq, tf)); } else { ScoreTerm term = queue.top(); if (term.score < score) { // update the smallest in the queue in place and update the queue. term.update(word, fieldName, score, idf, docFreq, tf); queue.updateTop(); } } } } return queue; }