/** * Sets boost of termClaimsDescriptionAbstractTitles. boost = weight = factor(tf*idf) * * @param vecsTerms * @param currentField * @param factor - adjustment factor ( ex. alpha or beta ) * @param decayFactor * @return * @throws java.io.IOException */ public Map<String, TermQuery> setBoost( Map<TermFreqVector, String> vecsTerms, String currentField, float factor, float decayFactor) throws IOException { Map<String, TermQuery> terms = new HashMap<>(); // setBoost for each of the terms of each of the docs int i = 0; float norm = (float) 1 / vecsTerms.size(); // System.out.println("--------------------------"); for (Map.Entry<TermFreqVector, String> e : vecsTerms.entrySet()) { // Increase decay String field = e.getValue(); TermFreqVector docTerms = e.getKey(); float decay = decayFactor * i; // Populate terms: with TermQuries and set boost for (String termTxt : docTerms.getTerms()) { // Create Term Term term = new Term(currentField, termTxt); // Calculate weight float tf = docTerms.getFreq(termTxt); // float idf = ir.docFreq(termTitle); int docs; float idf; if (sourceField.equals(PatentQuery.all)) { docs = ir.getDocCount(field); idf = (float) Math.log10((double) docs / (ir.docFreq(new Term(field, termTxt)) + 1)); } else { docs = ir.getDocCount(sourceField); idf = (float) Math.log10((double) docs / (ir.docFreq(new Term(sourceField, termTxt)) + 1)); } float weight = tf * idf; // System.out.println(term.text() + " -> tf= " + tf + " idf= " + idf + " // tfidf= " + weight); // Adjust weight by decay factor weight = weight - (weight * decay); // Create TermQuery and add it to the collection TermQuery termQuery = new TermQuery(term); // Calculate and set boost float boost; if (vecsTerms.size() == 1) { boost = factor * tf; } else { boost = factor; } if (boost != 0) { termQuery.setBoost(boost * norm); if (terms.containsKey(termTxt)) { TermQuery tq = terms.get(termTxt); tq.setBoost(tq.getBoost() + termQuery.getBoost()); } else { terms.put(termTxt, termQuery); } } } i++; } return terms; }
/* * listPostings displays the first n postings for a term in a * field in an index (specified by reader). Set n to MAX_VALUE * to display all postings. */ static void listPostings(IndexReader reader, String termString, String field, Integer n) throws IOException { System.out.println("\nPostings: " + termString + " " + field); /* * Prepare to access the index. */ BytesRef termBytes = new BytesRef(termString); Term term = new Term(field, termBytes); Bits liveDocs = MultiFields.getLiveDocs(reader); /* * Lookup the collection term frequency (ctf). */ long df = reader.docFreq(term); System.out.println("\tdf: " + df); long ctf = reader.totalTermFreq(term); System.out.println("\tctf: " + ctf); if (df < 1) return; /* * Lookup the inverted list. */ DocsAndPositionsEnum postings = MultiFields.getTermPositionsEnum(reader, liveDocs, field, termBytes); /* * Iterate through the first n postings. */ long count = 0; while ((count < n) && (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS)) { System.out.println("\tdocid: " + postings.docID()); int tf = postings.freq(); System.out.println("\ttf: " + tf); System.out.print("\tPositions: "); for (int j = 0; j < tf; j++) { int pos = postings.nextPosition(); System.out.print(pos + " "); } System.out.println(""); count++; } ; return; }
/** * Create a PriorityQueue from a word->tf map. * * @param words a map of words keyed on the word(String) with Int objects as the values. * @param fieldNames an array of field names to override defaults. */ private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words, String... fieldNames) throws IOException { // have collected all words in doc and their freqs int numDocs = ir.numDocs(); final int limit = Math.min(maxQueryTerms, words.size()); FreqQ queue = new FreqQ(limit); // will order words by score for (String word : words.keySet()) { // for every word int tf = words.get(word).x; // term freq in the source doc if (minTermFreq > 0 && tf < minTermFreq) { continue; // filter out words that don't occur enough times in the source } // go through all the fields and find the largest document frequency String topField = fieldNames[0]; int docFreq = 0; for (String fieldName : fieldNames) { int freq = ir.docFreq(new Term(fieldName, word)); topField = (freq > docFreq) ? fieldName : topField; docFreq = (freq > docFreq) ? freq : docFreq; } if (minDocFreq > 0 && docFreq < minDocFreq) { continue; // filter out words that don't occur in enough docs } if (docFreq > maxDocFreq) { continue; // filter out words that occur in too many docs } if (docFreq == 0) { continue; // index update problem? } float idf = similarity.idf(docFreq, numDocs); float score = tf * idf; if (queue.size() < limit) { // there is still space in the queue queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf)); } else { ScoreTerm term = queue.top(); if (term.score < score) { // update the smallest in the queue in place and update the queue. term.update(word, topField, score, idf, docFreq, tf); queue.updateTop(); } } } return queue; }