/** * Sets boost of termClaimsDescriptionAbstractTitles. boost = weight = factor(tf*idf) * * @param vecsTerms * @param currentField * @param factor - adjustment factor ( ex. alpha or beta ) * @param decayFactor * @return * @throws java.io.IOException */ public Map<String, TermQuery> setBoost( Map<TermFreqVector, String> vecsTerms, String currentField, float factor, float decayFactor) throws IOException { Map<String, TermQuery> terms = new HashMap<>(); // setBoost for each of the terms of each of the docs int i = 0; float norm = (float) 1 / vecsTerms.size(); // System.out.println("--------------------------"); for (Map.Entry<TermFreqVector, String> e : vecsTerms.entrySet()) { // Increase decay String field = e.getValue(); TermFreqVector docTerms = e.getKey(); float decay = decayFactor * i; // Populate terms: with TermQuries and set boost for (String termTxt : docTerms.getTerms()) { // Create Term Term term = new Term(currentField, termTxt); // Calculate weight float tf = docTerms.getFreq(termTxt); // float idf = ir.docFreq(termTitle); int docs; float idf; if (sourceField.equals(PatentQuery.all)) { docs = ir.getDocCount(field); idf = (float) Math.log10((double) docs / (ir.docFreq(new Term(field, termTxt)) + 1)); } else { docs = ir.getDocCount(sourceField); idf = (float) Math.log10((double) docs / (ir.docFreq(new Term(sourceField, termTxt)) + 1)); } float weight = tf * idf; // System.out.println(term.text() + " -> tf= " + tf + " idf= " + idf + " // tfidf= " + weight); // Adjust weight by decay factor weight = weight - (weight * decay); // Create TermQuery and add it to the collection TermQuery termQuery = new TermQuery(term); // Calculate and set boost float boost; if (vecsTerms.size() == 1) { boost = factor * tf; } else { boost = factor; } if (boost != 0) { termQuery.setBoost(boost * norm); if (terms.containsKey(termTxt)) { TermQuery tq = terms.get(termTxt); tq.setBoost(tq.getBoost() + termQuery.getBoost()); } else { terms.put(termTxt, termQuery); } } } i++; } return terms; }
public LuceneSearch_queryparser() throws IOException { if (searcher == null) { Directory dir = FSDirectory.open( new File("/Users/chanvain/Documents/workspace/NGPODCollector/index")); // 打开索引文件 IndexReader reader = DirectoryReader.open(dir); System.out.println(reader.getDocCount("title")); // 统计数量 searcher = new IndexSearcher(reader); String[] fields = {"title", "description"}; // 定义要搜索的两个field parser = new MultiFieldQueryParser(fields, new StandardAnalyzer()); // 定义parser parser.setDefaultOperator(QueryParserBase.AND_OPERATOR); // 定义两个field出现的情况,这里是都要出现 } }
public static void main(String[] args) throws IOException { IndexReader reader = null; /* * Opening the index first simplifies the processing of the * rest of the command line arguments. */ for (int i = 0; i < args.length; i++) { if (("-index".equals(args[i])) && ((i + 1) < args.length)) { reader = DirectoryReader.open(FSDirectory.open(new File(args[i + 1]))); if (reader == null) { System.err.println("Error: Can't open index " + args[i + 1]); System.exit(1); } ; break; } ; } ; if (reader == null) { System.err.println(usage); System.exit(1); } ; /* * Process the command line arguments sequentially. */ for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { /* * Handled in the previous loop, so just skip the argument. */ i++; } else if ("-list-edocid".equals(args[i])) { System.out.println("-list-edocid:"); if ((i + 1) >= args.length) { System.out.println(usage); break; } ; Document d = reader.document(Integer.parseInt(args[i + 1])); System.out.println( "Internal docid --> External docid: " + args[i + 1] + " --> " + d.get("externalId")); i += 1; } else if ("-list-docids".equals(args[i])) { System.out.println("-list-docids:"); for (int j = 0; j < reader.numDocs(); j++) { Document d = reader.document(j); System.out.println("Internal --> external docid: " + j + " --> " + d.get("externalId")); } ; } else if ("-list-fields".equals(args[i])) { Fields fields = MultiFields.getFields(reader); System.out.print("\nNumber of fields: "); if (fields == null) System.out.println("0"); else { System.out.println(fields.size()); Iterator<String> is = fields.iterator(); while (is.hasNext()) { System.out.println("\t" + is.next()); } ; } ; } else if ("-list-postings".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listPostings(reader, args[i + 1], args[i + 2], Integer.MAX_VALUE); i += 2; } else if ("-list-postings-sample".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listPostings(reader, args[i + 1], args[i + 2], 5); i += 2; } else if ("-list-stats".equals(args[i])) { System.out.println("Corpus statistics:"); System.out.println("\tnumdocs\t\t" + reader.numDocs()); System.out.println( "\turl:\t" + "\tnumdocs=" + reader.getDocCount("url") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("url") + "\tavglen=" + reader.getSumTotalTermFreq("url") / (float) reader.getDocCount("url")); System.out.println( "\tkeywords:" + "\tnumdocs=" + reader.getDocCount("keywords") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("keywords") + "\tavglen=" + reader.getSumTotalTermFreq("keywords") / (float) reader.getDocCount("keywords")); System.out.println( "\ttitle:\t" + "\tnumdocs=" + reader.getDocCount("title") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("title") + "\tavglen=" + reader.getSumTotalTermFreq("title") / (float) reader.getDocCount("title")); System.out.println( "\tbody:\t" + "\tnumdocs=" + reader.getDocCount("body") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("body") + "\tavglen=" + reader.getSumTotalTermFreq("body") / (float) reader.getDocCount("body")); System.out.println( "\tinlink:\t" + "\tnumdocs=" + reader.getDocCount("inlink") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("inlink") + "\tavglen=" + reader.getSumTotalTermFreq("inlink") / (float) reader.getDocCount("inlink")); } else if ("-list-terms".equals(args[i])) { if ((i + 1) >= args.length) { System.out.println(usage); break; } ; listTermDictionary(reader, args[i + 1]); i += 1; } else if ("-list-termvector".equals(args[i])) { if ((i + 1) >= args.length) { System.out.println(usage); break; } ; listTermVectors(reader, args[i + 1]); i += 1; } else if ("-list-termvector-field".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listTermVectorField(reader, args[i + 1], args[i + 2]); i += 2; } else System.err.println("\nWarning: Unknown argument " + args[i] + " ignored."); } ; /* * Close the index and exit gracefully. */ reader.close(); }