/** * Find words for a more-like-this query former. * * @param docNum the id of the lucene document from which to find terms */ private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException { Map<String, Int> termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) { vector = vectors.terms(fieldName); } else { vector = null; } // field does not store term vector info if (vector == null) { Document d = ir.document(docNum); IndexableField fields[] = d.getFields(fieldName); for (IndexableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermFrequencies(new FastStringReader(stringValue), termFreqMap, fieldName); } } } else { addTermFrequencies(termFreqMap, vector, fieldName); } } return createQueue(termFreqMap); }
/** * Extracts termClaimsDescriptionAbstractTitles of the documents; Adds them to vector in the same * order * * @param hits * @param i * @param j * @return relevantDocsTerms docs must be in order * @throws java.io.IOException */ public Map<TermFreqVector, String> getDocsTerms(TopDocs hits, int i, int j) throws IOException { Map<TermFreqVector, String> docsTerms = new HashMap<>(); // Process each of the documents while (i < j && i < hits.totalHits && i >= 0) { ScoreDoc scoreDoc = hits.scoreDocs[i]; if (sourceField.equals(PatentQuery.all)) { Terms termTitle = ir.getTermVector(scoreDoc.doc, PatentQuery.getFields()[1]); TermFreqVector docTermsTitle = new TermFreqVector(termTitle); docsTerms.put(docTermsTitle, PatentQuery.getFields()[1]); Terms termAbstract = ir.getTermVector(scoreDoc.doc, PatentQuery.getFields()[2]); TermFreqVector docTermsAbstract = new TermFreqVector(termAbstract); docsTerms.put(docTermsAbstract, PatentQuery.getFields()[2]); Terms termDescription = ir.getTermVector(scoreDoc.doc, PatentQuery.getFields()[3]); TermFreqVector docTermsDescription = new TermFreqVector(termDescription); docsTerms.put(docTermsDescription, PatentQuery.getFields()[3]); Terms termClaims = ir.getTermVector(scoreDoc.doc, PatentQuery.getFields()[5]); TermFreqVector docTermsClaims = new TermFreqVector(termClaims); docsTerms.put(docTermsClaims, PatentQuery.getFields()[5]); } else { Terms term = ir.getTermVector(scoreDoc.doc, sourceField); // get termvector for document // Create termVector and add it to vector TermFreqVector docTerms = new TermFreqVector(term); docsTerms.put(docTerms, sourceField); } i++; } return docsTerms; }
/** * Sets boost of termClaimsDescriptionAbstractTitles. boost = weight = factor(tf*idf) * * @param vecsTerms * @param currentField * @param factor - adjustment factor ( ex. alpha or beta ) * @param decayFactor * @return * @throws java.io.IOException */ public Map<String, TermQuery> setBoost( Map<TermFreqVector, String> vecsTerms, String currentField, float factor, float decayFactor) throws IOException { Map<String, TermQuery> terms = new HashMap<>(); // setBoost for each of the terms of each of the docs int i = 0; float norm = (float) 1 / vecsTerms.size(); // System.out.println("--------------------------"); for (Map.Entry<TermFreqVector, String> e : vecsTerms.entrySet()) { // Increase decay String field = e.getValue(); TermFreqVector docTerms = e.getKey(); float decay = decayFactor * i; // Populate terms: with TermQuries and set boost for (String termTxt : docTerms.getTerms()) { // Create Term Term term = new Term(currentField, termTxt); // Calculate weight float tf = docTerms.getFreq(termTxt); // float idf = ir.docFreq(termTitle); int docs; float idf; if (sourceField.equals(PatentQuery.all)) { docs = ir.getDocCount(field); idf = (float) Math.log10((double) docs / (ir.docFreq(new Term(field, termTxt)) + 1)); } else { docs = ir.getDocCount(sourceField); idf = (float) Math.log10((double) docs / (ir.docFreq(new Term(sourceField, termTxt)) + 1)); } float weight = tf * idf; // System.out.println(term.text() + " -> tf= " + tf + " idf= " + idf + " // tfidf= " + weight); // Adjust weight by decay factor weight = weight - (weight * decay); // Create TermQuery and add it to the collection TermQuery termQuery = new TermQuery(term); // Calculate and set boost float boost; if (vecsTerms.size() == 1) { boost = factor * tf; } else { boost = factor; } if (boost != 0) { termQuery.setBoost(boost * norm); if (terms.containsKey(termTxt)) { TermQuery tq = terms.get(termTxt); tq.setBoost(tq.getBoost() + termQuery.getBoost()); } else { terms.put(termTxt, termQuery); } } } i++; } return terms; }
/* * listTermVectors displays the term vectors for all of the fields * in a document in an index (specified by reader). */ static void listTermVectors(IndexReader reader, String docidString) throws IOException { System.out.println("\nTermVector: docid " + docidString); int docid = Integer.parseInt(docidString); if ((docid < 0) || (docid >= reader.numDocs())) { System.out.println("ERROR: " + docidString + " is a bad document id."); return; } ; /* * Iterate over the fields in this document. */ Fields fields = reader.getTermVectors(docid); Iterator<String> fieldIterator = fields.iterator(); while (fieldIterator.hasNext()) { String fieldName = fieldIterator.next(); System.out.println(" Field: " + fieldName); Terms terms = fields.terms(fieldName); termVectorDisplay(terms); } ; }
private void doTest(Random random, PrintWriter out, boolean useCompoundFiles, int MAX_DOCS) throws Exception { Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random); IndexWriterConfig conf = newIndexWriterConfig(analyzer); final MergePolicy mp = conf.getMergePolicy(); mp.setNoCFSRatio(useCompoundFiles ? 1.0 : 0.0); IndexWriter writer = new IndexWriter(directory, conf); if (VERBOSE) { System.out.println("TEST: now build index MAX_DOCS=" + MAX_DOCS); } for (int j = 0; j < MAX_DOCS; j++) { Document d = new Document(); d.add(newTextField(PRIORITY_FIELD, HIGH_PRIORITY, Field.Store.YES)); d.add(newTextField(ID_FIELD, Integer.toString(j), Field.Store.YES)); writer.addDocument(d); } writer.close(); // try a search without OR IndexReader reader = DirectoryReader.open(directory); IndexSearcher searcher = newSearcher(reader); Query query = new TermQuery(new Term(PRIORITY_FIELD, HIGH_PRIORITY)); out.println("Query: " + query.toString(PRIORITY_FIELD)); if (VERBOSE) { System.out.println("TEST: search query=" + query); } final Sort sort = new Sort(SortField.FIELD_SCORE, new SortField(ID_FIELD, SortField.Type.INT)); ScoreDoc[] hits = searcher.search(query, null, MAX_DOCS, sort).scoreDocs; printHits(out, hits, searcher); checkHits(hits, MAX_DOCS, searcher); // try a new search with OR searcher = newSearcher(reader); hits = null; BooleanQuery booleanQuery = new BooleanQuery(); booleanQuery.add( new TermQuery(new Term(PRIORITY_FIELD, HIGH_PRIORITY)), BooleanClause.Occur.SHOULD); booleanQuery.add( new TermQuery(new Term(PRIORITY_FIELD, MED_PRIORITY)), BooleanClause.Occur.SHOULD); out.println("Query: " + booleanQuery.toString(PRIORITY_FIELD)); hits = searcher.search(booleanQuery, null, MAX_DOCS, sort).scoreDocs; printHits(out, hits, searcher); checkHits(hits, MAX_DOCS, searcher); reader.close(); directory.close(); }
/* * listPostings displays the first n postings for a term in a * field in an index (specified by reader). Set n to MAX_VALUE * to display all postings. */ static void listPostings(IndexReader reader, String termString, String field, Integer n) throws IOException { System.out.println("\nPostings: " + termString + " " + field); /* * Prepare to access the index. */ BytesRef termBytes = new BytesRef(termString); Term term = new Term(field, termBytes); Bits liveDocs = MultiFields.getLiveDocs(reader); /* * Lookup the collection term frequency (ctf). */ long df = reader.docFreq(term); System.out.println("\tdf: " + df); long ctf = reader.totalTermFreq(term); System.out.println("\tctf: " + ctf); if (df < 1) return; /* * Lookup the inverted list. */ DocsAndPositionsEnum postings = MultiFields.getTermPositionsEnum(reader, liveDocs, field, termBytes); /* * Iterate through the first n postings. */ long count = 0; while ((count < n) && (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS)) { System.out.println("\tdocid: " + postings.docID()); int tf = postings.freq(); System.out.println("\ttf: " + tf); System.out.print("\tPositions: "); for (int j = 0; j < tf; j++) { int pos = postings.nextPosition(); System.out.print(pos + " "); } System.out.println(""); count++; } ; return; }
@Test public void testSimple() throws Exception { Directory directory = newDirectory(); RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory); for (int parent = 1; parent <= 5; parent++) { Document document = new Document(); document.add( new StringField( UidFieldMapper.NAME, Uid.createUid("parent", Integer.toString(parent)), Field.Store.NO)); document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); indexWriter.addDocument(document); for (int child = 1; child <= 3; child++) { document = new Document(); document.add( new StringField( UidFieldMapper.NAME, Uid.createUid("child", Integer.toString(parent * 3 + child)), Field.Store.NO)); document.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO)); document.add( new StringField( ParentFieldMapper.NAME, Uid.createUid("parent", Integer.toString(parent)), Field.Store.NO)); document.add(new StringField("field1", "value" + child, Field.Store.NO)); indexWriter.addDocument(document); } } IndexReader indexReader = DirectoryReader.open(indexWriter.w, false); IndexSearcher searcher = new IndexSearcher(indexReader); TermQuery childQuery = new TermQuery(new Term("field1", "value" + (1 + random().nextInt(3)))); TermFilter parentFilter = new TermFilter(new Term(TypeFieldMapper.NAME, "parent")); int shortCircuitParentDocSet = random().nextInt(5); ChildrenConstantScoreQuery query = new ChildrenConstantScoreQuery( childQuery, "parent", "child", parentFilter, shortCircuitParentDocSet, null); BitSetCollector collector = new BitSetCollector(indexReader.maxDoc()); searcher.search(query, collector); FixedBitSet actualResult = collector.getResult(); assertThat(actualResult.cardinality(), equalTo(5)); indexWriter.close(); indexReader.close(); directory.close(); }
/** * Create a PriorityQueue from a word->tf map. * * @param words a map of words keyed on the word(String) with Int objects as the values. * @param fieldNames an array of field names to override defaults. */ private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words, String... fieldNames) throws IOException { // have collected all words in doc and their freqs int numDocs = ir.numDocs(); final int limit = Math.min(maxQueryTerms, words.size()); FreqQ queue = new FreqQ(limit); // will order words by score for (String word : words.keySet()) { // for every word int tf = words.get(word).x; // term freq in the source doc if (minTermFreq > 0 && tf < minTermFreq) { continue; // filter out words that don't occur enough times in the source } // go through all the fields and find the largest document frequency String topField = fieldNames[0]; int docFreq = 0; for (String fieldName : fieldNames) { int freq = ir.docFreq(new Term(fieldName, word)); topField = (freq > docFreq) ? fieldName : topField; docFreq = (freq > docFreq) ? freq : docFreq; } if (minDocFreq > 0 && docFreq < minDocFreq) { continue; // filter out words that don't occur in enough docs } if (docFreq > maxDocFreq) { continue; // filter out words that occur in too many docs } if (docFreq == 0) { continue; // index update problem? } float idf = similarity.idf(docFreq, numDocs); float score = tf * idf; if (queue.size() < limit) { // there is still space in the queue queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf)); } else { ScoreTerm term = queue.top(); if (term.score < score) { // update the smallest in the queue in place and update the queue. term.update(word, topField, score, idf, docFreq, tf); queue.updateTop(); } } } return queue; }
/* * listTermVectorField displays the term vector for a field in * a document in an index (specified by reader). */ static void listTermVectorField(IndexReader reader, String docidString, String field) throws IOException { System.out.println("\nTermVector: docid " + docidString + ", field " + field); int docid = Integer.parseInt(docidString); if ((docid < 0) || (docid >= reader.numDocs())) { System.out.println("ERROR: " + docidString + " is a bad document id."); return; } ; Terms terms = reader.getTermVector(docid, field); termVectorDisplay(terms); }
/** test that when freqs are omitted, that totalTermFreq and sumTotalTermFreq are -1 */ public void testStats() throws Exception { Directory dir = newDirectory(); RandomIndexWriter iw = new RandomIndexWriter( random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS_ONLY); ft.freeze(); Field f = newField("foo", "bar", ft); doc.add(f); iw.addDocument(doc); IndexReader ir = iw.getReader(); iw.close(); assertEquals(-1, ir.totalTermFreq(new Term("foo", new BytesRef("bar")))); assertEquals(-1, ir.getSumTotalTermFreq("foo")); ir.close(); dir.close(); }
public static int docId(IndexReader reader, Term term) throws IOException { TermDocs termDocs = reader.termDocs(term); try { if (termDocs.next()) { return termDocs.doc(); } return NO_DOC; } finally { termDocs.close(); } }
// public SortingReader(IndexReader oldReader, int[] oldToNew) { // TODO MC public SortingReader(IndexReader oldReader, DocScore[] newToOld) { // TODO MC super(oldReader); this.newToOld = newToOld; this.oldToNew = new int[oldReader.maxDoc()]; int newDoc = 0; while (newDoc < newToOld.length) { int oldDoc = newToOld[newDoc].doc; oldToNew[oldDoc] = newDoc; newDoc++; } }
public void sort(File directory) throws IOException { LOG.info("IndexSorter: starting."); Date start = new Date(); int termIndexInterval = getConf().getInt("indexer.termIndexInterval", 128); IndexReader reader = IndexReader.open(new File(directory, "index")); Searcher searcher = new IndexSearcher(new File(directory, "index").getAbsolutePath()); // TODO MC SortingReader sorter = new SortingReader(reader, newToOld(reader, searcher)); // TODO MC IndexWriter writer = new IndexWriter(new File(directory, "index-sorted"), null, true); writer.setTermIndexInterval(termIndexInterval); writer.setUseCompoundFile(false); writer.addIndexes(new IndexReader[] {sorter}); writer.close(); Date end = new Date(); LOG.info("IndexSorter: done, " + (end.getTime() - start.getTime()) + " total milliseconds"); }
@Test public void testRandom() throws Exception { Directory directory = newDirectory(); RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory); int numUniqueChildValues = 1 + random().nextInt(TEST_NIGHTLY ? 10000 : 1000); String[] childValues = new String[numUniqueChildValues]; for (int i = 0; i < numUniqueChildValues; i++) { childValues[i] = Integer.toString(i); } IntOpenHashSet filteredOrDeletedDocs = new IntOpenHashSet(); int childDocId = 0; int numParentDocs = 1 + random().nextInt(TEST_NIGHTLY ? 20000 : 1000); ObjectObjectOpenHashMap<String, NavigableSet<String>> childValueToParentIds = new ObjectObjectOpenHashMap<String, NavigableSet<String>>(); for (int parentDocId = 0; parentDocId < numParentDocs; parentDocId++) { boolean markParentAsDeleted = rarely(); boolean filterMe = rarely(); String parent = Integer.toString(parentDocId); Document document = new Document(); document.add( new StringField(UidFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.YES)); document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); if (markParentAsDeleted) { filteredOrDeletedDocs.add(parentDocId); document.add(new StringField("delete", "me", Field.Store.NO)); } if (filterMe) { filteredOrDeletedDocs.add(parentDocId); document.add(new StringField("filter", "me", Field.Store.NO)); } indexWriter.addDocument(document); int numChildDocs; if (rarely()) { numChildDocs = random().nextInt(TEST_NIGHTLY ? 100 : 25); } else { numChildDocs = random().nextInt(TEST_NIGHTLY ? 40 : 10); } for (int i = 0; i < numChildDocs; i++) { boolean markChildAsDeleted = rarely(); String childValue = childValues[random().nextInt(childValues.length)]; document = new Document(); document.add( new StringField( UidFieldMapper.NAME, Uid.createUid("child", Integer.toString(childDocId)), Field.Store.NO)); document.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO)); document.add( new StringField( ParentFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.NO)); document.add(new StringField("field1", childValue, Field.Store.NO)); if (markChildAsDeleted) { document.add(new StringField("delete", "me", Field.Store.NO)); } indexWriter.addDocument(document); if (!markChildAsDeleted) { NavigableSet<String> parentIds; if (childValueToParentIds.containsKey(childValue)) { parentIds = childValueToParentIds.lget(); } else { childValueToParentIds.put(childValue, parentIds = new TreeSet<String>()); } if (!markParentAsDeleted && !filterMe) { parentIds.add(parent); } } } } // Delete docs that are marked to be deleted. indexWriter.deleteDocuments(new Term("delete", "me")); indexWriter.commit(); IndexReader indexReader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(indexReader); Engine.Searcher engineSearcher = new Engine.SimpleSearcher(ChildrenConstantScoreQueryTests.class.getSimpleName(), searcher); ((TestSearchContext) SearchContext.current()) .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher)); Filter rawParentFilter = new TermFilter(new Term(TypeFieldMapper.NAME, "parent")); Filter rawFilterMe = new NotFilter(new TermFilter(new Term("filter", "me"))); int max = numUniqueChildValues / 4; for (int i = 0; i < max; i++) { // Randomly pick a cached version: there is specific logic inside ChildrenQuery that deals // with the fact // that deletes are applied at the top level when filters are cached. Filter parentFilter; if (random().nextBoolean()) { parentFilter = SearchContext.current().filterCache().cache(rawParentFilter); } else { parentFilter = rawParentFilter; } // Using this in FQ, will invoke / test the Scorer#advance(..) and also let the Weight#scorer // not get live docs as acceptedDocs Filter filterMe; if (random().nextBoolean()) { filterMe = SearchContext.current().filterCache().cache(rawFilterMe); } else { filterMe = rawFilterMe; } // Simulate a parent update if (random().nextBoolean()) { int numberOfUpdates = 1 + random().nextInt(TEST_NIGHTLY ? 25 : 5); for (int j = 0; j < numberOfUpdates; j++) { int parentId; do { parentId = random().nextInt(numParentDocs); } while (filteredOrDeletedDocs.contains(parentId)); String parentUid = Uid.createUid("parent", Integer.toString(parentId)); indexWriter.deleteDocuments(new Term(UidFieldMapper.NAME, parentUid)); Document document = new Document(); document.add(new StringField(UidFieldMapper.NAME, parentUid, Field.Store.YES)); document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); indexWriter.addDocument(document); } indexReader.close(); indexReader = DirectoryReader.open(indexWriter.w, true); searcher = new IndexSearcher(indexReader); engineSearcher = new Engine.SimpleSearcher( ChildrenConstantScoreQueryTests.class.getSimpleName(), searcher); ((TestSearchContext) SearchContext.current()) .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher)); } String childValue = childValues[random().nextInt(numUniqueChildValues)]; TermQuery childQuery = new TermQuery(new Term("field1", childValue)); int shortCircuitParentDocSet = random().nextInt(numParentDocs); Filter nonNestedDocsFilter = random().nextBoolean() ? NonNestedDocsFilter.INSTANCE : null; Query query; if (random().nextBoolean()) { // Usage in HasChildQueryParser query = new ChildrenConstantScoreQuery( childQuery, "parent", "child", parentFilter, shortCircuitParentDocSet, nonNestedDocsFilter); } else { // Usage in HasChildFilterParser query = new XConstantScoreQuery( new CustomQueryWrappingFilter( new ChildrenConstantScoreQuery( childQuery, "parent", "child", parentFilter, shortCircuitParentDocSet, nonNestedDocsFilter))); } query = new XFilteredQuery(query, filterMe); BitSetCollector collector = new BitSetCollector(indexReader.maxDoc()); searcher.search(query, collector); FixedBitSet actualResult = collector.getResult(); FixedBitSet expectedResult = new FixedBitSet(indexReader.maxDoc()); if (childValueToParentIds.containsKey(childValue)) { AtomicReader slowAtomicReader = SlowCompositeReaderWrapper.wrap(indexReader); Terms terms = slowAtomicReader.terms(UidFieldMapper.NAME); if (terms != null) { NavigableSet<String> parentIds = childValueToParentIds.lget(); TermsEnum termsEnum = terms.iterator(null); DocsEnum docsEnum = null; for (String id : parentIds) { TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(Uid.createUidAsBytes("parent", id)); if (seekStatus == TermsEnum.SeekStatus.FOUND) { docsEnum = termsEnum.docs(slowAtomicReader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE); expectedResult.set(docsEnum.nextDoc()); } else if (seekStatus == TermsEnum.SeekStatus.END) { break; } } } } assertBitSet(actualResult, expectedResult, searcher); } indexWriter.close(); indexReader.close(); directory.close(); }
/** * Assert that the {@code scoreType} operates as expected and parents are found in the expected * order. * * <p>This will use the test index's parent/child types to create parents with multiple children. * Each child will have a randomly generated scored stored in {@link #CHILD_SCORE_NAME}, which is * used to score based on the {@code scoreType} by using a {@link MockScorer} to determine the * expected scores. * * @param scoreType The score type to use within the query to score parents relative to their * children. * @throws IOException if any unexpected error occurs */ private void assertScoreType(ScoreType scoreType) throws IOException { SearchContext context = SearchContext.current(); Directory directory = newDirectory(); IndexWriter writer = new IndexWriter(directory, newIndexWriterConfig(new MockAnalyzer(random()))); // calculates the expected score per parent MockScorer scorer = new MockScorer(scoreType); scorer.scores = new FloatArrayList(10); // number of parents to generate int parentDocs = scaledRandomIntBetween(2, 10); // unique child ID int childDocId = 0; // Parent ID to expected score Map<String, Float> parentScores = new TreeMap<>(); // Add a few random parents to ensure that the children's score is appropriately taken into // account for (int parentDocId = 0; parentDocId < parentDocs; ++parentDocId) { String parent = Integer.toString(parentDocId); // Create the parent Document parentDocument = new Document(); parentDocument.add( new StringField(UidFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.YES)); parentDocument.add(new StringField(IdFieldMapper.NAME, parent, Field.Store.YES)); parentDocument.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); // add the parent to the index writer.addDocument(parentDocument); int numChildDocs = scaledRandomIntBetween(1, 10); // forget any parent's previous scores scorer.scores.clear(); // associate children with the parent for (int i = 0; i < numChildDocs; ++i) { int childScore = random().nextInt(128); Document childDocument = new Document(); childDocument.add( new StringField( UidFieldMapper.NAME, Uid.createUid("child", Integer.toString(childDocId++)), Field.Store.NO)); childDocument.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO)); // parent association: childDocument.add( new StringField( ParentFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.NO)); childDocument.add(new DoubleField(CHILD_SCORE_NAME, childScore, Field.Store.NO)); // remember the score to be calculated scorer.scores.add(childScore); // add the associated child to the index writer.addDocument(childDocument); } // this score that should be returned for this parent parentScores.put(parent, scorer.score()); } writer.commit(); IndexReader reader = DirectoryReader.open(writer, true); IndexSearcher searcher = new IndexSearcher(reader); // setup to read the parent/child map Engine.Searcher engineSearcher = new Engine.Searcher(ChildrenQueryTests.class.getSimpleName(), searcher); ((TestSearchContext) context).setSearcher(new ContextIndexSearcher(context, engineSearcher)); // child query that returns the score as the value of "childScore" for each child document, with // the parent's score determined by the score type QueryBuilder childQueryBuilder = functionScoreQuery(typeFilter("child")) .add(new FieldValueFactorFunctionBuilder(CHILD_SCORE_NAME)); QueryBuilder queryBuilder = hasChildQuery("child", childQueryBuilder) .scoreType(scoreType.name().toLowerCase(Locale.ENGLISH)) .setShortCircuitCutoff(parentDocs); // Perform the search for the documents using the selected score type TopDocs docs = searcher.search(parseQuery(queryBuilder), parentDocs); assertThat("Expected all parents", docs.totalHits, is(parentDocs)); // score should be descending (just a sanity check) float topScore = docs.scoreDocs[0].score; // ensure each score is returned as expected for (int i = 0; i < parentDocs; ++i) { ScoreDoc scoreDoc = docs.scoreDocs[i]; // get the ID from the document to get its expected score; remove it so we cannot double-count // it float score = parentScores.remove(reader.document(scoreDoc.doc).get(IdFieldMapper.NAME)); // expect exact match assertThat("Unexpected score", scoreDoc.score, is(score)); assertThat("Not descending", score, lessThanOrEqualTo(topScore)); // it had better keep descending topScore = score; } reader.close(); writer.close(); directory.close(); }
@Test public void testRandom() throws Exception { Directory directory = newDirectory(); final Random r = random(); final IndexWriterConfig iwc = LuceneTestCase.newIndexWriterConfig(r, new MockAnalyzer(r)) .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .setRAMBufferSizeMB( scaledRandomIntBetween(16, 64)); // we might index a lot - don't go crazy here RandomIndexWriter indexWriter = new RandomIndexWriter(r, directory, iwc); int numUniqueChildValues = scaledRandomIntBetween(100, 2000); String[] childValues = new String[numUniqueChildValues]; for (int i = 0; i < numUniqueChildValues; i++) { childValues[i] = Integer.toString(i); } IntOpenHashSet filteredOrDeletedDocs = new IntOpenHashSet(); int childDocId = 0; int numParentDocs = scaledRandomIntBetween(1, numUniqueChildValues); ObjectObjectOpenHashMap<String, NavigableMap<String, FloatArrayList>> childValueToParentIds = new ObjectObjectOpenHashMap<>(); for (int parentDocId = 0; parentDocId < numParentDocs; parentDocId++) { boolean markParentAsDeleted = rarely(); boolean filterMe = rarely(); String parent = Integer.toString(parentDocId); Document document = new Document(); document.add( new StringField(UidFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.YES)); document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); if (markParentAsDeleted) { filteredOrDeletedDocs.add(parentDocId); document.add(new StringField("delete", "me", Field.Store.NO)); } if (filterMe) { filteredOrDeletedDocs.add(parentDocId); document.add(new StringField("filter", "me", Field.Store.NO)); } indexWriter.addDocument(document); int numChildDocs = scaledRandomIntBetween(0, 100); for (int i = 0; i < numChildDocs; i++) { boolean markChildAsDeleted = rarely(); String childValue = childValues[random().nextInt(childValues.length)]; document = new Document(); document.add( new StringField( UidFieldMapper.NAME, Uid.createUid("child", Integer.toString(childDocId++)), Field.Store.NO)); document.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO)); document.add( new StringField( ParentFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.NO)); document.add(new StringField("field1", childValue, Field.Store.NO)); if (markChildAsDeleted) { document.add(new StringField("delete", "me", Field.Store.NO)); } indexWriter.addDocument(document); if (!markChildAsDeleted) { NavigableMap<String, FloatArrayList> parentIdToChildScores; if (childValueToParentIds.containsKey(childValue)) { parentIdToChildScores = childValueToParentIds.lget(); } else { childValueToParentIds.put(childValue, parentIdToChildScores = new TreeMap<>()); } if (!markParentAsDeleted && !filterMe) { FloatArrayList childScores = parentIdToChildScores.get(parent); if (childScores == null) { parentIdToChildScores.put(parent, childScores = new FloatArrayList()); } childScores.add(1f); } } } } // Delete docs that are marked to be deleted. indexWriter.deleteDocuments(new Term("delete", "me")); indexWriter.commit(); IndexReader indexReader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(indexReader); Engine.Searcher engineSearcher = new Engine.Searcher(ChildrenQueryTests.class.getSimpleName(), searcher); ((TestSearchContext) SearchContext.current()) .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher)); int max = numUniqueChildValues / 4; for (int i = 0; i < max; i++) { // Simulate a parent update if (random().nextBoolean()) { final int numberOfUpdatableParents = numParentDocs - filteredOrDeletedDocs.size(); int numberOfUpdates = RandomInts.randomIntBetween( random(), 0, Math.min(numberOfUpdatableParents, TEST_NIGHTLY ? 25 : 5)); for (int j = 0; j < numberOfUpdates; j++) { int parentId; do { parentId = random().nextInt(numParentDocs); } while (filteredOrDeletedDocs.contains(parentId)); String parentUid = Uid.createUid("parent", Integer.toString(parentId)); indexWriter.deleteDocuments(new Term(UidFieldMapper.NAME, parentUid)); Document document = new Document(); document.add(new StringField(UidFieldMapper.NAME, parentUid, Field.Store.YES)); document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); indexWriter.addDocument(document); } indexReader.close(); indexReader = DirectoryReader.open(indexWriter.w, true); searcher = new IndexSearcher(indexReader); engineSearcher = new Engine.Searcher(ChildrenConstantScoreQueryTests.class.getSimpleName(), searcher); ((TestSearchContext) SearchContext.current()) .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher)); } String childValue = childValues[random().nextInt(numUniqueChildValues)]; int shortCircuitParentDocSet = random().nextInt(numParentDocs); ScoreType scoreType = ScoreType.values()[random().nextInt(ScoreType.values().length)]; // leave min/max set to 0 half the time int minChildren = random().nextInt(2) * scaledRandomIntBetween(0, 110); int maxChildren = random().nextInt(2) * scaledRandomIntBetween(minChildren, 110); QueryBuilder queryBuilder = hasChildQuery("child", constantScoreQuery(termQuery("field1", childValue))) .scoreType(scoreType.name().toLowerCase(Locale.ENGLISH)) .minChildren(minChildren) .maxChildren(maxChildren) .setShortCircuitCutoff(shortCircuitParentDocSet); // Using a FQ, will invoke / test the Scorer#advance(..) and also let the Weight#scorer not // get live docs as acceptedDocs queryBuilder = filteredQuery(queryBuilder, notFilter(termFilter("filter", "me"))); Query query = parseQuery(queryBuilder); BitSetCollector collector = new BitSetCollector(indexReader.maxDoc()); int numHits = 1 + random().nextInt(25); TopScoreDocCollector actualTopDocsCollector = TopScoreDocCollector.create(numHits); searcher.search(query, MultiCollector.wrap(collector, actualTopDocsCollector)); FixedBitSet actualResult = collector.getResult(); FixedBitSet expectedResult = new FixedBitSet(indexReader.maxDoc()); TopScoreDocCollector expectedTopDocsCollector = TopScoreDocCollector.create(numHits); if (childValueToParentIds.containsKey(childValue)) { LeafReader slowLeafReader = SlowCompositeReaderWrapper.wrap(indexReader); final FloatArrayList[] scores = new FloatArrayList[slowLeafReader.maxDoc()]; Terms terms = slowLeafReader.terms(UidFieldMapper.NAME); if (terms != null) { NavigableMap<String, FloatArrayList> parentIdToChildScores = childValueToParentIds.lget(); TermsEnum termsEnum = terms.iterator(null); DocsEnum docsEnum = null; for (Map.Entry<String, FloatArrayList> entry : parentIdToChildScores.entrySet()) { int count = entry.getValue().elementsCount; if (count >= minChildren && (maxChildren == 0 || count <= maxChildren)) { TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(Uid.createUidAsBytes("parent", entry.getKey())); if (seekStatus == TermsEnum.SeekStatus.FOUND) { docsEnum = termsEnum.docs(slowLeafReader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE); expectedResult.set(docsEnum.nextDoc()); scores[docsEnum.docID()] = new FloatArrayList(entry.getValue()); } else if (seekStatus == TermsEnum.SeekStatus.END) { break; } } } } MockScorer mockScorer = new MockScorer(scoreType); final LeafCollector leafCollector = expectedTopDocsCollector.getLeafCollector(slowLeafReader.getContext()); leafCollector.setScorer(mockScorer); for (int doc = expectedResult.nextSetBit(0); doc < slowLeafReader.maxDoc(); doc = doc + 1 >= expectedResult.length() ? DocIdSetIterator.NO_MORE_DOCS : expectedResult.nextSetBit(doc + 1)) { mockScorer.scores = scores[doc]; leafCollector.collect(doc); } } assertBitSet(actualResult, expectedResult, searcher); assertTopDocs(actualTopDocsCollector.topDocs(), expectedTopDocsCollector.topDocs()); } indexWriter.close(); indexReader.close(); directory.close(); }
// Test scores with one field with Term Freqs and one without, otherwise with equal content public void testBasic() throws Exception { Directory dir = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriter writer = new IndexWriter( dir, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer) .setMaxBufferedDocs(2) .setSimilarity(new SimpleSimilarity()) .setMergePolicy(newLogMergePolicy(2))); StringBuilder sb = new StringBuilder(265); String term = "term"; for (int i = 0; i < 30; i++) { Document d = new Document(); sb.append(term).append(" "); String content = sb.toString(); Field noTf = newField("noTf", content + (i % 2 == 0 ? "" : " notf"), omitType); d.add(noTf); Field tf = newField("tf", content + (i % 2 == 0 ? " tf" : ""), normalType); d.add(tf); writer.addDocument(d); // System.out.println(d); } writer.forceMerge(1); // flush writer.close(); /* * Verify the index */ IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = newSearcher(reader); searcher.setSimilarity(new SimpleSimilarity()); Term a = new Term("noTf", term); Term b = new Term("tf", term); Term c = new Term("noTf", "notf"); Term d = new Term("tf", "tf"); TermQuery q1 = new TermQuery(a); TermQuery q2 = new TermQuery(b); TermQuery q3 = new TermQuery(c); TermQuery q4 = new TermQuery(d); PhraseQuery pq = new PhraseQuery(); pq.add(a); pq.add(c); try { searcher.search(pq, 10); fail("did not hit expected exception"); } catch (Exception e) { Throwable cause = e; // If the searcher uses an executor service, the IAE is wrapped into other exceptions while (cause.getCause() != null) { cause = cause.getCause(); } assertTrue("Expected an IAE, got " + cause, cause instanceof IllegalStateException); } searcher.search( q1, new CountingHitCollector() { private Scorer scorer; @Override public final void setScorer(Scorer scorer) { this.scorer = scorer; } @Override public final void collect(int doc) throws IOException { // System.out.println("Q1: Doc=" + doc + " score=" + score); float score = scorer.score(); assertTrue("got score=" + score, score == 1.0f); super.collect(doc); } }); // System.out.println(CountingHitCollector.getCount()); searcher.search( q2, new CountingHitCollector() { private Scorer scorer; @Override public final void setScorer(Scorer scorer) { this.scorer = scorer; } @Override public final void collect(int doc) throws IOException { // System.out.println("Q2: Doc=" + doc + " score=" + score); float score = scorer.score(); assertEquals(1.0f + doc, score, 0.00001f); super.collect(doc); } }); // System.out.println(CountingHitCollector.getCount()); searcher.search( q3, new CountingHitCollector() { private Scorer scorer; @Override public final void setScorer(Scorer scorer) { this.scorer = scorer; } @Override public final void collect(int doc) throws IOException { // System.out.println("Q1: Doc=" + doc + " score=" + score); float score = scorer.score(); assertTrue(score == 1.0f); assertFalse(doc % 2 == 0); super.collect(doc); } }); // System.out.println(CountingHitCollector.getCount()); searcher.search( q4, new CountingHitCollector() { private Scorer scorer; @Override public final void setScorer(Scorer scorer) { this.scorer = scorer; } @Override public final void collect(int doc) throws IOException { float score = scorer.score(); // System.out.println("Q1: Doc=" + doc + " score=" + score); assertTrue(score == 1.0f); assertTrue(doc % 2 == 0); super.collect(doc); } }); // System.out.println(CountingHitCollector.getCount()); BooleanQuery bq = new BooleanQuery(); bq.add(q1, Occur.MUST); bq.add(q4, Occur.MUST); searcher.search( bq, new CountingHitCollector() { @Override public final void collect(int doc) throws IOException { // System.out.println("BQ: Doc=" + doc + " score=" + score); super.collect(doc); } }); assertEquals(15, CountingHitCollector.getCount()); reader.close(); dir.close(); }
/** * Set the maximum percentage in which words may still appear. Words that appear in more than this * many percent of all docs will be ignored. * * @param maxPercentage the maximum percentage of documents (0-100) that a term may appear in to * be still considered relevant */ public void setMaxDocFreqPct(int maxPercentage) { this.maxDocFreq = maxPercentage * ir.numDocs() / 100; }
public static void main(String[] args) throws IOException { IndexReader reader = null; /* * Opening the index first simplifies the processing of the * rest of the command line arguments. */ for (int i = 0; i < args.length; i++) { if (("-index".equals(args[i])) && ((i + 1) < args.length)) { reader = DirectoryReader.open(FSDirectory.open(new File(args[i + 1]))); if (reader == null) { System.err.println("Error: Can't open index " + args[i + 1]); System.exit(1); } ; break; } ; } ; if (reader == null) { System.err.println(usage); System.exit(1); } ; /* * Process the command line arguments sequentially. */ for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { /* * Handled in the previous loop, so just skip the argument. */ i++; } else if ("-list-edocid".equals(args[i])) { System.out.println("-list-edocid:"); if ((i + 1) >= args.length) { System.out.println(usage); break; } ; Document d = reader.document(Integer.parseInt(args[i + 1])); System.out.println( "Internal docid --> External docid: " + args[i + 1] + " --> " + d.get("externalId")); i += 1; } else if ("-list-docids".equals(args[i])) { System.out.println("-list-docids:"); for (int j = 0; j < reader.numDocs(); j++) { Document d = reader.document(j); System.out.println("Internal --> external docid: " + j + " --> " + d.get("externalId")); } ; } else if ("-list-fields".equals(args[i])) { Fields fields = MultiFields.getFields(reader); System.out.print("\nNumber of fields: "); if (fields == null) System.out.println("0"); else { System.out.println(fields.size()); Iterator<String> is = fields.iterator(); while (is.hasNext()) { System.out.println("\t" + is.next()); } ; } ; } else if ("-list-postings".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listPostings(reader, args[i + 1], args[i + 2], Integer.MAX_VALUE); i += 2; } else if ("-list-postings-sample".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listPostings(reader, args[i + 1], args[i + 2], 5); i += 2; } else if ("-list-stats".equals(args[i])) { System.out.println("Corpus statistics:"); System.out.println("\tnumdocs\t\t" + reader.numDocs()); System.out.println( "\turl:\t" + "\tnumdocs=" + reader.getDocCount("url") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("url") + "\tavglen=" + reader.getSumTotalTermFreq("url") / (float) reader.getDocCount("url")); System.out.println( "\tkeywords:" + "\tnumdocs=" + reader.getDocCount("keywords") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("keywords") + "\tavglen=" + reader.getSumTotalTermFreq("keywords") / (float) reader.getDocCount("keywords")); System.out.println( "\ttitle:\t" + "\tnumdocs=" + reader.getDocCount("title") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("title") + "\tavglen=" + reader.getSumTotalTermFreq("title") / (float) reader.getDocCount("title")); System.out.println( "\tbody:\t" + "\tnumdocs=" + reader.getDocCount("body") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("body") + "\tavglen=" + reader.getSumTotalTermFreq("body") / (float) reader.getDocCount("body")); System.out.println( "\tinlink:\t" + "\tnumdocs=" + reader.getDocCount("inlink") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("inlink") + "\tavglen=" + reader.getSumTotalTermFreq("inlink") / (float) reader.getDocCount("inlink")); } else if ("-list-terms".equals(args[i])) { if ((i + 1) >= args.length) { System.out.println(usage); break; } ; listTermDictionary(reader, args[i + 1]); i += 1; } else if ("-list-termvector".equals(args[i])) { if ((i + 1) >= args.length) { System.out.println(usage); break; } ; listTermVectors(reader, args[i + 1]); i += 1; } else if ("-list-termvector-field".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listTermVectorField(reader, args[i + 1], args[i + 2]); i += 2; } else System.err.println("\nWarning: Unknown argument " + args[i] + " ignored."); } ; /* * Close the index and exit gracefully. */ reader.close(); }
@Test public void testRandom() throws Exception { Directory directory = newDirectory(); RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory); int numUniqueChildValues = 1 + random().nextInt(TEST_NIGHTLY ? 6000 : 600); String[] childValues = new String[numUniqueChildValues]; for (int i = 0; i < numUniqueChildValues; i++) { childValues[i] = Integer.toString(i); } int childDocId = 0; int numParentDocs = 1 + random().nextInt(TEST_NIGHTLY ? 20000 : 1000); ObjectObjectOpenHashMap<String, NavigableMap<String, FloatArrayList>> childValueToParentIds = new ObjectObjectOpenHashMap<String, NavigableMap<String, FloatArrayList>>(); for (int parentDocId = 0; parentDocId < numParentDocs; parentDocId++) { boolean markParentAsDeleted = rarely(); String parent = Integer.toString(parentDocId); Document document = new Document(); document.add( new StringField(UidFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.YES)); document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); if (markParentAsDeleted) { document.add(new StringField("delete", "me", Field.Store.NO)); } indexWriter.addDocument(document); int numChildDocs = random().nextInt(TEST_NIGHTLY ? 100 : 25); for (int i = 0; i < numChildDocs; i++) { boolean markChildAsDeleted = rarely(); String childValue = childValues[random().nextInt(childValues.length)]; document = new Document(); document.add( new StringField( UidFieldMapper.NAME, Uid.createUid("child", Integer.toString(childDocId)), Field.Store.NO)); document.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO)); document.add( new StringField( ParentFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.NO)); document.add(new StringField("field1", childValue, Field.Store.NO)); if (markChildAsDeleted) { document.add(new StringField("delete", "me", Field.Store.NO)); } indexWriter.addDocument(document); if (!markChildAsDeleted) { NavigableMap<String, FloatArrayList> parentIdToChildScores; if (childValueToParentIds.containsKey(childValue)) { parentIdToChildScores = childValueToParentIds.lget(); } else { childValueToParentIds.put( childValue, parentIdToChildScores = new TreeMap<String, FloatArrayList>()); } if (!markParentAsDeleted) { FloatArrayList childScores = parentIdToChildScores.get(parent); if (childScores == null) { parentIdToChildScores.put(parent, childScores = new FloatArrayList()); } childScores.add(1f); } } } } // Delete docs that are marked to be deleted. indexWriter.deleteDocuments(new Term("delete", "me")); indexWriter.close(); IndexReader indexReader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(indexReader); Engine.Searcher engineSearcher = new Engine.SimpleSearcher(ChildrenQueryTests.class.getSimpleName(), searcher); ((TestSearchContext) SearchContext.current()) .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher)); TermFilter parentFilter = new TermFilter(new Term(TypeFieldMapper.NAME, "parent")); for (String childValue : childValues) { Query childQuery = new ConstantScoreQuery(new TermQuery(new Term("field1", childValue))); int shortCircuitParentDocSet = random().nextInt(numParentDocs); ScoreType scoreType = ScoreType.values()[random().nextInt(ScoreType.values().length)]; Query query = new ChildrenQuery( "parent", "child", parentFilter, childQuery, scoreType, shortCircuitParentDocSet); BitSetCollector collector = new BitSetCollector(indexReader.maxDoc()); int numHits = 1 + random().nextInt(25); TopScoreDocCollector actualTopDocsCollector = TopScoreDocCollector.create(numHits, false); searcher.search(query, MultiCollector.wrap(collector, actualTopDocsCollector)); FixedBitSet actualResult = collector.getResult(); FixedBitSet expectedResult = new FixedBitSet(indexReader.maxDoc()); MockScorer mockScorer = new MockScorer(scoreType); TopScoreDocCollector expectedTopDocsCollector = TopScoreDocCollector.create(numHits, false); expectedTopDocsCollector.setScorer(mockScorer); if (childValueToParentIds.containsKey(childValue)) { AtomicReader slowAtomicReader = SlowCompositeReaderWrapper.wrap(indexReader); Terms terms = slowAtomicReader.terms(UidFieldMapper.NAME); if (terms != null) { NavigableMap<String, FloatArrayList> parentIdToChildScores = childValueToParentIds.lget(); TermsEnum termsEnum = terms.iterator(null); DocsEnum docsEnum = null; for (Map.Entry<String, FloatArrayList> entry : parentIdToChildScores.entrySet()) { TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(Uid.createUidAsBytes("parent", entry.getKey())); if (seekStatus == TermsEnum.SeekStatus.FOUND) { docsEnum = termsEnum.docs(slowAtomicReader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE); expectedResult.set(docsEnum.nextDoc()); mockScorer.scores = entry.getValue(); expectedTopDocsCollector.collect(docsEnum.docID()); } else if (seekStatus == TermsEnum.SeekStatus.END) { break; } } } } assertBitSet(actualResult, expectedResult, searcher); assertTopDocs(actualTopDocsCollector.topDocs(), expectedTopDocsCollector.topDocs()); } indexReader.close(); directory.close(); }
/** * Searches pages using a particular combination of flags. * * @param query The query to perform in Lucene query language * @param flags A set of flags * @return A Collection of SearchResult instances * @throws ProviderException if there is a problem with the backend */ public Collection findPages(String query, int flags) throws ProviderException { IndexSearcher searcher = null; ArrayList<SearchResult> list = null; Highlighter highlighter = null; try { String[] queryfields = { LUCENE_PAGE_CONTENTS, LUCENE_PAGE_NAME, LUCENE_AUTHOR, LUCENE_ATTACHMENTS }; QueryParser qp = new MultiFieldQueryParser(Version.LUCENE_36, queryfields, getLuceneAnalyzer()); // QueryParser qp = new QueryParser( LUCENE_PAGE_CONTENTS, getLuceneAnalyzer() ); Query luceneQuery = qp.parse(query); if ((flags & FLAG_CONTEXTS) != 0) { highlighter = new Highlighter( new SimpleHTMLFormatter("<span class=\"searchmatch\">", "</span>"), new SimpleHTMLEncoder(), new QueryScorer(luceneQuery)); } try { File dir = new File(m_luceneDirectory); Directory luceneDir = new SimpleFSDirectory(dir, null); IndexReader reader = IndexReader.open(luceneDir); searcher = new IndexSearcher(reader); } catch (Exception ex) { log.info("Lucene not yet ready; indexing not started", ex); return null; } ScoreDoc[] hits = searcher.search(luceneQuery, MAX_SEARCH_HITS).scoreDocs; list = new ArrayList<SearchResult>(hits.length); for (int curr = 0; curr < hits.length; curr++) { int docID = hits[curr].doc; Document doc = searcher.doc(docID); String pageName = doc.get(LUCENE_ID); WikiPage page = m_engine.getPage(pageName, WikiPageProvider.LATEST_VERSION); if (page != null) { if (page instanceof Attachment) { // Currently attachments don't look nice on the search-results page // When the search-results are cleaned up this can be enabled again. } int score = (int) (hits[curr].score * 100); // Get highlighted search contexts String text = doc.get(LUCENE_PAGE_CONTENTS); String[] fragments = new String[0]; if (text != null && highlighter != null) { TokenStream tokenStream = getLuceneAnalyzer().tokenStream(LUCENE_PAGE_CONTENTS, new StringReader(text)); fragments = highlighter.getBestFragments(tokenStream, text, MAX_FRAGMENTS); } SearchResult result = new SearchResultImpl(page, score, fragments); list.add(result); } else { log.error( "Lucene found a result page '" + pageName + "' that could not be loaded, removing from Lucene cache"); pageRemoved(new WikiPage(m_engine, pageName)); } } } catch (IOException e) { log.error("Failed during lucene search", e); } catch (ParseException e) { log.info("Broken query; cannot parse query ", e); throw new ProviderException( "You have entered a query Lucene cannot process: " + e.getMessage()); } catch (InvalidTokenOffsetsException e) { log.error("Tokens are incompatible with provided text ", e); } finally { if (searcher != null) { try { searcher.close(); } catch (IOException e) { log.error(e); } } } return list; }
// private static int[] oldToNew(IndexReader reader, Searcher searcher) throws IOException { private static DocScore[] newToOld(IndexReader reader, Searcher searcher) throws IOException { int readerMax = reader.maxDoc(); DocScore[] newToOld = new DocScore[readerMax]; // use site, an indexed, un-tokenized field to get boost // byte[] boosts = reader.norms("site"); TODO MC /* TODO MC */ Document docMeta; Pattern includes = Pattern.compile("\\|"); String value = NutchConfiguration.create().get(INCLUDE_EXTENSIONS_KEY, ""); String includeExtensions[] = includes.split(value); Hashtable<String, Boolean> validExtensions = new Hashtable<String, Boolean>(); for (int i = 0; i < includeExtensions.length; i++) { validExtensions.put(includeExtensions[i], true); System.out.println("extension boosted " + includeExtensions[i]); } /* TODO MC */ for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) { float score; if (reader.isDeleted(oldDoc)) { // score = 0.0f; score = -1f; // TODO MC } else { // score = Similarity.decodeNorm(boosts[oldDoc]); TODO MC /* TODO MC */ docMeta = searcher.doc(oldDoc); if (validExtensions.get(docMeta.get("subType")) == null) { // searched extensions will have higher scores score = -0.5f; } else { score = Integer.parseInt(docMeta.get("inlinks")); /* if (score==0) { score=0.001f; // TODO MC - to not erase } */ } /* TODO MC */ // System.out.println("Score for old document "+oldDoc+" is "+score+" and type // "+docMeta.get("subType")); // TODO MC debug remove } DocScore docScore = new DocScore(); docScore.doc = oldDoc; docScore.score = score; newToOld[oldDoc] = docScore; } System.out.println("Sorting " + newToOld.length + " documents."); Arrays.sort(newToOld); // HeapSorter.sort(newToOld); // TODO MC - due to the lack of space /* TODO MC int[] oldToNew = new int[readerMax]; for (int newDoc = 0; newDoc < readerMax; newDoc++) { DocScore docScore = newToOld[newDoc]; //oldToNew[docScore.oldDoc] = docScore.score > 0.0f ? newDoc : -1; // TODO MC oldToNew[docScore.oldDoc] = newDoc; // TODO MC } */ /* TODO MC * for (int newDoc = 0; newDoc < readerMax; newDoc++) { DocScore docScore = newToOld[newDoc]; System.out.println("Score for new document "+newDoc+" is "+docScore.score); // TODO MC debug remove } * TODO MC */ // return oldToNew; TODO MC return newToOld; // TODO MC }