public TIntDoubleHashMap getConceptVector(String phrase, TIntSet validIds) throws IOException { synchronized (phraseCache) { if (phraseCache.containsKey(phrase)) { return phraseCache.get(phrase); } } QueryParser parser = new QueryParser(Version.LUCENE_42, "text", analyzer); TopDocs docs = null; try { docs = searcher.search(parser.parse(phrase), esaHelper.getWpIdFilter(validIds), 5000); } catch (org.apache.lucene.queryparser.classic.ParseException e) { LOG.log(Level.WARNING, "parsing of phrase " + phrase + " failed", e); return null; } pruneSimilar(docs); TIntDoubleHashMap result = expandScores(docs.scoreDocs); synchronized (phraseCache) { phraseCache.put(phrase, result); } return result; // System.out.println("top docs for " + phrase + " are:"); // for (int i = 0; i < 50 && i < docs.scoreDocs.length; i++) { // ScoreDoc sd = docs.scoreDocs[i]; // Document d = reader.document(sd.doc); // // System.out.println("\t" + sd.score + ": " + // d.get("title") + ", " + d.get("text").split("\\s+").length + // ", " + d.get("inlinks")); // } }
@Override public DocScoreList mostSimilar(int wpId, int maxResults, TIntSet validIds) throws IOException { if (hasCachedMostSimilar(wpId)) { return getCachedMostSimilar(wpId, maxResults, validIds); } MoreLikeThis mlt = getMoreLikeThis(); int luceneId = esaHelper.wpIdToLuceneId(wpId); Query query; if (luceneId >= 0) { query = mlt.like(luceneId); } else if (textHelper != null && textHelper.wpIdToLuceneId(wpId) >= 0) { Document d = textHelper.wpIdToLuceneDoc(wpId); String text = d.get(Page.FIELD_TEXT); query = mlt.like(new StringReader(text), Page.FIELD_TEXT); } else { return null; } TopDocs similarDocs = searcher.search(query, esaHelper.getWpIdFilter(validIds), maxResults); pruneSimilar(similarDocs); DocScoreList scores = new DocScoreList(similarDocs.scoreDocs.length); for (int i = 0; i < similarDocs.scoreDocs.length; i++) { ScoreDoc sd = similarDocs.scoreDocs[i]; scores.set(i, esaHelper.luceneIdToWpId(sd.doc), similarDocs.scoreDocs[i].score); } return normalize(scores); }
public double singleSearch(int docNum) throws IOException, InstantiationException, IllegalAccessException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath))); // ----------- String query = reader.document(docNum).getValues("hash")[0]; CEDD ceddQuery = new CEDD(); ceddQuery.setByteArrayRepresentation( reader.document(docNum).getField(DocumentBuilder.FIELD_NAME_CEDD).binaryValue().bytes, reader.document(docNum).getField(DocumentBuilder.FIELD_NAME_CEDD).binaryValue().offset, reader.document(docNum).getField(DocumentBuilder.FIELD_NAME_CEDD).binaryValue().length); // ----------- HashSet<String> gold = new HashSet<String>(numImagesEval); ImageSearcher cis = ImageSearcherFactory.createCEDDImageSearcher(100); ImageSearchHits hits = cis.search(reader.document(docNum), reader); for (int i = 0; i < 10; i++) { gold.add(hits.doc(i).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]); } // ------------ IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity( new SimilarityBase() { @Override protected float score(BasicStats basicStats, float freq, float v2) { return 1; } @Override public String toString() { return null; } }); TopDocs topDocs = searcher.search(createQuery(query), 500); topDocs = rerank(topDocs, ceddQuery, reader); // System.out.println("topDocs.scoreDocs.length = " + topDocs.scoreDocs.length); double numMatches = 0; for (int i = 0; i < topDocs.scoreDocs.length; i++) { ScoreDoc scoreDoc = topDocs.scoreDocs[i]; // System.out.print(scoreDoc.score + ": "); String file = reader.document(scoreDoc.doc).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]; // System.out.println(file.substring(file.lastIndexOf('/') + 1) + // (gold.contains(file)?" x":" o")); if (gold.contains(file)) numMatches++; } return numMatches; }
@Override public double similarity(int wpId1, int wpId2) throws IOException { int doc1 = esaHelper.wpIdToLuceneId(wpId1); int doc2 = esaHelper.wpIdToLuceneId(wpId2); if (doc1 < 0 || doc2 < 0) { return normalize(0.0); } MoreLikeThis mlt = getMoreLikeThis(); TopDocs similarDocs = searcher.search(mlt.like(doc1), new FieldCacheTermsFilter("id", "" + wpId2), 1); if (similarDocs.scoreDocs.length == 0) { return normalize(0); } else { assert (similarDocs.scoreDocs.length == 1); assert (similarDocs.scoreDocs[0].doc == doc2); return normalize(similarDocs.scoreDocs[0].score); } }
public void testOutputSearchResults() throws IOException, InstantiationException, IllegalAccessException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath))); int docNum = 0; // doc to search for. // ----------- String query = reader.document(docNum).getValues("hash")[0]; CEDD ceddQuery = new CEDD(); ceddQuery.setByteArrayRepresentation( reader.document(docNum).getField(DocumentBuilder.FIELD_NAME_CEDD).binaryValue().bytes, reader.document(docNum).getField(DocumentBuilder.FIELD_NAME_CEDD).binaryValue().offset, reader.document(docNum).getField(DocumentBuilder.FIELD_NAME_CEDD).binaryValue().length); IndexSearcher searcher = new IndexSearcher(reader); TopDocs topDocs = searcher.search(createQuery(query), numImagesEval); FileUtils.saveImageResultsToPng( "result_lsh", topDocs, reader.document(docNum).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0], reader); }
/** * Assert that the {@code scoreType} operates as expected and parents are found in the expected * order. * * <p>This will use the test index's parent/child types to create parents with multiple children. * Each child will have a randomly generated scored stored in {@link #CHILD_SCORE_NAME}, which is * used to score based on the {@code scoreType} by using a {@link MockScorer} to determine the * expected scores. * * @param scoreType The score type to use within the query to score parents relative to their * children. * @throws IOException if any unexpected error occurs */ private void assertScoreType(ScoreType scoreType) throws IOException { SearchContext context = SearchContext.current(); Directory directory = newDirectory(); IndexWriter writer = new IndexWriter(directory, newIndexWriterConfig(new MockAnalyzer(random()))); // calculates the expected score per parent MockScorer scorer = new MockScorer(scoreType); scorer.scores = new FloatArrayList(10); // number of parents to generate int parentDocs = scaledRandomIntBetween(2, 10); // unique child ID int childDocId = 0; // Parent ID to expected score Map<String, Float> parentScores = new TreeMap<>(); // Add a few random parents to ensure that the children's score is appropriately taken into // account for (int parentDocId = 0; parentDocId < parentDocs; ++parentDocId) { String parent = Integer.toString(parentDocId); // Create the parent Document parentDocument = new Document(); parentDocument.add( new StringField(UidFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.YES)); parentDocument.add(new StringField(IdFieldMapper.NAME, parent, Field.Store.YES)); parentDocument.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); // add the parent to the index writer.addDocument(parentDocument); int numChildDocs = scaledRandomIntBetween(1, 10); // forget any parent's previous scores scorer.scores.clear(); // associate children with the parent for (int i = 0; i < numChildDocs; ++i) { int childScore = random().nextInt(128); Document childDocument = new Document(); childDocument.add( new StringField( UidFieldMapper.NAME, Uid.createUid("child", Integer.toString(childDocId++)), Field.Store.NO)); childDocument.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO)); // parent association: childDocument.add( new StringField( ParentFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.NO)); childDocument.add(new DoubleField(CHILD_SCORE_NAME, childScore, Field.Store.NO)); // remember the score to be calculated scorer.scores.add(childScore); // add the associated child to the index writer.addDocument(childDocument); } // this score that should be returned for this parent parentScores.put(parent, scorer.score()); } writer.commit(); IndexReader reader = DirectoryReader.open(writer, true); IndexSearcher searcher = new IndexSearcher(reader); // setup to read the parent/child map Engine.Searcher engineSearcher = new Engine.Searcher(ChildrenQueryTests.class.getSimpleName(), searcher); ((TestSearchContext) context).setSearcher(new ContextIndexSearcher(context, engineSearcher)); // child query that returns the score as the value of "childScore" for each child document, with // the parent's score determined by the score type QueryBuilder childQueryBuilder = functionScoreQuery(typeFilter("child")) .add(new FieldValueFactorFunctionBuilder(CHILD_SCORE_NAME)); QueryBuilder queryBuilder = hasChildQuery("child", childQueryBuilder) .scoreType(scoreType.name().toLowerCase(Locale.ENGLISH)) .setShortCircuitCutoff(parentDocs); // Perform the search for the documents using the selected score type TopDocs docs = searcher.search(parseQuery(queryBuilder), parentDocs); assertThat("Expected all parents", docs.totalHits, is(parentDocs)); // score should be descending (just a sanity check) float topScore = docs.scoreDocs[0].score; // ensure each score is returned as expected for (int i = 0; i < parentDocs; ++i) { ScoreDoc scoreDoc = docs.scoreDocs[i]; // get the ID from the document to get its expected score; remove it so we cannot double-count // it float score = parentScores.remove(reader.document(scoreDoc.doc).get(IdFieldMapper.NAME)); // expect exact match assertThat("Unexpected score", scoreDoc.score, is(score)); assertThat("Not descending", score, lessThanOrEqualTo(topScore)); // it had better keep descending topScore = score; } reader.close(); writer.close(); directory.close(); }
@Test public void testRandom() throws Exception { Directory directory = newDirectory(); final Random r = random(); final IndexWriterConfig iwc = LuceneTestCase.newIndexWriterConfig(r, new MockAnalyzer(r)) .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .setRAMBufferSizeMB( scaledRandomIntBetween(16, 64)); // we might index a lot - don't go crazy here RandomIndexWriter indexWriter = new RandomIndexWriter(r, directory, iwc); int numUniqueChildValues = scaledRandomIntBetween(100, 2000); String[] childValues = new String[numUniqueChildValues]; for (int i = 0; i < numUniqueChildValues; i++) { childValues[i] = Integer.toString(i); } IntOpenHashSet filteredOrDeletedDocs = new IntOpenHashSet(); int childDocId = 0; int numParentDocs = scaledRandomIntBetween(1, numUniqueChildValues); ObjectObjectOpenHashMap<String, NavigableMap<String, FloatArrayList>> childValueToParentIds = new ObjectObjectOpenHashMap<>(); for (int parentDocId = 0; parentDocId < numParentDocs; parentDocId++) { boolean markParentAsDeleted = rarely(); boolean filterMe = rarely(); String parent = Integer.toString(parentDocId); Document document = new Document(); document.add( new StringField(UidFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.YES)); document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); if (markParentAsDeleted) { filteredOrDeletedDocs.add(parentDocId); document.add(new StringField("delete", "me", Field.Store.NO)); } if (filterMe) { filteredOrDeletedDocs.add(parentDocId); document.add(new StringField("filter", "me", Field.Store.NO)); } indexWriter.addDocument(document); int numChildDocs = scaledRandomIntBetween(0, 100); for (int i = 0; i < numChildDocs; i++) { boolean markChildAsDeleted = rarely(); String childValue = childValues[random().nextInt(childValues.length)]; document = new Document(); document.add( new StringField( UidFieldMapper.NAME, Uid.createUid("child", Integer.toString(childDocId++)), Field.Store.NO)); document.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO)); document.add( new StringField( ParentFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.NO)); document.add(new StringField("field1", childValue, Field.Store.NO)); if (markChildAsDeleted) { document.add(new StringField("delete", "me", Field.Store.NO)); } indexWriter.addDocument(document); if (!markChildAsDeleted) { NavigableMap<String, FloatArrayList> parentIdToChildScores; if (childValueToParentIds.containsKey(childValue)) { parentIdToChildScores = childValueToParentIds.lget(); } else { childValueToParentIds.put(childValue, parentIdToChildScores = new TreeMap<>()); } if (!markParentAsDeleted && !filterMe) { FloatArrayList childScores = parentIdToChildScores.get(parent); if (childScores == null) { parentIdToChildScores.put(parent, childScores = new FloatArrayList()); } childScores.add(1f); } } } } // Delete docs that are marked to be deleted. indexWriter.deleteDocuments(new Term("delete", "me")); indexWriter.commit(); IndexReader indexReader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(indexReader); Engine.Searcher engineSearcher = new Engine.Searcher(ChildrenQueryTests.class.getSimpleName(), searcher); ((TestSearchContext) SearchContext.current()) .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher)); int max = numUniqueChildValues / 4; for (int i = 0; i < max; i++) { // Simulate a parent update if (random().nextBoolean()) { final int numberOfUpdatableParents = numParentDocs - filteredOrDeletedDocs.size(); int numberOfUpdates = RandomInts.randomIntBetween( random(), 0, Math.min(numberOfUpdatableParents, TEST_NIGHTLY ? 25 : 5)); for (int j = 0; j < numberOfUpdates; j++) { int parentId; do { parentId = random().nextInt(numParentDocs); } while (filteredOrDeletedDocs.contains(parentId)); String parentUid = Uid.createUid("parent", Integer.toString(parentId)); indexWriter.deleteDocuments(new Term(UidFieldMapper.NAME, parentUid)); Document document = new Document(); document.add(new StringField(UidFieldMapper.NAME, parentUid, Field.Store.YES)); document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); indexWriter.addDocument(document); } indexReader.close(); indexReader = DirectoryReader.open(indexWriter.w, true); searcher = new IndexSearcher(indexReader); engineSearcher = new Engine.Searcher(ChildrenConstantScoreQueryTests.class.getSimpleName(), searcher); ((TestSearchContext) SearchContext.current()) .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher)); } String childValue = childValues[random().nextInt(numUniqueChildValues)]; int shortCircuitParentDocSet = random().nextInt(numParentDocs); ScoreType scoreType = ScoreType.values()[random().nextInt(ScoreType.values().length)]; // leave min/max set to 0 half the time int minChildren = random().nextInt(2) * scaledRandomIntBetween(0, 110); int maxChildren = random().nextInt(2) * scaledRandomIntBetween(minChildren, 110); QueryBuilder queryBuilder = hasChildQuery("child", constantScoreQuery(termQuery("field1", childValue))) .scoreType(scoreType.name().toLowerCase(Locale.ENGLISH)) .minChildren(minChildren) .maxChildren(maxChildren) .setShortCircuitCutoff(shortCircuitParentDocSet); // Using a FQ, will invoke / test the Scorer#advance(..) and also let the Weight#scorer not // get live docs as acceptedDocs queryBuilder = filteredQuery(queryBuilder, notFilter(termFilter("filter", "me"))); Query query = parseQuery(queryBuilder); BitSetCollector collector = new BitSetCollector(indexReader.maxDoc()); int numHits = 1 + random().nextInt(25); TopScoreDocCollector actualTopDocsCollector = TopScoreDocCollector.create(numHits); searcher.search(query, MultiCollector.wrap(collector, actualTopDocsCollector)); FixedBitSet actualResult = collector.getResult(); FixedBitSet expectedResult = new FixedBitSet(indexReader.maxDoc()); TopScoreDocCollector expectedTopDocsCollector = TopScoreDocCollector.create(numHits); if (childValueToParentIds.containsKey(childValue)) { LeafReader slowLeafReader = SlowCompositeReaderWrapper.wrap(indexReader); final FloatArrayList[] scores = new FloatArrayList[slowLeafReader.maxDoc()]; Terms terms = slowLeafReader.terms(UidFieldMapper.NAME); if (terms != null) { NavigableMap<String, FloatArrayList> parentIdToChildScores = childValueToParentIds.lget(); TermsEnum termsEnum = terms.iterator(null); DocsEnum docsEnum = null; for (Map.Entry<String, FloatArrayList> entry : parentIdToChildScores.entrySet()) { int count = entry.getValue().elementsCount; if (count >= minChildren && (maxChildren == 0 || count <= maxChildren)) { TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(Uid.createUidAsBytes("parent", entry.getKey())); if (seekStatus == TermsEnum.SeekStatus.FOUND) { docsEnum = termsEnum.docs(slowLeafReader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE); expectedResult.set(docsEnum.nextDoc()); scores[docsEnum.docID()] = new FloatArrayList(entry.getValue()); } else if (seekStatus == TermsEnum.SeekStatus.END) { break; } } } } MockScorer mockScorer = new MockScorer(scoreType); final LeafCollector leafCollector = expectedTopDocsCollector.getLeafCollector(slowLeafReader.getContext()); leafCollector.setScorer(mockScorer); for (int doc = expectedResult.nextSetBit(0); doc < slowLeafReader.maxDoc(); doc = doc + 1 >= expectedResult.length() ? DocIdSetIterator.NO_MORE_DOCS : expectedResult.nextSetBit(doc + 1)) { mockScorer.scores = scores[doc]; leafCollector.collect(doc); } } assertBitSet(actualResult, expectedResult, searcher); assertTopDocs(actualTopDocsCollector.topDocs(), expectedTopDocsCollector.topDocs()); } indexWriter.close(); indexReader.close(); directory.close(); }
/** * Searches pages using a particular combination of flags. * * @param query The query to perform in Lucene query language * @param flags A set of flags * @return A Collection of SearchResult instances * @throws ProviderException if there is a problem with the backend */ public Collection findPages(String query, int flags) throws ProviderException { IndexSearcher searcher = null; ArrayList<SearchResult> list = null; Highlighter highlighter = null; try { String[] queryfields = { LUCENE_PAGE_CONTENTS, LUCENE_PAGE_NAME, LUCENE_AUTHOR, LUCENE_ATTACHMENTS }; QueryParser qp = new MultiFieldQueryParser(Version.LUCENE_36, queryfields, getLuceneAnalyzer()); // QueryParser qp = new QueryParser( LUCENE_PAGE_CONTENTS, getLuceneAnalyzer() ); Query luceneQuery = qp.parse(query); if ((flags & FLAG_CONTEXTS) != 0) { highlighter = new Highlighter( new SimpleHTMLFormatter("<span class=\"searchmatch\">", "</span>"), new SimpleHTMLEncoder(), new QueryScorer(luceneQuery)); } try { File dir = new File(m_luceneDirectory); Directory luceneDir = new SimpleFSDirectory(dir, null); IndexReader reader = IndexReader.open(luceneDir); searcher = new IndexSearcher(reader); } catch (Exception ex) { log.info("Lucene not yet ready; indexing not started", ex); return null; } ScoreDoc[] hits = searcher.search(luceneQuery, MAX_SEARCH_HITS).scoreDocs; list = new ArrayList<SearchResult>(hits.length); for (int curr = 0; curr < hits.length; curr++) { int docID = hits[curr].doc; Document doc = searcher.doc(docID); String pageName = doc.get(LUCENE_ID); WikiPage page = m_engine.getPage(pageName, WikiPageProvider.LATEST_VERSION); if (page != null) { if (page instanceof Attachment) { // Currently attachments don't look nice on the search-results page // When the search-results are cleaned up this can be enabled again. } int score = (int) (hits[curr].score * 100); // Get highlighted search contexts String text = doc.get(LUCENE_PAGE_CONTENTS); String[] fragments = new String[0]; if (text != null && highlighter != null) { TokenStream tokenStream = getLuceneAnalyzer().tokenStream(LUCENE_PAGE_CONTENTS, new StringReader(text)); fragments = highlighter.getBestFragments(tokenStream, text, MAX_FRAGMENTS); } SearchResult result = new SearchResultImpl(page, score, fragments); list.add(result); } else { log.error( "Lucene found a result page '" + pageName + "' that could not be loaded, removing from Lucene cache"); pageRemoved(new WikiPage(m_engine, pageName)); } } } catch (IOException e) { log.error("Failed during lucene search", e); } catch (ParseException e) { log.info("Broken query; cannot parse query ", e); throw new ProviderException( "You have entered a query Lucene cannot process: " + e.getMessage()); } catch (InvalidTokenOffsetsException e) { log.error("Tokens are incompatible with provided text ", e); } finally { if (searcher != null) { try { searcher.close(); } catch (IOException e) { log.error(e); } } } return list; }