private void pruneSimilar(TopDocs docs) throws IOException { if (docs.scoreDocs.length == 0) { return; } int cutoff = docs.scoreDocs.length; double threshold = 0.005 * docs.scoreDocs[0].score; for (int i = 0, j = 100; j < docs.scoreDocs.length; i++, j++) { float delta = docs.scoreDocs[i].score - docs.scoreDocs[j].score; if (delta < threshold) { cutoff = j; break; } } if (cutoff < docs.scoreDocs.length) { // LOG.info("pruned results from " + docs.scoreDocs.length + " to " + cutoff); docs.scoreDocs = ArrayUtils.subarray(docs.scoreDocs, 0, cutoff); } }
/** * Accumulates groups for the BlockJoinQuery specified by its slot. * * @param slot Search query's slot * @param offset Parent docs offset * @param maxDocsPerGroup Upper bound of documents per group number * @param withinGroupOffset Offset within each group of child docs * @param withinGroupSort Sort criteria within groups * @param fillSortFields Specifies whether to add sort fields or not * @return TopGroups for the query specified by slot * @throws IOException if there is a low-level I/O error */ @SuppressWarnings({"unchecked", "rawtypes"}) private TopGroups<Integer> accumulateGroups( int slot, int offset, int maxDocsPerGroup, int withinGroupOffset, Sort withinGroupSort, boolean fillSortFields) throws IOException { final GroupDocs<Integer>[] groups = new GroupDocs[sortedGroups.length - offset]; final FakeScorer fakeScorer = new FakeScorer(); int totalGroupedHitCount = 0; // System.out.println("slot=" + slot); for (int groupIDX = offset; groupIDX < sortedGroups.length; groupIDX++) { final OneGroup og = sortedGroups[groupIDX]; final int numChildDocs; if (slot == -1 || slot >= og.counts.length) { numChildDocs = 0; } else { numChildDocs = og.counts[slot]; } // Number of documents in group should be bounded to prevent redundant memory allocation final int numDocsInGroup = Math.max(1, Math.min(numChildDocs, maxDocsPerGroup)); // System.out.println("parent doc=" + og.doc + " numChildDocs=" + numChildDocs + " maxDocsPG=" // + maxDocsPerGroup); // At this point we hold all docs w/ in each group, // unsorted; we now sort them: final TopDocsCollector<?> collector; if (withinGroupSort == null) { // System.out.println("sort by score"); // Sort by score if (!trackScores) { throw new IllegalArgumentException( "cannot sort by relevance within group: trackScores=false"); } collector = TopScoreDocCollector.create(numDocsInGroup, true); } else { // Sort by fields collector = TopFieldCollector.create( withinGroupSort, numDocsInGroup, fillSortFields, trackScores, trackMaxScore, true); } collector.setScorer(fakeScorer); collector.setNextReader(og.readerContext); for (int docIDX = 0; docIDX < numChildDocs; docIDX++) { // System.out.println("docIDX=" + docIDX + " vs " + og.docs[slot].length); final int doc = og.docs[slot][docIDX]; fakeScorer.doc = doc; if (trackScores) { fakeScorer.score = og.scores[slot][docIDX]; } collector.collect(doc); } totalGroupedHitCount += numChildDocs; final Object[] groupSortValues; if (fillSortFields) { groupSortValues = new Object[comparators.length]; for (int sortFieldIDX = 0; sortFieldIDX < comparators.length; sortFieldIDX++) { groupSortValues[sortFieldIDX] = comparators[sortFieldIDX].value(og.slot); } } else { groupSortValues = null; } final TopDocs topDocs = collector.topDocs(withinGroupOffset, numDocsInGroup); groups[groupIDX - offset] = new GroupDocs<>( og.score, topDocs.getMaxScore(), numChildDocs, topDocs.scoreDocs, og.doc, groupSortValues); } return new TopGroups<>( new TopGroups<>( sort.getSort(), withinGroupSort == null ? null : withinGroupSort.getSort(), 0, totalGroupedHitCount, groups, maxScore), totalHitCount); }
/** * @param scrollSort Whether to ignore the from and sort all hits in each shard result. Only used * for scroll search * @param resultsArr Shard result holder */ public ScoreDoc[] sortDocs( boolean scrollSort, AtomicArray<? extends QuerySearchResultProvider> resultsArr) throws IOException { List<? extends AtomicArray.Entry<? extends QuerySearchResultProvider>> results = resultsArr.asList(); if (results.isEmpty()) { return EMPTY_DOCS; } if (optimizeSingleShard) { boolean canOptimize = false; QuerySearchResult result = null; int shardIndex = -1; if (results.size() == 1) { canOptimize = true; result = results.get(0).value.queryResult(); shardIndex = results.get(0).index; } else { // lets see if we only got hits from a single shard, if so, we can optimize... for (AtomicArray.Entry<? extends QuerySearchResultProvider> entry : results) { if (entry.value.queryResult().topDocs().scoreDocs.length > 0) { if (result != null) { // we already have one, can't really optimize canOptimize = false; break; } canOptimize = true; result = entry.value.queryResult(); shardIndex = entry.index; } } } if (canOptimize) { int offset = result.from(); if (scrollSort) { offset = 0; } ScoreDoc[] scoreDocs = result.topDocs().scoreDocs; if (scoreDocs.length == 0 || scoreDocs.length < offset) { return EMPTY_DOCS; } int resultDocsSize = result.size(); if ((scoreDocs.length - offset) < resultDocsSize) { resultDocsSize = scoreDocs.length - offset; } ScoreDoc[] docs = new ScoreDoc[resultDocsSize]; for (int i = 0; i < resultDocsSize; i++) { ScoreDoc scoreDoc = scoreDocs[offset + i]; scoreDoc.shardIndex = shardIndex; docs[i] = scoreDoc; } return docs; } } @SuppressWarnings("unchecked") AtomicArray.Entry<? extends QuerySearchResultProvider>[] sortedResults = results.toArray(new AtomicArray.Entry[results.size()]); Arrays.sort(sortedResults, QUERY_RESULT_ORDERING); QuerySearchResultProvider firstResult = sortedResults[0].value; final Sort sort; if (firstResult.queryResult().topDocs() instanceof TopFieldDocs) { TopFieldDocs firstTopDocs = (TopFieldDocs) firstResult.queryResult().topDocs(); sort = new Sort(firstTopDocs.fields); } else { sort = null; } int topN = firstResult.queryResult().size(); // Need to use the length of the resultsArr array, since the slots will be based on the position // in the resultsArr array TopDocs[] shardTopDocs = new TopDocs[resultsArr.length()]; if (firstResult.includeFetch()) { // if we did both query and fetch on the same go, we have fetched all the docs from each // shards already, use them... // this is also important since we shortcut and fetch only docs from "from" and up to "size" topN *= sortedResults.length; } for (AtomicArray.Entry<? extends QuerySearchResultProvider> sortedResult : sortedResults) { TopDocs topDocs = sortedResult.value.queryResult().topDocs(); // the 'index' field is the position in the resultsArr atomic array shardTopDocs[sortedResult.index] = topDocs; } int from = firstResult.queryResult().from(); if (scrollSort) { from = 0; } // TopDocs#merge can't deal with null shard TopDocs for (int i = 0; i < shardTopDocs.length; i++) { if (shardTopDocs[i] == null) { shardTopDocs[i] = Lucene.EMPTY_TOP_DOCS; } } TopDocs mergedTopDocs = TopDocs.merge(sort, from, topN, shardTopDocs); return mergedTopDocs.scoreDocs; }