/** * Sets boost of termClaimsDescriptionAbstractTitles. boost = weight = factor(tf*idf) * * @param vecsTerms * @param currentField * @param factor - adjustment factor ( ex. alpha or beta ) * @param decayFactor * @return * @throws java.io.IOException */ public Map<String, TermQuery> setBoost( Map<TermFreqVector, String> vecsTerms, String currentField, float factor, float decayFactor) throws IOException { Map<String, TermQuery> terms = new HashMap<>(); // setBoost for each of the terms of each of the docs int i = 0; float norm = (float) 1 / vecsTerms.size(); // System.out.println("--------------------------"); for (Map.Entry<TermFreqVector, String> e : vecsTerms.entrySet()) { // Increase decay String field = e.getValue(); TermFreqVector docTerms = e.getKey(); float decay = decayFactor * i; // Populate terms: with TermQuries and set boost for (String termTxt : docTerms.getTerms()) { // Create Term Term term = new Term(currentField, termTxt); // Calculate weight float tf = docTerms.getFreq(termTxt); // float idf = ir.docFreq(termTitle); int docs; float idf; if (sourceField.equals(PatentQuery.all)) { docs = ir.getDocCount(field); idf = (float) Math.log10((double) docs / (ir.docFreq(new Term(field, termTxt)) + 1)); } else { docs = ir.getDocCount(sourceField); idf = (float) Math.log10((double) docs / (ir.docFreq(new Term(sourceField, termTxt)) + 1)); } float weight = tf * idf; // System.out.println(term.text() + " -> tf= " + tf + " idf= " + idf + " // tfidf= " + weight); // Adjust weight by decay factor weight = weight - (weight * decay); // Create TermQuery and add it to the collection TermQuery termQuery = new TermQuery(term); // Calculate and set boost float boost; if (vecsTerms.size() == 1) { boost = factor * tf; } else { boost = factor; } if (boost != 0) { termQuery.setBoost(boost * norm); if (terms.containsKey(termTxt)) { TermQuery tq = terms.get(termTxt); tq.setBoost(tq.getBoost() + termQuery.getBoost()); } else { terms.put(termTxt, termQuery); } } } i++; } return terms; }
public void handleRequest(SolrQueryRequest req, SolrQueryResponse rsp) { numRequests++; Query query = null; Filter filter = null; List<String> commands = StrUtils.splitSmart(req.getQueryString(), ';'); String qs = commands.size() >= 1 ? commands.get(0) : ""; query = QueryParsing.parseQuery(qs, req.getSchema()); // If the first non-query, non-filter command is a simple sort on an indexed field, then // we can use the Lucene sort ability. Sort sort = null; if (commands.size() >= 2) { sort = QueryParsing.parseSort(commands.get(1), req); } try { int numHits; ScoreDoc[] scoreDocs; if (sort != null) { TopFieldDocs hits = req.getSearcher().search(query, filter, req.getStart() + req.getLimit(), sort); scoreDocs = hits.scoreDocs; numHits = hits.totalHits; } else { TopDocs hits = req.getSearcher().search(query, filter, req.getStart() + req.getLimit()); scoreDocs = hits.scoreDocs; numHits = hits.totalHits; } int startRow = Math.min(numHits, req.getStart()); int endRow = Math.min(numHits, req.getStart() + req.getLimit()); int numRows = endRow - startRow; int[] ids = new int[numRows]; Document[] data = new Document[numRows]; for (int i = startRow; i < endRow; i++) { ids[i] = scoreDocs[i].doc; data[i] = req.getSearcher().doc(ids[i]); } rsp.add(null, new DocSlice(0, numRows, ids, null, numHits, 0.0f)); /** * ********************* rsp.setResults(new DocSlice(0,numRows,ids,null,numHits)); * * <p>// Setting the actual document objects is optional rsp.setResults(data); * ********************** */ } catch (IOException e) { rsp.setException(e); numErrors++; return; } }
public ArrayList<String> filterSearchQueryString(ArrayList<String> qstring) throws IOException { Term vword = new Term(""); int docFreq = indexReader.docFreq(vword); int numDocs = indexReader.maxDoc(); float idf = (float) Math.log(numDocs / (double) (docFreq + 1) + 1.0); return qstring; }
private String msg(String left, String right) { int size = Math.min(left.length(), right.length()); StringBuilder builder = new StringBuilder("size: " + left.length() + " vs. " + right.length()); builder.append(" content: <<"); for (int i = 0; i < size; i++) { if (left.charAt(i) == right.charAt(i)) { builder.append(left.charAt(i)); } else { builder .append(">> ") .append("until offset: ") .append(i) .append(" [") .append(left.charAt(i)) .append(" vs.") .append(right.charAt(i)) .append("] [") .append((int) left.charAt(i)) .append(" vs.") .append((int) right.charAt(i)) .append(']'); return builder.toString(); } } if (left.length() != right.length()) { int leftEnd = Math.max(size, left.length()) - 1; int rightEnd = Math.max(size, right.length()) - 1; builder .append(">> ") .append("until offset: ") .append(size) .append(" [") .append(left.charAt(leftEnd)) .append(" vs.") .append(right.charAt(rightEnd)) .append("] [") .append((int) left.charAt(leftEnd)) .append(" vs.") .append((int) right.charAt(rightEnd)) .append(']'); return builder.toString(); } return ""; }
/** * Merges <code>termClaimsDescriptionAbstractTitleQueries</code> into a single query. In the * future this method should probably be in <code>Query</code> class. This is akward way of doing * it; but only merge queries method that is available is mergeBooleanQueries; so actually have to * make a string termClaimsDescriptionAbstractTitle1^boost1, * termClaimsDescriptionAbstractTitle2^boost and then parse it into a query * * @param termQueries - to merge * @param maxTerms * @return query created from termClaimsDescriptionAbstractTitleQueries including boost parameters * @throws org.apache.lucene.queryparser.classic.ParseException */ public Query mergeQueries(List<TermQuery> termQueries, int maxTerms) throws ParseException { BooleanQuery query = new BooleanQuery(); // Select only the maxTerms number of terms int termCount = Math.min(termQueries.size(), maxTerms); for (int i = 0; i < termCount; i++) { TermQuery termQuery = termQueries.get(i); query.add(termQuery, BooleanClause.Occur.SHOULD); } return query; }
/** * Adjust termClaimsDescriptionAbstractTitle features of the docs with alpha * query; and beta; * and assign weights/boost to termClaimsDescriptionAbstractTitles (tf*idf). * * @param query * @param currentField * @param alpha * @param beta - factor of the equation * @param gamma * @param decay * @param maxExpandedQueryTerms - maximum number of termClaimsDescriptionAbstractTitles in * expanded query * @return expandedQuery with boost factors adjusted using Rocchio's algorithm * @throws IOException * @throws ParseException */ public Query adjust( Query query, String currentField, float alpha, float beta, float gamma, float decay, int maxExpandedQueryTerms) throws IOException, ParseException { Query expandedQuery; // setBoost of docs terms Map<String, TermQuery> relevantDocsTerms = setBoost(docsTermVectorReldocs, currentField, beta, decay); Map<String, TermQuery> irrrelevantDocsTerms = setBoost(docsTermVectorIrreldocs, currentField, gamma, decay); // Map<String, TermQuery> relevantDocsTerms = new HashMap<>(); // Map<String, TermQuery> irrrelevantDocsTerms = new HashMap<>(); // setBoost of query terms // Get queryTerms from the query // combine weights according to expansion formula List<TermQuery> expandedQueryTerms = combine(new HashMap<String, TermQuery>(), relevantDocsTerms, irrrelevantDocsTerms); // Sort by boost=weight Comparator comparator = new QueryBoostComparator(); Collections.sort(expandedQueryTerms, comparator); relevantDocsTerms.clear(); int termCount = Math.min(expandedQueryTerms.size(), maxExpandedQueryTerms); for (int i = 0; i < termCount; i++) { TermQuery tq = expandedQueryTerms.get(i); relevantDocsTerms.put(tq.getTerm().text(), tq); System.out.print(tq.getTerm().text() + ", "); } TermFreqVector queryTermsVector = new TermFreqVector(query); Map<String, TermQuery> queryTerms; queryTerms = setBoost(queryTermsVector, currentField, alpha); // List<TermQuery> queryTermsList=new ArrayList(queryTerms.values()); // Collections.sort(queryTermsList, comparator); // queryTerms.clear(); // for(TermQuery tq:queryTermsList){ // queryTerms.put(tq.getTerm().text(), tq); // } expandedQueryTerms = combine(queryTerms, relevantDocsTerms, new HashMap<String, TermQuery>()); Collections.sort(expandedQueryTerms, comparator); // Create Expanded Query expandedQuery = mergeQueries(expandedQueryTerms, Integer.MAX_VALUE); return expandedQuery; }
/** * Create a PriorityQueue from a word->tf map. * * @param words a map of words keyed on the word(String) with Int objects as the values. * @param fieldNames an array of field names to override defaults. */ private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words, String... fieldNames) throws IOException { // have collected all words in doc and their freqs int numDocs = ir.numDocs(); final int limit = Math.min(maxQueryTerms, words.size()); FreqQ queue = new FreqQ(limit); // will order words by score for (String word : words.keySet()) { // for every word int tf = words.get(word).x; // term freq in the source doc if (minTermFreq > 0 && tf < minTermFreq) { continue; // filter out words that don't occur enough times in the source } // go through all the fields and find the largest document frequency String topField = fieldNames[0]; int docFreq = 0; for (String fieldName : fieldNames) { int freq = ir.docFreq(new Term(fieldName, word)); topField = (freq > docFreq) ? fieldName : topField; docFreq = (freq > docFreq) ? freq : docFreq; } if (minDocFreq > 0 && docFreq < minDocFreq) { continue; // filter out words that don't occur in enough docs } if (docFreq > maxDocFreq) { continue; // filter out words that occur in too many docs } if (docFreq == 0) { continue; // index update problem? } float idf = similarity.idf(docFreq, numDocs); float score = tf * idf; if (queue.size() < limit) { // there is still space in the queue queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf)); } else { ScoreTerm term = queue.top(); if (term.score < score) { // update the smallest in the queue in place and update the queue. term.update(word, topField, score, idf, docFreq, tf); queue.updateTop(); } } } return queue; }
@Override public DocScoreList mostSimilar(String phrase, int maxResults, TIntSet validIds) throws IOException { final TIntDoubleHashMap scores = getConceptVector(phrase, validIds); Integer luceneIds[] = ArrayUtils.toObject(scores.keys()); Arrays.sort( luceneIds, new Comparator<Integer>() { @Override public int compare(Integer id1, Integer id2) { return -1 * new Double(scores.get(id1)).compareTo(scores.get(id2)); } }); DocScoreList result = new DocScoreList(Math.min(luceneIds.length, maxResults)); for (int i = 0; i < result.numDocs(); i++) { result.set(i, esaHelper.luceneIdToWpId(luceneIds[i]), scores.get(luceneIds[i])); } return normalize(result); }
private TopDocs rerank(TopDocs docs, LireFeature feature, IndexReader reader) throws IOException, IllegalAccessException, InstantiationException { LireFeature tmp = new CEDD(); ArrayList<ScoreDoc> res = new ArrayList<ScoreDoc>(docs.scoreDocs.length); float maxScore = 0f; for (int i = 0; i < docs.scoreDocs.length; i++) { tmp.setByteArrayRepresentation( reader .document(docs.scoreDocs[i].doc) .getField(DocumentBuilder.FIELD_NAME_CEDD) .binaryValue() .bytes, reader .document(docs.scoreDocs[i].doc) .getField(DocumentBuilder.FIELD_NAME_CEDD) .binaryValue() .offset, reader .document(docs.scoreDocs[i].doc) .getField(DocumentBuilder.FIELD_NAME_CEDD) .binaryValue() .length); maxScore = Math.max(1 / tmp.getDistance(feature), maxScore); res.add(new ScoreDoc(docs.scoreDocs[i].doc, 1 / tmp.getDistance(feature))); } // sorting res ... Collections.sort( res, new Comparator<ScoreDoc>() { @Override public int compare(ScoreDoc o1, ScoreDoc o2) { return (int) Math.signum(o2.score - o1.score); } }); return new TopDocs(numImagesEval, (ScoreDoc[]) res.toArray(new ScoreDoc[res.size()]), maxScore); }
@Override public void collect(int parentDoc) throws IOException { // System.out.println("\nC parentDoc=" + parentDoc); totalHitCount++; float score = Float.NaN; if (trackMaxScore) { score = scorer.score(); maxScore = Math.max(maxScore, score); } // TODO: we could sweep all joinScorers here and // aggregate total child hit count, so we can fill this // in getTopGroups (we wire it to 0 now) if (queueFull) { // System.out.println(" queueFull"); // Fastmatch: return if this hit is not competitive for (int i = 0; ; i++) { final int c = reverseMul[i] * comparators[i].compareBottom(parentDoc); if (c < 0) { // Definitely not competitive. // System.out.println(" skip"); return; } else if (c > 0) { // Definitely competitive. break; } else if (i == compEnd) { // Here c=0. If we're at the last comparator, this doc is not // competitive, since docs are visited in doc Id order, which means // this doc cannot compete with any other document in the queue. // System.out.println(" skip"); return; } } // System.out.println(" competes! doc=" + (docBase + parentDoc)); // This hit is competitive - replace bottom element in queue & adjustTop for (int i = 0; i < comparators.length; i++) { comparators[i].copy(bottom.slot, parentDoc); } if (!trackMaxScore && trackScores) { score = scorer.score(); } bottom.doc = docBase + parentDoc; bottom.readerContext = currentReaderContext; bottom.score = score; copyGroups(bottom); bottom = queue.updateTop(); for (int i = 0; i < comparators.length; i++) { comparators[i].setBottom(bottom.slot); } } else { // Startup transient: queue is not yet full: final int comparatorSlot = totalHitCount - 1; // Copy hit into queue for (int i = 0; i < comparators.length; i++) { comparators[i].copy(comparatorSlot, parentDoc); } // System.out.println(" startup: new OG doc=" + (docBase+parentDoc)); if (!trackMaxScore && trackScores) { score = scorer.score(); } final OneGroup og = new OneGroup(comparatorSlot, docBase + parentDoc, score, joinScorers.length, trackScores); og.readerContext = currentReaderContext; copyGroups(og); bottom = queue.add(og); queueFull = totalHitCount == numParentHits; if (queueFull) { // End of startup transient: queue just filled up: for (int i = 0; i < comparators.length; i++) { comparators[i].setBottom(bottom.slot); } } } }
@Test public void testRandom() throws Exception { Directory directory = newDirectory(); final Random r = random(); final IndexWriterConfig iwc = LuceneTestCase.newIndexWriterConfig(r, new MockAnalyzer(r)) .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .setRAMBufferSizeMB( scaledRandomIntBetween(16, 64)); // we might index a lot - don't go crazy here RandomIndexWriter indexWriter = new RandomIndexWriter(r, directory, iwc); int numUniqueChildValues = scaledRandomIntBetween(100, 2000); String[] childValues = new String[numUniqueChildValues]; for (int i = 0; i < numUniqueChildValues; i++) { childValues[i] = Integer.toString(i); } IntOpenHashSet filteredOrDeletedDocs = new IntOpenHashSet(); int childDocId = 0; int numParentDocs = scaledRandomIntBetween(1, numUniqueChildValues); ObjectObjectOpenHashMap<String, NavigableMap<String, FloatArrayList>> childValueToParentIds = new ObjectObjectOpenHashMap<>(); for (int parentDocId = 0; parentDocId < numParentDocs; parentDocId++) { boolean markParentAsDeleted = rarely(); boolean filterMe = rarely(); String parent = Integer.toString(parentDocId); Document document = new Document(); document.add( new StringField(UidFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.YES)); document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); if (markParentAsDeleted) { filteredOrDeletedDocs.add(parentDocId); document.add(new StringField("delete", "me", Field.Store.NO)); } if (filterMe) { filteredOrDeletedDocs.add(parentDocId); document.add(new StringField("filter", "me", Field.Store.NO)); } indexWriter.addDocument(document); int numChildDocs = scaledRandomIntBetween(0, 100); for (int i = 0; i < numChildDocs; i++) { boolean markChildAsDeleted = rarely(); String childValue = childValues[random().nextInt(childValues.length)]; document = new Document(); document.add( new StringField( UidFieldMapper.NAME, Uid.createUid("child", Integer.toString(childDocId++)), Field.Store.NO)); document.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO)); document.add( new StringField( ParentFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.NO)); document.add(new StringField("field1", childValue, Field.Store.NO)); if (markChildAsDeleted) { document.add(new StringField("delete", "me", Field.Store.NO)); } indexWriter.addDocument(document); if (!markChildAsDeleted) { NavigableMap<String, FloatArrayList> parentIdToChildScores; if (childValueToParentIds.containsKey(childValue)) { parentIdToChildScores = childValueToParentIds.lget(); } else { childValueToParentIds.put(childValue, parentIdToChildScores = new TreeMap<>()); } if (!markParentAsDeleted && !filterMe) { FloatArrayList childScores = parentIdToChildScores.get(parent); if (childScores == null) { parentIdToChildScores.put(parent, childScores = new FloatArrayList()); } childScores.add(1f); } } } } // Delete docs that are marked to be deleted. indexWriter.deleteDocuments(new Term("delete", "me")); indexWriter.commit(); IndexReader indexReader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(indexReader); Engine.Searcher engineSearcher = new Engine.Searcher(ChildrenQueryTests.class.getSimpleName(), searcher); ((TestSearchContext) SearchContext.current()) .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher)); int max = numUniqueChildValues / 4; for (int i = 0; i < max; i++) { // Simulate a parent update if (random().nextBoolean()) { final int numberOfUpdatableParents = numParentDocs - filteredOrDeletedDocs.size(); int numberOfUpdates = RandomInts.randomIntBetween( random(), 0, Math.min(numberOfUpdatableParents, TEST_NIGHTLY ? 25 : 5)); for (int j = 0; j < numberOfUpdates; j++) { int parentId; do { parentId = random().nextInt(numParentDocs); } while (filteredOrDeletedDocs.contains(parentId)); String parentUid = Uid.createUid("parent", Integer.toString(parentId)); indexWriter.deleteDocuments(new Term(UidFieldMapper.NAME, parentUid)); Document document = new Document(); document.add(new StringField(UidFieldMapper.NAME, parentUid, Field.Store.YES)); document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); indexWriter.addDocument(document); } indexReader.close(); indexReader = DirectoryReader.open(indexWriter.w, true); searcher = new IndexSearcher(indexReader); engineSearcher = new Engine.Searcher(ChildrenConstantScoreQueryTests.class.getSimpleName(), searcher); ((TestSearchContext) SearchContext.current()) .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher)); } String childValue = childValues[random().nextInt(numUniqueChildValues)]; int shortCircuitParentDocSet = random().nextInt(numParentDocs); ScoreType scoreType = ScoreType.values()[random().nextInt(ScoreType.values().length)]; // leave min/max set to 0 half the time int minChildren = random().nextInt(2) * scaledRandomIntBetween(0, 110); int maxChildren = random().nextInt(2) * scaledRandomIntBetween(minChildren, 110); QueryBuilder queryBuilder = hasChildQuery("child", constantScoreQuery(termQuery("field1", childValue))) .scoreType(scoreType.name().toLowerCase(Locale.ENGLISH)) .minChildren(minChildren) .maxChildren(maxChildren) .setShortCircuitCutoff(shortCircuitParentDocSet); // Using a FQ, will invoke / test the Scorer#advance(..) and also let the Weight#scorer not // get live docs as acceptedDocs queryBuilder = filteredQuery(queryBuilder, notFilter(termFilter("filter", "me"))); Query query = parseQuery(queryBuilder); BitSetCollector collector = new BitSetCollector(indexReader.maxDoc()); int numHits = 1 + random().nextInt(25); TopScoreDocCollector actualTopDocsCollector = TopScoreDocCollector.create(numHits); searcher.search(query, MultiCollector.wrap(collector, actualTopDocsCollector)); FixedBitSet actualResult = collector.getResult(); FixedBitSet expectedResult = new FixedBitSet(indexReader.maxDoc()); TopScoreDocCollector expectedTopDocsCollector = TopScoreDocCollector.create(numHits); if (childValueToParentIds.containsKey(childValue)) { LeafReader slowLeafReader = SlowCompositeReaderWrapper.wrap(indexReader); final FloatArrayList[] scores = new FloatArrayList[slowLeafReader.maxDoc()]; Terms terms = slowLeafReader.terms(UidFieldMapper.NAME); if (terms != null) { NavigableMap<String, FloatArrayList> parentIdToChildScores = childValueToParentIds.lget(); TermsEnum termsEnum = terms.iterator(null); DocsEnum docsEnum = null; for (Map.Entry<String, FloatArrayList> entry : parentIdToChildScores.entrySet()) { int count = entry.getValue().elementsCount; if (count >= minChildren && (maxChildren == 0 || count <= maxChildren)) { TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(Uid.createUidAsBytes("parent", entry.getKey())); if (seekStatus == TermsEnum.SeekStatus.FOUND) { docsEnum = termsEnum.docs(slowLeafReader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE); expectedResult.set(docsEnum.nextDoc()); scores[docsEnum.docID()] = new FloatArrayList(entry.getValue()); } else if (seekStatus == TermsEnum.SeekStatus.END) { break; } } } } MockScorer mockScorer = new MockScorer(scoreType); final LeafCollector leafCollector = expectedTopDocsCollector.getLeafCollector(slowLeafReader.getContext()); leafCollector.setScorer(mockScorer); for (int doc = expectedResult.nextSetBit(0); doc < slowLeafReader.maxDoc(); doc = doc + 1 >= expectedResult.length() ? DocIdSetIterator.NO_MORE_DOCS : expectedResult.nextSetBit(doc + 1)) { mockScorer.scores = scores[doc]; leafCollector.collect(doc); } } assertBitSet(actualResult, expectedResult, searcher); assertTopDocs(actualTopDocsCollector.topDocs(), expectedTopDocsCollector.topDocs()); } indexWriter.close(); indexReader.close(); directory.close(); }
@Override public float tf(float freq) { return (float) (1.0 + Math.log(freq)); }
private static long optionalSum(long left, long right) { return Math.min(left, right) == -1 ? -1 : left + right; }
public InternalSearchResponse merge( ScoreDoc[] sortedDocs, AtomicArray<? extends QuerySearchResultProvider> queryResultsArr, AtomicArray<? extends FetchSearchResultProvider> fetchResultsArr) { List<? extends AtomicArray.Entry<? extends QuerySearchResultProvider>> queryResults = queryResultsArr.asList(); List<? extends AtomicArray.Entry<? extends FetchSearchResultProvider>> fetchResults = fetchResultsArr.asList(); if (queryResults.isEmpty()) { return InternalSearchResponse.empty(); } QuerySearchResult firstResult = queryResults.get(0).value.queryResult(); boolean sorted = false; int sortScoreIndex = -1; if (firstResult.topDocs() instanceof TopFieldDocs) { sorted = true; TopFieldDocs fieldDocs = (TopFieldDocs) firstResult.queryResult().topDocs(); for (int i = 0; i < fieldDocs.fields.length; i++) { if (fieldDocs.fields[i].getType() == SortField.Type.SCORE) { sortScoreIndex = i; } } } // merge facets InternalFacets facets = null; if (!queryResults.isEmpty()) { // we rely on the fact that the order of facets is the same on all query results if (firstResult.facets() != null && firstResult.facets().facets() != null && !firstResult.facets().facets().isEmpty()) { List<Facet> aggregatedFacets = Lists.newArrayList(); List<Facet> namedFacets = Lists.newArrayList(); for (Facet facet : firstResult.facets()) { // aggregate each facet name into a single list, and aggregate it namedFacets.clear(); for (AtomicArray.Entry<? extends QuerySearchResultProvider> entry : queryResults) { for (Facet facet1 : entry.value.queryResult().facets()) { if (facet.getName().equals(facet1.getName())) { namedFacets.add(facet1); } } } if (!namedFacets.isEmpty()) { Facet aggregatedFacet = ((InternalFacet) namedFacets.get(0)) .reduce(new InternalFacet.ReduceContext(cacheRecycler, namedFacets)); aggregatedFacets.add(aggregatedFacet); } } facets = new InternalFacets(aggregatedFacets); } } // count the total (we use the query result provider here, since we might not get any hits (we // scrolled past them)) long totalHits = 0; float maxScore = Float.NEGATIVE_INFINITY; boolean timedOut = false; Boolean terminatedEarly = null; for (AtomicArray.Entry<? extends QuerySearchResultProvider> entry : queryResults) { QuerySearchResult result = entry.value.queryResult(); if (result.searchTimedOut()) { timedOut = true; } if (result.terminatedEarly() != null) { if (terminatedEarly == null) { terminatedEarly = result.terminatedEarly(); } else if (result.terminatedEarly()) { terminatedEarly = true; } } totalHits += result.topDocs().totalHits; if (!Float.isNaN(result.topDocs().getMaxScore())) { maxScore = Math.max(maxScore, result.topDocs().getMaxScore()); } } if (Float.isInfinite(maxScore)) { maxScore = Float.NaN; } // clean the fetch counter for (AtomicArray.Entry<? extends FetchSearchResultProvider> entry : fetchResults) { entry.value.fetchResult().initCounter(); } // merge hits List<InternalSearchHit> hits = new ArrayList<>(); if (!fetchResults.isEmpty()) { for (ScoreDoc shardDoc : sortedDocs) { FetchSearchResultProvider fetchResultProvider = fetchResultsArr.get(shardDoc.shardIndex); if (fetchResultProvider == null) { continue; } FetchSearchResult fetchResult = fetchResultProvider.fetchResult(); int index = fetchResult.counterGetAndIncrement(); if (index < fetchResult.hits().internalHits().length) { InternalSearchHit searchHit = fetchResult.hits().internalHits()[index]; searchHit.score(shardDoc.score); searchHit.shard(fetchResult.shardTarget()); if (sorted) { FieldDoc fieldDoc = (FieldDoc) shardDoc; searchHit.sortValues(fieldDoc.fields); if (sortScoreIndex != -1) { searchHit.score(((Number) fieldDoc.fields[sortScoreIndex]).floatValue()); } } hits.add(searchHit); } } } // merge suggest results Suggest suggest = null; if (!queryResults.isEmpty()) { final Map<String, List<Suggest.Suggestion>> groupedSuggestions = new HashMap<>(); boolean hasSuggestions = false; for (AtomicArray.Entry<? extends QuerySearchResultProvider> entry : queryResults) { Suggest shardResult = entry.value.queryResult().queryResult().suggest(); if (shardResult == null) { continue; } hasSuggestions = true; Suggest.group(groupedSuggestions, shardResult); } suggest = hasSuggestions ? new Suggest(Suggest.Fields.SUGGEST, Suggest.reduce(groupedSuggestions)) : null; } // merge addAggregation InternalAggregations aggregations = null; if (!queryResults.isEmpty()) { if (firstResult.aggregations() != null && firstResult.aggregations().asList() != null) { List<InternalAggregations> aggregationsList = new ArrayList<>(queryResults.size()); for (AtomicArray.Entry<? extends QuerySearchResultProvider> entry : queryResults) { aggregationsList.add((InternalAggregations) entry.value.queryResult().aggregations()); } aggregations = InternalAggregations.reduce( aggregationsList, new ReduceContext(null, bigArrays, scriptService)); } } InternalSearchHits searchHits = new InternalSearchHits( hits.toArray(new InternalSearchHit[hits.size()]), totalHits, maxScore); return new InternalSearchResponse( searchHits, facets, aggregations, suggest, timedOut, terminatedEarly); }
private void mergeIds(ResponseBuilder rb, ShardRequest sreq) { SortSpec ss = rb.getSortSpec(); Sort sort = ss.getSort(); SortField[] sortFields = null; if (sort != null) sortFields = sort.getSort(); else { sortFields = new SortField[] {SortField.FIELD_SCORE}; } SchemaField uniqueKeyField = rb.req.getSchema().getUniqueKeyField(); // id to shard mapping, to eliminate any accidental dups HashMap<Object, String> uniqueDoc = new HashMap<Object, String>(); // Merge the docs via a priority queue so we don't have to sort *all* of the // documents... we only need to order the top (rows+start) ShardFieldSortedHitQueue queue; queue = new ShardFieldSortedHitQueue(sortFields, ss.getOffset() + ss.getCount()); long numFound = 0; Float maxScore = null; for (ShardResponse srsp : sreq.responses) { SolrDocumentList docs = (SolrDocumentList) srsp.getSolrResponse().getResponse().get("response"); // calculate global maxScore and numDocsFound if (docs.getMaxScore() != null) { maxScore = maxScore == null ? docs.getMaxScore() : Math.max(maxScore, docs.getMaxScore()); } numFound += docs.getNumFound(); NamedList sortFieldValues = (NamedList) (srsp.getSolrResponse().getResponse().get("sort_values")); // go through every doc in this response, construct a ShardDoc, and // put it in the priority queue so it can be ordered. for (int i = 0; i < docs.size(); i++) { SolrDocument doc = docs.get(i); Object id = doc.getFieldValue(uniqueKeyField.getName()); String prevShard = uniqueDoc.put(id, srsp.getShard()); if (prevShard != null) { // duplicate detected numFound--; // For now, just always use the first encountered since we can't currently // remove the previous one added to the priority queue. If we switched // to the Java5 PriorityQueue, this would be easier. continue; // make which duplicate is used deterministic based on shard // if (prevShard.compareTo(srsp.shard) >= 0) { // TODO: remove previous from priority queue // continue; // } } ShardDoc shardDoc = new ShardDoc(); shardDoc.id = id; shardDoc.shard = srsp.getShard(); shardDoc.orderInShard = i; Object scoreObj = doc.getFieldValue("score"); if (scoreObj != null) { if (scoreObj instanceof String) { shardDoc.score = Float.parseFloat((String) scoreObj); } else { shardDoc.score = (Float) scoreObj; } } shardDoc.sortFieldValues = sortFieldValues; queue.insertWithOverflow(shardDoc); } // end for-each-doc-in-response } // end for-each-response // The queue now has 0 -> queuesize docs, where queuesize <= start + rows // So we want to pop the last documents off the queue to get // the docs offset -> queuesize int resultSize = queue.size() - ss.getOffset(); resultSize = Math.max(0, resultSize); // there may not be any docs in range Map<Object, ShardDoc> resultIds = new HashMap<Object, ShardDoc>(); for (int i = resultSize - 1; i >= 0; i--) { ShardDoc shardDoc = (ShardDoc) queue.pop(); shardDoc.positionInResponse = i; // Need the toString() for correlation with other lists that must // be strings (like keys in highlighting, explain, etc) resultIds.put(shardDoc.id.toString(), shardDoc); } SolrDocumentList responseDocs = new SolrDocumentList(); if (maxScore != null) responseDocs.setMaxScore(maxScore); responseDocs.setNumFound(numFound); responseDocs.setStart(ss.getOffset()); // size appropriately for (int i = 0; i < resultSize; i++) responseDocs.add(null); // save these results in a private area so we can access them // again when retrieving stored fields. // TODO: use ResponseBuilder (w/ comments) or the request context? rb.resultIds = resultIds; rb._responseDocs = responseDocs; }
/** * Accumulates groups for the BlockJoinQuery specified by its slot. * * @param slot Search query's slot * @param offset Parent docs offset * @param maxDocsPerGroup Upper bound of documents per group number * @param withinGroupOffset Offset within each group of child docs * @param withinGroupSort Sort criteria within groups * @param fillSortFields Specifies whether to add sort fields or not * @return TopGroups for the query specified by slot * @throws IOException if there is a low-level I/O error */ @SuppressWarnings({"unchecked", "rawtypes"}) private TopGroups<Integer> accumulateGroups( int slot, int offset, int maxDocsPerGroup, int withinGroupOffset, Sort withinGroupSort, boolean fillSortFields) throws IOException { final GroupDocs<Integer>[] groups = new GroupDocs[sortedGroups.length - offset]; final FakeScorer fakeScorer = new FakeScorer(); int totalGroupedHitCount = 0; // System.out.println("slot=" + slot); for (int groupIDX = offset; groupIDX < sortedGroups.length; groupIDX++) { final OneGroup og = sortedGroups[groupIDX]; final int numChildDocs; if (slot == -1 || slot >= og.counts.length) { numChildDocs = 0; } else { numChildDocs = og.counts[slot]; } // Number of documents in group should be bounded to prevent redundant memory allocation final int numDocsInGroup = Math.max(1, Math.min(numChildDocs, maxDocsPerGroup)); // System.out.println("parent doc=" + og.doc + " numChildDocs=" + numChildDocs + " maxDocsPG=" // + maxDocsPerGroup); // At this point we hold all docs w/ in each group, // unsorted; we now sort them: final TopDocsCollector<?> collector; if (withinGroupSort == null) { // System.out.println("sort by score"); // Sort by score if (!trackScores) { throw new IllegalArgumentException( "cannot sort by relevance within group: trackScores=false"); } collector = TopScoreDocCollector.create(numDocsInGroup, true); } else { // Sort by fields collector = TopFieldCollector.create( withinGroupSort, numDocsInGroup, fillSortFields, trackScores, trackMaxScore, true); } collector.setScorer(fakeScorer); collector.setNextReader(og.readerContext); for (int docIDX = 0; docIDX < numChildDocs; docIDX++) { // System.out.println("docIDX=" + docIDX + " vs " + og.docs[slot].length); final int doc = og.docs[slot][docIDX]; fakeScorer.doc = doc; if (trackScores) { fakeScorer.score = og.scores[slot][docIDX]; } collector.collect(doc); } totalGroupedHitCount += numChildDocs; final Object[] groupSortValues; if (fillSortFields) { groupSortValues = new Object[comparators.length]; for (int sortFieldIDX = 0; sortFieldIDX < comparators.length; sortFieldIDX++) { groupSortValues[sortFieldIDX] = comparators[sortFieldIDX].value(og.slot); } } else { groupSortValues = null; } final TopDocs topDocs = collector.topDocs(withinGroupOffset, numDocsInGroup); groups[groupIDX - offset] = new GroupDocs<>( og.score, topDocs.getMaxScore(), numChildDocs, topDocs.scoreDocs, og.doc, groupSortValues); } return new TopGroups<>( new TopGroups<>( sort.getSort(), withinGroupSort == null ? null : withinGroupSort.getSort(), 0, totalGroupedHitCount, groups, maxScore), totalHitCount); }
@Override public float idf(long docFreq, long numDocs) { return (float) Math.log(numDocs / (double) docFreq); }