/** * Used when base query is highly constraining vs the drilldowns, or when the docs must be scored * at once (i.e., like BooleanScorer2, not BooleanScorer). In this case we just .next() on base * and .advance() on the dim filters. */ private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims) throws IOException { // if (DEBUG) { // System.out.println(" doQueryFirstScoring"); // } int docID = baseScorer.docID(); nextDoc: while (docID != PostingsEnum.NO_MORE_DOCS) { if (acceptDocs != null && acceptDocs.get(docID) == false) { docID = baseScorer.nextDoc(); continue; } LeafCollector failedCollector = null; for (DocsAndCost dim : dims) { // TODO: should we sort this 2nd dimension of // docsEnums from most frequent to least? if (dim.approximation.docID() < docID) { dim.approximation.advance(docID); } boolean matches = false; if (dim.approximation.docID() == docID) { if (dim.twoPhase == null) { matches = true; } else { matches = dim.twoPhase.matches(); } } if (matches == false) { if (failedCollector != null) { // More than one dim fails on this document, so // it's neither a hit nor a near-miss; move to // next doc: docID = baseScorer.nextDoc(); continue nextDoc; } else { failedCollector = dim.sidewaysLeafCollector; } } } collectDocID = docID; // TODO: we could score on demand instead since we are // daat here: collectScore = baseScorer.score(); if (failedCollector == null) { // Hit passed all filters, so it's "real": collectHit(collector, dims); } else { // Hit missed exactly one filter: collectNearMiss(failedCollector); } docID = baseScorer.nextDoc(); } }
public static Map<String, Integer> termFrequencies( IndexSearcher indexSearcher, Query documentFilterQuery, String fieldName, String propName, String altName) { try { String luceneField = ComplexFieldUtil.propertyField(fieldName, propName, altName); Weight weight = indexSearcher.createNormalizedWeight(documentFilterQuery, false); Map<String, Integer> freq = new HashMap<>(); IndexReader indexReader = indexSearcher.getIndexReader(); for (LeafReaderContext arc : indexReader.leaves()) { if (weight == null) throw new RuntimeException("weight == null"); if (arc == null) throw new RuntimeException("arc == null"); if (arc.reader() == null) throw new RuntimeException("arc.reader() == null"); Scorer scorer = weight.scorer(arc, arc.reader().getLiveDocs()); if (scorer != null) { while (scorer.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { getFrequenciesFromTermVector( indexReader, scorer.docID() + arc.docBase, luceneField, freq); } } } return freq; } catch (IOException e) { throw ExUtil.wrapRuntimeException(e); } }
/*(non-Javadoc) @see org.apache.lucene.search.Scorer#score() */ @Override public float score() throws IOException { for (int i = 0; i < valSrcScorers.length; i++) { vScores[i] = valSrcScorers[i].score(); } return qWeight * provider.customScore(subQueryScorer.docID(), subQueryScorer.score(), vScores); }
@Override public float score() throws IOException { float score = qWeight * scorer.score() * vals.floatVal(scorer.docID()); // Current Lucene priority queues can't handle NaN and -Infinity, so // map to -Float.MAX_VALUE. This conditional handles both -infinity // and NaN since comparisons with NaN are always false. return score > Float.NEGATIVE_INFINITY ? score : -Float.MAX_VALUE; }
/** * Collect all Spans extracted from a Scorer using a SpanCollector * * @param scorer the scorer to extract Spans from * @param collector the SpanCollector * @param errorOnNoSpans if true, throw an error if no Spans can be extracted from the Scorer or * any of its children * @throws IOException on error */ public static void collect(Scorer scorer, SpanCollector collector, boolean errorOnNoSpans) throws IOException { List<Spans> allSpans = getSpans(scorer, errorOnNoSpans); int doc = scorer.docID(); for (Spans spans : allSpans) { int spanDoc = spans.docID(); // if the Scorer advances lazily, then not all of its subspans may be on // the correct document if (spanDoc == doc || (spanDoc < doc && spans.advance(doc) == doc)) { while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) { spans.collect(collector); } } } }
@Override public int docID() { return subQueryScorer.docID(); }
@Override public int docID() { return scorer.docID(); }
@Override public float score() throws IOException { return (_func.useInnerScore()) ? _func.newScore(_innerScorer.score(), _innerScorer.docID()) : _func.newScore(_innerScorer.docID()); }
@Override public int docID() { return _innerScorer.docID(); }
private void doUnionScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims) throws IOException { // if (DEBUG) { // System.out.println(" doUnionScoring"); // } final int maxDoc = context.reader().maxDoc(); final int numDims = dims.length; // TODO: maybe a class like BS, instead of parallel arrays int[] filledSlots = new int[CHUNK]; int[] docIDs = new int[CHUNK]; float[] scores = new float[CHUNK]; int[] missingDims = new int[CHUNK]; int[] counts = new int[CHUNK]; docIDs[0] = -1; // NOTE: this is basically a specialized version of // BooleanScorer, to the minShouldMatch=N-1 case, but // carefully tracking which dimension failed to match int nextChunkStart = CHUNK; while (true) { // if (DEBUG) { // System.out.println("\ncycle nextChunkStart=" + nextChunkStart + " docIds[0]=" + // docIDs[0]); // } int filledCount = 0; int docID = baseScorer.docID(); // if (DEBUG) { // System.out.println(" base docID=" + docID); // } while (docID < nextChunkStart) { if (acceptDocs == null || acceptDocs.get(docID)) { int slot = docID & MASK; // if (DEBUG) { // System.out.println(" docIDs[slot=" + slot + "]=" + docID + " id=" + // context.reader().document(docID).get("id")); // } // Mark slot as valid: assert docIDs[slot] != docID : "slot=" + slot + " docID=" + docID; docIDs[slot] = docID; scores[slot] = baseScorer.score(); filledSlots[filledCount++] = slot; missingDims[slot] = 0; counts[slot] = 1; } docID = baseScorer.nextDoc(); } if (filledCount == 0) { if (nextChunkStart >= maxDoc) { break; } nextChunkStart += CHUNK; continue; } // First drill-down dim, basically adds SHOULD onto // the baseQuery: // if (DEBUG) { // System.out.println(" dim=0 [" + dims[0].dim + "]"); // } { DocsAndCost dc = dims[0]; docID = dc.approximation.docID(); // if (DEBUG) { // System.out.println(" start docID=" + docID); // } while (docID < nextChunkStart) { int slot = docID & MASK; if (docIDs[slot] == docID // this also checks that the doc is not deleted && (dc.twoPhase == null || dc.twoPhase.matches())) { // if (DEBUG) { // System.out.println(" set docID=" + docID + " count=2"); // } missingDims[slot] = 1; counts[slot] = 2; } docID = dc.approximation.nextDoc(); } } for (int dim = 1; dim < numDims; dim++) { // if (DEBUG) { // System.out.println(" dim=" + dim + " [" + dims[dim].dim + "]"); // } DocsAndCost dc = dims[dim]; docID = dc.approximation.docID(); // if (DEBUG) { // System.out.println(" start docID=" + docID); // } while (docID < nextChunkStart) { int slot = docID & MASK; if (docIDs[slot] == docID // also means that the doc is not deleted && counts[slot] >= dim && (dc.twoPhase == null || dc.twoPhase.matches())) { // This doc is still in the running... // TODO: single-valued dims will always be true // below; we could somehow specialize if (missingDims[slot] >= dim) { // if (DEBUG) { // System.out.println(" set docID=" + docID + " count=" + (dim+2)); // } missingDims[slot] = dim + 1; counts[slot] = dim + 2; } else { // if (DEBUG) { // System.out.println(" set docID=" + docID + " missing count=" + (dim+1)); // } counts[slot] = dim + 1; } } docID = dc.approximation.nextDoc(); } } // Collect: // System.out.println(" now collect: " + filledCount + " hits"); for (int i = 0; i < filledCount; i++) { // NOTE: This is actually in-order collection, // because we only accept docs originally returned by // the baseScorer (ie that Scorer is AND'd) int slot = filledSlots[i]; collectDocID = docIDs[slot]; collectScore = scores[slot]; // if (DEBUG) { // System.out.println(" docID=" + docIDs[slot] + " count=" + counts[slot]); // } // System.out.println(" collect doc=" + collectDocID + " main.freq=" + (counts[slot]-1) + " // main.doc=" + collectDocID + " exactCount=" + numDims); if (counts[slot] == 1 + numDims) { // System.out.println(" hit"); collectHit(collector, dims); } else if (counts[slot] == numDims) { // System.out.println(" sw"); collectNearMiss(dims[missingDims[slot]].sidewaysLeafCollector); } } if (nextChunkStart >= maxDoc) { break; } nextChunkStart += CHUNK; } }
/** Used when drill downs are highly constraining vs baseQuery. */ private void doDrillDownAdvanceScoring( Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims) throws IOException { final int maxDoc = context.reader().maxDoc(); final int numDims = dims.length; // if (DEBUG) { // System.out.println(" doDrillDownAdvanceScoring"); // } // TODO: maybe a class like BS, instead of parallel arrays int[] filledSlots = new int[CHUNK]; int[] docIDs = new int[CHUNK]; float[] scores = new float[CHUNK]; int[] missingDims = new int[CHUNK]; int[] counts = new int[CHUNK]; docIDs[0] = -1; int nextChunkStart = CHUNK; final FixedBitSet seen = new FixedBitSet(CHUNK); while (true) { // if (DEBUG) { // System.out.println("\ncycle nextChunkStart=" + nextChunkStart + " docIds[0]=" + // docIDs[0]); // } // First dim: // if (DEBUG) { // System.out.println(" dim0"); // } DocsAndCost dc = dims[0]; int docID = dc.approximation.docID(); while (docID < nextChunkStart) { if (acceptDocs == null || acceptDocs.get(docID)) { int slot = docID & MASK; if (docIDs[slot] != docID && (dc.twoPhase == null || dc.twoPhase.matches())) { seen.set(slot); // Mark slot as valid: // if (DEBUG) { // System.out.println(" set docID=" + docID + " id=" + // context.reader().document(docID).get("id")); // } docIDs[slot] = docID; missingDims[slot] = 1; counts[slot] = 1; } } docID = dc.approximation.nextDoc(); } // Second dim: // if (DEBUG) { // System.out.println(" dim1"); // } dc = dims[1]; docID = dc.approximation.docID(); while (docID < nextChunkStart) { if (acceptDocs == null || acceptDocs.get(docID) && (dc.twoPhase == null || dc.twoPhase.matches())) { int slot = docID & MASK; if (docIDs[slot] != docID) { // Mark slot as valid: seen.set(slot); // if (DEBUG) { // System.out.println(" set docID=" + docID + " missingDim=0 id=" + // context.reader().document(docID).get("id")); // } docIDs[slot] = docID; missingDims[slot] = 0; counts[slot] = 1; } else { // TODO: single-valued dims will always be true // below; we could somehow specialize if (missingDims[slot] >= 1) { missingDims[slot] = 2; counts[slot] = 2; // if (DEBUG) { // System.out.println(" set docID=" + docID + " missingDim=2 id=" + // context.reader().document(docID).get("id")); // } } else { counts[slot] = 1; // if (DEBUG) { // System.out.println(" set docID=" + docID + " missingDim=" + missingDims[slot] + // " id=" + context.reader().document(docID).get("id")); // } } } } docID = dc.approximation.nextDoc(); } // After this we can "upgrade" to conjunction, because // any doc not seen by either dim 0 or dim 1 cannot be // a hit or a near miss: // if (DEBUG) { // System.out.println(" baseScorer"); // } // Fold in baseScorer, using advance: int filledCount = 0; int slot0 = 0; while (slot0 < CHUNK && (slot0 = seen.nextSetBit(slot0)) != DocIdSetIterator.NO_MORE_DOCS) { int ddDocID = docIDs[slot0]; assert ddDocID != -1; int baseDocID = baseScorer.docID(); if (baseDocID < ddDocID) { baseDocID = baseScorer.advance(ddDocID); } if (baseDocID == ddDocID) { // if (DEBUG) { // System.out.println(" keep docID=" + ddDocID + " id=" + // context.reader().document(ddDocID).get("id")); // } scores[slot0] = baseScorer.score(); filledSlots[filledCount++] = slot0; counts[slot0]++; } else { // if (DEBUG) { // System.out.println(" no docID=" + ddDocID + " id=" + // context.reader().document(ddDocID).get("id")); // } docIDs[slot0] = -1; // TODO: we could jump slot0 forward to the // baseDocID ... but we'd need to set docIDs for // intervening slots to -1 } slot0++; } seen.clear(0, CHUNK); if (filledCount == 0) { if (nextChunkStart >= maxDoc) { break; } nextChunkStart += CHUNK; continue; } // TODO: factor this out & share w/ union scorer, // except we start from dim=2 instead: for (int dim = 2; dim < numDims; dim++) { // if (DEBUG) { // System.out.println(" dim=" + dim + " [" + dims[dim].dim + "]"); // } dc = dims[dim]; docID = dc.approximation.docID(); while (docID < nextChunkStart) { int slot = docID & MASK; if (docIDs[slot] == docID && counts[slot] >= dim && (dc.twoPhase == null || dc.twoPhase.matches())) { // TODO: single-valued dims will always be true // below; we could somehow specialize if (missingDims[slot] >= dim) { // if (DEBUG) { // System.out.println(" set docID=" + docID + " count=" + (dim+2)); // } missingDims[slot] = dim + 1; counts[slot] = dim + 2; } else { // if (DEBUG) { // System.out.println(" set docID=" + docID + " missing count=" + (dim+1)); // } counts[slot] = dim + 1; } } // TODO: sometimes use advance? docID = dc.approximation.nextDoc(); } } // Collect: // if (DEBUG) { // System.out.println(" now collect: " + filledCount + " hits"); // } for (int i = 0; i < filledCount; i++) { int slot = filledSlots[i]; collectDocID = docIDs[slot]; collectScore = scores[slot]; // if (DEBUG) { // System.out.println(" docID=" + docIDs[slot] + " count=" + counts[slot]); // } if (counts[slot] == 1 + numDims) { collectHit(collector, dims); } else if (counts[slot] == numDims) { collectNearMiss(dims[missingDims[slot]].sidewaysLeafCollector); } } if (nextChunkStart >= maxDoc) { break; } nextChunkStart += CHUNK; } }