// algorithm: treat sentence snippets as miniature documents // we can intersect these with the postings lists via BreakIterator.preceding(offset),s // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) private Passage[] highlightDoc( String field, BytesRef terms[], int contentLength, BreakIterator bi, int doc, TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException { PassageScorer scorer = getScorer(field); if (scorer == null) { throw new NullPointerException("PassageScorer cannot be null"); } PriorityQueue<OffsetsEnum> pq = new PriorityQueue<>(); float weights[] = new float[terms.length]; // initialize postings for (int i = 0; i < terms.length; i++) { DocsAndPositionsEnum de = postings[i]; int pDoc; if (de == EMPTY) { continue; } else if (de == null) { postings[i] = EMPTY; // initially if (!termsEnum.seekExact(terms[i])) { continue; // term not found } de = postings[i] = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS); if (de == null) { // no positions available throw new IllegalArgumentException( "field '" + field + "' was indexed without offsets, cannot highlight"); } pDoc = de.advance(doc); } else { pDoc = de.docID(); if (pDoc < doc) { pDoc = de.advance(doc); } } if (doc == pDoc) { weights[i] = scorer.weight(contentLength, de.freq()); de.nextPosition(); pq.add(new OffsetsEnum(de, i)); } } pq.add(new OffsetsEnum(EMPTY, Integer.MAX_VALUE)); // a sentinel for termination PriorityQueue<Passage> passageQueue = new PriorityQueue<>( n, new Comparator<Passage>() { @Override public int compare(Passage left, Passage right) { if (left.score < right.score) { return -1; } else if (left.score > right.score) { return 1; } else { return left.startOffset - right.startOffset; } } }); Passage current = new Passage(); OffsetsEnum off; while ((off = pq.poll()) != null) { final DocsAndPositionsEnum dp = off.dp; int start = dp.startOffset(); if (start == -1) { throw new IllegalArgumentException( "field '" + field + "' was indexed without offsets, cannot highlight"); } int end = dp.endOffset(); // LUCENE-5166: this hit would span the content limit... however more valid // hits may exist (they are sorted by start). so we pretend like we never // saw this term, it won't cause a passage to be added to passageQueue or anything. assert EMPTY.startOffset() == Integer.MAX_VALUE; if (start < contentLength && end > contentLength) { continue; } if (start >= current.endOffset) { if (current.startOffset >= 0) { // finalize current current.score *= scorer.norm(current.startOffset); // new sentence: first add 'current' to queue if (passageQueue.size() == n && current.score < passageQueue.peek().score) { current.reset(); // can't compete, just reset it } else { passageQueue.offer(current); if (passageQueue.size() > n) { current = passageQueue.poll(); current.reset(); } else { current = new Passage(); } } } // if we exceed limit, we are done if (start >= contentLength) { Passage passages[] = new Passage[passageQueue.size()]; passageQueue.toArray(passages); for (Passage p : passages) { p.sort(); } // sort in ascending order Arrays.sort( passages, new Comparator<Passage>() { @Override public int compare(Passage left, Passage right) { return left.startOffset - right.startOffset; } }); return passages; } // advance breakiterator assert BreakIterator.DONE < 0; current.startOffset = Math.max(bi.preceding(start + 1), 0); current.endOffset = Math.min(bi.next(), contentLength); } int tf = 0; while (true) { tf++; BytesRef term = terms[off.id]; if (term == null) { // multitermquery match, pull from payload term = off.dp.getPayload(); assert term != null; } current.addMatch(start, end, term); if (off.pos == dp.freq()) { break; // removed from pq } else { off.pos++; dp.nextPosition(); start = dp.startOffset(); end = dp.endOffset(); } if (start >= current.endOffset || end > contentLength) { pq.offer(off); break; } } current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset); } // Dead code but compiler disagrees: assert false; return null; }