public void testThreeBlocks() throws Exception { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); List<String> terms = new ArrayList<>(); for (int i = 0; i < 36; i++) { Document doc = new Document(); String term = "" + (char) (97 + i); terms.add(term); if (VERBOSE) { System.out.println("i=" + i + " term=" + term); } doc.add(newTextField("field", term, Field.Store.NO)); w.addDocument(doc); } for (int i = 0; i < 36; i++) { Document doc = new Document(); String term = "m" + (char) (97 + i); terms.add(term); if (VERBOSE) { System.out.println("i=" + i + " term=" + term); } doc.add(newTextField("field", term, Field.Store.NO)); w.addDocument(doc); } for (int i = 0; i < 36; i++) { Document doc = new Document(); String term = "mo" + (char) (97 + i); terms.add(term); if (VERBOSE) { System.out.println("i=" + i + " term=" + term); } doc.add(newTextField("field", term, Field.Store.NO)); w.addDocument(doc); } w.forceMerge(1); IndexReader r = w.getReader(); TermsEnum te = MultiFields.getTerms(r, "field").iterator(null); if (VERBOSE) { while (te.next() != null) { System.out.println("TERM: " + te.ord() + " " + te.term().utf8ToString()); } } assertTrue(te.seekExact(new BytesRef("mo"))); assertEquals(27, te.ord()); te.seekExact(90); assertEquals(new BytesRef("s"), te.term()); testEnum(te, terms); r.close(); w.close(); dir.close(); }
public TermInfo collect(String term) throws IOException { TermInfo info = new TermInfo(); BytesRef luceneTerm = new BytesRef(term.getBytes()); // this gives documents in which the term is found, but no offset information can be retrieved PostingsEnum postings = MultiFields.getTermDocsEnum(indexReader, ngramInfoFieldname, luceneTerm); // now go through each document int docId = postings.nextDoc(); while (docId != PostingsEnum.NO_MORE_DOCS) { // get the term vector for that document. TermsEnum it = indexReader.getTermVector(docId, ngramInfoFieldname).iterator(); // find the term of interest it.seekExact(luceneTerm); // get its posting info. this will contain offset info PostingsEnum postingsInDoc = it.postings(null, PostingsEnum.OFFSETS); postingsInDoc.nextDoc(); Document doc = indexReader.document(docId); String id = doc.get(idFieldname); JATEDocument jd = new JATEDocument(id); Set<int[]> offsets = new HashSet<>(); int totalFreq = postingsInDoc.freq(); for (int i = 0; i < totalFreq; i++) { postingsInDoc.nextPosition(); offsets.add(new int[] {postingsInDoc.startOffset(), postingsInDoc.endOffset()}); } info.getOffsets().put(jd, offsets); docId = postings.nextDoc(); } return info; }
public void testDocsAndPositionsEnumStart() throws Exception { Analyzer analyzer = new MockAnalyzer(random()); int numIters = atLeast(3); MemoryIndex memory = new MemoryIndex(true, false, random().nextInt(50) * 1024 * 1024); for (int i = 0; i < numIters; i++) { // check reuse memory.addField("foo", "bar", analyzer); LeafReader reader = (LeafReader) memory.createSearcher().getIndexReader(); TestUtil.checkReader(reader); assertEquals(1, reader.terms("foo").getSumTotalTermFreq()); PostingsEnum disi = reader.postings(new Term("foo", "bar"), PostingsEnum.ALL); int docid = disi.docID(); assertEquals(-1, docid); assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(0, disi.nextPosition()); assertEquals(0, disi.startOffset()); assertEquals(3, disi.endOffset()); // now reuse and check again TermsEnum te = reader.terms("foo").iterator(); assertTrue(te.seekExact(new BytesRef("bar"))); disi = te.postings(disi); docid = disi.docID(); assertEquals(-1, docid); assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); reader.close(); memory.reset(); } }
@Override protected void fillDocsAndScores(FixedBitSet matchingDocs, TermsEnum termsEnum) throws IOException { BytesRef spare = new BytesRef(); PostingsEnum postingsEnum = null; for (int i = 0; i < terms.size(); i++) { if (termsEnum.seekExact(terms.get(ords[i], spare))) { postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); float score = TermsIncludingScoreQuery.this.scores[ords[i]]; for (int doc = postingsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = postingsEnum.nextDoc()) { // I prefer this: /*if (scores[doc] < score) { scores[doc] = score; matchingDocs.set(doc); }*/ // But this behaves the same as MVInnerScorer and only then the tests will pass: if (!matchingDocs.get(doc)) { scores[doc] = score; matchingDocs.set(doc); } } } } }
/* Copied from lucene 4.2.x core */ private static long totalTermFreq(IndexReader r, String field, BytesRef text) throws IOException { final Terms terms = MultiFields.getTerms(r, field); if (terms != null) { final TermsEnum termsEnum = terms.iterator(null); if (termsEnum.seekExact(text, true)) { return termsEnum.totalTermFreq(); } } return 0; }
SegmentResult(int[] counts, int total, TermsEnum tenum, int startFacetOrd, int endFacetOrd) throws IOException { super(counts, total - counts[0], counts[0], endFacetOrd + 1); this.tenum = tenum; this.mergePos = startFacetOrd == -1 ? 1 : startFacetOrd + 1; if (mergePos < maxTermPos) { assert tenum != null; tenum.seekExact(startFacetOrd == -1 ? 0 : startFacetOrd); mergeTerm = tenum.term(); } }
private void testEnum(TermsEnum te, List<String> terms) throws IOException { Collections.sort(terms); for (int i = terms.size() - 1; i >= 0; i--) { if (VERBOSE) { System.out.println("TEST: seek to ord=" + i); } te.seekExact(i); assertEquals(i, te.ord()); assertEquals(terms.get(i), te.term().utf8ToString()); } int iters = atLeast(1000); for (int iter = 0; iter < iters; iter++) { int ord = random().nextInt(terms.size()); if (random().nextBoolean()) { te.seekExact(ord); assertEquals(terms.get(ord), te.term().utf8ToString()); } else { te.seekExact(new BytesRef(terms.get(ord))); assertEquals(ord, te.ord()); } } }
public void testFloorBlocks() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); IndexWriter w = new IndexWriter(dir, iwc); for (int i = 0; i < 128; i++) { Document doc = new Document(); String term = "" + (char) i; if (VERBOSE) { System.out.println("i=" + i + " term=" + term + " bytes=" + new BytesRef(term)); } doc.add(newStringField("field", term, Field.Store.NO)); w.addDocument(doc); } w.forceMerge(1); IndexReader r = DirectoryReader.open(w, true); TermsEnum te = MultiFields.getTerms(r, "field").iterator(null); if (VERBOSE) { BytesRef term; while ((term = te.next()) != null) { System.out.println(" " + te.ord() + ": " + term.utf8ToString()); } } assertTrue(te.seekExact(new BytesRef("a"))); assertEquals(97, te.ord()); te.seekExact(98); assertEquals(new BytesRef("b"), te.term()); assertTrue(te.seekExact(new BytesRef("z"))); assertEquals(122, te.ord()); r.close(); w.close(); dir.close(); }
public PostingsEnum randomDocsEnum( String field, BytesRef term, List<LeafReaderContext> readers, Bits bits) throws IOException { if (random().nextInt(10) == 0) { return null; } LeafReader indexReader = readers.get(random().nextInt(readers.size())).reader(); Terms terms = indexReader.terms(field); if (terms == null) { return null; } TermsEnum iterator = terms.iterator(); if (iterator.seekExact(term)) { return iterator.postings( bits, null, random().nextBoolean() ? PostingsEnum.FREQS : PostingsEnum.NONE); } return null; }
SegmentResult( int[] counts, int total, int missingCountIndex, TermsEnum tenum, int startFacetOrd, int endFacetOrd) throws IOException { super( counts, total - counts[missingCountIndex], counts[missingCountIndex], endFacetOrd == missingCountIndex + 1 ? missingCountIndex : endFacetOrd); this.tenum = tenum; this.mergePos = startFacetOrd; if (tenum != null) { tenum.seekExact(mergePos); mergeTerm = tenum.term(); } }
private Query newTermQuery(IndexReader reader, Term term) throws IOException { if (ignoreTF) { return new ConstantScoreQuery(new TermQuery(term)); } else { // we build an artificial TermContext that will give an overall df and ttf // equal to 1 TermContext context = new TermContext(reader.getContext()); for (LeafReaderContext leafContext : reader.leaves()) { Terms terms = leafContext.reader().terms(term.field()); if (terms != null) { TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(term.bytes())) { int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1 context.register(termsEnum.termState(), leafContext.ord, freq, freq); } } } return new TermQuery(term, context); } }
protected void fillDocsAndScores(FixedBitSet matchingDocs, TermsEnum termsEnum) throws IOException { BytesRef spare = new BytesRef(); PostingsEnum postingsEnum = null; for (int i = 0; i < terms.size(); i++) { if (termsEnum.seekExact(terms.get(ords[i], spare))) { postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); float score = TermsIncludingScoreQuery.this.scores[ords[i]]; for (int doc = postingsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = postingsEnum.nextDoc()) { matchingDocs.set(doc); // In the case the same doc is also related to a another doc, a score might be // overwritten. I think this // can only happen in a many-to-many relation scores[doc] = score; } } } }
public void testSeveralNonRootBlocks() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); IndexWriter w = new IndexWriter(dir, iwc); List<String> terms = new ArrayList<>(); for (int i = 0; i < 30; i++) { for (int j = 0; j < 30; j++) { Document doc = new Document(); String term = "" + (char) (97 + i) + (char) (97 + j); terms.add(term); if (VERBOSE) { System.out.println("term=" + term); } doc.add(newTextField("body", term, Field.Store.NO)); w.addDocument(doc); } } w.forceMerge(1); IndexReader r = DirectoryReader.open(w, true); TermsEnum te = MultiFields.getTerms(r, "body").iterator(null); for (int i = 0; i < 30; i++) { for (int j = 0; j < 30; j++) { String term = "" + (char) (97 + i) + (char) (97 + j); if (VERBOSE) { System.out.println("TEST: check term=" + term); } assertEquals(term, te.next().utf8ToString()); assertEquals(30 * i + j, te.ord()); } } testEnum(te, terms); te.seekExact(0); assertEquals("aa", te.term().utf8ToString()); r.close(); w.close(); dir.close(); }
public static DocSet createDocSet(SolrIndexSearcher searcher, Term term) throws IOException { DirectoryReader reader = searcher.getRawReader(); // raw reader to avoid extra wrapping overhead int maxDoc = searcher.getIndexReader().maxDoc(); int smallSetSize = smallSetSize(maxDoc); String field = term.field(); BytesRef termVal = term.bytes(); int maxCount = 0; int firstReader = -1; List<LeafReaderContext> leaves = reader.leaves(); PostingsEnum[] postList = new PostingsEnum [leaves .size()]; // use array for slightly higher scanning cost, but fewer memory // allocations for (LeafReaderContext ctx : leaves) { assert leaves.get(ctx.ord) == ctx; LeafReader r = ctx.reader(); Fields f = r.fields(); Terms t = f.terms(field); if (t == null) continue; // field is missing TermsEnum te = t.iterator(); if (te.seekExact(termVal)) { maxCount += te.docFreq(); postList[ctx.ord] = te.postings(null, PostingsEnum.NONE); if (firstReader < 0) firstReader = ctx.ord; } } if (maxCount == 0) { return DocSet.EMPTY; } if (maxCount <= smallSetSize) { return createSmallSet(leaves, postList, maxCount, firstReader); } return createBigSet(leaves, postList, maxDoc, firstReader); }
protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader atomicReader, int doc) throws IOException { // For strict positions, get a Map of term to Spans: // note: ScriptPhraseHelper.NONE does the right thing for these method calls final Map<BytesRef, Spans> strictPhrasesTermToSpans = strictPhrases.getTermToSpans(atomicReader, doc); // Usually simply wraps terms in a List; but if willRewrite() then can be expanded final List<BytesRef> sourceTerms = strictPhrases.expandTermsIfRewrite(terms, strictPhrasesTermToSpans); final List<OffsetsEnum> offsetsEnums = new ArrayList<>(sourceTerms.size() + 1); Terms termsIndex = atomicReader == null || sourceTerms.isEmpty() ? null : atomicReader.terms(field); if (termsIndex != null) { TermsEnum termsEnum = termsIndex.iterator(); // does not return null for (BytesRef term : sourceTerms) { if (!termsEnum.seekExact(term)) { continue; // term not found } PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS); if (postingsEnum == null) { // no offsets or positions available throw new IllegalArgumentException( "field '" + field + "' was indexed without offsets, cannot highlight"); } if (doc != postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted continue; } postingsEnum = strictPhrases.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term)); if (postingsEnum == null) { continue; // completely filtered out } offsetsEnums.add(new OffsetsEnum(term, postingsEnum)); } } return offsetsEnums; }
public void testDocsEnumStart() throws Exception { Analyzer analyzer = new MockAnalyzer(random()); MemoryIndex memory = new MemoryIndex(random().nextBoolean(), false, random().nextInt(50) * 1024 * 1024); memory.addField("foo", "bar", analyzer); LeafReader reader = (LeafReader) memory.createSearcher().getIndexReader(); TestUtil.checkReader(reader); PostingsEnum disi = TestUtil.docs(random(), reader, "foo", new BytesRef("bar"), null, PostingsEnum.NONE); int docid = disi.docID(); assertEquals(-1, docid); assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); // now reuse and check again TermsEnum te = reader.terms("foo").iterator(); assertTrue(te.seekExact(new BytesRef("bar"))); disi = te.postings(disi, PostingsEnum.NONE); docid = disi.docID(); assertEquals(-1, docid); assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); reader.close(); }
protected int[] lookupDocIdByPK(final IndexSearcher searcher, final String... ids) throws IOException { final List<AtomicReaderContext> subReaders = searcher.getIndexReader().leaves(); final TermsEnum[] termsEnums = new TermsEnum[subReaders.size()]; final DocsEnum[] docsEnums = new DocsEnum[subReaders.size()]; for (int subIDX = 0; subIDX < subReaders.size(); subIDX++) { termsEnums[subIDX] = subReaders.get(subIDX).reader().fields().terms("id").iterator(null); } int[] results = new int[ids.length]; for (int i = 0; i < results.length; i++) { results[i] = -1; } for (int idx = 0; idx < ids.length; idx++) { int base = 0; final BytesRef id = new BytesRef(ids[idx]); for (int subIDX = 0; subIDX < subReaders.size(); subIDX++) { final AtomicReader sub = subReaders.get(subIDX).reader(); final TermsEnum termsEnum = termsEnums[subIDX]; if (termsEnum.seekExact(id, false)) { final DocsEnum docs = docsEnums[subIDX] = termsEnum.docs(sub.getLiveDocs(), docsEnums[subIDX], 0); if (docs != null) { final int docID = docs.nextDoc(); if (docID != DocIdSetIterator.NO_MORE_DOCS) { results[idx] = base + docID; break; } } } base += sub.maxDoc(); } } return results; }
public void collectTermContext( IndexReader reader, List<LeafReaderContext> leaves, TermContext[] contextArray, Term[] queryTerms) throws IOException { TermsEnum termsEnum = null; for (LeafReaderContext context : leaves) { final Fields fields = context.reader().fields(); for (int i = 0; i < queryTerms.length; i++) { Term term = queryTerms[i]; TermContext termContext = contextArray[i]; final Terms terms = fields.terms(term.field()); if (terms == null) { // field does not exist continue; } termsEnum = terms.iterator(); assert termsEnum != null; if (termsEnum == TermsEnum.EMPTY) continue; if (termsEnum.seekExact(term.bytes())) { if (termContext == null) { contextArray[i] = new TermContext( reader.getContext(), termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } else { termContext.register( termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } } } } }
public void testBasic() throws Exception { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); Document doc = new Document(); doc.add(newTextField("field", "a b c", Field.Store.NO)); w.addDocument(doc); IndexReader r = w.getReader(); TermsEnum te = MultiFields.getTerms(r, "field").iterator(null); // Test next() assertEquals(new BytesRef("a"), te.next()); assertEquals(0L, te.ord()); assertEquals(new BytesRef("b"), te.next()); assertEquals(1L, te.ord()); assertEquals(new BytesRef("c"), te.next()); assertEquals(2L, te.ord()); assertNull(te.next()); // Test seekExact by term assertTrue(te.seekExact(new BytesRef("b"))); assertEquals(1, te.ord()); assertTrue(te.seekExact(new BytesRef("a"))); assertEquals(0, te.ord()); assertTrue(te.seekExact(new BytesRef("c"))); assertEquals(2, te.ord()); // Test seekExact by ord te.seekExact(1); assertEquals(new BytesRef("b"), te.term()); te.seekExact(0); assertEquals(new BytesRef("a"), te.term()); te.seekExact(2); assertEquals(new BytesRef("c"), te.term()); r.close(); w.close(); dir.close(); }
/** Returns the term ({@link BytesRef}) corresponding to the provided ordinal. */ public BytesRef lookupTerm(TermsEnum termsEnum, int ord) throws IOException { termsEnum.seekExact(ord); return termsEnum.term(); }
public void testTwoBlocks() throws Exception { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); List<String> terms = new ArrayList<>(); for (int i = 0; i < 36; i++) { Document doc = new Document(); String term = "" + (char) (97 + i); terms.add(term); if (VERBOSE) { System.out.println("i=" + i + " term=" + term); } doc.add(newTextField("field", term, Field.Store.NO)); w.addDocument(doc); } for (int i = 0; i < 36; i++) { Document doc = new Document(); String term = "m" + (char) (97 + i); terms.add(term); if (VERBOSE) { System.out.println("i=" + i + " term=" + term); } doc.add(newTextField("field", term, Field.Store.NO)); w.addDocument(doc); } if (VERBOSE) { System.out.println("TEST: now forceMerge"); } w.forceMerge(1); IndexReader r = w.getReader(); TermsEnum te = MultiFields.getTerms(r, "field").iterator(null); assertTrue(te.seekExact(new BytesRef("mo"))); assertEquals(27, te.ord()); te.seekExact(54); assertEquals(new BytesRef("s"), te.term()); Collections.sort(terms); for (int i = terms.size() - 1; i >= 0; i--) { te.seekExact(i); assertEquals(i, te.ord()); assertEquals(terms.get(i), te.term().utf8ToString()); } int iters = atLeast(1000); for (int iter = 0; iter < iters; iter++) { int ord = random().nextInt(terms.size()); BytesRef term = new BytesRef(terms.get(ord)); if (random().nextBoolean()) { if (VERBOSE) { System.out.println("TEST: iter=" + iter + " seek to ord=" + ord + " of " + terms.size()); } te.seekExact(ord); } else { if (VERBOSE) { System.out.println( "TEST: iter=" + iter + " seek to term=" + terms.get(ord) + " ord=" + ord + " of " + terms.size()); } te.seekExact(term); } assertEquals(ord, te.ord()); assertEquals(term, te.term()); } r.close(); w.close(); dir.close(); }
@Override public void execute(String[] args, PrintStream out) throws Exception { String field = null; String termVal = null; try { field = args[0]; } catch (Exception e) { field = null; } if (field != null) { String[] parts = field.split(":"); if (parts.length > 1) { field = parts[0]; termVal = parts[1]; } } if (field == null || termVal == null) { out.println("usage: field:term"); out.flush(); return; } IndexReader reader = ctx.getIndexReader(); List<AtomicReaderContext> leaves = reader.leaves(); int docBase = 0; int numPerPage = 20; for (AtomicReaderContext leaf : leaves) { AtomicReader atomicReader = leaf.reader(); Terms terms = atomicReader.terms(field); if (terms == null) { continue; } boolean hasPositions = terms.hasPositions(); if (terms != null && termVal != null) { TermsEnum te = terms.iterator(null); int count = 0; if (te.seekExact(new BytesRef(termVal), true)) { if (hasPositions) { DocsAndPositionsEnum iter = te.docsAndPositions(atomicReader.getLiveDocs(), null); int docid; while ((docid = iter.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { count++; out.print("docid: " + (docid + docBase) + ", freq: " + iter.freq() + ", "); for (int i = 0; i < iter.freq(); ++i) { out.print("pos " + i + ": " + iter.nextPosition()); BytesRef payload = iter.getPayload(); if (payload != null) { out.print(",payload: " + payload); } out.print(";"); } out.println(); if (ctx.isInteractiveMode()) { if (count % numPerPage == 0) { out.println("Ctrl-D to break"); int ch = System.in.read(); if (ch == -1) { out.flush(); return; } } } } } else { DocsEnum iter = te.docs(atomicReader.getLiveDocs(), null); int docid; while ((docid = iter.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { count++; out.println("docid: " + (docid + docBase)); if (ctx.isInteractiveMode()) { if (count % numPerPage == 0) { out.println("Ctrl-D to break"); int ch = System.in.read(); if (ch == -1) { out.flush(); return; } } } } } } } docBase += atomicReader.maxDoc(); } }
private void assertTermsSeeking(Terms leftTerms, Terms rightTerms) throws Exception { TermsEnum leftEnum = null; TermsEnum rightEnum = null; // just an upper bound int numTests = atLeast(20); Random random = random(); // collect this number of terms from the left side HashSet<BytesRef> tests = new HashSet<BytesRef>(); int numPasses = 0; while (numPasses < 10 && tests.size() < numTests) { leftEnum = leftTerms.iterator(leftEnum); BytesRef term = null; while ((term = leftEnum.next()) != null) { int code = random.nextInt(10); if (code == 0) { // the term tests.add(BytesRef.deepCopyOf(term)); } else if (code == 1) { // truncated subsequence of term term = BytesRef.deepCopyOf(term); if (term.length > 0) { // truncate it term.length = random.nextInt(term.length); } } else if (code == 2) { // term, but ensure a non-zero offset byte newbytes[] = new byte[term.length + 5]; System.arraycopy(term.bytes, term.offset, newbytes, 5, term.length); tests.add(new BytesRef(newbytes, 5, term.length)); } } numPasses++; } ArrayList<BytesRef> shuffledTests = new ArrayList<BytesRef>(tests); Collections.shuffle(shuffledTests, random); for (BytesRef b : shuffledTests) { leftEnum = leftTerms.iterator(leftEnum); rightEnum = rightTerms.iterator(rightEnum); assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b)); assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b)); SeekStatus leftStatus; SeekStatus rightStatus; leftStatus = leftEnum.seekCeil(b); rightStatus = rightEnum.seekCeil(b); assertEquals(leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { assertEquals(leftEnum.term(), rightEnum.term()); } leftStatus = leftEnum.seekCeil(b); rightStatus = rightEnum.seekCeil(b); assertEquals(leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { assertEquals(leftEnum.term(), rightEnum.term()); } } }
@Override protected void doSetNextReader(LeafReaderContext context) throws IOException { if (segmentFacetCounts != null) { segmentResults.add(createSegmentResult()); } groupFieldTermsIndex = DocValues.getSorted(context.reader(), groupField); facetFieldDocTermOrds = DocValues.getSortedSet(context.reader(), facetField); facetFieldNumTerms = (int) facetFieldDocTermOrds.getValueCount(); if (facetFieldNumTerms == 0) { facetOrdTermsEnum = null; } else { facetOrdTermsEnum = facetFieldDocTermOrds.termsEnum(); } // [facetFieldNumTerms() + 1] for all possible facet values and docs not containing facet // field segmentFacetCounts = new int[facetFieldNumTerms + 1]; segmentTotalCount = 0; segmentGroupedFacetHits.clear(); for (GroupedFacetHit groupedFacetHit : groupedFacetHits) { int groupOrd = groupedFacetHit.groupValue == null ? -1 : groupFieldTermsIndex.lookupTerm(groupedFacetHit.groupValue); if (groupedFacetHit.groupValue != null && groupOrd < 0) { continue; } int facetOrd; if (groupedFacetHit.facetValue != null) { if (facetOrdTermsEnum == null || !facetOrdTermsEnum.seekExact(groupedFacetHit.facetValue)) { continue; } facetOrd = (int) facetOrdTermsEnum.ord(); } else { facetOrd = facetFieldNumTerms; } // (facetFieldDocTermOrds.numTerms() + 1) for all possible facet values and docs not // containing facet field int segmentGroupedFacetsIndex = groupOrd * (facetFieldNumTerms + 1) + facetOrd; segmentGroupedFacetHits.put(segmentGroupedFacetsIndex); } if (facetPrefix != null) { TermsEnum.SeekStatus seekStatus; if (facetOrdTermsEnum != null) { seekStatus = facetOrdTermsEnum.seekCeil(facetPrefix); } else { seekStatus = TermsEnum.SeekStatus.END; } if (seekStatus != TermsEnum.SeekStatus.END) { startFacetOrd = (int) facetOrdTermsEnum.ord(); } else { startFacetOrd = 0; endFacetOrd = 0; return; } BytesRefBuilder facetEndPrefix = new BytesRefBuilder(); facetEndPrefix.append(facetPrefix); facetEndPrefix.append(UnicodeUtil.BIG_TERM); seekStatus = facetOrdTermsEnum.seekCeil(facetEndPrefix.get()); if (seekStatus != TermsEnum.SeekStatus.END) { endFacetOrd = (int) facetOrdTermsEnum.ord(); } else { endFacetOrd = facetFieldNumTerms; // Don't include null... } } else { startFacetOrd = 0; endFacetOrd = facetFieldNumTerms + 1; } }
private IndexIterationContext createContext( int nDocs, RandomIndexWriter fromWriter, RandomIndexWriter toWriter, boolean multipleValuesPerDocument, boolean scoreDocsInOrder) throws IOException { IndexIterationContext context = new IndexIterationContext(); int numRandomValues = nDocs / 2; context.randomUniqueValues = new String[numRandomValues]; Set<String> trackSet = new HashSet<String>(); context.randomFrom = new boolean[numRandomValues]; for (int i = 0; i < numRandomValues; i++) { String uniqueRandomValue; do { uniqueRandomValue = _TestUtil.randomRealisticUnicodeString(random()); // uniqueRandomValue = _TestUtil.randomSimpleString(random); } while ("".equals(uniqueRandomValue) || trackSet.contains(uniqueRandomValue)); // Generate unique values and empty strings aren't allowed. trackSet.add(uniqueRandomValue); context.randomFrom[i] = random().nextBoolean(); context.randomUniqueValues[i] = uniqueRandomValue; } RandomDoc[] docs = new RandomDoc[nDocs]; for (int i = 0; i < nDocs; i++) { String id = Integer.toString(i); int randomI = random().nextInt(context.randomUniqueValues.length); String value = context.randomUniqueValues[randomI]; Document document = new Document(); document.add(newTextField(random(), "id", id, Field.Store.NO)); document.add(newTextField(random(), "value", value, Field.Store.NO)); boolean from = context.randomFrom[randomI]; int numberOfLinkValues = multipleValuesPerDocument ? 2 + random().nextInt(10) : 1; docs[i] = new RandomDoc(id, numberOfLinkValues, value, from); for (int j = 0; j < numberOfLinkValues; j++) { String linkValue = context.randomUniqueValues[random().nextInt(context.randomUniqueValues.length)]; docs[i].linkValues.add(linkValue); if (from) { if (!context.fromDocuments.containsKey(linkValue)) { context.fromDocuments.put(linkValue, new ArrayList<RandomDoc>()); } if (!context.randomValueFromDocs.containsKey(value)) { context.randomValueFromDocs.put(value, new ArrayList<RandomDoc>()); } context.fromDocuments.get(linkValue).add(docs[i]); context.randomValueFromDocs.get(value).add(docs[i]); document.add(newTextField(random(), "from", linkValue, Field.Store.NO)); } else { if (!context.toDocuments.containsKey(linkValue)) { context.toDocuments.put(linkValue, new ArrayList<RandomDoc>()); } if (!context.randomValueToDocs.containsKey(value)) { context.randomValueToDocs.put(value, new ArrayList<RandomDoc>()); } context.toDocuments.get(linkValue).add(docs[i]); context.randomValueToDocs.get(value).add(docs[i]); document.add(newTextField(random(), "to", linkValue, Field.Store.NO)); } } final RandomIndexWriter w; if (from) { w = fromWriter; } else { w = toWriter; } w.addDocument(document); if (random().nextInt(10) == 4) { w.commit(); } if (VERBOSE) { System.out.println("Added document[" + docs[i].id + "]: " + document); } } // Pre-compute all possible hits for all unique random values. On top of this also compute all // possible score for // any ScoreMode. IndexSearcher fromSearcher = newSearcher(fromWriter.getReader()); IndexSearcher toSearcher = newSearcher(toWriter.getReader()); for (int i = 0; i < context.randomUniqueValues.length; i++) { String uniqueRandomValue = context.randomUniqueValues[i]; final String fromField; final String toField; final Map<String, Map<Integer, JoinScore>> queryVals; if (context.randomFrom[i]) { fromField = "from"; toField = "to"; queryVals = context.fromHitsToJoinScore; } else { fromField = "to"; toField = "from"; queryVals = context.toHitsToJoinScore; } final Map<BytesRef, JoinScore> joinValueToJoinScores = new HashMap<BytesRef, JoinScore>(); if (multipleValuesPerDocument) { fromSearcher.search( new TermQuery(new Term("value", uniqueRandomValue)), new Collector() { private Scorer scorer; private SortedSetDocValues docTermOrds; final BytesRef joinValue = new BytesRef(); @Override public void collect(int doc) throws IOException { docTermOrds.setDocument(doc); long ord; while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { docTermOrds.lookupOrd(ord, joinValue); JoinScore joinScore = joinValueToJoinScores.get(joinValue); if (joinScore == null) { joinValueToJoinScores.put( BytesRef.deepCopyOf(joinValue), joinScore = new JoinScore()); } joinScore.addScore(scorer.score()); } } @Override public void setNextReader(AtomicReaderContext context) throws IOException { docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader(), fromField); } @Override public void setScorer(Scorer scorer) { this.scorer = scorer; } @Override public boolean acceptsDocsOutOfOrder() { return false; } }); } else { fromSearcher.search( new TermQuery(new Term("value", uniqueRandomValue)), new Collector() { private Scorer scorer; private BinaryDocValues terms; private Bits docsWithField; private final BytesRef spare = new BytesRef(); @Override public void collect(int doc) throws IOException { terms.get(doc, spare); BytesRef joinValue = spare; if (joinValue.length == 0 && !docsWithField.get(doc)) { return; } JoinScore joinScore = joinValueToJoinScores.get(joinValue); if (joinScore == null) { joinValueToJoinScores.put( BytesRef.deepCopyOf(joinValue), joinScore = new JoinScore()); } joinScore.addScore(scorer.score()); } @Override public void setNextReader(AtomicReaderContext context) throws IOException { terms = FieldCache.DEFAULT.getTerms(context.reader(), fromField, true); docsWithField = FieldCache.DEFAULT.getDocsWithField(context.reader(), fromField); } @Override public void setScorer(Scorer scorer) { this.scorer = scorer; } @Override public boolean acceptsDocsOutOfOrder() { return false; } }); } final Map<Integer, JoinScore> docToJoinScore = new HashMap<Integer, JoinScore>(); if (multipleValuesPerDocument) { if (scoreDocsInOrder) { AtomicReader slowCompositeReader = SlowCompositeReaderWrapper.wrap(toSearcher.getIndexReader()); Terms terms = slowCompositeReader.terms(toField); if (terms != null) { DocsEnum docsEnum = null; TermsEnum termsEnum = null; SortedSet<BytesRef> joinValues = new TreeSet<BytesRef>(BytesRef.getUTF8SortedAsUnicodeComparator()); joinValues.addAll(joinValueToJoinScores.keySet()); for (BytesRef joinValue : joinValues) { termsEnum = terms.iterator(termsEnum); if (termsEnum.seekExact(joinValue)) { docsEnum = termsEnum.docs(slowCompositeReader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE); JoinScore joinScore = joinValueToJoinScores.get(joinValue); for (int doc = docsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = docsEnum.nextDoc()) { // First encountered join value determines the score. // Something to keep in mind for many-to-many relations. if (!docToJoinScore.containsKey(doc)) { docToJoinScore.put(doc, joinScore); } } } } } } else { toSearcher.search( new MatchAllDocsQuery(), new Collector() { private SortedSetDocValues docTermOrds; private final BytesRef scratch = new BytesRef(); private int docBase; @Override public void collect(int doc) throws IOException { docTermOrds.setDocument(doc); long ord; while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { docTermOrds.lookupOrd(ord, scratch); JoinScore joinScore = joinValueToJoinScores.get(scratch); if (joinScore == null) { continue; } Integer basedDoc = docBase + doc; // First encountered join value determines the score. // Something to keep in mind for many-to-many relations. if (!docToJoinScore.containsKey(basedDoc)) { docToJoinScore.put(basedDoc, joinScore); } } } @Override public void setNextReader(AtomicReaderContext context) throws IOException { docBase = context.docBase; docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader(), toField); } @Override public boolean acceptsDocsOutOfOrder() { return false; } @Override public void setScorer(Scorer scorer) {} }); } } else { toSearcher.search( new MatchAllDocsQuery(), new Collector() { private BinaryDocValues terms; private int docBase; private final BytesRef spare = new BytesRef(); @Override public void collect(int doc) { terms.get(doc, spare); JoinScore joinScore = joinValueToJoinScores.get(spare); if (joinScore == null) { return; } docToJoinScore.put(docBase + doc, joinScore); } @Override public void setNextReader(AtomicReaderContext context) throws IOException { terms = FieldCache.DEFAULT.getTerms(context.reader(), toField, false); docBase = context.docBase; } @Override public boolean acceptsDocsOutOfOrder() { return false; } @Override public void setScorer(Scorer scorer) {} }); } queryVals.put(uniqueRandomValue, docToJoinScore); } fromSearcher.getIndexReader().close(); toSearcher.getIndexReader().close(); return context; }
// algorithm: treat sentence snippets as miniature documents // we can intersect these with the postings lists via BreakIterator.preceding(offset),s // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) private Passage[] highlightDoc( String field, BytesRef terms[], int contentLength, BreakIterator bi, int doc, TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException { PassageScorer scorer = getScorer(field); if (scorer == null) { throw new NullPointerException("PassageScorer cannot be null"); } PriorityQueue<OffsetsEnum> pq = new PriorityQueue<>(); float weights[] = new float[terms.length]; // initialize postings for (int i = 0; i < terms.length; i++) { DocsAndPositionsEnum de = postings[i]; int pDoc; if (de == EMPTY) { continue; } else if (de == null) { postings[i] = EMPTY; // initially if (!termsEnum.seekExact(terms[i])) { continue; // term not found } de = postings[i] = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS); if (de == null) { // no positions available throw new IllegalArgumentException( "field '" + field + "' was indexed without offsets, cannot highlight"); } pDoc = de.advance(doc); } else { pDoc = de.docID(); if (pDoc < doc) { pDoc = de.advance(doc); } } if (doc == pDoc) { weights[i] = scorer.weight(contentLength, de.freq()); de.nextPosition(); pq.add(new OffsetsEnum(de, i)); } } pq.add(new OffsetsEnum(EMPTY, Integer.MAX_VALUE)); // a sentinel for termination PriorityQueue<Passage> passageQueue = new PriorityQueue<>( n, new Comparator<Passage>() { @Override public int compare(Passage left, Passage right) { if (left.score < right.score) { return -1; } else if (left.score > right.score) { return 1; } else { return left.startOffset - right.startOffset; } } }); Passage current = new Passage(); OffsetsEnum off; while ((off = pq.poll()) != null) { final DocsAndPositionsEnum dp = off.dp; int start = dp.startOffset(); if (start == -1) { throw new IllegalArgumentException( "field '" + field + "' was indexed without offsets, cannot highlight"); } int end = dp.endOffset(); // LUCENE-5166: this hit would span the content limit... however more valid // hits may exist (they are sorted by start). so we pretend like we never // saw this term, it won't cause a passage to be added to passageQueue or anything. assert EMPTY.startOffset() == Integer.MAX_VALUE; if (start < contentLength && end > contentLength) { continue; } if (start >= current.endOffset) { if (current.startOffset >= 0) { // finalize current current.score *= scorer.norm(current.startOffset); // new sentence: first add 'current' to queue if (passageQueue.size() == n && current.score < passageQueue.peek().score) { current.reset(); // can't compete, just reset it } else { passageQueue.offer(current); if (passageQueue.size() > n) { current = passageQueue.poll(); current.reset(); } else { current = new Passage(); } } } // if we exceed limit, we are done if (start >= contentLength) { Passage passages[] = new Passage[passageQueue.size()]; passageQueue.toArray(passages); for (Passage p : passages) { p.sort(); } // sort in ascending order Arrays.sort( passages, new Comparator<Passage>() { @Override public int compare(Passage left, Passage right) { return left.startOffset - right.startOffset; } }); return passages; } // advance breakiterator assert BreakIterator.DONE < 0; current.startOffset = Math.max(bi.preceding(start + 1), 0); current.endOffset = Math.min(bi.next(), contentLength); } int tf = 0; while (true) { tf++; BytesRef term = terms[off.id]; if (term == null) { // multitermquery match, pull from payload term = off.dp.getPayload(); assert term != null; } current.addMatch(start, end, term); if (off.pos == dp.freq()) { break; // removed from pq } else { off.pos++; dp.nextPosition(); start = dp.startOffset(); end = dp.endOffset(); } if (start >= current.endOffset || end > contentLength) { pq.offer(off); break; } } current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset); } // Dead code but compiler disagrees: assert false; return null; }