@Override protected void fillDocsAndScores(FixedBitSet matchingDocs, TermsEnum termsEnum) throws IOException { BytesRef spare = new BytesRef(); PostingsEnum postingsEnum = null; for (int i = 0; i < terms.size(); i++) { if (termsEnum.seekExact(terms.get(ords[i], spare))) { postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); float score = TermsIncludingScoreQuery.this.scores[ords[i]]; for (int doc = postingsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = postingsEnum.nextDoc()) { // I prefer this: /*if (scores[doc] < score) { scores[doc] = score; matchingDocs.set(doc); }*/ // But this behaves the same as MVInnerScorer and only then the tests will pass: if (!matchingDocs.get(doc)) { scores[doc] = score; matchingDocs.set(doc); } } } } }
private static DocSet createBigSet( List<LeafReaderContext> leaves, PostingsEnum[] postList, int maxDoc, int firstReader) throws IOException { long[] bits = new long[FixedBitSet.bits2words(maxDoc)]; int sz = 0; for (int i = firstReader; i < postList.length; i++) { PostingsEnum postings = postList[i]; if (postings == null) continue; LeafReaderContext ctx = leaves.get(i); Bits liveDocs = ctx.reader().getLiveDocs(); int base = ctx.docBase; for (; ; ) { int subId = postings.nextDoc(); if (subId == DocIdSetIterator.NO_MORE_DOCS) break; if (liveDocs != null && !liveDocs.get(subId)) continue; int globalId = subId + base; bits[globalId >> 6] |= (1L << globalId); sz++; } } BitDocSet docSet = new BitDocSet(new FixedBitSet(bits, maxDoc), sz); int smallSetSize = smallSetSize(maxDoc); if (sz < smallSetSize) { // make this optional? DocSet smallSet = toSmallSet(docSet); // assert equals(docSet, smallSet); return smallSet; } return docSet; }
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param termFreqMap a Map of terms and their frequencies * @param vector List of terms and their frequencies for a doc/field * @param fieldName Optional field name of the terms for skip terms */ private void addTermFrequencies( Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName) throws IOException { final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } if (isSkipTerm(fieldName, term)) { continue; } final PostingsEnum docs = termsEnum.postings(null, null); int freq = 0; while (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { freq += docs.freq(); } // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
/** * Add term frequencies for a single document to a frequency map. * * @param reader the index * @param doc doc id * @param luceneName the index field from which to use the term vector * @param freq where to add to the token frequencies */ public static void getFrequenciesFromTermVector( IndexReader reader, int doc, String luceneName, Map<String, Integer> freq) { try { org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName); if (terms == null) { throw new IllegalArgumentException("Field " + luceneName + " has no Terms"); } TermsEnum termsEnum = terms.iterator(); // Verzamel concordantiewoorden uit term vector PostingsEnum postingsEnum = null; while (termsEnum.next() != null) { postingsEnum = termsEnum.postings(null, postingsEnum, PostingsEnum.FREQS); String term = termsEnum.term().utf8ToString(); Integer n = freq.get(term); if (n == null) { n = 0; } while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { n += termsEnum.docFreq(); } freq.put(term, n); } } catch (Exception e) { throw ExUtil.wrapRuntimeException(e); } }
public TermInfo collect(String term) throws IOException { TermInfo info = new TermInfo(); BytesRef luceneTerm = new BytesRef(term.getBytes()); // this gives documents in which the term is found, but no offset information can be retrieved PostingsEnum postings = MultiFields.getTermDocsEnum(indexReader, ngramInfoFieldname, luceneTerm); // now go through each document int docId = postings.nextDoc(); while (docId != PostingsEnum.NO_MORE_DOCS) { // get the term vector for that document. TermsEnum it = indexReader.getTermVector(docId, ngramInfoFieldname).iterator(); // find the term of interest it.seekExact(luceneTerm); // get its posting info. this will contain offset info PostingsEnum postingsInDoc = it.postings(null, PostingsEnum.OFFSETS); postingsInDoc.nextDoc(); Document doc = indexReader.document(docId); String id = doc.get(idFieldname); JATEDocument jd = new JATEDocument(id); Set<int[]> offsets = new HashSet<>(); int totalFreq = postingsInDoc.freq(); for (int i = 0; i < totalFreq; i++) { postingsInDoc.nextPosition(); offsets.add(new int[] {postingsInDoc.startOffset(), postingsInDoc.endOffset()}); } info.getOffsets().put(jd, offsets); docId = postings.nextDoc(); } return info; }
protected OffsetsEnum createOffsetsEnumFromTokenStream(int doc, TokenStream tokenStream) throws IOException { // if there are automata (MTQ), we have to initialize the "fake" enum wrapping them. assert tokenStream != null; // TODO Opt: we sometimes evaluate the automata twice when this TS isn't the original; can we // avoid? PostingsEnum mtqPostingsEnum = MultiTermHighlighting.getDocsEnum(tokenStream, automata); assert mtqPostingsEnum instanceof Closeable; // FYI we propagate close() later. mtqPostingsEnum.advance(doc); return new OffsetsEnum(null, mtqPostingsEnum); }
public void testDocsAndPositionsEnumStart() throws Exception { Analyzer analyzer = new MockAnalyzer(random()); int numIters = atLeast(3); MemoryIndex memory = new MemoryIndex(true, false, random().nextInt(50) * 1024 * 1024); for (int i = 0; i < numIters; i++) { // check reuse memory.addField("foo", "bar", analyzer); LeafReader reader = (LeafReader) memory.createSearcher().getIndexReader(); TestUtil.checkReader(reader); assertEquals(1, reader.terms("foo").getSumTotalTermFreq()); PostingsEnum disi = reader.postings(new Term("foo", "bar"), PostingsEnum.ALL); int docid = disi.docID(); assertEquals(-1, docid); assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(0, disi.nextPosition()); assertEquals(0, disi.startOffset()); assertEquals(3, disi.endOffset()); // now reuse and check again TermsEnum te = reader.terms("foo").iterator(); assertTrue(te.seekExact(new BytesRef("bar"))); disi = te.postings(disi); docid = disi.docID(); assertEquals(-1, docid); assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); reader.close(); memory.reset(); } }
@Override public PostingsEnum postings( FieldInfo fieldInfo, BlockTermState termState, PostingsEnum reuse, int flags) throws IOException { SingleDocsEnum docsEnum; if (PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS)) { SinglePostingsEnum posEnum; if (reuse instanceof SinglePostingsEnum) { posEnum = (SinglePostingsEnum) reuse; } else { posEnum = new SinglePostingsEnum(); } IDVersionTermState _termState = (IDVersionTermState) termState; posEnum.reset(_termState.docID, _termState.idVersion); return posEnum; } if (reuse instanceof SingleDocsEnum) { docsEnum = (SingleDocsEnum) reuse; } else { docsEnum = new SingleDocsEnum(); } docsEnum.reset(((IDVersionTermState) termState).docID); return docsEnum; }
private void buildTerm( XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter) throws IOException { // start term, optimized writing BytesRef term = termIter.next(); spare.copyUTF8Bytes(term); builder.startObject(spare.toString()); buildTermStatistics(builder, termIter); // finally write the term vectors PostingsEnum posEnum = termIter.postings(null, null, PostingsEnum.ALL); int termFreq = posEnum.freq(); builder.field(FieldStrings.TERM_FREQ, termFreq); initMemory(curTerms, termFreq); initValues(curTerms, posEnum, termFreq); buildValues(builder, curTerms, termFreq); builder.endObject(); }
public void testChangeGaps() throws Exception { // LUCENE-5324: check that it is possible to change the wrapper's gaps final int positionGap = random().nextInt(1000); final int offsetGap = random().nextInt(1000); final Analyzer delegate = new MockAnalyzer(random()); final Analyzer a = new DelegatingAnalyzerWrapper(delegate.getReuseStrategy()) { @Override protected Analyzer getWrappedAnalyzer(String fieldName) { return delegate; } @Override public int getPositionIncrementGap(String fieldName) { return positionGap; } @Override public int getOffsetGap(String fieldName) { return offsetGap; } }; final RandomIndexWriter writer = new RandomIndexWriter(random(), newDirectory(), a); final Document doc = new Document(); final FieldType ft = new FieldType(); ft.setIndexOptions(IndexOptions.DOCS); ft.setTokenized(true); ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.setStoreTermVectorOffsets(true); doc.add(new Field("f", "a", ft)); doc.add(new Field("f", "a", ft)); writer.addDocument(doc); final LeafReader reader = getOnlySegmentReader(writer.getReader()); final Fields fields = reader.getTermVectors(0); final Terms terms = fields.terms("f"); final TermsEnum te = terms.iterator(); assertEquals(new BytesRef("a"), te.next()); final PostingsEnum dpe = te.postings(null, PostingsEnum.ALL); assertEquals(0, dpe.nextDoc()); assertEquals(2, dpe.freq()); assertEquals(0, dpe.nextPosition()); assertEquals(0, dpe.startOffset()); final int endOffset = dpe.endOffset(); assertEquals(1 + positionGap, dpe.nextPosition()); assertEquals(1 + endOffset + offsetGap, dpe.endOffset()); assertEquals(null, te.next()); reader.close(); writer.close(); writer.w.getDirectory().close(); }
private void initValues(Terms curTerms, PostingsEnum posEnum, int termFreq) throws IOException { for (int j = 0; j < termFreq; j++) { int nextPos = posEnum.nextPosition(); if (curTerms.hasPositions()) { currentPositions[j] = nextPos; } if (curTerms.hasOffsets()) { currentStartOffset[j] = posEnum.startOffset(); currentEndOffset[j] = posEnum.endOffset(); } if (curTerms.hasPayloads()) { BytesRef curPayload = posEnum.getPayload(); if (curPayload != null) { currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length); } else { currentPayloads[j] = null; } } } }
protected void fillDocsAndScores(FixedBitSet matchingDocs, TermsEnum termsEnum) throws IOException { BytesRef spare = new BytesRef(); PostingsEnum postingsEnum = null; for (int i = 0; i < terms.size(); i++) { if (termsEnum.seekExact(terms.get(ords[i], spare))) { postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); float score = TermsIncludingScoreQuery.this.scores[ords[i]]; for (int doc = postingsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = postingsEnum.nextDoc()) { matchingDocs.set(doc); // In the case the same doc is also related to a another doc, a score might be // overwritten. I think this // can only happen in a many-to-many relation scores[doc] = score; } } } }
protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader atomicReader, int doc) throws IOException { // For strict positions, get a Map of term to Spans: // note: ScriptPhraseHelper.NONE does the right thing for these method calls final Map<BytesRef, Spans> strictPhrasesTermToSpans = strictPhrases.getTermToSpans(atomicReader, doc); // Usually simply wraps terms in a List; but if willRewrite() then can be expanded final List<BytesRef> sourceTerms = strictPhrases.expandTermsIfRewrite(terms, strictPhrasesTermToSpans); final List<OffsetsEnum> offsetsEnums = new ArrayList<>(sourceTerms.size() + 1); Terms termsIndex = atomicReader == null || sourceTerms.isEmpty() ? null : atomicReader.terms(field); if (termsIndex != null) { TermsEnum termsEnum = termsIndex.iterator(); // does not return null for (BytesRef term : sourceTerms) { if (!termsEnum.seekExact(term)) { continue; // term not found } PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS); if (postingsEnum == null) { // no offsets or positions available throw new IllegalArgumentException( "field '" + field + "' was indexed without offsets, cannot highlight"); } if (doc != postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted continue; } postingsEnum = strictPhrases.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term)); if (postingsEnum == null) { continue; // completely filtered out } offsetsEnums.add(new OffsetsEnum(term, postingsEnum)); } } return offsetsEnums; }
@Override public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { if (PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS)) { SimpleTVPostings postings = current.getValue(); if (postings.positions != null || postings.startOffsets != null) { // TODO: reuse SimpleTVPostingsEnum e = new SimpleTVPostingsEnum(); e.reset( postings.positions, postings.startOffsets, postings.endOffsets, postings.payloads); return e; } } // TODO: reuse SimpleTVDocsEnum e = new SimpleTVDocsEnum(); e.reset( PostingsEnum.featureRequested(flags, PostingsEnum.FREQS) == false ? 1 : current.getValue().freq); return e; }
public void testDocsEnumStart() throws Exception { Analyzer analyzer = new MockAnalyzer(random()); MemoryIndex memory = new MemoryIndex(random().nextBoolean(), false, random().nextInt(50) * 1024 * 1024); memory.addField("foo", "bar", analyzer); LeafReader reader = (LeafReader) memory.createSearcher().getIndexReader(); TestUtil.checkReader(reader); PostingsEnum disi = TestUtil.docs(random(), reader, "foo", new BytesRef("bar"), null, PostingsEnum.NONE); int docid = disi.docID(); assertEquals(-1, docid); assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); // now reuse and check again TermsEnum te = reader.terms("foo").iterator(); assertTrue(te.seekExact(new BytesRef("bar"))); disi = te.postings(disi, PostingsEnum.NONE); docid = disi.docID(); assertEquals(-1, docid); assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); reader.close(); }
private static DocSet createSmallSet( List<LeafReaderContext> leaves, PostingsEnum[] postList, int maxPossible, int firstReader) throws IOException { int[] docs = new int[maxPossible]; int sz = 0; for (int i = firstReader; i < postList.length; i++) { PostingsEnum postings = postList[i]; if (postings == null) continue; LeafReaderContext ctx = leaves.get(i); Bits liveDocs = ctx.reader().getLiveDocs(); int base = ctx.docBase; for (; ; ) { int subId = postings.nextDoc(); if (subId == DocIdSetIterator.NO_MORE_DOCS) break; if (liveDocs != null && !liveDocs.get(subId)) continue; int globalId = subId + base; docs[sz++] = globalId; } } return new SortedIntDocSet(docs, sz); }
@Override public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { if (PostingsEnum.featureRequested(flags, DocsAndPositionsEnum.OLD_NULL_SEMANTICS)) { if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { // Positions were not indexed: return null; } } // System.out.println("BTR.docs this=" + this); decodeMetaData(); // System.out.println("BTR.docs: state.docFreq=" + state.docFreq); return postingsReader.postings(fieldInfo, state, reuse, flags); }
@Test public void testRandom() throws Exception { Directory directory = newDirectory(); final Random r = random(); final IndexWriterConfig iwc = LuceneTestCase.newIndexWriterConfig(r, new MockAnalyzer(r)) .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .setRAMBufferSizeMB( scaledRandomIntBetween(16, 64)); // we might index a lot - don't go crazy here RandomIndexWriter indexWriter = new RandomIndexWriter(r, directory, iwc); int numUniqueChildValues = scaledRandomIntBetween(100, 2000); String[] childValues = new String[numUniqueChildValues]; for (int i = 0; i < numUniqueChildValues; i++) { childValues[i] = Integer.toString(i); } IntOpenHashSet filteredOrDeletedDocs = new IntOpenHashSet(); int childDocId = 0; int numParentDocs = scaledRandomIntBetween(1, numUniqueChildValues); ObjectObjectOpenHashMap<String, NavigableSet<String>> childValueToParentIds = new ObjectObjectOpenHashMap<>(); for (int parentDocId = 0; parentDocId < numParentDocs; parentDocId++) { boolean markParentAsDeleted = rarely(); boolean filterMe = rarely(); String parent = Integer.toString(parentDocId); Document document = new Document(); document.add( new StringField(UidFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.YES)); document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); if (markParentAsDeleted) { filteredOrDeletedDocs.add(parentDocId); document.add(new StringField("delete", "me", Field.Store.NO)); } if (filterMe) { filteredOrDeletedDocs.add(parentDocId); document.add(new StringField("filter", "me", Field.Store.NO)); } indexWriter.addDocument(document); final int numChildDocs = scaledRandomIntBetween(0, 100); for (int i = 0; i < numChildDocs; i++) { boolean markChildAsDeleted = rarely(); String childValue = childValues[random().nextInt(childValues.length)]; document = new Document(); document.add( new StringField( UidFieldMapper.NAME, Uid.createUid("child", Integer.toString(childDocId++)), Field.Store.NO)); document.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO)); document.add( new StringField( ParentFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.NO)); document.add(new StringField("field1", childValue, Field.Store.NO)); if (markChildAsDeleted) { document.add(new StringField("delete", "me", Field.Store.NO)); } indexWriter.addDocument(document); if (!markChildAsDeleted) { NavigableSet<String> parentIds; if (childValueToParentIds.containsKey(childValue)) { parentIds = childValueToParentIds.lget(); } else { childValueToParentIds.put(childValue, parentIds = new TreeSet<>()); } if (!markParentAsDeleted && !filterMe) { parentIds.add(parent); } } } } // Delete docs that are marked to be deleted. indexWriter.deleteDocuments(new Term("delete", "me")); indexWriter.commit(); IndexReader indexReader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(indexReader); Engine.Searcher engineSearcher = new Engine.Searcher(ChildrenConstantScoreQueryTests.class.getSimpleName(), searcher); ((TestSearchContext) SearchContext.current()) .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher)); int max = numUniqueChildValues / 4; for (int i = 0; i < max; i++) { // Simulate a parent update if (random().nextBoolean()) { final int numberOfUpdatableParents = numParentDocs - filteredOrDeletedDocs.size(); int numberOfUpdates = scaledRandomIntBetween(0, numberOfUpdatableParents); for (int j = 0; j < numberOfUpdates; j++) { int parentId; do { parentId = random().nextInt(numParentDocs); } while (filteredOrDeletedDocs.contains(parentId)); String parentUid = Uid.createUid("parent", Integer.toString(parentId)); indexWriter.deleteDocuments(new Term(UidFieldMapper.NAME, parentUid)); Document document = new Document(); document.add(new StringField(UidFieldMapper.NAME, parentUid, Field.Store.YES)); document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); indexWriter.addDocument(document); } indexReader.close(); indexReader = DirectoryReader.open(indexWriter.w, true); searcher = new IndexSearcher(indexReader); engineSearcher = new Engine.Searcher(ChildrenConstantScoreQueryTests.class.getSimpleName(), searcher); ((TestSearchContext) SearchContext.current()) .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher)); } String childValue = childValues[random().nextInt(numUniqueChildValues)]; int shortCircuitParentDocSet = random().nextInt(numParentDocs); QueryBuilder queryBuilder; if (random().nextBoolean()) { queryBuilder = hasChildQuery("child", termQuery("field1", childValue)) .setShortCircuitCutoff(shortCircuitParentDocSet); } else { queryBuilder = constantScoreQuery( hasChildFilter("child", termQuery("field1", childValue)) .setShortCircuitCutoff(shortCircuitParentDocSet)); } // Using a FQ, will invoke / test the Scorer#advance(..) and also let the Weight#scorer not // get live docs as acceptedDocs queryBuilder = filteredQuery(queryBuilder, notFilter(termFilter("filter", "me"))); Query query = parseQuery(queryBuilder); BitSetCollector collector = new BitSetCollector(indexReader.maxDoc()); searcher.search(query, collector); FixedBitSet actualResult = collector.getResult(); FixedBitSet expectedResult = new FixedBitSet(indexReader.maxDoc()); if (childValueToParentIds.containsKey(childValue)) { LeafReader slowLeafReader = SlowCompositeReaderWrapper.wrap(indexReader); Terms terms = slowLeafReader.terms(UidFieldMapper.NAME); if (terms != null) { NavigableSet<String> parentIds = childValueToParentIds.lget(); TermsEnum termsEnum = terms.iterator(null); PostingsEnum docsEnum = null; for (String id : parentIds) { TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(Uid.createUidAsBytes("parent", id)); if (seekStatus == TermsEnum.SeekStatus.FOUND) { docsEnum = termsEnum.postings(slowLeafReader.getLiveDocs(), docsEnum, PostingsEnum.NONE); expectedResult.set(docsEnum.nextDoc()); } else if (seekStatus == TermsEnum.SeekStatus.END) { break; } } } } assertBitSet(actualResult, expectedResult, searcher); } indexWriter.close(); indexReader.close(); directory.close(); }
@Override public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException { if (postings.getPayload() != null) { payloads.add(BytesRef.deepCopyOf(postings.getPayload())); } }
private void duellReaders(CompositeReader other, LeafReader memIndexReader) throws IOException { Fields memFields = memIndexReader.fields(); for (String field : MultiFields.getFields(other)) { Terms memTerms = memFields.terms(field); Terms iwTerms = memIndexReader.terms(field); if (iwTerms == null) { assertNull(memTerms); } else { NumericDocValues normValues = MultiDocValues.getNormValues(other, field); NumericDocValues memNormValues = memIndexReader.getNormValues(field); if (normValues != null) { // mem idx always computes norms on the fly assertNotNull(memNormValues); assertEquals(normValues.get(0), memNormValues.get(0)); } assertNotNull(memTerms); assertEquals(iwTerms.getDocCount(), memTerms.getDocCount()); assertEquals(iwTerms.getSumDocFreq(), memTerms.getSumDocFreq()); assertEquals(iwTerms.getSumTotalTermFreq(), memTerms.getSumTotalTermFreq()); TermsEnum iwTermsIter = iwTerms.iterator(); TermsEnum memTermsIter = memTerms.iterator(); if (iwTerms.hasPositions()) { final boolean offsets = iwTerms.hasOffsets() && memTerms.hasOffsets(); while (iwTermsIter.next() != null) { assertNotNull(memTermsIter.next()); assertEquals(iwTermsIter.term(), memTermsIter.term()); PostingsEnum iwDocsAndPos = iwTermsIter.postings(null, PostingsEnum.ALL); PostingsEnum memDocsAndPos = memTermsIter.postings(null, PostingsEnum.ALL); while (iwDocsAndPos.nextDoc() != PostingsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.docID(), memDocsAndPos.nextDoc()); assertEquals(iwDocsAndPos.freq(), memDocsAndPos.freq()); for (int i = 0; i < iwDocsAndPos.freq(); i++) { assertEquals( "term: " + iwTermsIter.term().utf8ToString(), iwDocsAndPos.nextPosition(), memDocsAndPos.nextPosition()); if (offsets) { assertEquals(iwDocsAndPos.startOffset(), memDocsAndPos.startOffset()); assertEquals(iwDocsAndPos.endOffset(), memDocsAndPos.endOffset()); } if (iwTerms.hasPayloads()) { assertEquals(iwDocsAndPos.getPayload(), memDocsAndPos.getPayload()); } } } } } else { while (iwTermsIter.next() != null) { assertEquals(iwTermsIter.term(), memTermsIter.term()); PostingsEnum iwDocsAndPos = iwTermsIter.postings(null); PostingsEnum memDocsAndPos = memTermsIter.postings(null); while (iwDocsAndPos.nextDoc() != PostingsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.docID(), memDocsAndPos.nextDoc()); assertEquals(iwDocsAndPos.freq(), memDocsAndPos.freq()); } } } } } }
/** * Returns a list of terms in the specified field along with the corresponding count of documents * in the set that match that constraint. This method uses the FilterCache to get the intersection * count between <code>docs</code> and the DocSet for each term in the filter. * * @see FacetParams#FACET_LIMIT * @see FacetParams#FACET_ZEROS * @see FacetParams#FACET_MISSING */ public NamedList<Integer> getFacetTermEnumCounts( SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase, SolrParams params) throws IOException { /* :TODO: potential optimization... * cache the Terms with the highest docFreq and try them first * don't enum if we get our max from them */ // Minimum term docFreq in order to use the filterCache for that term. int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0); // make sure we have a set that is fast for random access, if we will use it for that DocSet fastForRandomSet = docs; if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) { SortedIntDocSet sset = (SortedIntDocSet) docs; fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size()); } IndexSchema schema = searcher.getSchema(); LeafReader r = searcher.getLeafReader(); FieldType ft = schema.getFieldType(field); boolean sortByCount = sort.equals("count") || sort.equals("true"); final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1; final BoundedTreeSet<CountPair<BytesRef, Integer>> queue = sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null; final NamedList<Integer> res = new NamedList<>(); int min = mincount - 1; // the smallest value in the top 'N' values int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; BytesRef prefixTermBytes = null; if (prefix != null) { String indexedPrefix = ft.toInternal(prefix); prefixTermBytes = new BytesRef(indexedPrefix); } Fields fields = r.fields(); Terms terms = fields == null ? null : fields.terms(field); TermsEnum termsEnum = null; SolrIndexSearcher.DocsEnumState deState = null; BytesRef term = null; if (terms != null) { termsEnum = terms.iterator(); // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for // facet.offset when sorting by index order. if (prefixTermBytes != null) { if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); } } else { // position termsEnum on first term term = termsEnum.next(); } } PostingsEnum postingsEnum = null; CharsRefBuilder charsRef = new CharsRefBuilder(); if (docs.size() >= mincount) { while (term != null) { if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes)) break; if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) { int df = termsEnum.docFreq(); // If we are sorting, we can use df>min (rather than >=) since we // are going in index order. For certain term distributions this can // make a large difference (for example, many terms with df=1). if (df > 0 && df > min) { int c; if (df >= minDfFilterCache) { // use the filter cache if (deState == null) { deState = new SolrIndexSearcher.DocsEnumState(); deState.fieldName = field; deState.liveDocs = r.getLiveDocs(); deState.termsEnum = termsEnum; deState.postingsEnum = postingsEnum; } c = searcher.numDocs(docs, deState); postingsEnum = deState.postingsEnum; } else { // iterate over TermDocs to calculate the intersection // TODO: specialize when base docset is a bitset or hash set (skipDocs)? or does it // matter for this? // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class // impl) // TODO: would passing deleted docs lead to better efficiency over checking the // fastForRandomSet? postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); c = 0; if (postingsEnum instanceof MultiPostingsEnum) { MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs(); int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs(); for (int subindex = 0; subindex < numSubs; subindex++) { MultiPostingsEnum.EnumWithSlice sub = subs[subindex]; if (sub.postingsEnum == null) continue; int base = sub.slice.start; int docid; while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid + base)) c++; } } } else { int docid; while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid)) c++; } } } if (sortByCount) { if (c > min) { BytesRef termCopy = BytesRef.deepCopyOf(term); queue.add(new CountPair<>(termCopy, c)); if (queue.size() >= maxsize) min = queue.last().val; } } else { if (c >= mincount && --off < 0) { if (--lim < 0) break; ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } } } term = termsEnum.next(); } } if (sortByCount) { for (CountPair<BytesRef, Integer> p : queue) { if (--off >= 0) continue; if (--lim < 0) break; ft.indexedToReadable(p.key, charsRef); res.add(charsRef.toString(), p.val); } } if (missing) { res.add(null, getFieldMissingCount(searcher, docs, field)); } return res; }
protected void compareTermVectors(Terms terms, Terms memTerms, String field_name) throws IOException { TermsEnum termEnum = terms.iterator(); TermsEnum memTermEnum = memTerms.iterator(); while (termEnum.next() != null) { assertNotNull(memTermEnum.next()); assertThat(termEnum.totalTermFreq(), equalTo(memTermEnum.totalTermFreq())); PostingsEnum docsPosEnum = termEnum.postings(null, PostingsEnum.POSITIONS); PostingsEnum memDocsPosEnum = memTermEnum.postings(null, PostingsEnum.POSITIONS); String currentTerm = termEnum.term().utf8ToString(); assertThat( "Token mismatch for field: " + field_name, currentTerm, equalTo(memTermEnum.term().utf8ToString())); docsPosEnum.nextDoc(); memDocsPosEnum.nextDoc(); int freq = docsPosEnum.freq(); assertThat(freq, equalTo(memDocsPosEnum.freq())); for (int i = 0; i < freq; i++) { String failDesc = " (field:" + field_name + " term:" + currentTerm + ")"; int memPos = memDocsPosEnum.nextPosition(); int pos = docsPosEnum.nextPosition(); assertThat("Position test failed" + failDesc, memPos, equalTo(pos)); assertThat( "Start offset test failed" + failDesc, memDocsPosEnum.startOffset(), equalTo(docsPosEnum.startOffset())); assertThat( "End offset test failed" + failDesc, memDocsPosEnum.endOffset(), equalTo(docsPosEnum.endOffset())); assertThat( "Missing payload test failed" + failDesc, docsPosEnum.getPayload(), equalTo(docsPosEnum.getPayload())); } } assertNull("Still some tokens not processed", memTermEnum.next()); }
public void testSetPosition() throws Exception { Analyzer analyzer = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { return new TokenStreamComponents( new Tokenizer() { // TODO: use CannedTokenStream private final String[] TOKENS = {"1", "2", "3", "4", "5"}; private final int[] INCREMENTS = {1, 2, 1, 0, 1}; private int i = 0; PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); @Override public boolean incrementToken() { if (i == TOKENS.length) return false; clearAttributes(); termAtt.append(TOKENS[i]); offsetAtt.setOffset(i, i); posIncrAtt.setPositionIncrement(INCREMENTS[i]); i++; return true; } @Override public void reset() throws IOException { super.reset(); this.i = 0; } }); } }; Directory store = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), store, analyzer); Document d = new Document(); d.add(newTextField("field", "bogus", Field.Store.YES)); writer.addDocument(d); IndexReader reader = writer.getReader(); writer.close(); IndexSearcher searcher = newSearcher(reader); PostingsEnum pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("1")); pos.nextDoc(); // first token should be at position 0 assertEquals(0, pos.nextPosition()); pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("2")); pos.nextDoc(); // second token should be at position 2 assertEquals(2, pos.nextPosition()); PhraseQuery q; ScoreDoc[] hits; q = new PhraseQuery("field", "1", "2"); hits = searcher.search(q, 1000).scoreDocs; assertEquals(0, hits.length); // same as previous, using the builder with implicit positions PhraseQuery.Builder builder = new PhraseQuery.Builder(); builder.add(new Term("field", "1")); builder.add(new Term("field", "2")); q = builder.build(); hits = searcher.search(q, 1000).scoreDocs; assertEquals(0, hits.length); // same as previous, just specify positions explicitely. builder = new PhraseQuery.Builder(); builder.add(new Term("field", "1"), 0); builder.add(new Term("field", "2"), 1); q = builder.build(); hits = searcher.search(q, 1000).scoreDocs; assertEquals(0, hits.length); // specifying correct positions should find the phrase. builder = new PhraseQuery.Builder(); builder.add(new Term("field", "1"), 0); builder.add(new Term("field", "2"), 2); q = builder.build(); hits = searcher.search(q, 1000).scoreDocs; assertEquals(1, hits.length); q = new PhraseQuery("field", "2", "3"); hits = searcher.search(q, 1000).scoreDocs; assertEquals(1, hits.length); q = new PhraseQuery("field", "3", "4"); hits = searcher.search(q, 1000).scoreDocs; assertEquals(0, hits.length); // phrase query would find it when correct positions are specified. builder = new PhraseQuery.Builder(); builder.add(new Term("field", "3"), 0); builder.add(new Term("field", "4"), 0); q = builder.build(); hits = searcher.search(q, 1000).scoreDocs; assertEquals(1, hits.length); // phrase query should fail for non existing searched term // even if there exist another searched terms in the same searched position. builder = new PhraseQuery.Builder(); builder.add(new Term("field", "3"), 0); builder.add(new Term("field", "9"), 0); q = builder.build(); hits = searcher.search(q, 1000).scoreDocs; assertEquals(0, hits.length); // multi-phrase query should succed for non existing searched term // because there exist another searched terms in the same searched position. MultiPhraseQuery mq = new MultiPhraseQuery(); mq.add(new Term[] {new Term("field", "3"), new Term("field", "9")}, 0); hits = searcher.search(mq, 1000).scoreDocs; assertEquals(1, hits.length); q = new PhraseQuery("field", "2", "4"); hits = searcher.search(q, 1000).scoreDocs; assertEquals(1, hits.length); q = new PhraseQuery("field", "3", "5"); hits = searcher.search(q, 1000).scoreDocs; assertEquals(1, hits.length); q = new PhraseQuery("field", "4", "5"); hits = searcher.search(q, 1000).scoreDocs; assertEquals(1, hits.length); q = new PhraseQuery("field", "2", "5"); hits = searcher.search(q, 1000).scoreDocs; assertEquals(0, hits.length); reader.close(); store.close(); }
/** * Get all words between the specified start and end positions from the term vector. * * <p>NOTE: this may return an array of less than the size requested, if the document ends before * the requested end position. * * @param reader the index * @param doc doc id * @param luceneName the index field from which to use the term vector * @param start start position (first word we want to request) * @param end end position (last word we want to request) * @param partialOk is it okay if we're missing words in the middle, or do we need them all? * (debug) * @return the words found, in order */ public static String[] getWordsFromTermVector( IndexReader reader, int doc, String luceneName, int start, int end, boolean partialOk) { // Retrieve the term position vector of the contents of this document. // NOTE: might be faster to retrieve all term vectors at once try { org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName); if (terms == null) { throw new IllegalArgumentException("Field " + luceneName + " has no Terms"); } if (!terms.hasPositions()) throw new IllegalArgumentException( "Field " + luceneName + " has no character postion information"); // String[] docTerms = new String[(int) terms.size()]; // final List<BytesRef> termsList = new ArrayList<BytesRef>(); TermsEnum termsEnum = terms.iterator(); // Verzamel concordantiewoorden uit term vector PostingsEnum docPosEnum = null; int numFound = 0; String[] concordanceWords = new String[end - start + 1]; while (termsEnum.next() != null) { docPosEnum = termsEnum.postings(null, docPosEnum, PostingsEnum.POSITIONS); while (docPosEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { // NOTE: .docId() will always return 0 in this case // if (docPosEnum.docID() != doc) // throw new RuntimeException("Wrong doc id: " + docPosEnum.docID() + " (expected " + doc // + ")"); for (int i = 0; i < docPosEnum.freq(); i++) { int position = docPosEnum.nextPosition(); if (position == -1) throw new RuntimeException( "Unexpected missing position (i=" + i + ", docPosEnum.freq() = " + docPosEnum.freq() + ")"); if (position >= start && position <= end) { if (concordanceWords[position - start] == null) concordanceWords[position - start] = termsEnum.term().utf8ToString(); else concordanceWords[position - start] += "|" + termsEnum.term().utf8ToString(); numFound++; } } if (numFound == concordanceWords.length) return concordanceWords; } } if (numFound < concordanceWords.length && !partialOk) { // If we simply ran into the end of the document, that's okay; // but if words are missing in the middle, that's not. String[] partial = new String[numFound]; for (int i = 0; i < numFound; i++) { partial[i] = concordanceWords[i]; if (partial[i] == null) { throw new RuntimeException( "Not all words found (" + numFound + " out of " + concordanceWords.length + "); missing words in the middle of concordance!"); } } return partial; } return concordanceWords; } catch (Exception e) { throw ExUtil.wrapRuntimeException(e); } }
private void checkAllInfo( int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i) throws IOException { TermVectorsRequestBuilder resp = client() .prepareTermVectors("test", "type1", Integer.toString(i)) .setPayloads(true) .setOffsets(true) .setPositions(true) .setFieldStatistics(true) .setTermStatistics(true) .setSelectedFields(); assertThat(resp.request().fieldStatistics(), equalTo(true)); TermVectorsResponse response = resp.execute().actionGet(); assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); Terms terms = fields.terms("field"); assertThat(terms.size(), equalTo(8l)); assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs))); assertThat(terms.getDocCount(), Matchers.equalTo(numDocs)); assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length)); TermsEnum iterator = terms.iterator(); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, Matchers.notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, Matchers.notNullValue()); if (string.equals("the")) { assertThat( "expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq())); } else { assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq())); } PostingsEnum docsAndPositions = iterator.postings(null, null, PostingsEnum.ALL); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); assertThat(iterator.docFreq(), equalTo(numDocs)); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); } } assertThat(iterator.next(), Matchers.nullValue()); XContentBuilder xBuilder = XContentFactory.jsonBuilder(); xBuilder.startObject(); response.toXContent(xBuilder, ToXContent.EMPTY_PARAMS); xBuilder.endObject(); BytesStream bytesStream = xBuilder.bytesStream(); String utf8 = bytesStream.bytes().toUtf8().replaceFirst("\"took\":\\d+,", ""); ; String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}"; assertThat(utf8, equalTo(expectedString)); }
/** Call this only once (if you subclass!) */ protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix) throws IOException { final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) { throw new IllegalStateException( "Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); } // System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); final long startTime = System.nanoTime(); prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix); final int maxDoc = reader.maxDoc(); final int[] index = new int [maxDoc]; // immediate term numbers, or the index into the byte[] representing the last // number final int[] lastTerm = new int[maxDoc]; // last term we saw for this document final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) final Terms terms = reader.terms(field); if (terms == null) { // No terms return; } final TermsEnum te = terms.iterator(); final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); // System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // For our "term index wrapper" final List<BytesRef> indexedTerms = new ArrayList<>(); final PagedBytes indexedTermsBytes = new PagedBytes(15); // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. byte[] tempArr = new byte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in its byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. This requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; postingsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (; ; ) { final BytesRef t = te.term(); if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) { break; } // System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); visitTerm(te, termNum); if ((termNum & indexIntervalMask) == 0) { // Index this term sizeOfIndexedStrings += t.length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.add(indexedTerm); } final int df = te.docFreq(); if (df <= maxTermDocFreq) { postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE); // dF, but takes deletions into account int actualDF = 0; for (; ; ) { int doc = postingsEnum.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } // System.out.println(" chunk=" + chunk + " docs"); actualDF++; termInstances++; // System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = val >>> 8; int ilen = vIntSize(delta); byte[] arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit // boundary. // TODO: figure out what array lengths we can round up to w/o actually using more // memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment byte[] newarr = new byte[newLen]; System.arraycopy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = writeInt(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } // System.out.println(" ipos=" + ipos); int endPos = writeInt(delta, tempArr, ipos); // System.out.println(" endpos=" + endPos); if (endPos <= 4) { // System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (byte) val; val >>>= 8; } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new byte[12]; } } } setActualDocFreq(termNum, actualDF); } termNum++; if (te.next() == null) { break; } } numTermsInField = termNum; long midPoint = System.nanoTime(); if (termInstances == 0) { // we didn't invert anything // lower memory consumption. tnums = null; } else { this.index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { byte[] target = tnums[pass]; int pos = 0; // end in target; if (target != null) { pos = target.length; } else { target = new byte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { // System.out.println(" pass="******" process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = val >>> 8; // System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw new IllegalStateException( "Too many values for UnInvertedField faceting on field " + field); } byte[] arr = bytes[doc]; /* for(byte b : arr) { //System.out.println(" b=" + Integer.toHexString((int) b)); } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.length <= pos + len) { int newlen = target.length; /** * * we don't have to worry about the array getting too large since the "pos" param * will overflow first (only 24 bits available) if ((newlen<<1) <= 0) { // * overflow... newlen = Integer.MAX_VALUE; if (newlen <= pos + len) { throw new * SolrException(400,"Too many terms to uninvert field!"); } } else { while (newlen * <= pos + len) newlen<<=1; // doubling strategy } ** */ while (newlen <= pos + len) newlen <<= 1; // doubling strategy byte[] newtarget = new byte[newlen]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } System.arraycopy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.length) { byte[] newtarget = new byte[pos]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } tnums[pass] = target; if ((pass << 16) > maxDoc) break; } } indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]); long endTime = System.nanoTime(); total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS); phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS); }
public void testPayloadsPos0() throws Exception { Directory dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new MockPayloadAnalyzer()); Document doc = new Document(); doc.add(new TextField("content", new StringReader("a a b c d e a f g h i j a b k k"))); writer.addDocument(doc); final IndexReader readerFromWriter = writer.getReader(); LeafReader r = SlowCompositeReaderWrapper.wrap(readerFromWriter); PostingsEnum tp = r.postings(new Term("content", "a"), PostingsEnum.ALL); int count = 0; assertTrue(tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); // "a" occurs 4 times assertEquals(4, tp.freq()); assertEquals(0, tp.nextPosition()); assertEquals(1, tp.nextPosition()); assertEquals(3, tp.nextPosition()); assertEquals(6, tp.nextPosition()); // only one doc has "a" assertEquals(DocIdSetIterator.NO_MORE_DOCS, tp.nextDoc()); IndexSearcher is = newSearcher(readerFromWriter); SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a")); SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k")); SpanQuery[] sqs = {stq1, stq2}; SpanNearQuery snq = new SpanNearQuery(sqs, 30, false); count = 0; boolean sawZero = false; if (VERBOSE) { System.out.println("\ngetPayloadSpans test"); } PayloadSpanCollector collector = new PayloadSpanCollector(); Spans pspans = MultiSpansWrapper.wrap(is.getIndexReader(), snq, SpanWeight.Postings.PAYLOADS); while (pspans.nextDoc() != Spans.NO_MORE_DOCS) { while (pspans.nextStartPosition() != Spans.NO_MORE_POSITIONS) { if (VERBOSE) { System.out.println( "doc " + pspans.docID() + ": span " + pspans.startPosition() + " to " + pspans.endPosition()); } collector.reset(); pspans.collect(collector); sawZero |= pspans.startPosition() == 0; for (BytesRef payload : collector.payloads) { count++; if (VERBOSE) { System.out.println(" payload: " + Term.toString(payload)); } } } } assertTrue(sawZero); assertEquals(8, count); // System.out.println("\ngetSpans test"); Spans spans = MultiSpansWrapper.wrap(is.getIndexReader(), snq); count = 0; sawZero = false; while (spans.nextDoc() != Spans.NO_MORE_DOCS) { while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) { count++; sawZero |= spans.startPosition() == 0; // System.out.println(spans.doc() + " - " + spans.start() + " - " + // spans.end()); } } assertEquals(4, count); assertTrue(sawZero); writer.close(); is.getIndexReader().close(); dir.close(); }