@Test public void testBytes() throws Exception { List<Integer> docs = Arrays.asList(1, 5, 7); ObjectOpenHashSet<BytesRef> hTerms = new ObjectOpenHashSet<BytesRef>(); List<BytesRef> cTerms = new ArrayList<BytesRef>(docs.size()); for (int i = 0; i < docs.size(); i++) { BytesRef term = new BytesRef("str" + docs.get(i)); hTerms.add(term); cTerms.add(term); } FieldDataTermsFilter hFilter = FieldDataTermsFilter.newBytes(getFieldData(strMapper), hTerms); int size = reader.maxDoc(); FixedBitSet result = new FixedBitSet(size); result.clear(0, size); assertThat(result.cardinality(), equalTo(0)); result.or(hFilter.getDocIdSet(reader.getContext(), reader.getLiveDocs()).iterator()); assertThat(result.cardinality(), equalTo(docs.size())); for (int i = 0; i < reader.maxDoc(); i++) { assertThat(result.get(i), equalTo(docs.contains(i))); } // filter from mapper result.clear(0, size); assertThat(result.cardinality(), equalTo(0)); result.or( strMapper .termsFilter(ifdService, cTerms, null) .getDocIdSet(reader.getContext(), reader.getLiveDocs()) .iterator()); assertThat(result.cardinality(), equalTo(docs.size())); for (int i = 0; i < reader.maxDoc(); i++) { assertThat(result.get(i), equalTo(docs.contains(i))); } result.clear(0, size); assertThat(result.cardinality(), equalTo(0)); // filter on a numeric field using BytesRef terms // should not match any docs hFilter = FieldDataTermsFilter.newBytes(getFieldData(lngMapper), hTerms); result.or(hFilter.getDocIdSet(reader.getContext(), reader.getLiveDocs()).iterator()); assertThat(result.cardinality(), equalTo(0)); // filter on a numeric field using BytesRef terms // should not match any docs hFilter = FieldDataTermsFilter.newBytes(getFieldData(dblMapper), hTerms); result.or(hFilter.getDocIdSet(reader.getContext(), reader.getLiveDocs()).iterator()); assertThat(result.cardinality(), equalTo(0)); }
/** Creates a {@link DocMap} instance appropriate for this reader. */ public static DocMap build(AtomicReader reader) { final int maxDoc = reader.maxDoc(); if (!reader.hasDeletions()) { return new NoDelDocMap(maxDoc); } final Bits liveDocs = reader.getLiveDocs(); return build(maxDoc, liveDocs); }
// indexes Integer.MAX_VALUE docs with a fixed binary field public void testFixedSorted() throws Exception { BaseDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("2BFixedSorted")); if (dir instanceof MockDirectoryWrapper) { ((MockDirectoryWrapper) dir).setThrottling(MockDirectoryWrapper.Throttling.NEVER); } IndexWriter w = new IndexWriter( dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())) .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .setRAMBufferSizeMB(256.0) .setMergeScheduler(new ConcurrentMergeScheduler()) .setMergePolicy(newLogMergePolicy(false, 10)) .setOpenMode(IndexWriterConfig.OpenMode.CREATE)); Document doc = new Document(); byte bytes[] = new byte[2]; BytesRef data = new BytesRef(bytes); SortedDocValuesField dvField = new SortedDocValuesField("dv", data); doc.add(dvField); for (int i = 0; i < Integer.MAX_VALUE; i++) { bytes[0] = (byte) (i >> 8); bytes[1] = (byte) i; w.addDocument(doc); if (i % 100000 == 0) { System.out.println("indexed: " + i); System.out.flush(); } } w.forceMerge(1); w.close(); System.out.println("verifying..."); System.out.flush(); DirectoryReader r = DirectoryReader.open(dir); int expectedValue = 0; for (AtomicReaderContext context : r.leaves()) { AtomicReader reader = context.reader(); BytesRef scratch = new BytesRef(); BinaryDocValues dv = reader.getSortedDocValues("dv"); for (int i = 0; i < reader.maxDoc(); i++) { bytes[0] = (byte) (expectedValue >> 8); bytes[1] = (byte) expectedValue; dv.get(i, scratch); assertEquals(data, scratch); expectedValue++; } } r.close(); dir.close(); }
@Test public void testDoubles() throws Exception { List<Integer> docs = Arrays.asList(1, 5, 7); DoubleOpenHashSet hTerms = new DoubleOpenHashSet(); List<Double> cTerms = new ArrayList<Double>(docs.size()); for (int i = 0; i < docs.size(); i++) { double term = Double.valueOf(docs.get(i)); hTerms.add(term); cTerms.add(term); } FieldDataTermsFilter hFilter = FieldDataTermsFilter.newDoubles(getFieldData(dblMapper), hTerms); int size = reader.maxDoc(); FixedBitSet result = new FixedBitSet(size); result.clear(0, size); assertThat(result.cardinality(), equalTo(0)); result.or(hFilter.getDocIdSet(reader.getContext(), reader.getLiveDocs()).iterator()); assertThat(result.cardinality(), equalTo(docs.size())); for (int i = 0; i < reader.maxDoc(); i++) { assertThat(result.get(i), equalTo(docs.contains(i))); } // filter from mapper result.clear(0, size); assertThat(result.cardinality(), equalTo(0)); result.or( dblMapper .termsFilter(ifdService, cTerms, null) .getDocIdSet(reader.getContext(), reader.getLiveDocs()) .iterator()); assertThat(result.cardinality(), equalTo(docs.size())); for (int i = 0; i < reader.maxDoc(); i++) { assertThat(result.get(i), equalTo(docs.contains(i))); } hFilter = FieldDataTermsFilter.newDoubles(getFieldData(lngMapper), hTerms); assertNull(hFilter.getDocIdSet(reader.getContext(), reader.getLiveDocs())); }
private static void printDocs(DirectoryReader r) throws Throwable { for (AtomicReaderContext ctx : r.leaves()) { // TODO: improve this AtomicReader sub = ctx.reader(); Bits liveDocs = sub.getLiveDocs(); System.out.println(" " + ((SegmentReader) sub).getSegmentInfo()); for (int docID = 0; docID < sub.maxDoc(); docID++) { StoredDocument doc = sub.document(docID); if (liveDocs == null || liveDocs.get(docID)) { System.out.println(" docID=" + docID + " id:" + doc.get("id")); } else { System.out.println(" DEL docID=" + docID + " id:" + doc.get("id")); } } } }
public static TermsEnum filter( TermsEnum toFilter, Terms terms, AtomicReader reader, Settings settings) throws IOException { int docCount = terms.getDocCount(); if (docCount == -1) { docCount = reader.maxDoc(); } final double minFrequency = settings.getAsDouble("min", 0d); final double maxFrequency = settings.getAsDouble("max", docCount + 1d); final double minSegmentSize = settings.getAsInt("min_segment_size", 0); if (minSegmentSize < docCount) { final int minFreq = minFrequency >= 1.0 ? (int) minFrequency : (int) (docCount * minFrequency); final int maxFreq = maxFrequency >= 1.0 ? (int) maxFrequency : (int) (docCount * maxFrequency); assert minFreq < maxFreq; return new FrequencyFilter(toFilter, minFreq, maxFreq); } return toFilter; }
@Override public DoubleArrayAtomicFieldData loadDirect(AtomicReaderContext context) throws Exception { AtomicReader reader = context.reader(); Terms terms = reader.terms(getFieldNames().indexName()); if (terms == null) { return DoubleArrayAtomicFieldData.EMPTY; } // TODO: how can we guess the number of terms? numerics end up creating more terms per value... final TDoubleArrayList values = new TDoubleArrayList(); ArrayList<int[]> ordinals = new ArrayList<int[]>(); int[] idx = new int[reader.maxDoc()]; ordinals.add(new int[reader.maxDoc()]); values.add(0); // first "t" indicates null value int termOrd = 1; // current term number TermsEnum termsEnum = terms.iterator(null); try { DocsEnum docsEnum = null; for (BytesRef term = termsEnum.next(); term != null; term = termsEnum.next()) { values.add(FieldCache.NUMERIC_UTILS_DOUBLE_PARSER.parseDouble(term)); docsEnum = termsEnum.docs(reader.getLiveDocs(), docsEnum, 0); for (int docId = docsEnum.nextDoc(); docId != DocsEnum.NO_MORE_DOCS; docId = docsEnum.nextDoc()) { int[] ordinal; if (idx[docId] >= ordinals.size()) { ordinal = new int[reader.maxDoc()]; ordinals.add(ordinal); } else { ordinal = ordinals.get(idx[docId]); } ordinal[docId] = termOrd; idx[docId]++; } termOrd++; } } catch (RuntimeException e) { if (e.getClass().getName().endsWith("StopFillCacheException")) { // all is well, in case numeric parsers are used. } else { throw e; } } if (ordinals.size() == 1) { int[] nativeOrdinals = ordinals.get(0); FixedBitSet set = new FixedBitSet(reader.maxDoc()); double[] sValues = new double[reader.maxDoc()]; boolean allHaveValue = true; for (int i = 0; i < nativeOrdinals.length; i++) { int nativeOrdinal = nativeOrdinals[i]; if (nativeOrdinal == 0) { allHaveValue = false; } else { set.set(i); sValues[i] = values.get(nativeOrdinal); } } if (allHaveValue) { return new DoubleArrayAtomicFieldData.Single(sValues, reader.maxDoc()); } else { return new DoubleArrayAtomicFieldData.SingleFixedSet(sValues, reader.maxDoc(), set); } } else { int[][] nativeOrdinals = new int[ordinals.size()][]; for (int i = 0; i < nativeOrdinals.length; i++) { nativeOrdinals[i] = ordinals.get(i); } return new DoubleArrayAtomicFieldData.WithOrdinals( values.toArray(new double[values.size()]), reader.maxDoc(), Ordinals.Factories.createFromFlatOrdinals( nativeOrdinals, termOrd, fieldDataType.getSettings())); } }
int resolveParentDocuments( TopDocs topDocs, SearchContext context, Recycler.V<ObjectObjectOpenHashMap<Object, ParentDoc[]>> parentDocs) { int parentHitsResolved = 0; Recycler.V<ObjectObjectOpenHashMap<Object, Recycler.V<IntObjectOpenHashMap<ParentDoc>>>> parentDocsPerReader = cacheRecycler.hashMap(context.searcher().getIndexReader().leaves().size()); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { int readerIndex = ReaderUtil.subIndex(scoreDoc.doc, context.searcher().getIndexReader().leaves()); AtomicReaderContext subContext = context.searcher().getIndexReader().leaves().get(readerIndex); int subDoc = scoreDoc.doc - subContext.docBase; // find the parent id HashedBytesArray parentId = context.idCache().reader(subContext.reader()).parentIdByDoc(parentType, subDoc); if (parentId == null) { // no parent found continue; } // now go over and find the parent doc Id and reader tuple for (AtomicReaderContext atomicReaderContext : context.searcher().getIndexReader().leaves()) { AtomicReader indexReader = atomicReaderContext.reader(); int parentDocId = context.idCache().reader(indexReader).docById(parentType, parentId); Bits liveDocs = indexReader.getLiveDocs(); if (parentDocId != -1 && (liveDocs == null || liveDocs.get(parentDocId))) { // we found a match, add it and break Recycler.V<IntObjectOpenHashMap<ParentDoc>> readerParentDocs = parentDocsPerReader.v().get(indexReader.getCoreCacheKey()); if (readerParentDocs == null) { readerParentDocs = cacheRecycler.intObjectMap(indexReader.maxDoc()); parentDocsPerReader.v().put(indexReader.getCoreCacheKey(), readerParentDocs); } ParentDoc parentDoc = readerParentDocs.v().get(parentDocId); if (parentDoc == null) { parentHitsResolved++; // we have a hit on a parent parentDoc = new ParentDoc(); parentDoc.docId = parentDocId; parentDoc.count = 1; parentDoc.maxScore = scoreDoc.score; parentDoc.sumScores = scoreDoc.score; readerParentDocs.v().put(parentDocId, parentDoc); } else { parentDoc.count++; parentDoc.sumScores += scoreDoc.score; if (scoreDoc.score > parentDoc.maxScore) { parentDoc.maxScore = scoreDoc.score; } } } } } boolean[] states = parentDocsPerReader.v().allocated; Object[] keys = parentDocsPerReader.v().keys; Object[] values = parentDocsPerReader.v().values; for (int i = 0; i < states.length; i++) { if (states[i]) { Recycler.V<IntObjectOpenHashMap<ParentDoc>> value = (Recycler.V<IntObjectOpenHashMap<ParentDoc>>) values[i]; ParentDoc[] _parentDocs = value.v().values().toArray(ParentDoc.class); Arrays.sort(_parentDocs, PARENT_DOC_COMP); parentDocs.v().put(keys[i], _parentDocs); Releasables.release(value); } } Releasables.release(parentDocsPerReader); return parentHitsResolved; }
private void verify(AtomicReader r, int[][] idToOrds, BytesRef[] termsArray, BytesRef prefixRef) throws Exception { final DocTermOrds dto = new DocTermOrds( r, r.getLiveDocs(), "field", prefixRef, Integer.MAX_VALUE, _TestUtil.nextInt(random(), 2, 10)); final FieldCache.Ints docIDToID = FieldCache.DEFAULT.getInts(r, "id", false); /* for(int docID=0;docID<subR.maxDoc();docID++) { System.out.println(" docID=" + docID + " id=" + docIDToID[docID]); } */ if (VERBOSE) { System.out.println( "TEST: verify prefix=" + (prefixRef == null ? "null" : prefixRef.utf8ToString())); System.out.println("TEST: all TERMS:"); TermsEnum allTE = MultiFields.getTerms(r, "field").iterator(null); int ord = 0; while (allTE.next() != null) { System.out.println(" ord=" + (ord++) + " term=" + allTE.term().utf8ToString()); } } // final TermsEnum te = subR.fields().terms("field").iterator(); final TermsEnum te = dto.getOrdTermsEnum(r); if (dto.numTerms() == 0) { if (prefixRef == null) { assertNull(MultiFields.getTerms(r, "field")); } else { Terms terms = MultiFields.getTerms(r, "field"); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); TermsEnum.SeekStatus result = termsEnum.seekCeil(prefixRef, false); if (result != TermsEnum.SeekStatus.END) { assertFalse( "term=" + termsEnum.term().utf8ToString() + " matches prefix=" + prefixRef.utf8ToString(), StringHelper.startsWith(termsEnum.term(), prefixRef)); } else { // ok } } else { // ok } } return; } if (VERBOSE) { System.out.println("TEST: TERMS:"); te.seekExact(0); while (true) { System.out.println(" ord=" + te.ord() + " term=" + te.term().utf8ToString()); if (te.next() == null) { break; } } } SortedSetDocValues iter = dto.iterator(r); for (int docID = 0; docID < r.maxDoc(); docID++) { if (VERBOSE) { System.out.println( "TEST: docID=" + docID + " of " + r.maxDoc() + " (id=" + docIDToID.get(docID) + ")"); } iter.setDocument(docID); final int[] answers = idToOrds[docIDToID.get(docID)]; int upto = 0; long ord; while ((ord = iter.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { te.seekExact(ord); final BytesRef expected = termsArray[answers[upto++]]; if (VERBOSE) { System.out.println( " exp=" + expected.utf8ToString() + " actual=" + te.term().utf8ToString()); } assertEquals( "expected=" + expected.utf8ToString() + " actual=" + te.term().utf8ToString() + " ord=" + ord, expected, te.term()); } assertEquals(answers.length, upto); } }