public void testDocsWithField() throws Exception { Directory dir = newDirectory(); IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); IndexWriter writer = new IndexWriter(dir, conf); Document doc = new Document(); doc.add(new NumericDocValuesField("dv", 0L)); writer.addDocument(doc); doc = new Document(); doc.add(new TextField("dv", "some text", Field.Store.NO)); doc.add(new NumericDocValuesField("dv", 0L)); writer.addDocument(doc); DirectoryReader r = writer.getReader(); writer.close(); AtomicReader subR = r.leaves().get(0).reader(); assertEquals(2, subR.numDocs()); Bits bits = FieldCache.DEFAULT.getDocsWithField(subR, "dv"); assertTrue(bits.get(0)); assertTrue(bits.get(1)); r.close(); dir.close(); }
private static DocSet createBigSet( List<LeafReaderContext> leaves, PostingsEnum[] postList, int maxDoc, int firstReader) throws IOException { long[] bits = new long[FixedBitSet.bits2words(maxDoc)]; int sz = 0; for (int i = firstReader; i < postList.length; i++) { PostingsEnum postings = postList[i]; if (postings == null) continue; LeafReaderContext ctx = leaves.get(i); Bits liveDocs = ctx.reader().getLiveDocs(); int base = ctx.docBase; for (; ; ) { int subId = postings.nextDoc(); if (subId == DocIdSetIterator.NO_MORE_DOCS) break; if (liveDocs != null && !liveDocs.get(subId)) continue; int globalId = subId + base; bits[globalId >> 6] |= (1L << globalId); sz++; } } BitDocSet docSet = new BitDocSet(new FixedBitSet(bits, maxDoc), sz); int smallSetSize = smallSetSize(maxDoc); if (sz < smallSetSize) { // make this optional? DocSet smallSet = toSmallSet(docSet); // assert equals(docSet, smallSet); return smallSet; } return docSet; }
public DocumentFilteredLeafIndexReader( LeafReaderContext context, Filter preserveFilter, boolean negateFilter) throws IOException { super(context.reader()); final int maxDoc = in.maxDoc(); final FixedBitSet bits = new FixedBitSet(maxDoc); // ignore livedocs here, as we filter them later: final DocIdSet docs = preserveFilter.getDocIdSet(context, null); if (docs != null) { final DocIdSetIterator it = docs.iterator(); if (it != null) { bits.or(it); } } if (negateFilter) { bits.flip(0, maxDoc); } if (in.hasDeletions()) { final Bits oldLiveDocs = in.getLiveDocs(); assert oldLiveDocs != null; final DocIdSetIterator it = new BitSetIterator(bits, 0L); // the cost is not useful here for (int i = it.nextDoc(); i < maxDoc; i = it.nextDoc()) { if (!oldLiveDocs.get(i)) { // we can safely modify the current bit, as the iterator already stepped over it: bits.clear(i); } } } this.liveDocs = bits; this.numDocs = bits.cardinality(); }
@Override public boolean get(int index) { for (Bits bit : bits) { if (!bit.get(index)) { return false; } } return true; }
@Override public int nextDoc() throws IOException { if (docID == NO_MORE_DOCS) { return docID; } boolean first = true; int termFreq = 0; while (true) { final long lineStart = in.getFilePointer(); SimpleTextUtil.readLine(in, scratch); if (StringHelper.startsWith(scratch, DOC)) { if (!first && (liveDocs == null || liveDocs.get(docID))) { in.seek(lineStart); if (!omitTF) { tf = termFreq; } return docID; } UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + DOC.length, scratch.length - DOC.length, scratchUTF16); docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); termFreq = 0; first = false; } else if (StringHelper.startsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + FREQ.length, scratch.length - FREQ.length, scratchUTF16); termFreq = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); } else if (StringHelper.startsWith(scratch, POS)) { // skip termFreq++; } else if (StringHelper.startsWith(scratch, START_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, END_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, PAYLOAD)) { // skip } else { assert StringHelper.startsWith(scratch, TERM) || StringHelper.startsWith(scratch, FIELD) || StringHelper.startsWith(scratch, END) : "scratch=" + scratch.utf8ToString(); if (!first && (liveDocs == null || liveDocs.get(docID))) { in.seek(lineStart); if (!omitTF) { tf = termFreq; } return docID; } return docID = NO_MORE_DOCS; } } }
/** * @param reader * @param lireFeature * @return the maximum distance found for normalizing. * @throws java.io.IOException */ @SuppressWarnings("unchecked") private float[] findSimilar(IndexReader reader, LireFeature[] lireFeature) throws IOException { float[] maxDistance = new float[lireFeature.length]; float[] overallMaxDistance = new float[lireFeature.length]; for (int i = 0; i < overallMaxDistance.length; i++) { overallMaxDistance[i] = -1f; maxDistance[i] = -1f; } parDocs = new TreeSet[lireFeature.length]; for (int i = 0; i < parDocs.length; i++) { parDocs[i] = new TreeSet<SimpleResult>(); } // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); // clear result set ... int docs = reader.numDocs(); for (int i = 0; i < docs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document d = reader.document(i); float[] distance = getDistance(d, lireFeature); // calculate the overall max distance to normalize score afterwards for (int j = 0; j < distance.length; j++) { float f = distance[j]; if (overallMaxDistance[j] < f) { overallMaxDistance[j] = f; } // if it is the first document: if (maxDistance[j] < 0) { maxDistance[j] = f; } // if the array is not full yet: if (this.parDocs[j].size() < maxHits) { this.parDocs[j].add(new SimpleResult(f, d)); if (f > maxDistance[j]) { maxDistance[j] = f; } } else if (f < maxDistance[j]) { // if it is nearer to the sample than at least on of the current set: // remove the last one ... this.parDocs[j].remove(this.parDocs[j].last()); // add the new one ... this.parDocs[j].add(new SimpleResult(f, d)); // and set our new distance border ... maxDistance[j] = this.parDocs[j].last().getDistance(); } } } return maxDistance; }
@Override public Explanation explain(LeafReaderContext context, int doc) throws IOException { Explanation expl = subQueryWeight.explain(context, doc); if (!expl.isMatch()) { return expl; } // First: Gather explanations for all filters List<Explanation> filterExplanations = new ArrayList<>(); for (int i = 0; i < filterFunctions.length; ++i) { Bits docSet = Lucene.asSequentialAccessBits( context.reader().maxDoc(), filterWeights[i].scorer(context)); if (docSet.get(doc)) { FilterFunction filterFunction = filterFunctions[i]; Explanation functionExplanation = filterFunction.function.getLeafScoreFunction(context).explainScore(doc, expl); double factor = functionExplanation.getValue(); float sc = CombineFunction.toFloat(factor); Explanation filterExplanation = Explanation.match( sc, "function score, product of:", Explanation.match(1.0f, "match filter: " + filterFunction.filter.toString()), functionExplanation); filterExplanations.add(filterExplanation); } } if (filterExplanations.size() > 0) { FiltersFunctionFactorScorer scorer = functionScorer(context); int actualDoc = scorer.iterator().advance(doc); assert (actualDoc == doc); double score = scorer.computeScore(doc, expl.getValue()); Explanation factorExplanation = Explanation.match( CombineFunction.toFloat(score), "function score, score mode [" + scoreMode.toString().toLowerCase(Locale.ROOT) + "]", filterExplanations); expl = combineFunction.explain(expl, factorExplanation, maxBoost); } if (minScore != null && minScore > expl.getValue()) { expl = Explanation.noMatch( "Score value is too low, expected at least " + minScore + " but got " + expl.getValue(), expl); } return expl; }
@Override public Bits getLiveDocs() { Bits liveDocs = super.getLiveDocs(); if (liveDocs != null) { assert maxDoc() == liveDocs.length(); liveDocs = new AssertingBits(liveDocs); } else { assert maxDoc() == numDocs(); assert !hasDeletions(); } return liveDocs; }
/** * We assume that the initial indexing has been done and a set of reference objects has been found * and indexed in the separate directory. However further documents were added and they now need * to get a ranked list of reference objects. So we (i) get all these new documents missing the * field "ro-order" and (ii) add this field. * * @param indexPath the index to update * @throws IOException */ public void updateIndex(String indexPath) throws IOException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath))); int numDocs = reader.numDocs(); boolean hasDeletions = reader.hasDeletions(); int countUpdated = 0; IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro"))); ImageSearcher searcher = new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName); Map<String, Analyzer> perField = new HashMap<String, Analyzer>(1); perField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION)); PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), perField); IndexWriter iw = new IndexWriter( FSDirectory.open(new File(indexPath)), new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper) .setOpenMode(IndexWriterConfig.OpenMode.CREATE)); StringBuilder sb = new StringBuilder(256); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (int i = 0; i < numDocs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document document = reader.document(i); if (document.getField("ro-order") == null) { // if the field is not here we create it. ImageSearchHits hits = searcher.search(document, readerRo); sb.delete(0, sb.length()); for (int j = 0; j < numReferenceObjectsUsed; j++) { sb.append(hits.doc(j).getValues("ro-id")[0]); sb.append(' '); } // System.out.println(sb.toString()); document.add(new TextField("ro-order", sb.toString(), Field.Store.YES)); iw.updateDocument( new Term( DocumentBuilder.FIELD_NAME_IDENTIFIER, document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), document); countUpdated++; } // progress report progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1); // debug: System.out.println("countUpdated = " + countUpdated); } iw.commit(); iw.close(); }
@Override public int nextDoc() throws IOException { boolean first = true; in.seek(nextDocStart); long posStart = 0; while (true) { final long lineStart = in.getFilePointer(); SimpleTextUtil.readLine(in, scratch); // System.out.println("NEXT DOC: " + scratch.utf8ToString()); if (StringHelper.startsWith(scratch, DOC)) { if (!first && (liveDocs == null || liveDocs.get(docID))) { nextDocStart = lineStart; in.seek(posStart); return docID; } UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + DOC.length, scratch.length - DOC.length, scratchUTF16); docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); tf = 0; first = false; } else if (StringHelper.startsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + FREQ.length, scratch.length - FREQ.length, scratchUTF16); tf = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); posStart = in.getFilePointer(); } else if (StringHelper.startsWith(scratch, POS)) { // skip } else if (StringHelper.startsWith(scratch, START_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, END_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, PAYLOAD)) { // skip } else { assert StringHelper.startsWith(scratch, TERM) || StringHelper.startsWith(scratch, FIELD) || StringHelper.startsWith(scratch, END); if (!first && (liveDocs == null || liveDocs.get(docID))) { nextDocStart = lineStart; in.seek(posStart); return docID; } return docID = NO_MORE_DOCS; } } }
public void tttestGetDistribution() throws IOException { BufferedWriter bw = new BufferedWriter(new FileWriter("data.csv")); IndexReader reader = IndexReader.open(FSDirectory.open(new File(indexPath))); // get the first document: // if (!IndexReader.indexExists(reader.directory())) // throw new FileNotFoundException("No index found at this specific location."); CEDD cedd1 = new CEDD(); FCTH fcth1 = new FCTH(); CEDD cedd2 = new CEDD(); FCTH fcth2 = new FCTH(); JCD jcd1 = new JCD(); JCD jcd2 = new JCD(); String[] cls; // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); int docs = reader.numDocs(); for (int i = 0; i < docs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document doc = reader.document(i); cls = doc.getValues(DocumentBuilder.FIELD_NAME_CEDD); if (cls != null && cls.length > 0) cedd1.setStringRepresentation(cls[0]); cls = doc.getValues(DocumentBuilder.FIELD_NAME_FCTH); if (cls != null && cls.length > 0) fcth1.setStringRepresentation(cls[0]); for (int j = i + 1; j < docs; j++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document doc2 = reader.document(j); cls = doc2.getValues(DocumentBuilder.FIELD_NAME_CEDD); if (cls != null && cls.length > 0) cedd2.setStringRepresentation(cls[0]); cls = doc2.getValues(DocumentBuilder.FIELD_NAME_FCTH); if (cls != null && cls.length > 0) fcth2.setStringRepresentation(cls[0]); jcd1.init(cedd1, fcth1); jcd2.init(cedd2, fcth2); bw.write( cedd1.getDistance(cedd2) + ";" + fcth1.getDistance(fcth2) + ";" + jcd1.getDistance(jcd2) + "\n"); } if (i % 100 == 0) System.out.println(i + " entries processed ... "); } bw.close(); }
public void writeResponse() throws IOException { Boolean omitHeader = req.getParams().getBool(CommonParams.OMIT_HEADER); if (omitHeader != null && omitHeader) rsp.getValues().remove("responseHeader"); SolrIndexSearcher searcher = req.getSearcher(); if (liveDocs == null) { liveDocs = searcher.getLeafReader().getLiveDocs(); } int maxDoc = searcher.maxDoc(); try { // responseWriter.write(sw,req,rsp); ReturnFields fields = rsp.getReturnFields(); // return everything Set<String> fnames = fields.getLuceneFieldNames(); int docCounter = 0; for (int i = 0; i < maxDoc; i++) { if (liveDocs != null && !liveDocs.get(i)) { continue; } Document doc = searcher.doc(i); SolrDocument sdoc = toSolrDocument(doc, schema); writeSolrDocument(null, sdoc, fields, docCounter++); getWriter().write("\n"); } } finally { close(); writer.write('\n'); // ending with a newline looks much better from the command line writer.close(); } }
@Override public DocIdSet getDocIdSet(LeafReaderContext context, Bits acceptDocs) throws IOException { FixedBitSet bits = new FixedBitSet(context.reader().maxDoc()); bits.set(doc); if (acceptDocs != null && !acceptDocs.get(doc)) bits.clear(doc); return new BitDocIdSet(bits); }
/** * Used when base query is highly constraining vs the drilldowns, or when the docs must be scored * at once (i.e., like BooleanScorer2, not BooleanScorer). In this case we just .next() on base * and .advance() on the dim filters. */ private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims) throws IOException { // if (DEBUG) { // System.out.println(" doQueryFirstScoring"); // } int docID = baseScorer.docID(); nextDoc: while (docID != PostingsEnum.NO_MORE_DOCS) { if (acceptDocs != null && acceptDocs.get(docID) == false) { docID = baseScorer.nextDoc(); continue; } LeafCollector failedCollector = null; for (DocsAndCost dim : dims) { // TODO: should we sort this 2nd dimension of // docsEnums from most frequent to least? if (dim.approximation.docID() < docID) { dim.approximation.advance(docID); } boolean matches = false; if (dim.approximation.docID() == docID) { if (dim.twoPhase == null) { matches = true; } else { matches = dim.twoPhase.matches(); } } if (matches == false) { if (failedCollector != null) { // More than one dim fails on this document, so // it's neither a hit nor a near-miss; move to // next doc: docID = baseScorer.nextDoc(); continue nextDoc; } else { failedCollector = dim.sidewaysLeafCollector; } } } collectDocID = docID; // TODO: we could score on demand instead since we are // daat here: collectScore = baseScorer.score(); if (failedCollector == null) { // Hit passed all filters, so it's "real": collectHit(collector, dims); } else { // Hit missed exactly one filter: collectNearMiss(failedCollector); } docID = baseScorer.nextDoc(); } }
/** * Returns Doc Ids by searching the index for document having the correct spatial hash cell id at * given grid level * * @param reader reader to the index */ @Override public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException { if (spatialHashCellsIds.size() == 0) { return null; } final AtomicReader atomicReader = context.reader(); OpenBitSet matchedDocumentsIds = new OpenBitSet(atomicReader.maxDoc()); Boolean found = false; for (int i = 0; i < spatialHashCellsIds.size(); i++) { Term spatialHashCellTerm = new Term(fieldName, spatialHashCellsIds.get(i)); DocsEnum spatialHashCellsDocs = atomicReader.termDocsEnum(spatialHashCellTerm); if (spatialHashCellsDocs != null) { while (true) { final int docId = spatialHashCellsDocs.nextDoc(); if (docId == DocIdSetIterator.NO_MORE_DOCS) { break; } else { if (acceptDocs == null || acceptDocs.get(docId)) { matchedDocumentsIds.fastSet(docId); found = true; } } } } } if (found) { return matchedDocumentsIds; } else { return null; } }
@Override public int nextDoc() throws IOException { while (true) { if (count == docFreq) { return doc = NO_MORE_DOCS; } count++; // TODO: maybe we should do the 1-bit trick for encoding // freq=1 case? // Decode next doc // System.out.println(" sep d&p read doc"); accum += docReader.next(); // System.out.println(" sep d&p read freq"); freq = freqReader.next(); pendingPosCount += freq; if (liveDocs == null || liveDocs.get(accum)) { break; } } position = 0; return (doc = accum); }
static DocMap build(final int maxDoc, final Bits liveDocs) { assert liveDocs != null; final MonotonicAppendingLongBuffer docMap = new MonotonicAppendingLongBuffer(); int del = 0; for (int i = 0; i < maxDoc; ++i) { docMap.add(i - del); if (!liveDocs.get(i)) { ++del; } } final int numDeletedDocs = del; assert docMap.size() == maxDoc; return new DocMap() { @Override public int get(int docID) { if (!liveDocs.get(docID)) { return -1; } return (int) docMap.get(docID); } @Override public int maxDoc() { return maxDoc; } @Override public int numDeletedDocs() { return numDeletedDocs; } }; }
@Override public int nextDoc() throws IOException { while (true) { if (count == docFreq) { return doc = NO_MORE_DOCS; } count++; // Decode next doc // System.out.println("decode docDelta:"); accum += docReader.next(); if (!omitTF) { // System.out.println("decode freq:"); freq = freqReader.next(); } if (liveDocs == null || liveDocs.get(accum)) { break; } } return (doc = accum); }
private static void printDocs(DirectoryReader r) throws Throwable { for (AtomicReaderContext ctx : r.leaves()) { // TODO: improve this AtomicReader sub = ctx.reader(); Bits liveDocs = sub.getLiveDocs(); System.out.println(" " + ((SegmentReader) sub).getSegmentInfo()); for (int docID = 0; docID < sub.maxDoc(); docID++) { StoredDocument doc = sub.document(docID); if (liveDocs == null || liveDocs.get(docID)) { System.out.println(" docID=" + docID + " id:" + doc.get("id")); } else { System.out.println(" DEL docID=" + docID + " id:" + doc.get("id")); } } } }
private Set<Uid> getShardDocUIDs(final IndexShard shard) throws IOException { shard.refresh("get_uids"); try (Engine.Searcher searcher = shard.acquireSearcher("test")) { Set<Uid> ids = new HashSet<>(); for (LeafReaderContext leafContext : searcher.reader().leaves()) { LeafReader reader = leafContext.reader(); Bits liveDocs = reader.getLiveDocs(); for (int i = 0; i < reader.maxDoc(); i++) { if (liveDocs == null || liveDocs.get(i)) { Document uuid = reader.document(i, Collections.singleton(UidFieldMapper.NAME)); ids.add(Uid.createUid(uuid.get(UidFieldMapper.NAME))); } } } return ids; } }
public void testDocsWithField() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = newIndexWriterConfig(random(), null); iwc.setMergePolicy(newLogMergePolicy()); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); int numDocs = TEST_NIGHTLY ? atLeast(500) : atLeast(50); for (int i = 0; i < numDocs; i++) { Document doc = new Document(); if (random().nextInt(4) >= 0) { doc.add(new NumericDocValuesField("numbers", random().nextLong())); } doc.add(new NumericDocValuesField("numbersAlways", random().nextLong())); iw.addDocument(doc); if (random().nextInt(17) == 0) { iw.commit(); } } DirectoryReader ir = iw.getReader(); iw.forceMerge(1); DirectoryReader ir2 = iw.getReader(); LeafReader merged = getOnlyLeafReader(ir2); iw.close(); Bits multi = MultiDocValues.getDocsWithField(ir, "numbers"); Bits single = merged.getDocsWithField("numbers"); if (multi == null) { assertNull(single); } else { assertEquals(single.length(), multi.length()); for (int i = 0; i < numDocs; i++) { assertEquals(single.get(i), multi.get(i)); } } multi = MultiDocValues.getDocsWithField(ir, "numbersAlways"); single = merged.getDocsWithField("numbersAlways"); assertEquals(single.length(), multi.length()); for (int i = 0; i < numDocs; i++) { assertEquals(single.get(i), multi.get(i)); } ir.close(); ir2.close(); dir.close(); }
@Override protected boolean useRandomAccess(Bits bits, long filterCost) { int multiplier = threshold; if (threshold == -1) { // default multiplier = 100; } return filterCost * multiplier > bits.length(); }
@Override public int nextDoc() { pos = -1; if (hasNext && (liveDocs == null || liveDocs.get(0))) { hasNext = false; return doc = 0; } else { return doc = NO_MORE_DOCS; } }
/** * Merges in the term vectors from the readers in <code>mergeState</code>. The default * implementation skips over deleted documents, and uses {@link #startDocument(int)}, {@link * #startField(FieldInfo, int, boolean, boolean)}, {@link #startTerm(BytesRef, int)}, {@link * #addPosition(int, int, int)}, and {@link #finish(FieldInfos, int)}, returning the number of * documents that were written. Implementations can override this method for more sophisticated * merging (bulk-byte copying, etc). */ public int merge(MergeState mergeState) throws IOException { int docCount = 0; for (MergeState.IndexReaderAndLiveDocs reader : mergeState.readers) { final int maxDoc = reader.reader.maxDoc(); final Bits liveDocs = reader.liveDocs; for (int docID = 0; docID < maxDoc; docID++) { if (liveDocs != null && !liveDocs.get(docID)) { // skip deleted docs continue; } // NOTE: it's very important to first assign to vectors then pass it to // termVectorsWriter.addAllDocVectors; see LUCENE-1282 Fields vectors = reader.reader.getTermVectors(docID); addAllDocVectors(vectors, mergeState.fieldInfos); docCount++; mergeState.checkAbort.work(300); } } finish(mergeState.fieldInfos, docCount); return docCount; }
@Override protected void search(List<LeafReaderContext> leaves, Weight weight, Collector collector) throws IOException { for (LeafReaderContext ctx : leaves) { // search each subreader // we force the use of Scorer (not BulkScorer) to make sure // that the scorer passed to LeafCollector.setScorer supports // Scorer.getChildren Scorer scorer = weight.scorer(ctx); if (scorer != null) { final LeafCollector leafCollector = collector.getLeafCollector(ctx); leafCollector.setScorer(scorer); final Bits liveDocs = ctx.reader().getLiveDocs(); for (int doc = scorer.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = scorer.nextDoc()) { if (liveDocs == null || liveDocs.get(doc)) { leafCollector.collect(doc); } } } } }
private static DocSet createSmallSet( List<LeafReaderContext> leaves, PostingsEnum[] postList, int maxPossible, int firstReader) throws IOException { int[] docs = new int[maxPossible]; int sz = 0; for (int i = firstReader; i < postList.length; i++) { PostingsEnum postings = postList[i]; if (postings == null) continue; LeafReaderContext ctx = leaves.get(i); Bits liveDocs = ctx.reader().getLiveDocs(); int base = ctx.docBase; for (; ; ) { int subId = postings.nextDoc(); if (subId == DocIdSetIterator.NO_MORE_DOCS) break; if (liveDocs != null && !liveDocs.get(subId)) continue; int globalId = subId + base; docs[sz++] = globalId; } } return new SortedIntDocSet(docs, sz); }
@Override public int nextDoc() throws IOException { int d; do { d = bitSet.nextSetBit(docBase + doc + 1); if (d == -1 || d >= maxDoc + docBase) { doc = NO_MORE_DOCS; } else { doc = d - docBase; } } while (doc != NO_MORE_DOCS && d < maxDoc + docBase && liveDocs != null && !liveDocs.get(doc)); return doc; }
private ConciseDocument readNextDocument() throws Exception { if (reader != null) { ConciseDocument cd = null; while (docID < reader.maxDoc()) { // check if document is deleted // see LUCENE-2600 at https://lucene.apache.org/core/4_0_0/MIGRATE.html if (liveDocs != null && !liveDocs.get(docID)) { docID++; continue; } Document doc = reader.document(docID); cd = new ConciseDocument(doc); cd.docID = docID; cd.documentFile = new File(originalFolder, cd.filename); docID++; return cd; } } return null; }
/** * Merges the sortedset docvalues from <code>toMerge</code>. * * <p>The default implementation calls {@link #addSortedSetField}, passing an Iterable that merges * ordinals and values and filters deleted documents . */ public void mergeSortedSetField( FieldInfo fieldInfo, final MergeState mergeState, List<SortedSetDocValues> toMerge) throws IOException { mergeState.checkAbort.work(mergeState.segmentInfo.getDocCount()); final AtomicReader readers[] = mergeState.readers.toArray(new AtomicReader[toMerge.size()]); final SortedSetDocValues dvs[] = toMerge.toArray(new SortedSetDocValues[toMerge.size()]); // step 1: iterate thru each sub and mark terms still in use TermsEnum liveTerms[] = new TermsEnum[dvs.length]; long[] weights = new long[liveTerms.length]; for (int sub = 0; sub < liveTerms.length; sub++) { AtomicReader reader = readers[sub]; SortedSetDocValues dv = dvs[sub]; Bits liveDocs = reader.getLiveDocs(); if (liveDocs == null) { liveTerms[sub] = dv.termsEnum(); weights[sub] = dv.getValueCount(); } else { LongBitSet bitset = new LongBitSet(dv.getValueCount()); for (int i = 0; i < reader.maxDoc(); i++) { if (liveDocs.get(i)) { dv.setDocument(i); long ord; while ((ord = dv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { bitset.set(ord); } } } liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset); weights[sub] = bitset.cardinality(); } } // step 2: create ordinal map (this conceptually does the "merging") final OrdinalMap map = OrdinalMap.build(this, liveTerms, weights, PackedInts.COMPACT); // step 3: add field addSortedSetField( fieldInfo, // ord -> value new Iterable<BytesRef>() { @Override public Iterator<BytesRef> iterator() { return new Iterator<BytesRef>() { long currentOrd; @Override public boolean hasNext() { return currentOrd < map.getValueCount(); } @Override public BytesRef next() { if (!hasNext()) { throw new NoSuchElementException(); } int segmentNumber = map.getFirstSegmentNumber(currentOrd); long segmentOrd = map.getFirstSegmentOrd(currentOrd); final BytesRef term = dvs[segmentNumber].lookupOrd(segmentOrd); currentOrd++; return term; } @Override public void remove() { throw new UnsupportedOperationException(); } }; } }, // doc -> ord count new Iterable<Number>() { @Override public Iterator<Number> iterator() { return new Iterator<Number>() { int readerUpto = -1; int docIDUpto; int nextValue; AtomicReader currentReader; Bits currentLiveDocs; boolean nextIsSet; @Override public boolean hasNext() { return nextIsSet || setNext(); } @Override public void remove() { throw new UnsupportedOperationException(); } @Override public Number next() { if (!hasNext()) { throw new NoSuchElementException(); } assert nextIsSet; nextIsSet = false; // TODO make a mutable number return nextValue; } private boolean setNext() { while (true) { if (readerUpto == readers.length) { return false; } if (currentReader == null || docIDUpto == currentReader.maxDoc()) { readerUpto++; if (readerUpto < readers.length) { currentReader = readers[readerUpto]; currentLiveDocs = currentReader.getLiveDocs(); } docIDUpto = 0; continue; } if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) { nextIsSet = true; SortedSetDocValues dv = dvs[readerUpto]; dv.setDocument(docIDUpto); nextValue = 0; while (dv.nextOrd() != SortedSetDocValues.NO_MORE_ORDS) { nextValue++; } docIDUpto++; return true; } docIDUpto++; } } }; } }, // ords new Iterable<Number>() { @Override public Iterator<Number> iterator() { return new Iterator<Number>() { int readerUpto = -1; int docIDUpto; long nextValue; AtomicReader currentReader; Bits currentLiveDocs; LongValues currentMap; boolean nextIsSet; long ords[] = new long[8]; int ordUpto; int ordLength; @Override public boolean hasNext() { return nextIsSet || setNext(); } @Override public void remove() { throw new UnsupportedOperationException(); } @Override public Number next() { if (!hasNext()) { throw new NoSuchElementException(); } assert nextIsSet; nextIsSet = false; // TODO make a mutable number return nextValue; } private boolean setNext() { while (true) { if (readerUpto == readers.length) { return false; } if (ordUpto < ordLength) { nextValue = ords[ordUpto]; ordUpto++; nextIsSet = true; return true; } if (currentReader == null || docIDUpto == currentReader.maxDoc()) { readerUpto++; if (readerUpto < readers.length) { currentReader = readers[readerUpto]; currentLiveDocs = currentReader.getLiveDocs(); currentMap = map.getGlobalOrds(readerUpto); } docIDUpto = 0; continue; } if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) { assert docIDUpto < currentReader.maxDoc(); SortedSetDocValues dv = dvs[readerUpto]; dv.setDocument(docIDUpto); ordUpto = ordLength = 0; long ord; while ((ord = dv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { if (ordLength == ords.length) { ords = ArrayUtil.grow(ords, ordLength + 1); } ords[ordLength] = currentMap.get(ord); ordLength++; } docIDUpto++; continue; } docIDUpto++; } } }; } }); }
/** * Reconstruct document fields. * * @param docNum document number. If this document is deleted, but the index is not optimized yet, * the reconstruction process may still yield the reconstructed field content even from * deleted documents. * @return reconstructed document * @throws Exception */ public Reconstructed reconstruct(int docNum) throws Exception { if (docNum < 0 || docNum > reader.maxDoc()) { throw new Exception("Document number outside of valid range."); } Reconstructed res = new Reconstructed(); if (deleted != null && deleted.get(docNum)) { throw new Exception("Document is deleted."); } else { Document doc = reader.document(docNum); for (int i = 0; i < fieldNames.length; i++) { Field[] fs = doc.getFields(fieldNames[i]); if (fs != null && fs.length > 0) { res.getStoredFields().put(fieldNames[i], fs); } } } // collect values from unstored fields HashSet<String> fields = new HashSet<String>(Arrays.asList(fieldNames)); // try to use term vectors if available progress.maxValue = fieldNames.length; progress.curValue = 0; progress.minValue = 0; for (int i = 0; i < fieldNames.length; i++) { TermFreqVector tvf = reader.getTermFreqVector(docNum, fieldNames[i]); if (tvf != null && tvf.size() > 0 && (tvf instanceof TermPositionVector)) { TermPositionVector tpv = (TermPositionVector) tvf; progress.message = "Reading term vectors ..."; progress.curValue = i; setChanged(); notifyObservers(progress); BytesRef[] tv = tpv.getTerms(); for (int k = 0; k < tv.length; k++) { // do we have positions? int[] posArr = tpv.getTermPositions(k); if (posArr == null) { // only offsets TermVectorOffsetInfo[] offsets = tpv.getOffsets(k); if (offsets.length == 0) { continue; } // convert offsets into positions posArr = convertOffsets(offsets); } GrowableStringArray gsa = res.getReconstructedFields().get(fieldNames[i]); if (gsa == null) { gsa = new GrowableStringArray(); res.getReconstructedFields().put(fieldNames[i], gsa); } for (int m = 0; m < posArr.length; m++) { gsa.append(posArr[m], "|", tv[k].utf8ToString()); } } fields.remove(fieldNames[i]); // got what we wanted } } // this loop collects data only from left-over fields // not yet collected through term vectors progress.maxValue = fields.size(); progress.curValue = 0; progress.minValue = 0; for (String fld : fields) { progress.message = "Collecting terms in " + fld + " ..."; progress.curValue++; setChanged(); notifyObservers(progress); Terms terms = MultiFields.getTerms(reader, fld); if (terms == null) { // no terms in this field continue; } TermsEnum te = terms.iterator(); while (te.next() != null) { DocsAndPositionsEnum dpe = te.docsAndPositions(deleted, null); if (dpe == null) { // no position info for this field break; } int num = dpe.advance(docNum); if (num != docNum) { // either greater than or NO_MORE_DOCS continue; // no data for this term in this doc } String term = te.term().utf8ToString(); GrowableStringArray gsa = (GrowableStringArray) res.getReconstructedFields().get(fld); if (gsa == null) { gsa = new GrowableStringArray(); res.getReconstructedFields().put(fld, gsa); } for (int k = 0; k < dpe.freq(); k++) { int pos = dpe.nextPosition(); gsa.append(pos, "|", term); } } } progress.message = "Done."; progress.curValue = 100; setChanged(); notifyObservers(progress); return res; }