/** * List all of the files in this index database * * @throws IOException If an IO error occurs while reading from the database */ public void listFiles() throws IOException { IndexReader ireader = null; TermsEnum iter; Terms terms = null; try { ireader = DirectoryReader.open(indexDirectory); // open existing index int numDocs = ireader.numDocs(); if (numDocs > 0) { Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0); terms = uFields.terms(QueryBuilder.U); } iter = terms.iterator(null); // init uid iterator while (iter.term() != null) { log.fine(Util.uid2url(iter.term().utf8ToString())); iter.next(); } } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { log.log(Level.WARNING, "An error occured while closing index reader", e); } } } }
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param termFreqMap a Map of terms and their frequencies * @param vector List of terms and their frequencies for a doc/field * @param fieldName Optional field name of the terms for skip terms */ private void addTermFrequencies( Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName) throws IOException { final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } if (isSkipTerm(fieldName, term)) { continue; } final PostingsEnum docs = termsEnum.postings(null, null); int freq = 0; while (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { freq += docs.freq(); } // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
/** * Prepare a document reconstructor. * * @param reader IndexReader to read from. * @param fieldNames if non-null or not empty, data will be collected only from these fields, * otherwise data will be collected from all fields * @param numTerms total number of terms in the index, or -1 if unknown (will be calculated) * @throws Exception */ public DocReconstructor(IndexReader reader, String[] fieldNames, int numTerms) throws Exception { if (reader == null) { throw new Exception("IndexReader cannot be null."); } this.reader = reader; if (fieldNames == null || fieldNames.length == 0) { // collect fieldNames this.fieldNames = (String[]) reader.getFieldNames(FieldOption.ALL).toArray(new String[0]); } else { this.fieldNames = fieldNames; } if (numTerms == -1) { Fields fields = MultiFields.getFields(reader); numTerms = 0; FieldsEnum fe = fields.iterator(); String fld = null; while ((fld = fe.next()) != null) { TermsEnum te = fe.terms(); while (te.next() != null) { numTerms++; } } this.numTerms = numTerms; } deleted = MultiFields.getDeletedDocs(reader); }
private void printSegment(PrintWriter out, SegmentCommitInfo si) throws Exception { SegmentReader reader = new SegmentReader(si, DirectoryReader.DEFAULT_TERMS_INDEX_DIVISOR, newIOContext(random())); for (int i = 0; i < reader.numDocs(); i++) out.println(reader.document(i)); Fields fields = reader.fields(); for (String field : fields) { Terms terms = fields.terms(field); assertNotNull(terms); TermsEnum tis = terms.iterator(null); while (tis.next() != null) { out.print(" term=" + field + ":" + tis.term()); out.println(" DF=" + tis.docFreq()); DocsAndPositionsEnum positions = tis.docsAndPositions(reader.getLiveDocs(), null); while (positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { out.print(" doc=" + positions.docID()); out.print(" TF=" + positions.freq()); out.print(" pos="); out.print(positions.nextPosition()); for (int j = 1; j < positions.freq(); j++) out.print("," + positions.nextPosition()); out.println(""); } } } reader.close(); }
/* * listTermDictionary displays the term dictionary for a field. */ static void listTermDictionary(IndexReader reader, String fieldName) throws IOException { System.out.println("\nTerm Dictionary: field " + fieldName); /* Grant says: MultiFields.getTerms(IndexReader, fieldName) */ Terms terms = MultiFields.getTerms(reader, fieldName); if ((terms == null) || (terms.size() == -1)) System.out.println(" The term dictionary is empty."); else { System.out.println(" Vocabulary size: " + terms.size() + " terms"); TermsEnum ithTerm = terms.iterator(null); /* * Iterate over the terms in this document. * Information about a term's occurrences (tf and * positions) is accessed via the indexing API, which * returns inverted lists that describe (only) the * current document. */ while (ithTerm.next() != null) { System.out.format( " %-30s %d %d\n", ithTerm.term().utf8ToString(), ithTerm.docFreq(), ithTerm.totalTermFreq()); } ; } ; }
public TermInfo collect(String term) throws IOException { TermInfo info = new TermInfo(); BytesRef luceneTerm = new BytesRef(term.getBytes()); // this gives documents in which the term is found, but no offset information can be retrieved PostingsEnum postings = MultiFields.getTermDocsEnum(indexReader, ngramInfoFieldname, luceneTerm); // now go through each document int docId = postings.nextDoc(); while (docId != PostingsEnum.NO_MORE_DOCS) { // get the term vector for that document. TermsEnum it = indexReader.getTermVector(docId, ngramInfoFieldname).iterator(); // find the term of interest it.seekExact(luceneTerm); // get its posting info. this will contain offset info PostingsEnum postingsInDoc = it.postings(null, PostingsEnum.OFFSETS); postingsInDoc.nextDoc(); Document doc = indexReader.document(docId); String id = doc.get(idFieldname); JATEDocument jd = new JATEDocument(id); Set<int[]> offsets = new HashSet<>(); int totalFreq = postingsInDoc.freq(); for (int i = 0; i < totalFreq; i++) { postingsInDoc.nextPosition(); offsets.add(new int[] {postingsInDoc.startOffset(), postingsInDoc.endOffset()}); } info.getOffsets().put(jd, offsets); docId = postings.nextDoc(); } return info; }
/** * Remove a stale file (uidIter.term().text()) from the index database (and the xref file) * * @throws java.io.IOException if an error occurs */ private void removeFile() throws IOException { String path = Util.uid2url(uidIter.term().utf8ToString()); for (IndexChangedListener listener : listeners) { listener.fileRemove(path); } writer.deleteDocuments(new Term(QueryBuilder.U, uidIter.term())); writer.prepareCommit(); writer.commit(); File xrefFile; if (RuntimeEnvironment.getInstance().isCompressXref()) { xrefFile = new File(xrefDir, path + ".gz"); } else { xrefFile = new File(xrefDir, path); } File parent = xrefFile.getParentFile(); if (!xrefFile.delete() && xrefFile.exists()) { log.log(Level.INFO, "Failed to remove obsolete xref-file: {0}", xrefFile.getAbsolutePath()); } // Remove the parent directory if it's empty if (parent.delete()) { log.log(Level.FINE, "Removed empty xref dir:{0}", parent.getAbsolutePath()); } setDirty(); for (IndexChangedListener listener : listeners) { listener.fileRemoved(path); } }
public synchronized ShapeFieldCache<T> getCache(LeafReader reader) throws IOException { ShapeFieldCache<T> idx = sidx.get(reader); if (idx != null) { return idx; } long startTime = System.currentTimeMillis(); log.fine("Building Cache [" + reader.maxDoc() + "]"); idx = new ShapeFieldCache<>(reader.maxDoc(), defaultSize); int count = 0; DocsEnum docs = null; Terms terms = reader.terms(shapeField); TermsEnum te = null; if (terms != null) { te = terms.iterator(te); BytesRef term = te.next(); while (term != null) { T shape = readShape(term); if (shape != null) { docs = te.docs(null, docs, DocsEnum.FLAG_NONE); Integer docid = docs.nextDoc(); while (docid != DocIdSetIterator.NO_MORE_DOCS) { idx.add(docid, shape); docid = docs.nextDoc(); count++; } } term = te.next(); } } sidx.put(reader, idx); long elapsed = System.currentTimeMillis() - startTime; log.fine("Cached: [" + count + " in " + elapsed + "ms] " + idx); return idx; }
public static void fillQueue(TermsEnum termsEnum, TermStatsQueue tiq, String field) throws Exception { BytesRef term; while ((term = termsEnum.next()) != null) { BytesRef r = new BytesRef(); r.copyBytes(term); tiq.insertWithOverflow(new TermStats(field, r, termsEnum.docFreq())); } }
/* Copied from lucene 4.2.x core */ private static long totalTermFreq(IndexReader r, String field, BytesRef text) throws IOException { final Terms terms = MultiFields.getTerms(r, field); if (terms != null) { final TermsEnum termsEnum = terms.iterator(null); if (termsEnum.seekExact(text, true)) { return termsEnum.totalTermFreq(); } } return 0; }
public int[] toDocsArray(Term term, Bits bits, IndexReader reader) throws IOException { Fields fields = MultiFields.getFields(reader); Terms cterms = fields.terms(term.field); TermsEnum ctermsEnum = cterms.iterator(); if (ctermsEnum.seekExact(new BytesRef(term.text()))) { PostingsEnum postingsEnum = TestUtil.docs(random(), ctermsEnum, bits, null, PostingsEnum.NONE); return toArray(postingsEnum); } return null; }
/** * Returns {@link PostingsEnum} for the specified term. This will return null if either the field * or term does not exist. * * <p><b>NOTE:</b> The returned {@link PostingsEnum} may contain deleted docs. * * @see TermsEnum#postings(PostingsEnum) */ public final PostingsEnum postings(Term term, int flags) throws IOException { assert term.field() != null; assert term.bytes() != null; final Terms terms = terms(term.field()); if (terms != null) { final TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(term.bytes())) { return termsEnum.postings(null, flags); } } return null; }
/** * Returns {@link DocsAndPositionsEnum} for the specified term. This will return null if the field * or term does not exist or positions weren't indexed. * * @deprecated use {@link #postings(Term, int)} instead */ @Deprecated public final DocsAndPositionsEnum termPositionsEnum(Term term) throws IOException { assert term.field() != null; assert term.bytes() != null; final Terms terms = terms(term.field()); if (terms != null) { final TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(term.bytes())) { return termsEnum.docsAndPositions(getLiveDocs(), null); } } return null; }
/** * Returns the number of documents containing the term <code>t</code>. This method returns 0 if * the term or field does not exists. This method does not take into account deleted documents * that have not yet been merged away. */ @Override public final long totalTermFreq(Term term) throws IOException { final Terms terms = terms(term.field()); if (terms == null) { return 0; } final TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(term.bytes())) { return termsEnum.totalTermFreq(); } else { return 0; } }
/** tests intersect: TODO start at a random term! */ public void testIntersect() throws Exception { for (int i = 0; i < numIterations; i++) { String reg = AutomatonTestUtil.randomRegexp(random()); Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton(); CompiledAutomaton ca = new CompiledAutomaton(automaton, SpecialOperations.isFinite(automaton), false); TermsEnum te = MultiFields.getTerms(reader, "field").intersect(ca, null); Automaton expected = BasicOperations.intersection(termsAutomaton, automaton); TreeSet<BytesRef> found = new TreeSet<BytesRef>(); while (te.next() != null) { found.add(BytesRef.deepCopyOf(te.term())); } Automaton actual = BasicAutomata.makeStringUnion(found); assertTrue(BasicOperations.sameLanguage(expected, actual)); } }
/** mixes up seek and next for all terms */ public void testSeekingAndNexting() throws Exception { for (int i = 0; i < numIterations; i++) { TermsEnum te = MultiFields.getTerms(reader, "field").iterator(null); for (BytesRef term : terms) { int c = random().nextInt(3); if (c == 0) { assertEquals(term, te.next()); } else if (c == 1) { assertEquals(SeekStatus.FOUND, te.seekCeil(term, random().nextBoolean())); assertEquals(term, te.term()); } else { assertTrue(te.seekExact(term, random().nextBoolean())); } } } }
/* * Utility function to display a term vector. */ static void termVectorDisplay(Terms terms) throws IOException { if ((terms == null) || (terms.size() == -1)) System.out.println(" The field is not stored."); else { /* * The terms for this field are stored. */ System.out.println(" Vocabulary size: " + terms.size() + " terms"); TermsEnum ithTerm = terms.iterator(null); /* * Iterate over the terms in this document. * Information about a term's occurrences (tf and * positions) is accessed via the indexing API, which * returns inverted lists that describe (only) the * current document. */ while (ithTerm.next() != null) { System.out.format( " %10d %-20s %d ", ithTerm.ord(), ithTerm.term().utf8ToString(), ithTerm.totalTermFreq()); DocsAndPositionsEnum currDoc = ithTerm.docsAndPositions(null, null); currDoc.nextDoc(); for (int jthPosition = 0; jthPosition < ithTerm.totalTermFreq(); jthPosition++) System.out.print(currDoc.nextPosition() + " "); System.out.println(); } ; } ; }
private void verifyCount(IndexReader ir) throws Exception { Fields fields = MultiFields.getFields(ir); for (String field : fields) { Terms terms = fields.terms(field); if (terms == null) { continue; } int docCount = terms.getDocCount(); FixedBitSet visited = new FixedBitSet(ir.maxDoc()); TermsEnum te = terms.iterator(); while (te.next() != null) { PostingsEnum de = TestUtil.docs(random(), te, null, PostingsEnum.NONE); while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { visited.set(de.docID()); } } assertEquals(visited.cardinality(), docCount); } }
public void listTokens(int freq) throws IOException { IndexReader ireader = null; TermsEnum iter = null; Terms terms = null; try { ireader = DirectoryReader.open(indexDirectory); int numDocs = ireader.numDocs(); if (numDocs > 0) { Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0); terms = uFields.terms(QueryBuilder.DEFS); } iter = terms.iterator(null); // init uid iterator while (iter.term() != null) { // if (iter.term().field().startsWith("f")) { if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) { log.warning(iter.term().utf8ToString()); } iter.next(); /*} else { break; }*/ } } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { log.log(Level.WARNING, "An error occured while closing index reader", e); } } } }
/** seeks to every term accepted by some automata */ public void testSeeking() throws Exception { for (int i = 0; i < numIterations; i++) { String reg = AutomatonTestUtil.randomRegexp(random()); Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton(); TermsEnum te = MultiFields.getTerms(reader, "field").iterator(null); ArrayList<BytesRef> unsortedTerms = new ArrayList<BytesRef>(terms); Collections.shuffle(unsortedTerms, random()); for (BytesRef term : unsortedTerms) { if (BasicOperations.run(automaton, term.utf8ToString())) { // term is accepted if (random().nextBoolean()) { // seek exact assertTrue(te.seekExact(term, random().nextBoolean())); } else { // seek ceil assertEquals(SeekStatus.FOUND, te.seekCeil(term, random().nextBoolean())); assertEquals(term, te.term()); } } } } }
@Override public DoubleArrayAtomicFieldData loadDirect(AtomicReaderContext context) throws Exception { AtomicReader reader = context.reader(); Terms terms = reader.terms(getFieldNames().indexName()); if (terms == null) { return DoubleArrayAtomicFieldData.EMPTY; } // TODO: how can we guess the number of terms? numerics end up creating more terms per value... final TDoubleArrayList values = new TDoubleArrayList(); ArrayList<int[]> ordinals = new ArrayList<int[]>(); int[] idx = new int[reader.maxDoc()]; ordinals.add(new int[reader.maxDoc()]); values.add(0); // first "t" indicates null value int termOrd = 1; // current term number TermsEnum termsEnum = terms.iterator(null); try { DocsEnum docsEnum = null; for (BytesRef term = termsEnum.next(); term != null; term = termsEnum.next()) { values.add(FieldCache.NUMERIC_UTILS_DOUBLE_PARSER.parseDouble(term)); docsEnum = termsEnum.docs(reader.getLiveDocs(), docsEnum, 0); for (int docId = docsEnum.nextDoc(); docId != DocsEnum.NO_MORE_DOCS; docId = docsEnum.nextDoc()) { int[] ordinal; if (idx[docId] >= ordinals.size()) { ordinal = new int[reader.maxDoc()]; ordinals.add(ordinal); } else { ordinal = ordinals.get(idx[docId]); } ordinal[docId] = termOrd; idx[docId]++; } termOrd++; } } catch (RuntimeException e) { if (e.getClass().getName().endsWith("StopFillCacheException")) { // all is well, in case numeric parsers are used. } else { throw e; } } if (ordinals.size() == 1) { int[] nativeOrdinals = ordinals.get(0); FixedBitSet set = new FixedBitSet(reader.maxDoc()); double[] sValues = new double[reader.maxDoc()]; boolean allHaveValue = true; for (int i = 0; i < nativeOrdinals.length; i++) { int nativeOrdinal = nativeOrdinals[i]; if (nativeOrdinal == 0) { allHaveValue = false; } else { set.set(i); sValues[i] = values.get(nativeOrdinal); } } if (allHaveValue) { return new DoubleArrayAtomicFieldData.Single(sValues, reader.maxDoc()); } else { return new DoubleArrayAtomicFieldData.SingleFixedSet(sValues, reader.maxDoc(), set); } } else { int[][] nativeOrdinals = new int[ordinals.size()][]; for (int i = 0; i < nativeOrdinals.length; i++) { nativeOrdinals[i] = ordinals.get(i); } return new DoubleArrayAtomicFieldData.WithOrdinals( values.toArray(new double[values.size()]), reader.maxDoc(), Ordinals.Factories.createFromFlatOrdinals( nativeOrdinals, termOrd, fieldDataType.getSettings())); } }
// NumericDocValues Updates // If otherFieldUpdates != null, we need to merge the updates into them private synchronized Map<String, NumericFieldUpdates> applyNumericDocValuesUpdates( Iterable<NumericUpdate> updates, ReadersAndUpdates rld, SegmentReader reader, Map<String, NumericFieldUpdates> otherFieldUpdates) throws IOException { Fields fields = reader.fields(); if (fields == null) { // This reader has no postings return Collections.emptyMap(); } // TODO: we can process the updates per DV field, from last to first so that // if multiple terms affect same document for the same field, we add an update // only once (that of the last term). To do that, we can keep a bitset which // marks which documents have already been updated. So e.g. if term T1 // updates doc 7, and then we process term T2 and it updates doc 7 as well, // we don't apply the update since we know T1 came last and therefore wins // the update. // We can also use that bitset as 'liveDocs' to pass to TermEnum.docs(), so // that these documents aren't even returned. String currentField = null; TermsEnum termsEnum = null; DocsEnum docs = null; final Map<String, NumericFieldUpdates> result = otherFieldUpdates == null ? new HashMap<String, NumericFieldUpdates>() : otherFieldUpdates; // System.out.println(Thread.currentThread().getName() + " numericDVUpdate reader=" + reader); for (NumericUpdate update : updates) { Term term = update.term; int limit = update.docIDUpto; // TODO: we traverse the terms in update order (not term order) so that we // apply the updates in the correct order, i.e. if two terms udpate the // same document, the last one that came in wins, irrespective of the // terms lexical order. // we can apply the updates in terms order if we keep an updatesGen (and // increment it with every update) and attach it to each NumericUpdate. Note // that we cannot rely only on docIDUpto because an app may send two updates // which will get same docIDUpto, yet will still need to respect the order // those updates arrived. if (!term.field().equals(currentField)) { // if we change the code to process updates in terms order, enable this assert // assert currentField == null || currentField.compareTo(term.field()) < 0; currentField = term.field(); Terms terms = fields.terms(currentField); if (terms != null) { termsEnum = terms.iterator(termsEnum); } else { termsEnum = null; continue; // no terms in that field } } if (termsEnum == null) { continue; } // System.out.println(" term=" + term); if (termsEnum.seekExact(term.bytes())) { // we don't need term frequencies for this DocsEnum docsEnum = termsEnum.docs(rld.getLiveDocs(), docs, DocsEnum.FLAG_NONE); // System.out.println("BDS: got docsEnum=" + docsEnum); NumericFieldUpdates fieldUpdates = result.get(update.field); if (fieldUpdates == null) { fieldUpdates = new NumericFieldUpdates.PackedNumericFieldUpdates(reader.maxDoc()); result.put(update.field, fieldUpdates); } int doc; while ((doc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { // System.out.println(Thread.currentThread().getName() + " numericDVUpdate term=" + term + // " doc=" + docID); if (doc >= limit) { break; // no more docs that can be updated for this term } fieldUpdates.add(doc, update.value); } } } return result; }
// Delete by Term private synchronized long applyTermDeletes( Iterable<Term> termsIter, ReadersAndUpdates rld, SegmentReader reader) throws IOException { long delCount = 0; Fields fields = reader.fields(); if (fields == null) { // This reader has no postings return 0; } TermsEnum termsEnum = null; String currentField = null; DocsEnum docs = null; assert checkDeleteTerm(null); boolean any = false; // System.out.println(Thread.currentThread().getName() + " del terms reader=" + reader); for (Term term : termsIter) { // Since we visit terms sorted, we gain performance // by re-using the same TermsEnum and seeking only // forwards if (!term.field().equals(currentField)) { assert currentField == null || currentField.compareTo(term.field()) < 0; currentField = term.field(); Terms terms = fields.terms(currentField); if (terms != null) { termsEnum = terms.iterator(termsEnum); } else { termsEnum = null; } } if (termsEnum == null) { continue; } assert checkDeleteTerm(term); // System.out.println(" term=" + term); if (termsEnum.seekExact(term.bytes())) { // we don't need term frequencies for this DocsEnum docsEnum = termsEnum.docs(rld.getLiveDocs(), docs, DocsEnum.FLAG_NONE); // System.out.println("BDS: got docsEnum=" + docsEnum); if (docsEnum != null) { while (true) { final int docID = docsEnum.nextDoc(); // System.out.println(Thread.currentThread().getName() + " del term=" + term + " doc=" + // docID); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } if (!any) { rld.initWritableLiveDocs(); any = true; } // NOTE: there is no limit check on the docID // when deleting by Term (unlike by Query) // because on flush we apply all Term deletes to // each segment. So all Term deleting here is // against prior segments: if (rld.delete(docID)) { delCount++; } } } } } return delCount; }
public SparseInstances readIndex(String indexPath, String destFile, int threshold) throws Exception { if (indexPath == null || destFile == null) { System.out.println("error: indexPath or destFile is null\n"); return null; } DirectoryReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath))); Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(reviewKey); int capacity = (int) terms.size(); HashMap<String, Integer> wordDict = new HashMap<>(capacity); capacity = capacity > 65535 ? 65535 : capacity; SparseInstances instData = new SparseInstances(capacity, reader.numDocs()); TermsEnum termsEnum = terms.iterator(); int index = 0; BytesRef term = null; String strTerm = null; while ((term = termsEnum.next()) != null) { strTerm = term.toString(); if (termsEnum.totalTermFreq() < threshold) { continue; } if (strTerm.isEmpty()) { continue; } if (wordDict.get(strTerm) != null) { continue; } instData.addAttribute(strTerm); index++; } int numAtt = instData.numAttributes(); int numInst = instData.numInstances(); Integer attIndex = null; String id = null; int termIndex = 0; for (int docIndex = 0; docIndex < numInst; docIndex++) { id = reader.document(docIndex).getField(idKey).stringValue(); Terms docTerms = reader.getTermVector(docIndex, reviewKey); if (docTerms == null) { continue; } int[] indices = new int[(int) docTerms.size()]; double[] attValues = new double[(int) docTerms.size()]; termsEnum = docTerms.iterator(); termIndex = 0; while ((term = termsEnum.next()) != null) { strTerm = term.toString(); attIndex = wordDict.get(strTerm); if (attIndex == null) { continue; } indices[termIndex] = attIndex.intValue(); attValues[termIndex] = termsEnum.totalTermFreq(); } ESparseInstance instance = new ESparseInstance(id, 1.0, attValues, indices, numAtt); instData.addInstance(instance); } return null; }
/** * Generate indexes recursively * * @param dir the root indexDirectory to generate indexes for * @param path the path * @param count_only if true will just traverse the source root and count files * @param cur_count current count during the traversal of the tree * @param est_total estimate total files to process */ private int indexDown(File dir, String parent, boolean count_only, int cur_count, int est_total) throws IOException { int lcur_count = cur_count; if (isInterrupted()) { return lcur_count; } if (!accept(dir)) { return lcur_count; } File[] files = dir.listFiles(); if (files == null) { log.log(Level.SEVERE, "Failed to get file listing for: {0}", dir.getAbsolutePath()); return lcur_count; } Arrays.sort( files, new Comparator<File>() { @Override public int compare(File p1, File p2) { return p1.getName().compareTo(p2.getName()); } }); for (File file : files) { if (accept(dir, file)) { String path = parent + '/' + file.getName(); if (file.isDirectory()) { lcur_count = indexDown(file, path, count_only, lcur_count, est_total); } else { lcur_count++; if (count_only) { continue; } if (RuntimeEnvironment.getInstance().isPrintProgress() && est_total > 0 && log.isLoggable(Level.INFO)) { log.log( Level.INFO, "Progress: {0} ({1}%)", new Object[] {lcur_count, (lcur_count * 100.0f / est_total)}); } if (uidIter != null) { String uid = Util.path2uid( path, DateTools.timeToString( file.lastModified(), DateTools.Resolution.MILLISECOND)); // construct uid for doc BytesRef buid = new BytesRef(uid); while (uidIter.term() != null && uidIter.term().compareTo(emptyBR) != 0 && uidIter.term().compareTo(buid) < 0) { removeFile(); uidIter.next(); } if (uidIter.term() != null && uidIter.term().bytesEquals(buid)) { uidIter.next(); // keep matching docs continue; } } try { addFile(file, path); } catch (Exception e) { log.log(Level.WARNING, "Failed to add file " + file.getAbsolutePath(), e); } } } } return lcur_count; }
@Override public void process(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (!params.getBool(TermsParams.TERMS, false)) return; String[] fields = params.getParams(TermsParams.TERMS_FIELD); NamedList<Object> termsResult = new SimpleOrderedMap<>(); rb.rsp.add("terms", termsResult); if (fields == null || fields.length == 0) return; int limit = params.getInt(TermsParams.TERMS_LIMIT, 10); if (limit < 0) { limit = Integer.MAX_VALUE; } String lowerStr = params.get(TermsParams.TERMS_LOWER); String upperStr = params.get(TermsParams.TERMS_UPPER); boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false); boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true); boolean sort = !TermsParams.TERMS_SORT_INDEX.equals( params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT)); int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1); int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT); if (freqmax < 0) { freqmax = Integer.MAX_VALUE; } String prefix = params.get(TermsParams.TERMS_PREFIX_STR); String regexp = params.get(TermsParams.TERMS_REGEXP_STR); Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null; boolean raw = params.getBool(TermsParams.TERMS_RAW, false); final AtomicReader indexReader = rb.req.getSearcher().getAtomicReader(); Fields lfields = indexReader.fields(); for (String field : fields) { NamedList<Integer> fieldTerms = new NamedList<>(); termsResult.add(field, fieldTerms); Terms terms = lfields == null ? null : lfields.terms(field); if (terms == null) { // no terms for this field continue; } FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field); if (ft == null) ft = new StrField(); // prefix must currently be text BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix); BytesRef upperBytes = null; if (upperStr != null) { upperBytes = new BytesRef(); ft.readableToIndexed(upperStr, upperBytes); } BytesRef lowerBytes; if (lowerStr == null) { // If no lower bound was specified, use the prefix lowerBytes = prefixBytes; } else { lowerBytes = new BytesRef(); if (raw) { // TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists // perhaps we detect if the FieldType is non-character and expect hex if so? lowerBytes = new BytesRef(lowerStr); } else { lowerBytes = new BytesRef(); ft.readableToIndexed(lowerStr, lowerBytes); } } TermsEnum termsEnum = terms.iterator(null); BytesRef term = null; if (lowerBytes != null) { if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); // Only advance the enum if we are excluding the lower bound and the lower Term actually // matches if (lowerIncl == false && term.equals(lowerBytes)) { term = termsEnum.next(); } } } else { // position termsEnum on first term term = termsEnum.next(); } int i = 0; BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null); CharsRef external = new CharsRef(); while (term != null && (i < limit || sort)) { boolean externalized = false; // did we fill in "external" yet for this term? // stop if the prefix doesn't match if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes)) break; if (pattern != null) { // indexed text or external text? // TODO: support "raw" mode? ft.indexedToReadable(term, external); externalized = true; if (!pattern.matcher(external).matches()) { term = termsEnum.next(); continue; } } if (upperBytes != null) { int upperCmp = term.compareTo(upperBytes); // if we are past the upper term, or equal to it (when don't include upper) then stop. if (upperCmp > 0 || (upperCmp == 0 && !upperIncl)) break; } // This is a good term in the range. Check if mincount/maxcount conditions are satisfied. int docFreq = termsEnum.docFreq(); if (docFreq >= freqmin && docFreq <= freqmax) { // add the term to the list if (sort) { queue.add(new CountPair<>(BytesRef.deepCopyOf(term), docFreq)); } else { // TODO: handle raw somehow if (!externalized) { ft.indexedToReadable(term, external); } fieldTerms.add(external.toString(), docFreq); i++; } } term = termsEnum.next(); } if (sort) { for (CountPair<BytesRef, Integer> item : queue) { if (i >= limit) break; ft.indexedToReadable(item.key, external); fieldTerms.add(external.toString(), item.val); i++; } } } }
@Test public void testRandom() throws Exception { Directory directory = newDirectory(); RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory); int numUniqueChildValues = 1 + random().nextInt(TEST_NIGHTLY ? 10000 : 1000); String[] childValues = new String[numUniqueChildValues]; for (int i = 0; i < numUniqueChildValues; i++) { childValues[i] = Integer.toString(i); } IntOpenHashSet filteredOrDeletedDocs = new IntOpenHashSet(); int childDocId = 0; int numParentDocs = 1 + random().nextInt(TEST_NIGHTLY ? 20000 : 1000); ObjectObjectOpenHashMap<String, NavigableSet<String>> childValueToParentIds = new ObjectObjectOpenHashMap<String, NavigableSet<String>>(); for (int parentDocId = 0; parentDocId < numParentDocs; parentDocId++) { boolean markParentAsDeleted = rarely(); boolean filterMe = rarely(); String parent = Integer.toString(parentDocId); Document document = new Document(); document.add( new StringField(UidFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.YES)); document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); if (markParentAsDeleted) { filteredOrDeletedDocs.add(parentDocId); document.add(new StringField("delete", "me", Field.Store.NO)); } if (filterMe) { filteredOrDeletedDocs.add(parentDocId); document.add(new StringField("filter", "me", Field.Store.NO)); } indexWriter.addDocument(document); int numChildDocs; if (rarely()) { numChildDocs = random().nextInt(TEST_NIGHTLY ? 100 : 25); } else { numChildDocs = random().nextInt(TEST_NIGHTLY ? 40 : 10); } for (int i = 0; i < numChildDocs; i++) { boolean markChildAsDeleted = rarely(); String childValue = childValues[random().nextInt(childValues.length)]; document = new Document(); document.add( new StringField( UidFieldMapper.NAME, Uid.createUid("child", Integer.toString(childDocId)), Field.Store.NO)); document.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO)); document.add( new StringField( ParentFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.NO)); document.add(new StringField("field1", childValue, Field.Store.NO)); if (markChildAsDeleted) { document.add(new StringField("delete", "me", Field.Store.NO)); } indexWriter.addDocument(document); if (!markChildAsDeleted) { NavigableSet<String> parentIds; if (childValueToParentIds.containsKey(childValue)) { parentIds = childValueToParentIds.lget(); } else { childValueToParentIds.put(childValue, parentIds = new TreeSet<String>()); } if (!markParentAsDeleted && !filterMe) { parentIds.add(parent); } } } } // Delete docs that are marked to be deleted. indexWriter.deleteDocuments(new Term("delete", "me")); indexWriter.commit(); IndexReader indexReader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(indexReader); Engine.Searcher engineSearcher = new Engine.SimpleSearcher(ChildrenConstantScoreQueryTests.class.getSimpleName(), searcher); ((TestSearchContext) SearchContext.current()) .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher)); Filter rawParentFilter = new TermFilter(new Term(TypeFieldMapper.NAME, "parent")); Filter rawFilterMe = new NotFilter(new TermFilter(new Term("filter", "me"))); int max = numUniqueChildValues / 4; for (int i = 0; i < max; i++) { // Randomly pick a cached version: there is specific logic inside ChildrenQuery that deals // with the fact // that deletes are applied at the top level when filters are cached. Filter parentFilter; if (random().nextBoolean()) { parentFilter = SearchContext.current().filterCache().cache(rawParentFilter); } else { parentFilter = rawParentFilter; } // Using this in FQ, will invoke / test the Scorer#advance(..) and also let the Weight#scorer // not get live docs as acceptedDocs Filter filterMe; if (random().nextBoolean()) { filterMe = SearchContext.current().filterCache().cache(rawFilterMe); } else { filterMe = rawFilterMe; } // Simulate a parent update if (random().nextBoolean()) { int numberOfUpdates = 1 + random().nextInt(TEST_NIGHTLY ? 25 : 5); for (int j = 0; j < numberOfUpdates; j++) { int parentId; do { parentId = random().nextInt(numParentDocs); } while (filteredOrDeletedDocs.contains(parentId)); String parentUid = Uid.createUid("parent", Integer.toString(parentId)); indexWriter.deleteDocuments(new Term(UidFieldMapper.NAME, parentUid)); Document document = new Document(); document.add(new StringField(UidFieldMapper.NAME, parentUid, Field.Store.YES)); document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); indexWriter.addDocument(document); } indexReader.close(); indexReader = DirectoryReader.open(indexWriter.w, true); searcher = new IndexSearcher(indexReader); engineSearcher = new Engine.SimpleSearcher( ChildrenConstantScoreQueryTests.class.getSimpleName(), searcher); ((TestSearchContext) SearchContext.current()) .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher)); } String childValue = childValues[random().nextInt(numUniqueChildValues)]; TermQuery childQuery = new TermQuery(new Term("field1", childValue)); int shortCircuitParentDocSet = random().nextInt(numParentDocs); Filter nonNestedDocsFilter = random().nextBoolean() ? NonNestedDocsFilter.INSTANCE : null; Query query; if (random().nextBoolean()) { // Usage in HasChildQueryParser query = new ChildrenConstantScoreQuery( childQuery, "parent", "child", parentFilter, shortCircuitParentDocSet, nonNestedDocsFilter); } else { // Usage in HasChildFilterParser query = new XConstantScoreQuery( new CustomQueryWrappingFilter( new ChildrenConstantScoreQuery( childQuery, "parent", "child", parentFilter, shortCircuitParentDocSet, nonNestedDocsFilter))); } query = new XFilteredQuery(query, filterMe); BitSetCollector collector = new BitSetCollector(indexReader.maxDoc()); searcher.search(query, collector); FixedBitSet actualResult = collector.getResult(); FixedBitSet expectedResult = new FixedBitSet(indexReader.maxDoc()); if (childValueToParentIds.containsKey(childValue)) { AtomicReader slowAtomicReader = SlowCompositeReaderWrapper.wrap(indexReader); Terms terms = slowAtomicReader.terms(UidFieldMapper.NAME); if (terms != null) { NavigableSet<String> parentIds = childValueToParentIds.lget(); TermsEnum termsEnum = terms.iterator(null); DocsEnum docsEnum = null; for (String id : parentIds) { TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(Uid.createUidAsBytes("parent", id)); if (seekStatus == TermsEnum.SeekStatus.FOUND) { docsEnum = termsEnum.docs(slowAtomicReader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE); expectedResult.set(docsEnum.nextDoc()); } else if (seekStatus == TermsEnum.SeekStatus.END) { break; } } } } assertBitSet(actualResult, expectedResult, searcher); } indexWriter.close(); indexReader.close(); directory.close(); }
/** * Update the content of this index database * * @throws IOException if an error occurs * @throws HistoryException if an error occurs when accessing the history */ public void update() throws IOException, HistoryException { synchronized (lock) { if (running) { throw new IOException("Indexer already running!"); } running = true; interrupted = false; } String ctgs = RuntimeEnvironment.getInstance().getCtags(); if (ctgs != null) { ctags = new Ctags(); ctags.setBinary(ctgs); } if (ctags == null) { log.severe("Unable to run ctags! searching definitions will not work!"); } if (ctags != null) { String filename = RuntimeEnvironment.getInstance().getCTagsExtraOptionsFile(); if (filename != null) { ctags.setCTagsExtraOptionsFile(filename); } } try { Analyzer analyzer = AnalyzerGuru.getAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(SearchEngine.LUCENE_VERSION, analyzer); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); // iwc.setRAMBufferSizeMB(256.0); //TODO check what is the sweet spot writer = new IndexWriter(indexDirectory, iwc); writer.commit(); // to make sure index exists on the disk // writer.setMaxFieldLength(RuntimeEnvironment.getInstance().getIndexWordLimit()); if (directories.isEmpty()) { if (project == null) { directories.add(""); } else { directories.add(project.getPath()); } } for (String dir : directories) { File sourceRoot; if ("".equals(dir)) { sourceRoot = RuntimeEnvironment.getInstance().getSourceRootFile(); } else { sourceRoot = new File(RuntimeEnvironment.getInstance().getSourceRootFile(), dir); } HistoryGuru.getInstance().ensureHistoryCacheExists(sourceRoot); String startuid = Util.path2uid(dir, ""); IndexReader reader = DirectoryReader.open(indexDirectory); // open existing index Terms terms = null; int numDocs = reader.numDocs(); if (numDocs > 0) { Fields uFields = MultiFields.getFields(reader); // reader.getTermVectors(0); terms = uFields.terms(QueryBuilder.U); } try { if (numDocs > 0) { uidIter = terms.iterator(null); TermsEnum.SeekStatus stat = uidIter.seekCeil(new BytesRef(startuid), true); // init uid if (stat == TermsEnum.SeekStatus.END || stat == TermsEnum.SeekStatus.NOT_FOUND) { uidIter = null; } } // TODO below should be optional, since it traverses the tree once more to get total // count! :( int file_cnt = 0; if (RuntimeEnvironment.getInstance().isPrintProgress()) { log.log(Level.INFO, "Counting files in {0} ...", dir); file_cnt = indexDown(sourceRoot, dir, true, 0, 0); if (log.isLoggable(Level.INFO)) { log.log( Level.INFO, "Need to process: {0} files for {1}", new Object[] {file_cnt, dir}); } } indexDown(sourceRoot, dir, false, 0, file_cnt); while (uidIter != null && uidIter.term() != null && uidIter.term().utf8ToString().startsWith(startuid)) { removeFile(); uidIter.next(); } } finally { reader.close(); } } } finally { if (writer != null) { try { writer.prepareCommit(); writer.commit(); writer.close(); } catch (IOException e) { log.log(Level.WARNING, "An error occured while closing writer", e); } } if (ctags != null) { try { ctags.close(); } catch (IOException e) { log.log(Level.WARNING, "An error occured while closing ctags process", e); } } synchronized (lock) { running = false; } } if (!isInterrupted() && isDirty()) { if (RuntimeEnvironment.getInstance().isOptimizeDatabase()) { optimize(); } createSpellingSuggestions(); RuntimeEnvironment env = RuntimeEnvironment.getInstance(); File timestamp = new File(env.getDataRootFile(), "timestamp"); if (timestamp.exists()) { if (!timestamp.setLastModified(System.currentTimeMillis())) { log.log( Level.WARNING, "Failed to set last modified time on ''{0}'', used for timestamping the index database.", timestamp.getAbsolutePath()); } } else { if (!timestamp.createNewFile()) { log.log( Level.WARNING, "Failed to create file ''{0}'', used for timestamping the index database.", timestamp.getAbsolutePath()); } } } }
/** * Reconstruct document fields. * * @param docNum document number. If this document is deleted, but the index is not optimized yet, * the reconstruction process may still yield the reconstructed field content even from * deleted documents. * @return reconstructed document * @throws Exception */ public Reconstructed reconstruct(int docNum) throws Exception { if (docNum < 0 || docNum > reader.maxDoc()) { throw new Exception("Document number outside of valid range."); } Reconstructed res = new Reconstructed(); if (deleted != null && deleted.get(docNum)) { throw new Exception("Document is deleted."); } else { Document doc = reader.document(docNum); for (int i = 0; i < fieldNames.length; i++) { Field[] fs = doc.getFields(fieldNames[i]); if (fs != null && fs.length > 0) { res.getStoredFields().put(fieldNames[i], fs); } } } // collect values from unstored fields HashSet<String> fields = new HashSet<String>(Arrays.asList(fieldNames)); // try to use term vectors if available progress.maxValue = fieldNames.length; progress.curValue = 0; progress.minValue = 0; for (int i = 0; i < fieldNames.length; i++) { TermFreqVector tvf = reader.getTermFreqVector(docNum, fieldNames[i]); if (tvf != null && tvf.size() > 0 && (tvf instanceof TermPositionVector)) { TermPositionVector tpv = (TermPositionVector) tvf; progress.message = "Reading term vectors ..."; progress.curValue = i; setChanged(); notifyObservers(progress); BytesRef[] tv = tpv.getTerms(); for (int k = 0; k < tv.length; k++) { // do we have positions? int[] posArr = tpv.getTermPositions(k); if (posArr == null) { // only offsets TermVectorOffsetInfo[] offsets = tpv.getOffsets(k); if (offsets.length == 0) { continue; } // convert offsets into positions posArr = convertOffsets(offsets); } GrowableStringArray gsa = res.getReconstructedFields().get(fieldNames[i]); if (gsa == null) { gsa = new GrowableStringArray(); res.getReconstructedFields().put(fieldNames[i], gsa); } for (int m = 0; m < posArr.length; m++) { gsa.append(posArr[m], "|", tv[k].utf8ToString()); } } fields.remove(fieldNames[i]); // got what we wanted } } // this loop collects data only from left-over fields // not yet collected through term vectors progress.maxValue = fields.size(); progress.curValue = 0; progress.minValue = 0; for (String fld : fields) { progress.message = "Collecting terms in " + fld + " ..."; progress.curValue++; setChanged(); notifyObservers(progress); Terms terms = MultiFields.getTerms(reader, fld); if (terms == null) { // no terms in this field continue; } TermsEnum te = terms.iterator(); while (te.next() != null) { DocsAndPositionsEnum dpe = te.docsAndPositions(deleted, null); if (dpe == null) { // no position info for this field break; } int num = dpe.advance(docNum); if (num != docNum) { // either greater than or NO_MORE_DOCS continue; // no data for this term in this doc } String term = te.term().utf8ToString(); GrowableStringArray gsa = (GrowableStringArray) res.getReconstructedFields().get(fld); if (gsa == null) { gsa = new GrowableStringArray(); res.getReconstructedFields().put(fld, gsa); } for (int k = 0; k < dpe.freq(); k++) { int pos = dpe.nextPosition(); gsa.append(pos, "|", term); } } } progress.message = "Done."; progress.curValue = 100; setChanged(); notifyObservers(progress); return res; }
@Test public void testRandom() throws Exception { Directory directory = newDirectory(); final Random r = random(); final IndexWriterConfig iwc = LuceneTestCase.newIndexWriterConfig(r, new MockAnalyzer(r)) .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .setRAMBufferSizeMB( scaledRandomIntBetween(16, 64)); // we might index a lot - don't go crazy here RandomIndexWriter indexWriter = new RandomIndexWriter(r, directory, iwc); int numUniqueChildValues = scaledRandomIntBetween(100, 2000); String[] childValues = new String[numUniqueChildValues]; for (int i = 0; i < numUniqueChildValues; i++) { childValues[i] = Integer.toString(i); } IntOpenHashSet filteredOrDeletedDocs = new IntOpenHashSet(); int childDocId = 0; int numParentDocs = scaledRandomIntBetween(1, numUniqueChildValues); ObjectObjectOpenHashMap<String, NavigableMap<String, FloatArrayList>> childValueToParentIds = new ObjectObjectOpenHashMap<>(); for (int parentDocId = 0; parentDocId < numParentDocs; parentDocId++) { boolean markParentAsDeleted = rarely(); boolean filterMe = rarely(); String parent = Integer.toString(parentDocId); Document document = new Document(); document.add( new StringField(UidFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.YES)); document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); if (markParentAsDeleted) { filteredOrDeletedDocs.add(parentDocId); document.add(new StringField("delete", "me", Field.Store.NO)); } if (filterMe) { filteredOrDeletedDocs.add(parentDocId); document.add(new StringField("filter", "me", Field.Store.NO)); } indexWriter.addDocument(document); int numChildDocs = scaledRandomIntBetween(0, 100); for (int i = 0; i < numChildDocs; i++) { boolean markChildAsDeleted = rarely(); String childValue = childValues[random().nextInt(childValues.length)]; document = new Document(); document.add( new StringField( UidFieldMapper.NAME, Uid.createUid("child", Integer.toString(childDocId++)), Field.Store.NO)); document.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO)); document.add( new StringField( ParentFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.NO)); document.add(new StringField("field1", childValue, Field.Store.NO)); if (markChildAsDeleted) { document.add(new StringField("delete", "me", Field.Store.NO)); } indexWriter.addDocument(document); if (!markChildAsDeleted) { NavigableMap<String, FloatArrayList> parentIdToChildScores; if (childValueToParentIds.containsKey(childValue)) { parentIdToChildScores = childValueToParentIds.lget(); } else { childValueToParentIds.put(childValue, parentIdToChildScores = new TreeMap<>()); } if (!markParentAsDeleted && !filterMe) { FloatArrayList childScores = parentIdToChildScores.get(parent); if (childScores == null) { parentIdToChildScores.put(parent, childScores = new FloatArrayList()); } childScores.add(1f); } } } } // Delete docs that are marked to be deleted. indexWriter.deleteDocuments(new Term("delete", "me")); indexWriter.commit(); IndexReader indexReader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(indexReader); Engine.Searcher engineSearcher = new Engine.Searcher(ChildrenQueryTests.class.getSimpleName(), searcher); ((TestSearchContext) SearchContext.current()) .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher)); int max = numUniqueChildValues / 4; for (int i = 0; i < max; i++) { // Simulate a parent update if (random().nextBoolean()) { final int numberOfUpdatableParents = numParentDocs - filteredOrDeletedDocs.size(); int numberOfUpdates = RandomInts.randomIntBetween( random(), 0, Math.min(numberOfUpdatableParents, TEST_NIGHTLY ? 25 : 5)); for (int j = 0; j < numberOfUpdates; j++) { int parentId; do { parentId = random().nextInt(numParentDocs); } while (filteredOrDeletedDocs.contains(parentId)); String parentUid = Uid.createUid("parent", Integer.toString(parentId)); indexWriter.deleteDocuments(new Term(UidFieldMapper.NAME, parentUid)); Document document = new Document(); document.add(new StringField(UidFieldMapper.NAME, parentUid, Field.Store.YES)); document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); indexWriter.addDocument(document); } indexReader.close(); indexReader = DirectoryReader.open(indexWriter.w, true); searcher = new IndexSearcher(indexReader); engineSearcher = new Engine.Searcher(ChildrenConstantScoreQueryTests.class.getSimpleName(), searcher); ((TestSearchContext) SearchContext.current()) .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher)); } String childValue = childValues[random().nextInt(numUniqueChildValues)]; int shortCircuitParentDocSet = random().nextInt(numParentDocs); ScoreType scoreType = ScoreType.values()[random().nextInt(ScoreType.values().length)]; // leave min/max set to 0 half the time int minChildren = random().nextInt(2) * scaledRandomIntBetween(0, 110); int maxChildren = random().nextInt(2) * scaledRandomIntBetween(minChildren, 110); QueryBuilder queryBuilder = hasChildQuery("child", constantScoreQuery(termQuery("field1", childValue))) .scoreType(scoreType.name().toLowerCase(Locale.ENGLISH)) .minChildren(minChildren) .maxChildren(maxChildren) .setShortCircuitCutoff(shortCircuitParentDocSet); // Using a FQ, will invoke / test the Scorer#advance(..) and also let the Weight#scorer not // get live docs as acceptedDocs queryBuilder = filteredQuery(queryBuilder, notFilter(termFilter("filter", "me"))); Query query = parseQuery(queryBuilder); BitSetCollector collector = new BitSetCollector(indexReader.maxDoc()); int numHits = 1 + random().nextInt(25); TopScoreDocCollector actualTopDocsCollector = TopScoreDocCollector.create(numHits); searcher.search(query, MultiCollector.wrap(collector, actualTopDocsCollector)); FixedBitSet actualResult = collector.getResult(); FixedBitSet expectedResult = new FixedBitSet(indexReader.maxDoc()); TopScoreDocCollector expectedTopDocsCollector = TopScoreDocCollector.create(numHits); if (childValueToParentIds.containsKey(childValue)) { LeafReader slowLeafReader = SlowCompositeReaderWrapper.wrap(indexReader); final FloatArrayList[] scores = new FloatArrayList[slowLeafReader.maxDoc()]; Terms terms = slowLeafReader.terms(UidFieldMapper.NAME); if (terms != null) { NavigableMap<String, FloatArrayList> parentIdToChildScores = childValueToParentIds.lget(); TermsEnum termsEnum = terms.iterator(null); DocsEnum docsEnum = null; for (Map.Entry<String, FloatArrayList> entry : parentIdToChildScores.entrySet()) { int count = entry.getValue().elementsCount; if (count >= minChildren && (maxChildren == 0 || count <= maxChildren)) { TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(Uid.createUidAsBytes("parent", entry.getKey())); if (seekStatus == TermsEnum.SeekStatus.FOUND) { docsEnum = termsEnum.docs(slowLeafReader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE); expectedResult.set(docsEnum.nextDoc()); scores[docsEnum.docID()] = new FloatArrayList(entry.getValue()); } else if (seekStatus == TermsEnum.SeekStatus.END) { break; } } } } MockScorer mockScorer = new MockScorer(scoreType); final LeafCollector leafCollector = expectedTopDocsCollector.getLeafCollector(slowLeafReader.getContext()); leafCollector.setScorer(mockScorer); for (int doc = expectedResult.nextSetBit(0); doc < slowLeafReader.maxDoc(); doc = doc + 1 >= expectedResult.length() ? DocIdSetIterator.NO_MORE_DOCS : expectedResult.nextSetBit(doc + 1)) { mockScorer.scores = scores[doc]; leafCollector.collect(doc); } } assertBitSet(actualResult, expectedResult, searcher); assertTopDocs(actualTopDocsCollector.topDocs(), expectedTopDocsCollector.topDocs()); } indexWriter.close(); indexReader.close(); directory.close(); }