private void getPrefixTerms( ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException { // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment // into one terms // instance, which is very expensive. Therefore I think it is better to iterate over each leaf // individually. List<LeafReaderContext> leaves = reader.leaves(); for (LeafReaderContext leaf : leaves) { Terms _terms = leaf.reader().terms(field); if (_terms == null) { continue; } TermsEnum termsEnum = _terms.iterator(); TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes()); if (TermsEnum.SeekStatus.END == seekStatus) { continue; } for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) { if (!StringHelper.startsWith(term, prefix.bytes())) { break; } terms.add(new Term(field, BytesRef.deepCopyOf(term))); if (terms.size() >= maxExpansions) { return; } } } }
/** * Returns {@link PostingsEnum} for the specified term. This will return null if either the field * or term does not exist. * * <p><b>NOTE:</b> The returned {@link PostingsEnum} may contain deleted docs. * * @see TermsEnum#postings(PostingsEnum) */ public final PostingsEnum postings(Term term, int flags) throws IOException { assert term.field() != null; assert term.bytes() != null; final Terms terms = terms(term.field()); if (terms != null) { final TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(term.bytes())) { return termsEnum.postings(null, flags); } } return null; }
/** * Returns {@link DocsAndPositionsEnum} for the specified term. This will return null if the field * or term does not exist or positions weren't indexed. * * @deprecated use {@link #postings(Term, int)} instead */ @Deprecated public final DocsAndPositionsEnum termPositionsEnum(Term term) throws IOException { assert term.field() != null; assert term.bytes() != null; final Terms terms = terms(term.field()); if (terms != null) { final TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(term.bytes())) { return termsEnum.docsAndPositions(getLiveDocs(), null); } } return null; }
void processQuery(Query query, ParseContext context) { ParseContext.Document doc = context.doc(); FieldType pft = (FieldType) this.fieldType(); QueryAnalyzer.Result result; try { result = QueryAnalyzer.analyze(query); } catch (QueryAnalyzer.UnsupportedQueryException e) { doc.add( new Field( pft.extractionResultField.name(), EXTRACTION_FAILED, extractionResultField.fieldType())); return; } for (Term term : result.terms) { BytesRefBuilder builder = new BytesRefBuilder(); builder.append(new BytesRef(term.field())); builder.append(FIELD_VALUE_SEPARATOR); builder.append(term.bytes()); doc.add(new Field(queryTermsField.name(), builder.toBytesRef(), queryTermsField.fieldType())); } if (result.verified) { doc.add( new Field( extractionResultField.name(), EXTRACTION_COMPLETE, extractionResultField.fieldType())); } else { doc.add( new Field( extractionResultField.name(), EXTRACTION_PARTIAL, extractionResultField.fieldType())); } }
/** * Split an index based on a given primary key term and a 'middle' term. If the middle term is * present, it's sent to dir2. */ public PKIndexSplitter(Directory input, Directory dir1, Directory dir2, Term midTerm) { this( input, dir1, dir2, new TermRangeFilter(midTerm.field(), null, midTerm.bytes(), true, false)); }
@Override protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { if (maxEdits == 0 || prefixLength >= term.text().length()) { // can only match if it's exact return new SingleTermsEnum(terms.iterator(), term.bytes()); } return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions); }
/** * Returns the number of documents containing the term <code>t</code>. This method returns 0 if * the term or field does not exists. This method does not take into account deleted documents * that have not yet been merged away. */ @Override public final long totalTermFreq(Term term) throws IOException { final Terms terms = terms(term.field()); if (terms == null) { return 0; } final TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(term.bytes())) { return termsEnum.totalTermFreq(); } else { return 0; } }
public PKIndexSplitter( Directory input, Directory dir1, Directory dir2, Term midTerm, IndexWriterConfig config1, IndexWriterConfig config2) { this( input, dir1, dir2, new TermRangeFilter(midTerm.field(), null, midTerm.bytes(), true, false), config1, config2); }
protected Query blendTermQuery(Term term, MappedFieldType fieldType) { if (fuzziness != null) { if (fieldType != null) { try { Query query = fieldType.fuzzyQuery( term.text(), fuzziness, fuzzyPrefixLength, maxExpansions, transpositions); if (query instanceof FuzzyQuery) { QueryParsers.setRewriteMethod((FuzzyQuery) query, fuzzyRewriteMethod); } return query; } catch (RuntimeException e) { return new TermQuery(term); // See long comment below about why we're lenient here. } } int edits = fuzziness.asDistance(term.text()); FuzzyQuery query = new FuzzyQuery(term, edits, fuzzyPrefixLength, maxExpansions, transpositions); QueryParsers.setRewriteMethod(query, fuzzyRewriteMethod); return query; } if (fieldType != null) { /* * Its a bit weird to default to lenient here but its the backwards * compatible. It makes some sense when you think about what we are * doing here: at this point the user has forced an analyzer and * passed some string to the match query. We cut it up using the * analyzer and then tried to cram whatever we get into the field. * lenient=true here means that we try the terms in the query and on * the off chance that they are actually valid terms then we * actually try them. lenient=false would mean that we blow up the * query if they aren't valid terms. "valid" in this context means * "parses properly to something of the type being queried." So "1" * is a valid number, etc. * * We use the text form here because we we've received the term from * an analyzer that cut some string into text. */ Query query = termQuery(fieldType, term.bytes(), true); if (query != null) { return query; } } return new TermQuery(term); }
private Query newTermQuery(IndexReader reader, Term term) throws IOException { if (ignoreTF) { return new ConstantScoreQuery(new TermQuery(term)); } else { // we build an artificial TermContext that will give an overall df and ttf // equal to 1 TermContext context = new TermContext(reader.getContext()); for (LeafReaderContext leafContext : reader.leaves()) { Terms terms = leafContext.reader().terms(term.field()); if (terms != null) { TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(term.bytes())) { int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1 context.register(termsEnum.termState(), leafContext.ord, freq, freq); } } } return new TermQuery(term, context); } }
public static DocSet createDocSet(SolrIndexSearcher searcher, Term term) throws IOException { DirectoryReader reader = searcher.getRawReader(); // raw reader to avoid extra wrapping overhead int maxDoc = searcher.getIndexReader().maxDoc(); int smallSetSize = smallSetSize(maxDoc); String field = term.field(); BytesRef termVal = term.bytes(); int maxCount = 0; int firstReader = -1; List<LeafReaderContext> leaves = reader.leaves(); PostingsEnum[] postList = new PostingsEnum [leaves .size()]; // use array for slightly higher scanning cost, but fewer memory // allocations for (LeafReaderContext ctx : leaves) { assert leaves.get(ctx.ord) == ctx; LeafReader r = ctx.reader(); Fields f = r.fields(); Terms t = f.terms(field); if (t == null) continue; // field is missing TermsEnum te = t.iterator(); if (te.seekExact(termVal)) { maxCount += te.docFreq(); postList[ctx.ord] = te.postings(null, PostingsEnum.NONE); if (firstReader < 0) firstReader = ctx.ord; } } if (maxCount == 0) { return DocSet.EMPTY; } if (maxCount <= smallSetSize) { return createSmallSet(leaves, postList, maxCount, firstReader); } return createBigSet(leaves, postList, maxDoc, firstReader); }
public void collectTermContext( IndexReader reader, List<LeafReaderContext> leaves, TermContext[] contextArray, Term[] queryTerms) throws IOException { TermsEnum termsEnum = null; for (LeafReaderContext context : leaves) { final Fields fields = context.reader().fields(); for (int i = 0; i < queryTerms.length; i++) { Term term = queryTerms[i]; TermContext termContext = contextArray[i]; final Terms terms = fields.terms(term.field()); if (terms == null) { // field does not exist continue; } termsEnum = terms.iterator(); assert termsEnum != null; if (termsEnum == TermsEnum.EMPTY) continue; if (termsEnum.seekExact(term.bytes())) { if (termContext == null) { contextArray[i] = new TermContext( reader.getContext(), termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } else { termContext.register( termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } } } } }
/** @see #toString(Query,IndexSchema) */ public static void toString(Query query, IndexSchema schema, Appendable out, int flags) throws IOException { boolean writeBoost = true; if (query instanceof TermQuery) { TermQuery q = (TermQuery) query; Term t = q.getTerm(); FieldType ft = writeFieldName(t.field(), schema, out, flags); writeFieldVal(t.bytes(), ft, out, flags); } else if (query instanceof TermRangeQuery) { TermRangeQuery q = (TermRangeQuery) query; String fname = q.getField(); FieldType ft = writeFieldName(fname, schema, out, flags); out.append(q.includesLower() ? '[' : '{'); BytesRef lt = q.getLowerTerm(); BytesRef ut = q.getUpperTerm(); if (lt == null) { out.append('*'); } else { writeFieldVal(lt, ft, out, flags); } out.append(" TO "); if (ut == null) { out.append('*'); } else { writeFieldVal(ut, ft, out, flags); } out.append(q.includesUpper() ? ']' : '}'); } else if (query instanceof NumericRangeQuery) { NumericRangeQuery q = (NumericRangeQuery) query; String fname = q.getField(); FieldType ft = writeFieldName(fname, schema, out, flags); out.append(q.includesMin() ? '[' : '{'); Number lt = q.getMin(); Number ut = q.getMax(); if (lt == null) { out.append('*'); } else { out.append(lt.toString()); } out.append(" TO "); if (ut == null) { out.append('*'); } else { out.append(ut.toString()); } out.append(q.includesMax() ? ']' : '}'); } else if (query instanceof BooleanQuery) { BooleanQuery q = (BooleanQuery) query; boolean needParens = false; if (q.getBoost() != 1.0 || q.getMinimumNumberShouldMatch() != 0 || q.isCoordDisabled()) { needParens = true; } if (needParens) { out.append('('); } boolean first = true; for (BooleanClause c : q.clauses()) { if (!first) { out.append(' '); } else { first = false; } if (c.isProhibited()) { out.append('-'); } else if (c.isRequired()) { out.append('+'); } Query subQuery = c.getQuery(); boolean wrapQuery = false; // TODO: may need to put parens around other types // of queries too, depending on future syntax. if (subQuery instanceof BooleanQuery) { wrapQuery = true; } if (wrapQuery) { out.append('('); } toString(subQuery, schema, out, flags); if (wrapQuery) { out.append(')'); } } if (needParens) { out.append(')'); } if (q.getMinimumNumberShouldMatch() > 0) { out.append('~'); out.append(Integer.toString(q.getMinimumNumberShouldMatch())); } if (q.isCoordDisabled()) { out.append("/no_coord"); } } else if (query instanceof PrefixQuery) { PrefixQuery q = (PrefixQuery) query; Term prefix = q.getPrefix(); FieldType ft = writeFieldName(prefix.field(), schema, out, flags); out.append(prefix.text()); out.append('*'); } else if (query instanceof WildcardQuery) { out.append(query.toString()); writeBoost = false; } else if (query instanceof FuzzyQuery) { out.append(query.toString()); writeBoost = false; } else if (query instanceof ConstantScoreQuery) { out.append(query.toString()); writeBoost = false; } else if (query instanceof WrappedQuery) { WrappedQuery q = (WrappedQuery) query; out.append(q.getOptions()); toString(q.getWrappedQuery(), schema, out, flags); writeBoost = false; // we don't use the boost on wrapped queries } else { out.append(query.getClass().getSimpleName() + '(' + query.toString() + ')'); writeBoost = false; } if (writeBoost && query.getBoost() != 1.0f) { out.append("^"); out.append(Float.toString(query.getBoost())); } }
/** * Expert: highlights the top-N passages from multiple fields, for the provided int[] docids, to * custom Object as returned by the {@link PassageFormatter}. Use this API to render to something * other than String. * * @param fieldsIn field names to highlight. Must have a stored string value and also be indexed * with offsets. * @param query query to highlight. * @param searcher searcher that was previously used to execute the query. * @param docidsIn containing the document IDs to highlight. * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to form the * highlighted snippets. * @return Map keyed on field name, containing the array of formatted snippets corresponding to * the documents in <code>docidsIn</code>. If no highlights were found for a document, the * first {@code maxPassages} from the field will be returned. * @throws IOException if an I/O error occurred during processing * @throws IllegalArgumentException if <code>field</code> was indexed without {@link * IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} */ protected Map<String, Object[]> highlightFieldsAsObjects( String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException { if (fieldsIn.length < 1) { throw new IllegalArgumentException("fieldsIn must not be empty"); } if (fieldsIn.length != maxPassagesIn.length) { throw new IllegalArgumentException("invalid number of maxPassagesIn"); } final IndexReader reader = searcher.getIndexReader(); Query rewritten = rewrite(query); SortedSet<Term> queryTerms = new TreeSet<>(); rewritten.extractTerms(queryTerms); IndexReaderContext readerContext = reader.getContext(); List<AtomicReaderContext> leaves = readerContext.leaves(); // Make our own copies because we sort in-place: int[] docids = new int[docidsIn.length]; System.arraycopy(docidsIn, 0, docids, 0, docidsIn.length); final String fields[] = new String[fieldsIn.length]; System.arraycopy(fieldsIn, 0, fields, 0, fieldsIn.length); final int maxPassages[] = new int[maxPassagesIn.length]; System.arraycopy(maxPassagesIn, 0, maxPassages, 0, maxPassagesIn.length); // sort for sequential io Arrays.sort(docids); new InPlaceMergeSorter() { @Override protected void swap(int i, int j) { String tmp = fields[i]; fields[i] = fields[j]; fields[j] = tmp; int tmp2 = maxPassages[i]; maxPassages[i] = maxPassages[j]; maxPassages[j] = tmp2; } @Override protected int compare(int i, int j) { return fields[i].compareTo(fields[j]); } }.sort(0, fields.length); // pull stored data: String[][] contents = loadFieldValues(searcher, fields, docids, maxLength); Map<String, Object[]> highlights = new HashMap<>(); for (int i = 0; i < fields.length; i++) { String field = fields[i]; int numPassages = maxPassages[i]; Term floor = new Term(field, ""); Term ceiling = new Term(field, UnicodeUtil.BIG_TERM); SortedSet<Term> fieldTerms = queryTerms.subSet(floor, ceiling); // TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords) // Strip off the redundant field: BytesRef terms[] = new BytesRef[fieldTerms.size()]; int termUpto = 0; for (Term term : fieldTerms) { terms[termUpto++] = term.bytes(); } Map<Integer, Object> fieldHighlights = highlightField( field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages, query); Object[] result = new Object[docids.length]; for (int j = 0; j < docidsIn.length; j++) { result[j] = fieldHighlights.get(docidsIn[j]); } highlights.put(field, result); } return highlights; }
// NumericDocValues Updates // If otherFieldUpdates != null, we need to merge the updates into them private synchronized Map<String, NumericFieldUpdates> applyNumericDocValuesUpdates( Iterable<NumericUpdate> updates, ReadersAndUpdates rld, SegmentReader reader, Map<String, NumericFieldUpdates> otherFieldUpdates) throws IOException { Fields fields = reader.fields(); if (fields == null) { // This reader has no postings return Collections.emptyMap(); } // TODO: we can process the updates per DV field, from last to first so that // if multiple terms affect same document for the same field, we add an update // only once (that of the last term). To do that, we can keep a bitset which // marks which documents have already been updated. So e.g. if term T1 // updates doc 7, and then we process term T2 and it updates doc 7 as well, // we don't apply the update since we know T1 came last and therefore wins // the update. // We can also use that bitset as 'liveDocs' to pass to TermEnum.docs(), so // that these documents aren't even returned. String currentField = null; TermsEnum termsEnum = null; DocsEnum docs = null; final Map<String, NumericFieldUpdates> result = otherFieldUpdates == null ? new HashMap<String, NumericFieldUpdates>() : otherFieldUpdates; // System.out.println(Thread.currentThread().getName() + " numericDVUpdate reader=" + reader); for (NumericUpdate update : updates) { Term term = update.term; int limit = update.docIDUpto; // TODO: we traverse the terms in update order (not term order) so that we // apply the updates in the correct order, i.e. if two terms udpate the // same document, the last one that came in wins, irrespective of the // terms lexical order. // we can apply the updates in terms order if we keep an updatesGen (and // increment it with every update) and attach it to each NumericUpdate. Note // that we cannot rely only on docIDUpto because an app may send two updates // which will get same docIDUpto, yet will still need to respect the order // those updates arrived. if (!term.field().equals(currentField)) { // if we change the code to process updates in terms order, enable this assert // assert currentField == null || currentField.compareTo(term.field()) < 0; currentField = term.field(); Terms terms = fields.terms(currentField); if (terms != null) { termsEnum = terms.iterator(termsEnum); } else { termsEnum = null; continue; // no terms in that field } } if (termsEnum == null) { continue; } // System.out.println(" term=" + term); if (termsEnum.seekExact(term.bytes())) { // we don't need term frequencies for this DocsEnum docsEnum = termsEnum.docs(rld.getLiveDocs(), docs, DocsEnum.FLAG_NONE); // System.out.println("BDS: got docsEnum=" + docsEnum); NumericFieldUpdates fieldUpdates = result.get(update.field); if (fieldUpdates == null) { fieldUpdates = new NumericFieldUpdates.PackedNumericFieldUpdates(reader.maxDoc()); result.put(update.field, fieldUpdates); } int doc; while ((doc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { // System.out.println(Thread.currentThread().getName() + " numericDVUpdate term=" + term + // " doc=" + docID); if (doc >= limit) { break; // no more docs that can be updated for this term } fieldUpdates.add(doc, update.value); } } } return result; }
// Delete by Term private synchronized long applyTermDeletes( Iterable<Term> termsIter, ReadersAndUpdates rld, SegmentReader reader) throws IOException { long delCount = 0; Fields fields = reader.fields(); if (fields == null) { // This reader has no postings return 0; } TermsEnum termsEnum = null; String currentField = null; DocsEnum docs = null; assert checkDeleteTerm(null); boolean any = false; // System.out.println(Thread.currentThread().getName() + " del terms reader=" + reader); for (Term term : termsIter) { // Since we visit terms sorted, we gain performance // by re-using the same TermsEnum and seeking only // forwards if (!term.field().equals(currentField)) { assert currentField == null || currentField.compareTo(term.field()) < 0; currentField = term.field(); Terms terms = fields.terms(currentField); if (terms != null) { termsEnum = terms.iterator(termsEnum); } else { termsEnum = null; } } if (termsEnum == null) { continue; } assert checkDeleteTerm(term); // System.out.println(" term=" + term); if (termsEnum.seekExact(term.bytes())) { // we don't need term frequencies for this DocsEnum docsEnum = termsEnum.docs(rld.getLiveDocs(), docs, DocsEnum.FLAG_NONE); // System.out.println("BDS: got docsEnum=" + docsEnum); if (docsEnum != null) { while (true) { final int docID = docsEnum.nextDoc(); // System.out.println(Thread.currentThread().getName() + " del term=" + term + " doc=" + // docID); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } if (!any) { rld.initWritableLiveDocs(); any = true; } // NOTE: there is no limit check on the docID // when deleting by Term (unlike by Query) // because on flush we apply all Term deletes to // each segment. So all Term deleting here is // against prior segments: if (rld.delete(docID)) { delCount++; } } } } } return delCount; }
@Override public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { if (termArrays.size() == 0) // optimize zero-term case return null; final IndexReader reader = context.reader; final Bits delDocs = reader.getDeletedDocs(); PhraseQuery.PostingsAndFreq[] postingsFreqs = new PhraseQuery.PostingsAndFreq[termArrays.size()]; for (int pos = 0; pos < postingsFreqs.length; pos++) { Term[] terms = termArrays.get(pos); final DocsAndPositionsEnum postingsEnum; int docFreq; if (terms.length > 1) { postingsEnum = new UnionDocsAndPositionsEnum(reader, terms); // coarse -- this overcounts since a given doc can // have more than one terms: docFreq = 0; for (int termIdx = 0; termIdx < terms.length; termIdx++) { docFreq += reader.docFreq(terms[termIdx]); } } else { final Term term = terms[0]; postingsEnum = reader.termPositionsEnum(delDocs, term.field(), term.bytes()); if (postingsEnum == null) { if (reader.termDocsEnum(delDocs, term.field(), term.bytes()) != null) { // term does exist, but has no positions throw new IllegalStateException( "field \"" + term.field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + term.text() + ")"); } else { // term does not exist return null; } } docFreq = reader.docFreq(term.field(), term.bytes()); } postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue()); } // sort by increasing docFreq order if (slop == 0) { ArrayUtil.quickSort(postingsFreqs); } if (slop == 0) { ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity, reader.norms(field)); if (s.noDocs) { return null; } else { return s; } } else { return new SloppyPhraseScorer(this, postingsFreqs, similarity, slop, reader.norms(field)); } }
private Object dirtyLock(Term uid) { return dirtyLock(uid.bytes()); }