/** * This is best effort only: the PhraseQuery may contain multiple terms at the same position * (think synonyms) or gaps (think stopwords) and it's in this case impossible to translate it * into a correct ElasticsearchQuery. */ private static JsonObject convertPhraseQuery(PhraseQuery query) { Term[] terms = query.getTerms(); if (terms.length == 0) { throw LOG.cannotQueryOnEmptyPhraseQuery(); } String field = terms[0].field(); // phrase queries are only supporting one field StringBuilder phrase = new StringBuilder(); for (Term term : terms) { phrase.append(" ").append(term.text()); } JsonObject phraseQuery = JsonBuilder.object() .add( "match_phrase", JsonBuilder.object() .add( field, JsonBuilder.object() .addProperty("query", phrase.toString().trim()) .addProperty("slop", query.getSlop()) .addProperty("boost", query.getBoost()))) .build(); return wrapQueryForNestedIfRequired(field, phraseQuery); }
@Override protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { if (maxEdits == 0 || prefixLength >= term.text().length()) { // can only match if it's exact return new SingleTermsEnum(terms.iterator(), term.bytes()); } return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions); }
@Override public void writeTo(StreamOutput out) throws IOException { out.writeVInt(1); // version out.writeUTF(uid.field()); out.writeUTF(uid.text()); out.writeLong(version); }
void processQuery(Query query, ParseContext context) { ParseContext.Document doc = context.doc(); FieldType pft = (FieldType) this.fieldType(); QueryAnalyzer.Result result; try { result = QueryAnalyzer.analyze(query); } catch (QueryAnalyzer.UnsupportedQueryException e) { doc.add( new Field( pft.extractionResultField.name(), EXTRACTION_FAILED, extractionResultField.fieldType())); return; } for (Term term : result.terms) { BytesRefBuilder builder = new BytesRefBuilder(); builder.append(new BytesRef(term.field())); builder.append(FIELD_VALUE_SEPARATOR); builder.append(term.bytes()); doc.add(new Field(queryTermsField.name(), builder.toBytesRef(), queryTermsField.fieldType())); } if (result.verified) { doc.add( new Field( extractionResultField.name(), EXTRACTION_COMPLETE, extractionResultField.fieldType())); } else { doc.add( new Field( extractionResultField.name(), EXTRACTION_PARTIAL, extractionResultField.fieldType())); } }
private static TInfo parseTerm(FunctionQParser fp) throws SyntaxError { TInfo tinfo = new TInfo(); tinfo.indexedField = tinfo.field = fp.parseArg(); tinfo.val = fp.parseArg(); tinfo.indexedBytes = new BytesRef(); FieldType ft = fp.getReq().getSchema().getFieldTypeNoEx(tinfo.field); if (ft == null) ft = new StrField(); if (ft instanceof TextField) { // need to do analysis on the term String indexedVal = tinfo.val; Query q = ft.getFieldQuery(fp, fp.getReq().getSchema().getFieldOrNull(tinfo.field), tinfo.val); if (q instanceof TermQuery) { Term term = ((TermQuery) q).getTerm(); tinfo.indexedField = term.field(); indexedVal = term.text(); } UnicodeUtil.UTF16toUTF8(indexedVal, 0, indexedVal.length(), tinfo.indexedBytes); } else { ft.readableToIndexed(tinfo.val, tinfo.indexedBytes); } return tinfo; }
/** * Split an index based on a given primary key term and a 'middle' term. If the middle term is * present, it's sent to dir2. */ public PKIndexSplitter(Directory input, Directory dir1, Directory dir2, Term midTerm) { this( input, dir1, dir2, new TermRangeFilter(midTerm.field(), null, midTerm.bytes(), true, false)); }
private void getPrefixTerms( ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException { // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment // into one terms // instance, which is very expensive. Therefore I think it is better to iterate over each leaf // individually. List<LeafReaderContext> leaves = reader.leaves(); for (LeafReaderContext leaf : leaves) { Terms _terms = leaf.reader().terms(field); if (_terms == null) { continue; } TermsEnum termsEnum = _terms.iterator(); TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes()); if (TermsEnum.SeekStatus.END == seekStatus) { continue; } for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) { if (!StringHelper.startsWith(term, prefix.bytes())) { break; } terms.add(new Term(field, BytesRef.deepCopyOf(term))); if (terms.size() >= maxExpansions) { return; } } } }
@Override public void visitMatchingTerms(IndexReader reader, String fieldName, MatchingTermVisitor mtv) throws IOException { boolean expanded = false; int prefixLength = prefix.length(); TermEnum enumerator = reader.terms(new Term(fieldName, prefix)); Matcher matcher = pattern.matcher(""); try { do { Term term = enumerator.term(); if (term != null) { String text = term.text(); if ((!text.startsWith(prefix)) || (!term.field().equals(fieldName))) { break; } else { matcher.reset(text.substring(prefixLength)); if (matcher.matches()) { mtv.visitMatchingTerm(term); expanded = true; } } } } while (enumerator.next()); } finally { enumerator.close(); matcher.reset(); } if (!expanded) { System.out.println("No terms in " + fieldName + " field for: " + toString()); } }
public void testSimpleSkip() throws IOException { Directory dir = new CountingRAMDirectory(new RAMDirectory()); IndexWriter writer = new IndexWriter( dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer()) .setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat())) .setMergePolicy(newLogMergePolicy())); Term term = new Term("test", "a"); for (int i = 0; i < 5000; i++) { Document d1 = new Document(); d1.add(newTextField(term.field(), term.text(), Field.Store.NO)); writer.addDocument(d1); } writer.commit(); writer.forceMerge(1); writer.close(); AtomicReader reader = getOnlySegmentReader(DirectoryReader.open(dir)); for (int i = 0; i < 2; i++) { counter = 0; DocsAndPositionsEnum tp = reader.termPositionsEnum(term); checkSkipTo(tp, 14, 185); // no skips checkSkipTo(tp, 17, 190); // one skip on level 0 checkSkipTo(tp, 287, 200); // one skip on level 1, two on level 0 // this test would fail if we had only one skip level, // because than more bytes would be read from the freqStream checkSkipTo(tp, 4800, 250); // one skip on level 2 } }
private static final int[] computeMultivaluedTD( ReaderAbstract reader, String fieldName, FieldCacheIndex stringIndex, DocIdInterface docIdInterface) throws IOException, SearchLibException { int[] countIndex = new int[stringIndex.lookup.length]; int indexPos = 0; if (docIdInterface.getSize() == 0) return countIndex; int[] docs = new int[100]; int[] freqs = new int[100]; BitSetInterface bitset = docIdInterface.getBitSet(); Term oTerm = new Term(fieldName); for (String term : stringIndex.lookup) { if (term != null) { Term t = oTerm.createTerm(term); TermDocs termDocs = reader.getTermDocs(t); int l; while ((l = termDocs.read(docs, freqs)) > 0) for (int i = 0; i < l; i++) if (freqs[i] > 0) if (bitset.get(docs[i])) countIndex[indexPos]++; termDocs.close(); } indexPos++; } return countIndex; }
protected boolean setTerm() { t = tenum.term(); if (t == null || t.field() != tindex.fterm.field() // intern'd compare || (tindex.prefix != null && !t.text().startsWith(tindex.prefix, 0))) { t = null; return false; } return true; }
/* (non-Javadoc) * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) */ public void write(DataOutput out) throws IOException { out.writeInt(deleteList.size()); for (Term term : deleteList) { Text.writeString(out, term.field()); Text.writeString(out, term.text()); } String[] files = dir.list(); RAMDirectoryUtil.writeRAMFiles(out, dir, files); }
// used only by assert private boolean checkDeleteTerm(Term term) { if (term != null) { assert lastDeleteTerm == null || term.compareTo(lastDeleteTerm) > 0 : "lastTerm=" + lastDeleteTerm + " vs term=" + term; } // TODO: we re-use term now in our merged iterable, but we shouldn't clone, instead copy for // this assert lastDeleteTerm = term == null ? null : new Term(term.field(), BytesRef.deepCopyOf(term.bytes)); return true; }
@Override public String toString(String field) { StringBuilder buffer = new StringBuilder(); if (!term.field().equals(field)) { buffer.append(term.field()); buffer.append(":"); } buffer.append(term.text()); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); }
private static SimpleOrderedMap<Object> getDocumentFieldsInfo( Document doc, int docId, IndexReader reader, IndexSchema schema) throws IOException { SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>(); for (Object o : doc.getFields()) { Fieldable fieldable = (Fieldable) o; SimpleOrderedMap<Object> f = new SimpleOrderedMap<Object>(); SchemaField sfield = schema.getFieldOrNull(fieldable.name()); FieldType ftype = (sfield == null) ? null : sfield.getType(); f.add("type", (ftype == null) ? null : ftype.getTypeName()); f.add("schema", getFieldFlags(sfield)); f.add("flags", getFieldFlags(fieldable)); Term t = new Term( fieldable.name(), ftype != null ? ftype.storedToIndexed(fieldable) : fieldable.stringValue()); f.add("value", (ftype == null) ? null : ftype.toExternal(fieldable)); // TODO: this really should be "stored" f.add("internal", fieldable.stringValue()); // may be a binary number byte[] arr = fieldable.getBinaryValue(); if (arr != null) { f.add("binary", Base64.byteArrayToBase64(arr, 0, arr.length)); } f.add("boost", fieldable.getBoost()); f.add( "docFreq", t.text() == null ? 0 : reader.docFreq(t)); // this can be 0 for non-indexed fields // If we have a term vector, return that if (fieldable.isTermVectorStored()) { try { TermFreqVector v = reader.getTermFreqVector(docId, fieldable.name()); if (v != null) { SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<Integer>(); for (int i = 0; i < v.size(); i++) { tfv.add(v.getTerms()[i], v.getTermFrequencies()[i]); } f.add("termVector", tfv); } } catch (Exception ex) { log.warn("error writing term vector", ex); } } finfo.add(fieldable.name(), f); } return finfo; }
/** * Returns {@link PostingsEnum} for the specified term. This will return null if either the field * or term does not exist. * * <p><b>NOTE:</b> The returned {@link PostingsEnum} may contain deleted docs. * * @see TermsEnum#postings(PostingsEnum) */ public final PostingsEnum postings(Term term, int flags) throws IOException { assert term.field() != null; assert term.bytes() != null; final Terms terms = terms(term.field()); if (terms != null) { final TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(term.bytes())) { return termsEnum.postings(null, flags); } } return null; }
/** * Returns the number of documents containing the term <code>t</code>. This method returns 0 if * the term or field does not exists. This method does not take into account deleted documents * that have not yet been merged away. */ @Override public final long totalTermFreq(Term term) throws IOException { final Terms terms = terms(term.field()); if (terms == null) { return 0; } final TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(term.bytes())) { return termsEnum.totalTermFreq(); } else { return 0; } }
protected Term getAnalyzedTerm(TokenType tokenType, String termString) throws IOException { Term term = getTerm(termString, tokenType); // first ensure that we've stripped any prefixes TokenStream tokenStream = analyzer.tokenStream(term.field(), new StringReader(term.text())); tokenStream.reset(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); StringBuilder sb = new StringBuilder(); while (tokenStream.incrementToken()) { sb.append(termAtt.toString()); } tokenStream.end(); tokenStream.close(); return new Term(term.field(), sb.toString()); }
/** * Returns {@link DocsAndPositionsEnum} for the specified term. This will return null if the field * or term does not exist or positions weren't indexed. * * @deprecated use {@link #postings(Term, int)} instead */ @Deprecated public final DocsAndPositionsEnum termPositionsEnum(Term term) throws IOException { assert term.field() != null; assert term.bytes() != null; final Terms terms = terms(term.field()); if (terms != null) { final TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(term.bytes())) { return termsEnum.docsAndPositions(getLiveDocs(), null); } } return null; }
/** * Computes a term frequency map for the index at the specified location. "Most Frequent" is * defined as the terms whose frequencies are greater than or equal to the topTermCutoff * the * frequency of the top term, where the topTermCutoff is number between 0 and 1. * * @return * @throws CorruptIndexException * @throws IOException */ protected ArrayList<String> retrieveTopTerms() throws CorruptIndexException, IOException { final Map<String, Integer> frequencyMap = new HashMap<String, Integer>(); List<String> termlist = new ArrayList<String>(); IndexReader reader = IndexReader.open(ramdir); TermEnum terms = reader.terms(); while (terms.next()) { Term term = terms.term(); String termText = term.text(); int frequency = reader.docFreq(term); frequencyMap.put(termText, frequency); termlist.add(termText); } reader.close(); // sort the term map by frequency descending Collections.sort( termlist, new Comparator<String>() { @Override public int compare(String term1, String term2) { int term1Freq = frequencyMap.get(term1); int term2Freq = frequencyMap.get(term2); if (term1Freq < term2Freq) return 1; if (term1Freq > term2Freq) return -1; return 0; } }); // retrieve the top terms based on topTermCutoff ArrayList<String> topTerms = new ArrayList<String>(); double topFreq = -1.0F; for (String term : termlist) { if (topFreq < 0.0F) { // first term, capture the value topFreq = (double) frequencyMap.get(term); topTerms.add(term); } else { // not the first term, compute the ratio and discard if below // topTermCutoff score double ratio = (double) ((double) frequencyMap.get(term) / topFreq); if (ratio >= topTermCutoff) { topTerms.add(term); } else { break; } } } return topTerms; }
private void initBiggerDiagonal(IndexReader reader) throws IOException { logger.info("Initializing Spatial Indexes for Queries Strategies"); if (biggerDiagonal == null) { biggerDiagonal = (Double) IndexReaderPersistentCache.get(reader, biggerDiagonalCacheKey); twiceBiggerDiagonal = (Double) IndexReaderPersistentCache.get(reader, twiceBiggerDiagonalCacheKey); if (biggerDiagonal == null || twiceBiggerDiagonal == null) { biggerDiagonal = 0.0; Term last = null; TermEnum termEnum = reader.terms(new Term(Globals.LUCENE_DIAGONAL_INDEX, "")); if (termEnum.term() != null && termEnum.term().field().equals(Globals.LUCENE_DIAGONAL_INDEX)) last = termEnum.term(); if (termEnum.term() != null) while (termEnum.next()) if (termEnum.term().field().equals(Globals.LUCENE_DIAGONAL_INDEX)) last = termEnum.term(); if (last != null) { biggerDiagonal = NumberUtils.SortableStr2double(last.text()); logger.info("Found bigger spatial width:" + biggerDiagonal); } twiceBiggerDiagonal = 2 * biggerDiagonal; halfBiggerDiagonal = biggerDiagonal / ((double) 2); logger.info("defining twice bigger spatial width:" + twiceBiggerDiagonal); termEnum.close(); IndexReaderPersistentCache.put(biggerDiagonalCacheKey, biggerDiagonal, reader); IndexReaderPersistentCache.put(twiceBiggerDiagonalCacheKey, twiceBiggerDiagonal, reader); } } if (biggerInternalCircleRadium == null) { biggerInternalCircleRadium = (Double) IndexReaderPersistentCache.get(reader, biggerRadiumCacheKey); if (biggerInternalCircleRadium == null) { biggerInternalCircleRadium = 0.0; Term last = null; TermEnum termEnum = reader.terms(new Term(Globals.LUCENE_RADIUM_INDEX, "")); if (termEnum.term() != null && termEnum.term().field().equals(Globals.LUCENE_RADIUM_INDEX)) last = termEnum.term(); if (termEnum.term() != null) while (termEnum.next()) if (termEnum.term().field().equals(Globals.LUCENE_RADIUM_INDEX)) last = termEnum.term(); if (last != null) { biggerInternalCircleRadium = NumberUtils.SortableStr2double(last.text()); logger.info("Found bigger spatial width:" + biggerInternalCircleRadium); } termEnum.close(); IndexReaderPersistentCache.put(biggerRadiumCacheKey, biggerInternalCircleRadium, reader); } } }
@SuppressWarnings({"StringEquality"}) @Override public void run() { TermDocs termDocs = null; TermEnum termEnum = null; try { BloomFilter filter = BloomFilterFactory.getFilter(reader.numDocs(), 15); termDocs = reader.termDocs(); termEnum = reader.terms(new Term(field)); do { Term term = termEnum.term(); if (term == null || term.field() != field) break; // LUCENE MONITOR: 4.0, move to use bytes! UnicodeUtil.UTF8Result utf8Result = Unicode.fromStringAsUtf8(term.text()); termDocs.seek(termEnum); while (termDocs.next()) { // when traversing, make sure to ignore deleted docs, so the key->docId will be correct if (!reader.isDeleted(termDocs.doc())) { filter.add(utf8Result.result, 0, utf8Result.length); } } } while (termEnum.next()); ConcurrentMap<String, BloomFilterEntry> fieldCache = cache.get(reader.getFieldCacheKey()); if (fieldCache != null) { if (fieldCache.containsKey(field)) { BloomFilterEntry filterEntry = new BloomFilterEntry(reader.numDocs(), filter); filterEntry.loading.set(false); fieldCache.put(field, filterEntry); } } } catch (Exception e) { logger.warn("failed to load bloom filter for [{}]", e, field); } finally { try { if (termDocs != null) { termDocs.close(); } } catch (IOException e) { // ignore } try { if (termEnum != null) { termEnum.close(); } } catch (IOException e) { // ignore } } }
public PKIndexSplitter( Directory input, Directory dir1, Directory dir2, Term midTerm, IndexWriterConfig config1, IndexWriterConfig config2) { this( input, dir1, dir2, new TermRangeFilter(midTerm.field(), null, midTerm.bytes(), true, false), config1, config2); }
@Override public boolean next() throws IOException { for (int i = 0; i < matchingSegments.length; i++) { SegmentMergeInfo smi = matchingSegments[i]; if (smi == null) break; if (smi.next()) queue.add(smi); else smi.close(); // done with segment } int numMatchingSegments = 0; matchingSegments[0] = null; SegmentMergeInfo top = queue.top(); if (top == null) { term = null; return false; } term = top.term; docFreq = 0; while (top != null && term.compareTo(top.term) == 0) { matchingSegments[numMatchingSegments++] = top; queue.pop(); docFreq += top.termEnum.docFreq(); // increment freq top = queue.top(); } matchingSegments[numMatchingSegments] = null; return true; }
@Override public String toString() { return "spans(" + term.toString() + ")@" + (doc == -1 ? "START" : (doc == Integer.MAX_VALUE) ? "END" : doc + "-" + position); }
public float queryScore(float idf) { return (float) Math.log(1 + term.text().length()) * dictidf * dictidf * (fromfreq + boost(fromfield)) * idf; }
/** * Gets the global term frequencies and writes them in the index directory. * * @throws Exception the exception */ public void getGlobalTermFrequencies() throws Exception { String parentDir = Flags.rootDir + (Flags.positional ? "/positional-" : "/") + "lucene/" + Flags.suffix; File file = new File(parentDir); indexReader = IndexReader.open(FSDirectory.open(file)); TermEnum terms = indexReader.terms(); BufferedWriter out = new BufferedWriter(new FileWriter(new File(parentDir + "/globalTermFreq.txt"))); while (terms.next()) { org.apache.lucene.index.Term term = terms.term(); out.write(term.text() + " " + getGlobalTermFreq(term) + "\n"); } out.close(); indexReader.close(); }
/** * @param clause * @param clauseQuery * @param ands * @param ors */ private void extractTerms( BooleanClause clause, org.apache.lucene.search.Query clauseQuery, Map<String, Object> ands, Map<String, Object> ors) { Set<Term> terms = Sets.newHashSet(); clauseQuery.extractTerms(terms); for (Term term : terms) { if (clause != null && clause.getOccur() == Occur.SHOULD) { accumulateValue(ors, term.field(), term.text()); } else { accumulateValue(ands, term.field(), term.text()); } } }
/** * Gets the global term frequency of a term, i.e. how may times it occurs in the whole corpus * * @param term whose frequency you want * @return Global term frequency of term, or 1 if unavailable. */ private int getGlobalTermFreq(Term term) { int tf = 0; try { TermDocs tDocs = this.indexReader.termDocs(term); if (tDocs == null) { logger.info("Couldn't get term frequency for term " + term.text()); return 1; } while (tDocs.next()) { tf += tDocs.freq(); } } catch (IOException e) { logger.info("Couldn't get term frequency for term " + term.text()); return 1; } return tf; }
protected Query blendTermQuery(Term term, MappedFieldType fieldType) { if (fuzziness != null) { if (fieldType != null) { try { Query query = fieldType.fuzzyQuery( term.text(), fuzziness, fuzzyPrefixLength, maxExpansions, transpositions); if (query instanceof FuzzyQuery) { QueryParsers.setRewriteMethod((FuzzyQuery) query, fuzzyRewriteMethod); } return query; } catch (RuntimeException e) { return new TermQuery(term); // See long comment below about why we're lenient here. } } int edits = fuzziness.asDistance(term.text()); FuzzyQuery query = new FuzzyQuery(term, edits, fuzzyPrefixLength, maxExpansions, transpositions); QueryParsers.setRewriteMethod(query, fuzzyRewriteMethod); return query; } if (fieldType != null) { /* * Its a bit weird to default to lenient here but its the backwards * compatible. It makes some sense when you think about what we are * doing here: at this point the user has forced an analyzer and * passed some string to the match query. We cut it up using the * analyzer and then tried to cram whatever we get into the field. * lenient=true here means that we try the terms in the query and on * the off chance that they are actually valid terms then we * actually try them. lenient=false would mean that we blow up the * query if they aren't valid terms. "valid" in this context means * "parses properly to something of the type being queried." So "1" * is a valid number, etc. * * We use the text form here because we we've received the term from * an analyzer that cut some string into text. */ Query query = termQuery(fieldType, term.bytes(), true); if (query != null) { return query; } } return new TermQuery(term); }