/** * Find terms in the index based on a prefix. Useful for autocomplete. * * @param index the index * @param fieldName the field * @param prefix the prefix we're looking for (null or empty string for all terms) * @param sensitive match case-sensitively or not? * @param maxResults max. number of results to return (or -1 for all) * @return the matching terms */ public static List<String> findTermsByPrefix( LeafReader index, String fieldName, String prefix, boolean sensitive, int maxResults) { boolean allTerms = prefix == null || prefix.length() == 0; if (allTerms) { prefix = ""; sensitive = true; // don't do unnecessary work in this case } try { if (!sensitive) prefix = StringUtil.removeAccents(prefix).toLowerCase(); org.apache.lucene.index.Terms terms = index.terms(fieldName); List<String> results = new ArrayList<>(); TermsEnum termsEnum = terms.iterator(); BytesRef brPrefix = new BytesRef(prefix.getBytes(LUCENE_DEFAULT_CHARSET)); termsEnum.seekCeil(brPrefix); // find the prefix in the terms list while (maxResults < 0 || results.size() < maxResults) { BytesRef term = termsEnum.next(); if (term == null) break; String termText = term.utf8ToString(); String optDesensitized = termText; if (!sensitive) optDesensitized = StringUtil.removeAccents(termText).toLowerCase(); if (!allTerms && !optDesensitized.substring(0, prefix.length()).equalsIgnoreCase(prefix)) { // Doesn't match prefix or different field; no more matches break; } // Match, add term results.add(termText); } return results; } catch (IOException e) { throw new RuntimeException(e); } }
public synchronized ShapeFieldCache<T> getCache(LeafReader reader) throws IOException { ShapeFieldCache<T> idx = sidx.get(reader); if (idx != null) { return idx; } long startTime = System.currentTimeMillis(); log.fine("Building Cache [" + reader.maxDoc() + "]"); idx = new ShapeFieldCache<>(reader.maxDoc(), defaultSize); int count = 0; DocsEnum docs = null; Terms terms = reader.terms(shapeField); TermsEnum te = null; if (terms != null) { te = terms.iterator(te); BytesRef term = te.next(); while (term != null) { T shape = readShape(term); if (shape != null) { docs = te.docs(null, docs, DocsEnum.FLAG_NONE); Integer docid = docs.nextDoc(); while (docid != DocIdSetIterator.NO_MORE_DOCS) { idx.add(docid, shape); docid = docs.nextDoc(); count++; } } term = te.next(); } } sidx.put(reader, idx); long elapsed = System.currentTimeMillis() - startTime; log.fine("Cached: [" + count + " in " + elapsed + "ms] " + idx); return idx; }
public String[] getTerms() { IndexReader reader = null; int maxSize = 100; Set<String> searchResults = new HashSet<String>(); try { reader = DirectoryReader.open(dir); Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms("contents"); TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY); BytesRef byteRef = null; while ((byteRef = termsEnum.next()) != null) { String term = new String(byteRef.bytes, byteRef.offset, byteRef.length); searchResults.add(term); if (searchResults.size() >= maxSize) { break; } } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { try { if (reader != null) { reader.close(); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } return searchResults.toArray(new String[searchResults.size()]); }
private void getPrefixTerms( ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException { // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment // into one terms // instance, which is very expensive. Therefore I think it is better to iterate over each leaf // individually. List<LeafReaderContext> leaves = reader.leaves(); for (LeafReaderContext leaf : leaves) { Terms _terms = leaf.reader().terms(field); if (_terms == null) { continue; } TermsEnum termsEnum = _terms.iterator(); TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes()); if (TermsEnum.SeekStatus.END == seekStatus) { continue; } for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) { if (!StringHelper.startsWith(term, prefix.bytes())) { break; } terms.add(new Term(field, BytesRef.deepCopyOf(term))); if (terms.size() >= maxExpansions) { return; } } } }
Query createCandidateQuery(IndexReader indexReader) throws IOException { List<Term> extractedTerms = new ArrayList<>(); // include extractionResultField:failed, because docs with this term have no // extractedTermsField // and otherwise we would fail to return these docs. Docs that failed query term extraction // always need to be verified by MemoryIndex: extractedTerms.add(new Term(extractionResultField.name(), EXTRACTION_FAILED)); LeafReader reader = indexReader.leaves().get(0).reader(); Fields fields = reader.fields(); for (String field : fields) { Terms terms = fields.terms(field); if (terms == null) { continue; } BytesRef fieldBr = new BytesRef(field); TermsEnum tenum = terms.iterator(); for (BytesRef term = tenum.next(); term != null; term = tenum.next()) { BytesRefBuilder builder = new BytesRefBuilder(); builder.append(fieldBr); builder.append(FIELD_VALUE_SEPARATOR); builder.append(term); extractedTerms.add(new Term(queryTermsField.name(), builder.toBytesRef())); } } return new TermsQuery(extractedTerms); }
/** * Add term frequencies for a single document to a frequency map. * * @param reader the index * @param doc doc id * @param luceneName the index field from which to use the term vector * @param freq where to add to the token frequencies */ public static void getFrequenciesFromTermVector( IndexReader reader, int doc, String luceneName, Map<String, Integer> freq) { try { org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName); if (terms == null) { throw new IllegalArgumentException("Field " + luceneName + " has no Terms"); } TermsEnum termsEnum = terms.iterator(); // Verzamel concordantiewoorden uit term vector PostingsEnum postingsEnum = null; while (termsEnum.next() != null) { postingsEnum = termsEnum.postings(null, postingsEnum, PostingsEnum.FREQS); String term = termsEnum.term().utf8ToString(); Integer n = freq.get(term); if (n == null) { n = 0; } while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { n += termsEnum.docFreq(); } freq.put(term, n); } } catch (Exception e) { throw ExUtil.wrapRuntimeException(e); } }
public void testReuseDocsEnumNoReuse() throws IOException { Directory dir = newDirectory(); Codec cp = TestUtil.alwaysPostingsFormat(new Lucene40RWPostingsFormat()); RandomIndexWriter writer = new RandomIndexWriter( random(), dir, newIndexWriterConfig(new MockAnalyzer(random())).setCodec(cp)); int numdocs = atLeast(20); createRandomIndex(numdocs, writer, random()); writer.commit(); DirectoryReader open = DirectoryReader.open(dir); for (LeafReaderContext ctx : open.leaves()) { LeafReader indexReader = ctx.reader(); Terms terms = indexReader.terms("body"); TermsEnum iterator = terms.iterator(); IdentityHashMap<PostingsEnum, Boolean> enums = new IdentityHashMap<>(); MatchNoBits bits = new Bits.MatchNoBits(indexReader.maxDoc()); while ((iterator.next()) != null) { PostingsEnum docs = iterator.postings( random().nextBoolean() ? bits : new Bits.MatchNoBits(indexReader.maxDoc()), null, random().nextBoolean() ? PostingsEnum.FREQS : PostingsEnum.NONE); enums.put(docs, true); } assertEquals(terms.size(), enums.size()); } writer.commit(); IOUtils.close(writer, open, dir); }
public void listTokens(int freq) throws IOException { IndexReader ireader = null; TermsEnum iter = null; Terms terms = null; try { ireader = DirectoryReader.open(indexDirectory); int numDocs = ireader.numDocs(); if (numDocs > 0) { Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0); terms = uFields.terms(QueryBuilder.DEFS); } iter = terms.iterator(null); // init uid iterator while (iter.term() != null) { // if (iter.term().field().startsWith("f")) { if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) { log.warning(iter.term().utf8ToString()); } iter.next(); /*} else { break; }*/ } } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { log.log(Level.WARNING, "An error occured while closing index reader", e); } } } }
/** * List all of the files in this index database * * @throws IOException If an IO error occurs while reading from the database */ public void listFiles() throws IOException { IndexReader ireader = null; TermsEnum iter; Terms terms = null; try { ireader = DirectoryReader.open(indexDirectory); // open existing index int numDocs = ireader.numDocs(); if (numDocs > 0) { Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0); terms = uFields.terms(QueryBuilder.U); } iter = terms.iterator(null); // init uid iterator while (iter.term() != null) { log.fine(Util.uid2url(iter.term().utf8ToString())); iter.next(); } } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { log.log(Level.WARNING, "An error occured while closing index reader", e); } } } }
public void assertTerms(Terms leftTerms, Terms rightTerms, boolean deep) throws Exception { if (leftTerms == null || rightTerms == null) { assertNull(leftTerms); assertNull(rightTerms); return; } assertTermsStatistics(leftTerms, rightTerms); // NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be // different TermsEnum leftTermsEnum = leftTerms.iterator(null); TermsEnum rightTermsEnum = rightTerms.iterator(null); assertTermsEnum(leftTermsEnum, rightTermsEnum, true); assertTermsSeeking(leftTerms, rightTerms); if (deep) { int numIntersections = atLeast(3); for (int i = 0; i < numIntersections; i++) { String re = AutomatonTestUtil.randomRegexp(random()); CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton()); if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { // TODO: test start term too TermsEnum leftIntersection = leftTerms.intersect(automaton, null); TermsEnum rightIntersection = rightTerms.intersect(automaton, null); assertTermsEnum(leftIntersection, rightIntersection, rarely()); } } } }
/* * Utility function to display a term vector. */ static void termVectorDisplay(Terms terms) throws IOException { if ((terms == null) || (terms.size() == -1)) System.out.println(" The field is not stored."); else { /* * The terms for this field are stored. */ System.out.println(" Vocabulary size: " + terms.size() + " terms"); TermsEnum ithTerm = terms.iterator(null); /* * Iterate over the terms in this document. * Information about a term's occurrences (tf and * positions) is accessed via the indexing API, which * returns inverted lists that describe (only) the * current document. */ while (ithTerm.next() != null) { System.out.format( " %10d %-20s %d ", ithTerm.ord(), ithTerm.term().utf8ToString(), ithTerm.totalTermFreq()); DocsAndPositionsEnum currDoc = ithTerm.docsAndPositions(null, null); currDoc.nextDoc(); for (int jthPosition = 0; jthPosition < ithTerm.totalTermFreq(); jthPosition++) System.out.print(currDoc.nextPosition() + " "); System.out.println(); } ; } ; }
/* * listTermDictionary displays the term dictionary for a field. */ static void listTermDictionary(IndexReader reader, String fieldName) throws IOException { System.out.println("\nTerm Dictionary: field " + fieldName); /* Grant says: MultiFields.getTerms(IndexReader, fieldName) */ Terms terms = MultiFields.getTerms(reader, fieldName); if ((terms == null) || (terms.size() == -1)) System.out.println(" The term dictionary is empty."); else { System.out.println(" Vocabulary size: " + terms.size() + " terms"); TermsEnum ithTerm = terms.iterator(null); /* * Iterate over the terms in this document. * Information about a term's occurrences (tf and * positions) is accessed via the indexing API, which * returns inverted lists that describe (only) the * current document. */ while (ithTerm.next() != null) { System.out.format( " %-30s %d %d\n", ithTerm.term().utf8ToString(), ithTerm.docFreq(), ithTerm.totalTermFreq()); } ; } ; }
private void buildFieldStatistics(XContentBuilder builder, Terms curTerms) throws IOException { long sumDocFreq = curTerms.getSumDocFreq(); int docCount = curTerms.getDocCount(); long sumTotalTermFrequencies = curTerms.getSumTotalTermFreq(); if (docCount > 0) { assert ((sumDocFreq > 0)) : "docCount >= 0 but sumDocFreq ain't!"; assert ((sumTotalTermFrequencies > 0)) : "docCount >= 0 but sumTotalTermFrequencies ain't!"; builder.startObject(FieldStrings.FIELD_STATISTICS); builder.field(FieldStrings.SUM_DOC_FREQ, sumDocFreq); builder.field(FieldStrings.DOC_COUNT, docCount); builder.field(FieldStrings.SUM_TTF, sumTotalTermFrequencies); builder.endObject(); } else if (docCount == -1) { // this should only be -1 if the field // statistics were not requested at all. In // this case all 3 values should be -1 assert ((sumDocFreq == -1)) : "docCount was -1 but sumDocFreq ain't!"; assert ((sumTotalTermFrequencies == -1)) : "docCount was -1 but sumTotalTermFrequencies ain't!"; } else { throw new ElasticsearchIllegalStateException( "Something is wrong with the field statistics of the term vector request: Values are " + "\n" + FieldStrings.SUM_DOC_FREQ + " " + sumDocFreq + "\n" + FieldStrings.DOC_COUNT + " " + docCount + "\n" + FieldStrings.SUM_TTF + " " + sumTotalTermFrequencies); } }
/* Copied from lucene 4.2.x core */ private static long totalTermFreq(IndexReader r, String field, BytesRef text) throws IOException { final Terms terms = MultiFields.getTerms(r, field); if (terms != null) { final TermsEnum termsEnum = terms.iterator(null); if (termsEnum.seekExact(text, true)) { return termsEnum.totalTermFreq(); } } return 0; }
private long getSumTermFrequency(IndexReader reader, String fieldName) { Terms collectionTermVector = null; try { collectionTermVector = MultiFields.getTerms(reader, fieldName); long totalTermFreq = collectionTermVector.getSumTotalTermFreq(); return totalTermFreq; } catch (IOException e) { LOG.warn("Unable to get total term frequency, it might not be indexed"); } return 0; }
@Override public FieldStats stats(Terms terms, int maxDoc) throws IOException { float minValue = NumericUtils.sortableIntToFloat(NumericUtils.getMinInt(terms)); float maxValue = NumericUtils.sortableIntToFloat(NumericUtils.getMaxInt(terms)); return new FieldStats.Float( maxDoc, terms.getDocCount(), terms.getSumDocFreq(), terms.getSumTotalTermFreq(), minValue, maxValue); }
@Override public FieldStats stats(Terms terms, int maxDoc) throws IOException { long minValue = NumericUtils.getMinInt(terms); long maxValue = NumericUtils.getMaxInt(terms); return new FieldStats.Long( maxDoc, terms.getDocCount(), terms.getSumDocFreq(), terms.getSumTotalTermFreq(), minValue, maxValue); }
public void testChangeGaps() throws Exception { // LUCENE-5324: check that it is possible to change the wrapper's gaps final int positionGap = random().nextInt(1000); final int offsetGap = random().nextInt(1000); final Analyzer delegate = new MockAnalyzer(random()); final Analyzer a = new DelegatingAnalyzerWrapper(delegate.getReuseStrategy()) { @Override protected Analyzer getWrappedAnalyzer(String fieldName) { return delegate; } @Override public int getPositionIncrementGap(String fieldName) { return positionGap; } @Override public int getOffsetGap(String fieldName) { return offsetGap; } }; final RandomIndexWriter writer = new RandomIndexWriter(random(), newDirectory(), a); final Document doc = new Document(); final FieldType ft = new FieldType(); ft.setIndexOptions(IndexOptions.DOCS); ft.setTokenized(true); ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.setStoreTermVectorOffsets(true); doc.add(new Field("f", "a", ft)); doc.add(new Field("f", "a", ft)); writer.addDocument(doc); final LeafReader reader = getOnlySegmentReader(writer.getReader()); final Fields fields = reader.getTermVectors(0); final Terms terms = fields.terms("f"); final TermsEnum te = terms.iterator(); assertEquals(new BytesRef("a"), te.next()); final PostingsEnum dpe = te.postings(null, PostingsEnum.ALL); assertEquals(0, dpe.nextDoc()); assertEquals(2, dpe.freq()); assertEquals(0, dpe.nextPosition()); assertEquals(0, dpe.startOffset()); final int endOffset = dpe.endOffset(); assertEquals(1 + positionGap, dpe.nextPosition()); assertEquals(1 + endOffset + offsetGap, dpe.endOffset()); assertEquals(null, te.next()); reader.close(); writer.close(); writer.w.getDirectory().close(); }
private void initMemory(Terms curTerms, int termFreq) { // init memory for performance reasons if (curTerms.hasPositions()) { currentPositions = ArrayUtil.grow(currentPositions, termFreq); } if (curTerms.hasOffsets()) { currentStartOffset = ArrayUtil.grow(currentStartOffset, termFreq); currentEndOffset = ArrayUtil.grow(currentEndOffset, termFreq); } if (curTerms.hasPayloads()) { currentPayloads = new BytesArray[termFreq]; } }
@Override protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { if (maxEdits == 0 || prefixLength >= term.text().length()) { // can only match if it's exact return new SingleTermsEnum(terms.iterator(), term.bytes()); } return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions); }
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param termFreqMap a Map of terms and their frequencies * @param vector List of terms and their frequencies for a doc/field * @param fieldName Optional field name of the terms for skip terms */ private void addTermFrequencies( Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName) throws IOException { final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } if (isSkipTerm(fieldName, term)) { continue; } final PostingsEnum docs = termsEnum.postings(null, null); int freq = 0; while (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { freq += docs.freq(); } // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param field2termFreqMap a Map of terms and their frequencies per field * @param vector List of terms and their frequencies for a doc/field */ private void addTermFrequencies( Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName) throws IOException { Map<String, Int> termFreqMap = field2termFreqMap.get(fieldName); if (termFreqMap == null) { termFreqMap = new HashMap<>(); field2termFreqMap.put(fieldName, termFreqMap); } final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } final int freq = (int) termsEnum.totalTermFreq(); // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
/** * @param reader * @param numTerms * @param field * @return TermStats[] ordered by terms with highest docFreq first. * @throws Exception */ public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames) throws Exception { TermStatsQueue tiq = null; TermsEnum te = null; if (fieldNames != null) { Fields fields = MultiFields.getFields(reader); if (fields == null) { LOG.info("Index with no fields - probably empty or corrupted"); return EMPTY_STATS; } tiq = new TermStatsQueue(numTerms); for (String field : fieldNames) { Terms terms = fields.terms(field); if (terms != null) { te = terms.iterator(te); fillQueue(te, tiq, field); } } } else { Fields fields = MultiFields.getFields(reader); if (fields == null) { LOG.info("Index with no fields - probably empty or corrupted"); return EMPTY_STATS; } tiq = new TermStatsQueue(numTerms); Iterator<String> fieldIterator = fields.iterator(); while (fieldIterator.hasNext()) { String field = fieldIterator.next(); Terms terms = fields.terms(field); if (terms != null) { te = terms.iterator(te); fillQueue(te, tiq, field); } } } TermStats[] result = new TermStats[tiq.size()]; // we want highest first so we read the queue and populate the array // starting at the end and work backwards int count = tiq.size() - 1; while (tiq.size() != 0) { result[count] = tiq.pop(); count--; } return result; }
public PostingsEnum randomDocsEnum( String field, BytesRef term, List<LeafReaderContext> readers, Bits bits) throws IOException { if (random().nextInt(10) == 0) { return null; } LeafReader indexReader = readers.get(random().nextInt(readers.size())).reader(); Terms terms = indexReader.terms(field); if (terms == null) { return null; } TermsEnum iterator = terms.iterator(); if (iterator.seekExact(term)) { return iterator.postings( bits, null, random().nextBoolean() ? PostingsEnum.FREQS : PostingsEnum.NONE); } return null; }
@Override protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { if (this.terms.size() == 0) { return TermsEnum.EMPTY; } return new SeekingTermSetTermsEnum(terms.iterator(), this.terms, ords); }
protected void compareTermVectors(Terms terms, Terms memTerms, String field_name) throws IOException { TermsEnum termEnum = terms.iterator(); TermsEnum memTermEnum = memTerms.iterator(); while (termEnum.next() != null) { assertNotNull(memTermEnum.next()); assertThat(termEnum.totalTermFreq(), equalTo(memTermEnum.totalTermFreq())); PostingsEnum docsPosEnum = termEnum.postings(null, PostingsEnum.POSITIONS); PostingsEnum memDocsPosEnum = memTermEnum.postings(null, PostingsEnum.POSITIONS); String currentTerm = termEnum.term().utf8ToString(); assertThat( "Token mismatch for field: " + field_name, currentTerm, equalTo(memTermEnum.term().utf8ToString())); docsPosEnum.nextDoc(); memDocsPosEnum.nextDoc(); int freq = docsPosEnum.freq(); assertThat(freq, equalTo(memDocsPosEnum.freq())); for (int i = 0; i < freq; i++) { String failDesc = " (field:" + field_name + " term:" + currentTerm + ")"; int memPos = memDocsPosEnum.nextPosition(); int pos = docsPosEnum.nextPosition(); assertThat("Position test failed" + failDesc, memPos, equalTo(pos)); assertThat( "Start offset test failed" + failDesc, memDocsPosEnum.startOffset(), equalTo(docsPosEnum.startOffset())); assertThat( "End offset test failed" + failDesc, memDocsPosEnum.endOffset(), equalTo(docsPosEnum.endOffset())); assertThat( "Missing payload test failed" + failDesc, docsPosEnum.getPayload(), equalTo(docsPosEnum.getPayload())); } } assertNull("Still some tokens not processed", memTermEnum.next()); }
/** * We will implement this according to the Lucene specification the formula used: sum ( IDF(qi) * * (df(qi,D) * (k+1)) / (df(qi,D) + k * (1-b + b*|D| / avgFL)) IDF and avgFL computation are * described above. * * @param doc * @param terms * @param context * @return */ @Override public float extract(Document doc, Terms terms, RerankerContext context) { Set<String> queryTokens = new HashSet<>(context.getQueryTokens()); TermsEnum termsEnum = null; try { termsEnum = terms.iterator(); } catch (IOException e) { LOG.warn("Error computing BM25, unable to retrieve terms enum"); return 0.0f; } IndexReader reader = context.getIndexSearcher().getIndexReader(); long maxDocs = reader.numDocs(); long sumTotalTermFreq = getSumTermFrequency(reader, context.getField()); // Compute by iterating long docSize = 0L; // NOTE df cannot be retrieved just from the term vector, // the term vector here is only a partial term vector that treats this as if we only have 1 // document in the index Map<String, Integer> docFreqMap = null; try { docFreqMap = getDocFreqs(reader, context.getQueryTokens(), context.getField()); } catch (IOException e) { LOG.warn("Unable to retrieve document frequencies."); docFreqMap = new HashMap<>(); } Map<String, Long> termFreqMap = new HashMap<>(); try { while (termsEnum.next() != null) { String termString = termsEnum.term().utf8ToString(); docSize += termsEnum.totalTermFreq(); if (queryTokens.contains(termString)) { termFreqMap.put(termString, termsEnum.totalTermFreq()); } } } catch (IOException e) { LOG.warn("Unable to retrieve termsEnum, treating as 0"); } float score = 0.0f; // Iterate over the query tokens double avgFL = computeAvgFL(sumTotalTermFreq, maxDocs); for (String token : queryTokens) { long docFreq = docFreqMap.containsKey(token) ? docFreqMap.get(token) : 0; double termFreq = termFreqMap.containsKey(token) ? termFreqMap.get(token) : 0; double numerator = (this.k1 + 1) * termFreq; double docLengthFactor = this.b * (docSize / avgFL); double denominator = termFreq + (this.k1) * (1 - this.b + docLengthFactor); score += computeIDF(docFreq, maxDocs) * numerator / denominator; } return score; }
private void buildField( XContentBuilder builder, final CharsRefBuilder spare, Fields theFields, Iterator<String> fieldIter) throws IOException { String fieldName = fieldIter.next(); builder.startObject(fieldName); Terms curTerms = theFields.terms(fieldName); // write field statistics buildFieldStatistics(builder, curTerms); builder.startObject(FieldStrings.TERMS); TermsEnum termIter = curTerms.iterator(null); for (int i = 0; i < curTerms.size(); i++) { buildTerm(builder, spare, curTerms, termIter); } builder.endObject(); builder.endObject(); }
private final TermsEnum delegate() throws IOException { if (delegateTermsEnum == null) { /* pull the iterator only if we really need it - * this can be a relativly heavy operation depending on the * delegate postings format and they underlying directory * (clone IndexInput) */ delegateTermsEnum = delegateTerms.iterator(reuseDelegate); } return delegateTermsEnum; }
@Override public FieldStats.Long stats(IndexReader reader) throws IOException { int maxDoc = reader.maxDoc(); Terms terms = org.apache.lucene.index.MultiFields.getTerms(reader, name()); if (terms == null) { return null; } long minValue = LegacyNumericUtils.getMinInt(terms); long maxValue = LegacyNumericUtils.getMaxInt(terms); return new FieldStats.Long( maxDoc, terms.getDocCount(), terms.getSumDocFreq(), terms.getSumTotalTermFreq(), isSearchable(), isAggregatable(), minValue, maxValue); }