public void testDocsAndPositionsEnumStart() throws Exception { Analyzer analyzer = new MockAnalyzer(random()); int numIters = atLeast(3); MemoryIndex memory = new MemoryIndex(true, false, random().nextInt(50) * 1024 * 1024); for (int i = 0; i < numIters; i++) { // check reuse memory.addField("foo", "bar", analyzer); LeafReader reader = (LeafReader) memory.createSearcher().getIndexReader(); TestUtil.checkReader(reader); assertEquals(1, reader.terms("foo").getSumTotalTermFreq()); PostingsEnum disi = reader.postings(new Term("foo", "bar"), PostingsEnum.ALL); int docid = disi.docID(); assertEquals(-1, docid); assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(0, disi.nextPosition()); assertEquals(0, disi.startOffset()); assertEquals(3, disi.endOffset()); // now reuse and check again TermsEnum te = reader.terms("foo").iterator(); assertTrue(te.seekExact(new BytesRef("bar"))); disi = te.postings(disi); docid = disi.docID(); assertEquals(-1, docid); assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); reader.close(); memory.reset(); } }
public synchronized ShapeFieldCache<T> getCache(LeafReader reader) throws IOException { ShapeFieldCache<T> idx = sidx.get(reader); if (idx != null) { return idx; } long startTime = System.currentTimeMillis(); log.fine("Building Cache [" + reader.maxDoc() + "]"); idx = new ShapeFieldCache<>(reader.maxDoc(), defaultSize); int count = 0; DocsEnum docs = null; Terms terms = reader.terms(shapeField); TermsEnum te = null; if (terms != null) { te = terms.iterator(te); BytesRef term = te.next(); while (term != null) { T shape = readShape(term); if (shape != null) { docs = te.docs(null, docs, DocsEnum.FLAG_NONE); Integer docid = docs.nextDoc(); while (docid != DocIdSetIterator.NO_MORE_DOCS) { idx.add(docid, shape); docid = docs.nextDoc(); count++; } } term = te.next(); } } sidx.put(reader, idx); long elapsed = System.currentTimeMillis() - startTime; log.fine("Cached: [" + count + " in " + elapsed + "ms] " + idx); return idx; }
@Override public void hitExecute(SearchContext context, HitContext hitContext) { if (context.getFetchSubPhaseContext(CONTEXT_FACTORY).hitExecutionNeeded() == false) { return; } String field = context.getFetchSubPhaseContext(CONTEXT_FACTORY).getField(); if (hitContext.hit().fieldsOrNull() == null) { hitContext.hit().fields(new HashMap<>()); } SearchHitField hitField = hitContext.hit().fields().get(NAMES[0]); if (hitField == null) { hitField = new InternalSearchHitField(NAMES[0], new ArrayList<>(1)); hitContext.hit().fields().put(NAMES[0], hitField); } TermVectorsResponse termVector = TermVectorsService.getTermVectors( context.indexShard(), new TermVectorsRequest( context.indexShard().shardId().getIndex().getName(), hitContext.hit().type(), hitContext.hit().id())); try { Map<String, Integer> tv = new HashMap<>(); TermsEnum terms = termVector.getFields().terms(field).iterator(); BytesRef term; while ((term = terms.next()) != null) { tv.put(term.utf8ToString(), terms.postings(null, PostingsEnum.ALL).freq()); } hitField.values().add(tv); } catch (IOException e) { ESLoggerFactory.getLogger(FetchSubPhasePluginIT.class.getName()) .info("Swallowed exception", e); } }
public void testSeekCeilNotFound() throws Exception { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); Document doc = new Document(); // Get empty string in there! doc.add(newStringField("field", "", Field.Store.NO)); w.addDocument(doc); for (int i = 0; i < 36; i++) { doc = new Document(); String term = "" + (char) (97 + i); String term2 = "a" + (char) (97 + i); doc.add(newTextField("field", term + " " + term2, Field.Store.NO)); w.addDocument(doc); } w.forceMerge(1); IndexReader r = w.getReader(); TermsEnum te = MultiFields.getTerms(r, "field").iterator(null); assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seekCeil(new BytesRef(new byte[] {0x22}))); assertEquals("a", te.term().utf8ToString()); assertEquals(1L, te.ord()); r.close(); w.close(); dir.close(); }
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param termFreqMap a Map of terms and their frequencies * @param vector List of terms and their frequencies for a doc/field * @param fieldName Optional field name of the terms for skip terms */ private void addTermFrequencies( Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName) throws IOException { final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } if (isSkipTerm(fieldName, term)) { continue; } final PostingsEnum docs = termsEnum.postings(null, null); int freq = 0; while (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { freq += docs.freq(); } // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
/* * listTermDictionary displays the term dictionary for a field. */ static void listTermDictionary(IndexReader reader, String fieldName) throws IOException { System.out.println("\nTerm Dictionary: field " + fieldName); /* Grant says: MultiFields.getTerms(IndexReader, fieldName) */ Terms terms = MultiFields.getTerms(reader, fieldName); if ((terms == null) || (terms.size() == -1)) System.out.println(" The term dictionary is empty."); else { System.out.println(" Vocabulary size: " + terms.size() + " terms"); TermsEnum ithTerm = terms.iterator(null); /* * Iterate over the terms in this document. * Information about a term's occurrences (tf and * positions) is accessed via the indexing API, which * returns inverted lists that describe (only) the * current document. */ while (ithTerm.next() != null) { System.out.format( " %-30s %d %d\n", ithTerm.term().utf8ToString(), ithTerm.docFreq(), ithTerm.totalTermFreq()); } ; } ; }
public void testReuseDocsEnumNoReuse() throws IOException { Directory dir = newDirectory(); Codec cp = TestUtil.alwaysPostingsFormat(new Lucene40RWPostingsFormat()); RandomIndexWriter writer = new RandomIndexWriter( random(), dir, newIndexWriterConfig(new MockAnalyzer(random())).setCodec(cp)); int numdocs = atLeast(20); createRandomIndex(numdocs, writer, random()); writer.commit(); DirectoryReader open = DirectoryReader.open(dir); for (LeafReaderContext ctx : open.leaves()) { LeafReader indexReader = ctx.reader(); Terms terms = indexReader.terms("body"); TermsEnum iterator = terms.iterator(); IdentityHashMap<PostingsEnum, Boolean> enums = new IdentityHashMap<>(); MatchNoBits bits = new Bits.MatchNoBits(indexReader.maxDoc()); while ((iterator.next()) != null) { PostingsEnum docs = iterator.postings( random().nextBoolean() ? bits : new Bits.MatchNoBits(indexReader.maxDoc()), null, random().nextBoolean() ? PostingsEnum.FREQS : PostingsEnum.NONE); enums.put(docs, true); } assertEquals(terms.size(), enums.size()); } writer.commit(); IOUtils.close(writer, open, dir); }
private void getPrefixTerms( ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException { // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment // into one terms // instance, which is very expensive. Therefore I think it is better to iterate over each leaf // individually. List<LeafReaderContext> leaves = reader.leaves(); for (LeafReaderContext leaf : leaves) { Terms _terms = leaf.reader().terms(field); if (_terms == null) { continue; } TermsEnum termsEnum = _terms.iterator(); TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes()); if (TermsEnum.SeekStatus.END == seekStatus) { continue; } for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) { if (!StringHelper.startsWith(term, prefix.bytes())) { break; } terms.add(new Term(field, BytesRef.deepCopyOf(term))); if (terms.size() >= maxExpansions) { return; } } } }
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param field2termFreqMap a Map of terms and their frequencies per field * @param vector List of terms and their frequencies for a doc/field */ private void addTermFrequencies( Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName) throws IOException { Map<String, Int> termFreqMap = field2termFreqMap.get(fieldName); if (termFreqMap == null) { termFreqMap = new HashMap<>(); field2termFreqMap.put(fieldName, termFreqMap); } final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } final int freq = (int) termsEnum.totalTermFreq(); // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
public String[] getTerms() { IndexReader reader = null; int maxSize = 100; Set<String> searchResults = new HashSet<String>(); try { reader = DirectoryReader.open(dir); Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms("contents"); TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY); BytesRef byteRef = null; while ((byteRef = termsEnum.next()) != null) { String term = new String(byteRef.bytes, byteRef.offset, byteRef.length); searchResults.add(term); if (searchResults.size() >= maxSize) { break; } } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { try { if (reader != null) { reader.close(); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } return searchResults.toArray(new String[searchResults.size()]); }
/** * Prepare a document reconstructor. * * @param reader IndexReader to read from. * @param fieldNames if non-null or not empty, data will be collected only from these fields, * otherwise data will be collected from all fields * @param numTerms total number of terms in the index, or -1 if unknown (will be calculated) * @throws Exception */ public DocReconstructor(IndexReader reader, String[] fieldNames, int numTerms) throws Exception { if (reader == null) { throw new Exception("IndexReader cannot be null."); } this.reader = reader; if (fieldNames == null || fieldNames.length == 0) { // collect fieldNames this.fieldNames = (String[]) reader.getFieldNames(FieldOption.ALL).toArray(new String[0]); } else { this.fieldNames = fieldNames; } if (numTerms == -1) { Fields fields = MultiFields.getFields(reader); numTerms = 0; FieldsEnum fe = fields.iterator(); String fld = null; while ((fld = fe.next()) != null) { TermsEnum te = fe.terms(); while (te.next() != null) { numTerms++; } } this.numTerms = numTerms; } deleted = MultiFields.getDeletedDocs(reader); }
/** * Add term frequencies for a single document to a frequency map. * * @param reader the index * @param doc doc id * @param luceneName the index field from which to use the term vector * @param freq where to add to the token frequencies */ public static void getFrequenciesFromTermVector( IndexReader reader, int doc, String luceneName, Map<String, Integer> freq) { try { org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName); if (terms == null) { throw new IllegalArgumentException("Field " + luceneName + " has no Terms"); } TermsEnum termsEnum = terms.iterator(); // Verzamel concordantiewoorden uit term vector PostingsEnum postingsEnum = null; while (termsEnum.next() != null) { postingsEnum = termsEnum.postings(null, postingsEnum, PostingsEnum.FREQS); String term = termsEnum.term().utf8ToString(); Integer n = freq.get(term); if (n == null) { n = 0; } while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { n += termsEnum.docFreq(); } freq.put(term, n); } } catch (Exception e) { throw ExUtil.wrapRuntimeException(e); } }
Query createCandidateQuery(IndexReader indexReader) throws IOException { List<Term> extractedTerms = new ArrayList<>(); // include extractionResultField:failed, because docs with this term have no // extractedTermsField // and otherwise we would fail to return these docs. Docs that failed query term extraction // always need to be verified by MemoryIndex: extractedTerms.add(new Term(extractionResultField.name(), EXTRACTION_FAILED)); LeafReader reader = indexReader.leaves().get(0).reader(); Fields fields = reader.fields(); for (String field : fields) { Terms terms = fields.terms(field); if (terms == null) { continue; } BytesRef fieldBr = new BytesRef(field); TermsEnum tenum = terms.iterator(); for (BytesRef term = tenum.next(); term != null; term = tenum.next()) { BytesRefBuilder builder = new BytesRefBuilder(); builder.append(fieldBr); builder.append(FIELD_VALUE_SEPARATOR); builder.append(term); extractedTerms.add(new Term(queryTermsField.name(), builder.toBytesRef())); } } return new TermsQuery(extractedTerms); }
/** * Find terms in the index based on a prefix. Useful for autocomplete. * * @param index the index * @param fieldName the field * @param prefix the prefix we're looking for (null or empty string for all terms) * @param sensitive match case-sensitively or not? * @param maxResults max. number of results to return (or -1 for all) * @return the matching terms */ public static List<String> findTermsByPrefix( LeafReader index, String fieldName, String prefix, boolean sensitive, int maxResults) { boolean allTerms = prefix == null || prefix.length() == 0; if (allTerms) { prefix = ""; sensitive = true; // don't do unnecessary work in this case } try { if (!sensitive) prefix = StringUtil.removeAccents(prefix).toLowerCase(); org.apache.lucene.index.Terms terms = index.terms(fieldName); List<String> results = new ArrayList<>(); TermsEnum termsEnum = terms.iterator(); BytesRef brPrefix = new BytesRef(prefix.getBytes(LUCENE_DEFAULT_CHARSET)); termsEnum.seekCeil(brPrefix); // find the prefix in the terms list while (maxResults < 0 || results.size() < maxResults) { BytesRef term = termsEnum.next(); if (term == null) break; String termText = term.utf8ToString(); String optDesensitized = termText; if (!sensitive) optDesensitized = StringUtil.removeAccents(termText).toLowerCase(); if (!allTerms && !optDesensitized.substring(0, prefix.length()).equalsIgnoreCase(prefix)) { // Doesn't match prefix or different field; no more matches break; } // Match, add term results.add(termText); } return results; } catch (IOException e) { throw new RuntimeException(e); } }
public TermInfo collect(String term) throws IOException { TermInfo info = new TermInfo(); BytesRef luceneTerm = new BytesRef(term.getBytes()); // this gives documents in which the term is found, but no offset information can be retrieved PostingsEnum postings = MultiFields.getTermDocsEnum(indexReader, ngramInfoFieldname, luceneTerm); // now go through each document int docId = postings.nextDoc(); while (docId != PostingsEnum.NO_MORE_DOCS) { // get the term vector for that document. TermsEnum it = indexReader.getTermVector(docId, ngramInfoFieldname).iterator(); // find the term of interest it.seekExact(luceneTerm); // get its posting info. this will contain offset info PostingsEnum postingsInDoc = it.postings(null, PostingsEnum.OFFSETS); postingsInDoc.nextDoc(); Document doc = indexReader.document(docId); String id = doc.get(idFieldname); JATEDocument jd = new JATEDocument(id); Set<int[]> offsets = new HashSet<>(); int totalFreq = postingsInDoc.freq(); for (int i = 0; i < totalFreq; i++) { postingsInDoc.nextPosition(); offsets.add(new int[] {postingsInDoc.startOffset(), postingsInDoc.endOffset()}); } info.getOffsets().put(jd, offsets); docId = postings.nextDoc(); } return info; }
@Override public void seekExact(long targetOrd) throws IOException { int delta = (int) (targetOrd - ordBase - ord); // System.out.println(" seek(ord) targetOrd=" + targetOrd + " delta=" + delta + " ord=" + ord // + " ii=" + indexInterval); if (delta < 0 || delta > indexInterval) { final int idx = (int) (targetOrd >>> indexIntervalBits); final BytesRef base = indexedTermsArray[idx]; // System.out.println(" do seek term=" + base.utf8ToString()); ord = idx << indexIntervalBits; delta = (int) (targetOrd - ord); final TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(base); assert seekStatus == TermsEnum.SeekStatus.FOUND; } else { // System.out.println("seek w/in block"); } while (--delta >= 0) { BytesRef br = termsEnum.next(); if (br == null) { assert false; return; } ord++; } setTerm(); assert term != null; }
@Override public SeekStatus seekCeil(BytesRef target) throws IOException { // already here if (term != null && term.equals(target)) { return SeekStatus.FOUND; } int startIdx = Arrays.binarySearch(indexedTermsArray, target); if (startIdx >= 0) { // we hit the term exactly... lucky us! TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target); assert seekStatus == TermsEnum.SeekStatus.FOUND; ord = startIdx << indexIntervalBits; setTerm(); assert term != null; return SeekStatus.FOUND; } // we didn't hit the term exactly startIdx = -startIdx - 1; if (startIdx == 0) { // our target occurs *before* the first term TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target); assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND; ord = 0; setTerm(); assert term != null; return SeekStatus.NOT_FOUND; } // back up to the start of the block startIdx--; if ((ord >> indexIntervalBits) == startIdx && term != null && term.compareTo(target) <= 0) { // we are already in the right block and the current term is before the term we want, // so we don't need to seek. } else { // seek to the right block TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(indexedTermsArray[startIdx]); assert seekStatus == TermsEnum.SeekStatus.FOUND; ord = startIdx << indexIntervalBits; setTerm(); assert term != null; // should be non-null since it's in the index } while (term != null && term.compareTo(target) < 0) { next(); } if (term == null) { return SeekStatus.END; } else if (term.compareTo(target) == 0) { return SeekStatus.FOUND; } else { return SeekStatus.NOT_FOUND; } }
/** * List all of the files in this index database * * @throws IOException If an IO error occurs while reading from the database */ public void listFiles() throws IOException { IndexReader ireader = null; TermsEnum iter; Terms terms = null; try { ireader = DirectoryReader.open(indexDirectory); // open existing index int numDocs = ireader.numDocs(); if (numDocs > 0) { Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0); terms = uFields.terms(QueryBuilder.U); } iter = terms.iterator(null); // init uid iterator while (iter.term() != null) { log.fine(Util.uid2url(iter.term().utf8ToString())); iter.next(); } } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { log.log(Level.WARNING, "An error occured while closing index reader", e); } } } }
/** * Remove a stale file (uidIter.term().text()) from the index database (and the xref file) * * @throws java.io.IOException if an error occurs */ private void removeFile() throws IOException { String path = Util.uid2url(uidIter.term().utf8ToString()); for (IndexChangedListener listener : listeners) { listener.fileRemove(path); } writer.deleteDocuments(new Term(QueryBuilder.U, uidIter.term())); writer.prepareCommit(); writer.commit(); File xrefFile; if (RuntimeEnvironment.getInstance().isCompressXref()) { xrefFile = new File(xrefDir, path + ".gz"); } else { xrefFile = new File(xrefDir, path); } File parent = xrefFile.getParentFile(); if (!xrefFile.delete() && xrefFile.exists()) { log.log(Level.INFO, "Failed to remove obsolete xref-file: {0}", xrefFile.getAbsolutePath()); } // Remove the parent directory if it's empty if (parent.delete()) { log.log(Level.FINE, "Removed empty xref dir:{0}", parent.getAbsolutePath()); } setDirty(); for (IndexChangedListener listener : listeners) { listener.fileRemoved(path); } }
@Override protected void fillDocsAndScores(FixedBitSet matchingDocs, TermsEnum termsEnum) throws IOException { BytesRef spare = new BytesRef(); PostingsEnum postingsEnum = null; for (int i = 0; i < terms.size(); i++) { if (termsEnum.seekExact(terms.get(ords[i], spare))) { postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); float score = TermsIncludingScoreQuery.this.scores[ords[i]]; for (int doc = postingsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = postingsEnum.nextDoc()) { // I prefer this: /*if (scores[doc] < score) { scores[doc] = score; matchingDocs.set(doc); }*/ // But this behaves the same as MVInnerScorer and only then the tests will pass: if (!matchingDocs.get(doc)) { scores[doc] = score; matchingDocs.set(doc); } } } } }
public static void fillQueue(TermsEnum termsEnum, TermStatsQueue tiq, String field) throws Exception { BytesRef term; while ((term = termsEnum.next()) != null) { BytesRef r = new BytesRef(); r.copyBytes(term); tiq.insertWithOverflow(new TermStats(field, r, termsEnum.docFreq())); } }
/** * We will implement this according to the Lucene specification the formula used: sum ( IDF(qi) * * (df(qi,D) * (k+1)) / (df(qi,D) + k * (1-b + b*|D| / avgFL)) IDF and avgFL computation are * described above. * * @param doc * @param terms * @param context * @return */ @Override public float extract(Document doc, Terms terms, RerankerContext context) { Set<String> queryTokens = new HashSet<>(context.getQueryTokens()); TermsEnum termsEnum = null; try { termsEnum = terms.iterator(); } catch (IOException e) { LOG.warn("Error computing BM25, unable to retrieve terms enum"); return 0.0f; } IndexReader reader = context.getIndexSearcher().getIndexReader(); long maxDocs = reader.numDocs(); long sumTotalTermFreq = getSumTermFrequency(reader, context.getField()); // Compute by iterating long docSize = 0L; // NOTE df cannot be retrieved just from the term vector, // the term vector here is only a partial term vector that treats this as if we only have 1 // document in the index Map<String, Integer> docFreqMap = null; try { docFreqMap = getDocFreqs(reader, context.getQueryTokens(), context.getField()); } catch (IOException e) { LOG.warn("Unable to retrieve document frequencies."); docFreqMap = new HashMap<>(); } Map<String, Long> termFreqMap = new HashMap<>(); try { while (termsEnum.next() != null) { String termString = termsEnum.term().utf8ToString(); docSize += termsEnum.totalTermFreq(); if (queryTokens.contains(termString)) { termFreqMap.put(termString, termsEnum.totalTermFreq()); } } } catch (IOException e) { LOG.warn("Unable to retrieve termsEnum, treating as 0"); } float score = 0.0f; // Iterate over the query tokens double avgFL = computeAvgFL(sumTotalTermFreq, maxDocs); for (String token : queryTokens) { long docFreq = docFreqMap.containsKey(token) ? docFreqMap.get(token) : 0; double termFreq = termFreqMap.containsKey(token) ? termFreqMap.get(token) : 0; double numerator = (this.k1 + 1) * termFreq; double docLengthFactor = this.b * (docSize / avgFL); double denominator = termFreq + (this.k1) * (1 - this.b + docLengthFactor); score += computeIDF(docFreq, maxDocs) * numerator / denominator; } return score; }
public void test10kPulsed() throws Exception { // we always run this test with pulsing codec. Codec cp = _TestUtil.alwaysPostingsFormat(new Pulsing41PostingsFormat(1)); File f = _TestUtil.getTempDir("10kpulsed"); BaseDirectoryWrapper dir = newFSDirectory(f); dir.setCheckIndexOnClose(false); // we do this ourselves explicitly RandomIndexWriter iw = new RandomIndexWriter( random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setCodec(cp)); Document document = new Document(); FieldType ft = new FieldType(TextField.TYPE_STORED); switch (_TestUtil.nextInt(random(), 0, 2)) { case 0: ft.setIndexOptions(IndexOptions.DOCS_ONLY); break; case 1: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); break; default: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); break; } Field field = newField("field", "", ft); document.add(field); NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT)); for (int i = 0; i < 10050; i++) { field.setStringValue(df.format(i)); iw.addDocument(document); } IndexReader ir = iw.getReader(); iw.close(); TermsEnum te = MultiFields.getTerms(ir, "field").iterator(null); DocsEnum de = null; for (int i = 0; i < 10050; i++) { String expected = df.format(i); assertEquals(expected, te.next().utf8ToString()); de = _TestUtil.docs(random(), te, null, de, 0); assertTrue(de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.nextDoc()); } ir.close(); _TestUtil.checkIndex(dir); dir.close(); }
/* Copied from lucene 4.2.x core */ private static long totalTermFreq(IndexReader r, String field, BytesRef text) throws IOException { final Terms terms = MultiFields.getTerms(r, field); if (terms != null) { final TermsEnum termsEnum = terms.iterator(null); if (termsEnum.seekExact(text, true)) { return termsEnum.totalTermFreq(); } } return 0; }
protected void fill(String field, TermsEnum termsEnum) throws IOException { while (true) { BytesRef term = termsEnum.next(); if (term != null) { insertWithOverflow(new TermStats(field, term, termsEnum.docFreq())); } else { break; } } }
SegmentResult(int[] counts, int total, TermsEnum tenum, int startFacetOrd, int endFacetOrd) throws IOException { super(counts, total - counts[0], counts[0], endFacetOrd + 1); this.tenum = tenum; this.mergePos = startFacetOrd == -1 ? 1 : startFacetOrd + 1; if (mergePos < maxTermPos) { assert tenum != null; tenum.seekExact(startFacetOrd == -1 ? 0 : startFacetOrd); mergeTerm = tenum.term(); } }
public void testChangeGaps() throws Exception { // LUCENE-5324: check that it is possible to change the wrapper's gaps final int positionGap = random().nextInt(1000); final int offsetGap = random().nextInt(1000); final Analyzer delegate = new MockAnalyzer(random()); final Analyzer a = new DelegatingAnalyzerWrapper(delegate.getReuseStrategy()) { @Override protected Analyzer getWrappedAnalyzer(String fieldName) { return delegate; } @Override public int getPositionIncrementGap(String fieldName) { return positionGap; } @Override public int getOffsetGap(String fieldName) { return offsetGap; } }; final RandomIndexWriter writer = new RandomIndexWriter(random(), newDirectory(), a); final Document doc = new Document(); final FieldType ft = new FieldType(); ft.setIndexOptions(IndexOptions.DOCS); ft.setTokenized(true); ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.setStoreTermVectorOffsets(true); doc.add(new Field("f", "a", ft)); doc.add(new Field("f", "a", ft)); writer.addDocument(doc); final LeafReader reader = getOnlySegmentReader(writer.getReader()); final Fields fields = reader.getTermVectors(0); final Terms terms = fields.terms("f"); final TermsEnum te = terms.iterator(); assertEquals(new BytesRef("a"), te.next()); final PostingsEnum dpe = te.postings(null, PostingsEnum.ALL); assertEquals(0, dpe.nextDoc()); assertEquals(2, dpe.freq()); assertEquals(0, dpe.nextPosition()); assertEquals(0, dpe.startOffset()); final int endOffset = dpe.endOffset(); assertEquals(1 + positionGap, dpe.nextPosition()); assertEquals(1 + endOffset + offsetGap, dpe.endOffset()); assertEquals(null, te.next()); reader.close(); writer.close(); writer.w.getDirectory().close(); }
/** Computes which global ordinals are accepted by this IncludeExclude instance. */ @Override public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals) throws IOException { LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount()); TermsEnum globalTermsEnum; Terms globalTerms = new DocValuesTerms(globalOrdinals); // TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can // avoid i/o and just set bits. globalTermsEnum = compiled.getTermsEnum(globalTerms); for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) { acceptedGlobalOrdinals.set(globalTermsEnum.ord()); } return acceptedGlobalOrdinals; }
// make sure we never reuse from another reader even if it is the same field & codec etc public void testReuseDocsEnumDifferentReader() throws IOException { Directory dir = newDirectory(); Codec cp = TestUtil.alwaysPostingsFormat(new Lucene40RWPostingsFormat()); MockAnalyzer analyzer = new MockAnalyzer(random()); analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig(analyzer).setCodec(cp)); int numdocs = atLeast(20); createRandomIndex(numdocs, writer, random()); writer.commit(); DirectoryReader firstReader = DirectoryReader.open(dir); DirectoryReader secondReader = DirectoryReader.open(dir); List<LeafReaderContext> leaves = firstReader.leaves(); List<LeafReaderContext> leaves2 = secondReader.leaves(); for (LeafReaderContext ctx : leaves) { Terms terms = ctx.reader().terms("body"); TermsEnum iterator = terms.iterator(); IdentityHashMap<PostingsEnum, Boolean> enums = new IdentityHashMap<>(); MatchNoBits bits = new Bits.MatchNoBits(firstReader.maxDoc()); iterator = terms.iterator(); PostingsEnum docs = null; BytesRef term = null; while ((term = iterator.next()) != null) { docs = iterator.postings( null, randomDocsEnum("body", term, leaves2, bits), random().nextBoolean() ? PostingsEnum.FREQS : PostingsEnum.NONE); enums.put(docs, true); } assertEquals(terms.size(), enums.size()); iterator = terms.iterator(); enums.clear(); docs = null; while ((term = iterator.next()) != null) { docs = iterator.postings( bits, randomDocsEnum("body", term, leaves2, bits), random().nextBoolean() ? PostingsEnum.FREQS : PostingsEnum.NONE); enums.put(docs, true); } assertEquals(terms.size(), enums.size()); } writer.close(); IOUtils.close(firstReader, secondReader, dir); }
@Override public long lookupTerm(BytesRef key) { try { switch (te.seekCeil(key)) { case FOUND: assert te.ord() >= 0; return te.ord(); case NOT_FOUND: assert te.ord() >= 0; return -te.ord() - 1; default: /* END */ return -numTerms() - 1; } } catch (IOException e) { throw new RuntimeException(e); } }