/** * Prepare a document reconstructor. * * @param reader IndexReader to read from. * @param fieldNames if non-null or not empty, data will be collected only from these fields, * otherwise data will be collected from all fields * @param numTerms total number of terms in the index, or -1 if unknown (will be calculated) * @throws Exception */ public DocReconstructor(IndexReader reader, String[] fieldNames, int numTerms) throws Exception { if (reader == null) { throw new Exception("IndexReader cannot be null."); } this.reader = reader; if (fieldNames == null || fieldNames.length == 0) { // collect fieldNames this.fieldNames = (String[]) reader.getFieldNames(FieldOption.ALL).toArray(new String[0]); } else { this.fieldNames = fieldNames; } if (numTerms == -1) { Fields fields = MultiFields.getFields(reader); numTerms = 0; FieldsEnum fe = fields.iterator(); String fld = null; while ((fld = fe.next()) != null) { TermsEnum te = fe.terms(); while (te.next() != null) { numTerms++; } } this.numTerms = numTerms; } deleted = MultiFields.getDeletedDocs(reader); }
/* * listPostings displays the first n postings for a term in a * field in an index (specified by reader). Set n to MAX_VALUE * to display all postings. */ static void listPostings(IndexReader reader, String termString, String field, Integer n) throws IOException { System.out.println("\nPostings: " + termString + " " + field); /* * Prepare to access the index. */ BytesRef termBytes = new BytesRef(termString); Term term = new Term(field, termBytes); Bits liveDocs = MultiFields.getLiveDocs(reader); /* * Lookup the collection term frequency (ctf). */ long df = reader.docFreq(term); System.out.println("\tdf: " + df); long ctf = reader.totalTermFreq(term); System.out.println("\tctf: " + ctf); if (df < 1) return; /* * Lookup the inverted list. */ DocsAndPositionsEnum postings = MultiFields.getTermPositionsEnum(reader, liveDocs, field, termBytes); /* * Iterate through the first n postings. */ long count = 0; while ((count < n) && (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS)) { System.out.println("\tdocid: " + postings.docID()); int tf = postings.freq(); System.out.println("\ttf: " + tf); System.out.print("\tPositions: "); for (int j = 0; j < tf; j++) { int pos = postings.nextPosition(); System.out.print(pos + " "); } System.out.println(""); count++; } ; return; }
public void testKeepsLastFilter() throws Throwable { DuplicateFilter df = new DuplicateFilter(KEY_FIELD); df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE); ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs; assertTrue("Filtered searching should have found some matches", hits.length > 0); for (ScoreDoc hit : hits) { StoredDocument d = searcher.doc(hit.doc); String url = d.get(KEY_FIELD); DocsEnum td = _TestUtil.docs( random(), reader, KEY_FIELD, new BytesRef(url), MultiFields.getLiveDocs(reader), null, 0); int lastDoc = 0; while (td.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { lastDoc = td.docID(); } assertEquals("Duplicate urls should return last doc", lastDoc, hit.doc); } }
public void testSeekCeilNotFound() throws Exception { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); Document doc = new Document(); // Get empty string in there! doc.add(newStringField("field", "", Field.Store.NO)); w.addDocument(doc); for (int i = 0; i < 36; i++) { doc = new Document(); String term = "" + (char) (97 + i); String term2 = "a" + (char) (97 + i); doc.add(newTextField("field", term + " " + term2, Field.Store.NO)); w.addDocument(doc); } w.forceMerge(1); IndexReader r = w.getReader(); TermsEnum te = MultiFields.getTerms(r, "field").iterator(null); assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seekCeil(new BytesRef(new byte[] {0x22}))); assertEquals("a", te.term().utf8ToString()); assertEquals(1L, te.ord()); r.close(); w.close(); dir.close(); }
public void listTokens(int freq) throws IOException { IndexReader ireader = null; TermsEnum iter = null; Terms terms = null; try { ireader = DirectoryReader.open(indexDirectory); int numDocs = ireader.numDocs(); if (numDocs > 0) { Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0); terms = uFields.terms(QueryBuilder.DEFS); } iter = terms.iterator(null); // init uid iterator while (iter.term() != null) { // if (iter.term().field().startsWith("f")) { if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) { log.warning(iter.term().utf8ToString()); } iter.next(); /*} else { break; }*/ } } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { log.log(Level.WARNING, "An error occured while closing index reader", e); } } } }
public TermInfo collect(String term) throws IOException { TermInfo info = new TermInfo(); BytesRef luceneTerm = new BytesRef(term.getBytes()); // this gives documents in which the term is found, but no offset information can be retrieved PostingsEnum postings = MultiFields.getTermDocsEnum(indexReader, ngramInfoFieldname, luceneTerm); // now go through each document int docId = postings.nextDoc(); while (docId != PostingsEnum.NO_MORE_DOCS) { // get the term vector for that document. TermsEnum it = indexReader.getTermVector(docId, ngramInfoFieldname).iterator(); // find the term of interest it.seekExact(luceneTerm); // get its posting info. this will contain offset info PostingsEnum postingsInDoc = it.postings(null, PostingsEnum.OFFSETS); postingsInDoc.nextDoc(); Document doc = indexReader.document(docId); String id = doc.get(idFieldname); JATEDocument jd = new JATEDocument(id); Set<int[]> offsets = new HashSet<>(); int totalFreq = postingsInDoc.freq(); for (int i = 0; i < totalFreq; i++) { postingsInDoc.nextPosition(); offsets.add(new int[] {postingsInDoc.startOffset(), postingsInDoc.endOffset()}); } info.getOffsets().put(jd, offsets); docId = postings.nextDoc(); } return info; }
/** * List all of the files in this index database * * @throws IOException If an IO error occurs while reading from the database */ public void listFiles() throws IOException { IndexReader ireader = null; TermsEnum iter; Terms terms = null; try { ireader = DirectoryReader.open(indexDirectory); // open existing index int numDocs = ireader.numDocs(); if (numDocs > 0) { Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0); terms = uFields.terms(QueryBuilder.U); } iter = terms.iterator(null); // init uid iterator while (iter.term() != null) { log.fine(Util.uid2url(iter.term().utf8ToString())); iter.next(); } } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { log.log(Level.WARNING, "An error occured while closing index reader", e); } } } }
/* * listTermDictionary displays the term dictionary for a field. */ static void listTermDictionary(IndexReader reader, String fieldName) throws IOException { System.out.println("\nTerm Dictionary: field " + fieldName); /* Grant says: MultiFields.getTerms(IndexReader, fieldName) */ Terms terms = MultiFields.getTerms(reader, fieldName); if ((terms == null) || (terms.size() == -1)) System.out.println(" The term dictionary is empty."); else { System.out.println(" Vocabulary size: " + terms.size() + " terms"); TermsEnum ithTerm = terms.iterator(null); /* * Iterate over the terms in this document. * Information about a term's occurrences (tf and * positions) is accessed via the indexing API, which * returns inverted lists that describe (only) the * current document. */ while (ithTerm.next() != null) { System.out.format( " %-30s %d %d\n", ithTerm.term().utf8ToString(), ithTerm.docFreq(), ithTerm.totalTermFreq()); } ; } ; }
/** * @param reader * @param numTerms * @param field * @return TermStats[] ordered by terms with highest docFreq first. * @throws Exception */ public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames) throws Exception { TermStatsQueue tiq = null; TermsEnum te = null; if (fieldNames != null) { Fields fields = MultiFields.getFields(reader); if (fields == null) { LOG.info("Index with no fields - probably empty or corrupted"); return EMPTY_STATS; } tiq = new TermStatsQueue(numTerms); for (String field : fieldNames) { Terms terms = fields.terms(field); if (terms != null) { te = terms.iterator(te); fillQueue(te, tiq, field); } } } else { Fields fields = MultiFields.getFields(reader); if (fields == null) { LOG.info("Index with no fields - probably empty or corrupted"); return EMPTY_STATS; } tiq = new TermStatsQueue(numTerms); Iterator<String> fieldIterator = fields.iterator(); while (fieldIterator.hasNext()) { String field = fieldIterator.next(); Terms terms = fields.terms(field); if (terms != null) { te = terms.iterator(te); fillQueue(te, tiq, field); } } } TermStats[] result = new TermStats[tiq.size()]; // we want highest first so we read the queue and populate the array // starting at the end and work backwards int count = tiq.size() - 1; while (tiq.size() != 0) { result[count] = tiq.pop(); count--; } return result; }
/** * @param filteredDocument Document with field values extracted for selected fields. * @return More Like This query for the passed document. */ public Query like(Map<String, Collection<Object>> filteredDocument) throws IOException { if (fieldNames == null) { // gather list of valid fields from lucene Collection<String> fields = MultiFields.getIndexedFields(ir); fieldNames = fields.toArray(new String[fields.size()]); } return createQuery(retrieveTerms(filteredDocument)); }
protected ValueSourceScorer(IndexReader reader, DocValues values) { super(null); this.reader = reader; this.maxDoc = reader.maxDoc(); this.values = values; setCheckDeletes(true); this.delDocs = MultiFields.getDeletedDocs(reader); }
/** * Return a query that will return docs like the passed lucene document ID. * * @param docNum the documentID of the lucene doc to generate the 'More Like This" query for. * @return a query that will return docs like the passed lucene document ID. */ public Query like(int docNum) throws IOException { if (fieldNames == null) { // gather list of valid fields from lucene Collection<String> fields = MultiFields.getIndexedFields(ir); fieldNames = fields.toArray(new String[fields.size()]); } return createQuery(retrieveTerms(docNum)); }
public void testThreeBlocks() throws Exception { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); List<String> terms = new ArrayList<>(); for (int i = 0; i < 36; i++) { Document doc = new Document(); String term = "" + (char) (97 + i); terms.add(term); if (VERBOSE) { System.out.println("i=" + i + " term=" + term); } doc.add(newTextField("field", term, Field.Store.NO)); w.addDocument(doc); } for (int i = 0; i < 36; i++) { Document doc = new Document(); String term = "m" + (char) (97 + i); terms.add(term); if (VERBOSE) { System.out.println("i=" + i + " term=" + term); } doc.add(newTextField("field", term, Field.Store.NO)); w.addDocument(doc); } for (int i = 0; i < 36; i++) { Document doc = new Document(); String term = "mo" + (char) (97 + i); terms.add(term); if (VERBOSE) { System.out.println("i=" + i + " term=" + term); } doc.add(newTextField("field", term, Field.Store.NO)); w.addDocument(doc); } w.forceMerge(1); IndexReader r = w.getReader(); TermsEnum te = MultiFields.getTerms(r, "field").iterator(null); if (VERBOSE) { while (te.next() != null) { System.out.println("TERM: " + te.ord() + " " + te.term().utf8ToString()); } } assertTrue(te.seekExact(new BytesRef("mo"))); assertEquals(27, te.ord()); te.seekExact(90); assertEquals(new BytesRef("s"), te.term()); testEnum(te, terms); r.close(); w.close(); dir.close(); }
public void test10kPulsed() throws Exception { // we always run this test with pulsing codec. Codec cp = _TestUtil.alwaysPostingsFormat(new Pulsing41PostingsFormat(1)); File f = _TestUtil.getTempDir("10kpulsed"); BaseDirectoryWrapper dir = newFSDirectory(f); dir.setCheckIndexOnClose(false); // we do this ourselves explicitly RandomIndexWriter iw = new RandomIndexWriter( random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setCodec(cp)); Document document = new Document(); FieldType ft = new FieldType(TextField.TYPE_STORED); switch (_TestUtil.nextInt(random(), 0, 2)) { case 0: ft.setIndexOptions(IndexOptions.DOCS_ONLY); break; case 1: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); break; default: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); break; } Field field = newField("field", "", ft); document.add(field); NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT)); for (int i = 0; i < 10050; i++) { field.setStringValue(df.format(i)); iw.addDocument(document); } IndexReader ir = iw.getReader(); iw.close(); TermsEnum te = MultiFields.getTerms(ir, "field").iterator(null); DocsEnum de = null; for (int i = 0; i < 10050; i++) { String expected = df.format(i); assertEquals(expected, te.next().utf8ToString()); de = _TestUtil.docs(random(), te, null, de, 0); assertTrue(de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.nextDoc()); } ir.close(); _TestUtil.checkIndex(dir); dir.close(); }
/* Copied from lucene 4.2.x core */ private static long totalTermFreq(IndexReader r, String field, BytesRef text) throws IOException { final Terms terms = MultiFields.getTerms(r, field); if (terms != null) { final TermsEnum termsEnum = terms.iterator(null); if (termsEnum.seekExact(text, true)) { return termsEnum.totalTermFreq(); } } return 0; }
/** * @param reader * @param lireFeature * @return the maximum distance found for normalizing. * @throws java.io.IOException */ @SuppressWarnings("unchecked") private float[] findSimilar(IndexReader reader, LireFeature[] lireFeature) throws IOException { float[] maxDistance = new float[lireFeature.length]; float[] overallMaxDistance = new float[lireFeature.length]; for (int i = 0; i < overallMaxDistance.length; i++) { overallMaxDistance[i] = -1f; maxDistance[i] = -1f; } parDocs = new TreeSet[lireFeature.length]; for (int i = 0; i < parDocs.length; i++) { parDocs[i] = new TreeSet<SimpleResult>(); } // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); // clear result set ... int docs = reader.numDocs(); for (int i = 0; i < docs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document d = reader.document(i); float[] distance = getDistance(d, lireFeature); // calculate the overall max distance to normalize score afterwards for (int j = 0; j < distance.length; j++) { float f = distance[j]; if (overallMaxDistance[j] < f) { overallMaxDistance[j] = f; } // if it is the first document: if (maxDistance[j] < 0) { maxDistance[j] = f; } // if the array is not full yet: if (this.parDocs[j].size() < maxHits) { this.parDocs[j].add(new SimpleResult(f, d)); if (f > maxDistance[j]) { maxDistance[j] = f; } } else if (f < maxDistance[j]) { // if it is nearer to the sample than at least on of the current set: // remove the last one ... this.parDocs[j].remove(this.parDocs[j].last()); // add the new one ... this.parDocs[j].add(new SimpleResult(f, d)); // and set our new distance border ... maxDistance[j] = this.parDocs[j].last().getDistance(); } } } return maxDistance; }
private FixedBitSet createExpectedResult( String queryValue, boolean from, IndexReader topLevelReader, IndexIterationContext context) throws IOException { final Map<String, List<RandomDoc>> randomValueDocs; final Map<String, List<RandomDoc>> linkValueDocuments; if (from) { randomValueDocs = context.randomValueFromDocs; linkValueDocuments = context.toDocuments; } else { randomValueDocs = context.randomValueToDocs; linkValueDocuments = context.fromDocuments; } FixedBitSet expectedResult = new FixedBitSet(topLevelReader.maxDoc()); List<RandomDoc> matchingDocs = randomValueDocs.get(queryValue); if (matchingDocs == null) { return new FixedBitSet(topLevelReader.maxDoc()); } for (RandomDoc matchingDoc : matchingDocs) { for (String linkValue : matchingDoc.linkValues) { List<RandomDoc> otherMatchingDocs = linkValueDocuments.get(linkValue); if (otherMatchingDocs == null) { continue; } for (RandomDoc otherSideDoc : otherMatchingDocs) { DocsEnum docsEnum = MultiFields.getTermDocsEnum( topLevelReader, MultiFields.getLiveDocs(topLevelReader), "id", new BytesRef(otherSideDoc.id), 0); assert docsEnum != null; int doc = docsEnum.nextDoc(); expectedResult.set(doc); } } } return expectedResult; }
private long getSumTermFrequency(IndexReader reader, String fieldName) { Terms collectionTermVector = null; try { collectionTermVector = MultiFields.getTerms(reader, fieldName); long totalTermFreq = collectionTermVector.getSumTotalTermFreq(); return totalTermFreq; } catch (IOException e) { LOG.warn("Unable to get total term frequency, it might not be indexed"); } return 0; }
public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext) throws Exception { // BytesRef br = termtext; long totalTF = 0; try { Bits liveDocs = MultiFields.getLiveDocs(reader); totalTF = reader.getSumTotalTermFreq(field); return totalTF; } catch (Exception e) { return 0; } }
public void tttestGetDistribution() throws IOException { BufferedWriter bw = new BufferedWriter(new FileWriter("data.csv")); IndexReader reader = IndexReader.open(FSDirectory.open(new File(indexPath))); // get the first document: // if (!IndexReader.indexExists(reader.directory())) // throw new FileNotFoundException("No index found at this specific location."); CEDD cedd1 = new CEDD(); FCTH fcth1 = new FCTH(); CEDD cedd2 = new CEDD(); FCTH fcth2 = new FCTH(); JCD jcd1 = new JCD(); JCD jcd2 = new JCD(); String[] cls; // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); int docs = reader.numDocs(); for (int i = 0; i < docs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document doc = reader.document(i); cls = doc.getValues(DocumentBuilder.FIELD_NAME_CEDD); if (cls != null && cls.length > 0) cedd1.setStringRepresentation(cls[0]); cls = doc.getValues(DocumentBuilder.FIELD_NAME_FCTH); if (cls != null && cls.length > 0) fcth1.setStringRepresentation(cls[0]); for (int j = i + 1; j < docs; j++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document doc2 = reader.document(j); cls = doc2.getValues(DocumentBuilder.FIELD_NAME_CEDD); if (cls != null && cls.length > 0) cedd2.setStringRepresentation(cls[0]); cls = doc2.getValues(DocumentBuilder.FIELD_NAME_FCTH); if (cls != null && cls.length > 0) fcth2.setStringRepresentation(cls[0]); jcd1.init(cedd1, fcth1); jcd2.init(cedd2, fcth2); bw.write( cedd1.getDistance(cedd2) + ";" + fcth1.getDistance(fcth2) + ";" + jcd1.getDistance(jcd2) + "\n"); } if (i % 100 == 0) System.out.println(i + " entries processed ... "); } bw.close(); }
/** * We assume that the initial indexing has been done and a set of reference objects has been found * and indexed in the separate directory. However further documents were added and they now need * to get a ranked list of reference objects. So we (i) get all these new documents missing the * field "ro-order" and (ii) add this field. * * @param indexPath the index to update * @throws IOException */ public void updateIndex(String indexPath) throws IOException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath))); int numDocs = reader.numDocs(); boolean hasDeletions = reader.hasDeletions(); int countUpdated = 0; IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro"))); ImageSearcher searcher = new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName); Map<String, Analyzer> perField = new HashMap<String, Analyzer>(1); perField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION)); PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), perField); IndexWriter iw = new IndexWriter( FSDirectory.open(new File(indexPath)), new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper) .setOpenMode(IndexWriterConfig.OpenMode.CREATE)); StringBuilder sb = new StringBuilder(256); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (int i = 0; i < numDocs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document document = reader.document(i); if (document.getField("ro-order") == null) { // if the field is not here we create it. ImageSearchHits hits = searcher.search(document, readerRo); sb.delete(0, sb.length()); for (int j = 0; j < numReferenceObjectsUsed; j++) { sb.append(hits.doc(j).getValues("ro-id")[0]); sb.append(' '); } // System.out.println(sb.toString()); document.add(new TextField("ro-order", sb.toString(), Field.Store.YES)); iw.updateDocument( new Term( DocumentBuilder.FIELD_NAME_IDENTIFIER, document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), document); countUpdated++; } // progress report progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1); // debug: System.out.println("countUpdated = " + countUpdated); } iw.commit(); iw.close(); }
/** Returns TermStats[] ordered by terms with highest docFreq first. */ public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field) throws Exception { TermStatsQueue tiq = null; if (field != null) { Fields fields = MultiFields.getFields(reader); if (fields == null) { throw new RuntimeException("field " + field + " not found"); } Terms terms = fields.terms(field); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); tiq = new TermStatsQueue(numTerms); tiq.fill(field, termsEnum); } } else { Fields fields = MultiFields.getFields(reader); if (fields == null) { throw new RuntimeException("no fields found for this index"); } tiq = new TermStatsQueue(numTerms); for (String fieldName : fields) { Terms terms = fields.terms(fieldName); if (terms != null) { tiq.fill(fieldName, terms.iterator(null)); } } } TermStats[] result = new TermStats[tiq.size()]; // we want highest first so we read the queue and populate the array // starting at the end and work backwards int count = tiq.size() - 1; while (tiq.size() != 0) { result[count] = tiq.pop(); count--; } return result; }
public static Terms getTermVector(String fieldname, SolrIndexSearcher solrIndexSearcher) throws JATEException { try { Fields fields = MultiFields.getFields(solrIndexSearcher.getLeafReader()); Terms vector = fields.terms(fieldname); if (vector == null) throw new JATEException(String.format("Cannot find expected field: %s", fieldname)); return vector; } catch (IOException ioe) { StringBuilder sb = new StringBuilder( String.format("Cannot find expected field: %s. Error stacktrack: \n", fieldname)); sb.append(org.apache.commons.lang.exception.ExceptionUtils.getFullStackTrace(ioe)); throw new JATEException(sb.toString()); } }
private Fields generateTermVectors( Collection<GetField> getFields, boolean withOffsets, @Nullable Map<String, String> perFieldAnalyzer) throws IOException { /* store document in memory index */ MemoryIndex index = new MemoryIndex(withOffsets); for (GetField getField : getFields) { String field = getField.getName(); Analyzer analyzer = getAnalyzerAtField(field, perFieldAnalyzer); for (Object text : getField.getValues()) { index.addField(field, text.toString(), analyzer); } } /* and read vectors from it */ return MultiFields.getFields(index.createSearcher().getIndexReader()); }
public void testNonRootFloorBlocks() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); IndexWriter w = new IndexWriter(dir, iwc); List<String> terms = new ArrayList<>(); for (int i = 0; i < 36; i++) { Document doc = new Document(); String term = "" + (char) (97 + i); terms.add(term); if (VERBOSE) { System.out.println("i=" + i + " term=" + term); } doc.add(newTextField("field", term, Field.Store.NO)); w.addDocument(doc); } for (int i = 0; i < 128; i++) { Document doc = new Document(); String term = "m" + (char) i; terms.add(term); if (VERBOSE) { System.out.println("i=" + i + " term=" + term + " bytes=" + new BytesRef(term)); } doc.add(newStringField("field", term, Field.Store.NO)); w.addDocument(doc); } w.forceMerge(1); IndexReader r = DirectoryReader.open(w, true); TermsEnum te = MultiFields.getTerms(r, "field").iterator(null); BytesRef term; int ord = 0; while ((term = te.next()) != null) { if (VERBOSE) { System.out.println("TEST: " + te.ord() + ": " + term.utf8ToString()); } assertEquals(ord, te.ord()); ord++; } testEnum(te, terms); r.close(); w.close(); dir.close(); }
private int countTerms(MultiTermQuery q) throws Exception { final Terms terms = MultiFields.getTerms(reader, q.getField()); if (terms == null) return 0; final TermsEnum termEnum = q.getTermsEnum(terms); assertNotNull(termEnum); int count = 0; BytesRef cur, last = null; while ((cur = termEnum.next()) != null) { count++; if (last != null) { assertTrue(last.compareTo(cur) < 0); } last = BytesRef.deepCopyOf(cur); } // LUCENE-3314: the results after next() already returned null are undefined, // assertNull(termEnum.next()); return count; }
@Override public FieldStats.Long stats(IndexReader reader) throws IOException { int maxDoc = reader.maxDoc(); Terms terms = org.apache.lucene.index.MultiFields.getTerms(reader, name()); if (terms == null) { return null; } long minValue = LegacyNumericUtils.getMinInt(terms); long maxValue = LegacyNumericUtils.getMaxInt(terms); return new FieldStats.Long( maxDoc, terms.getDocCount(), terms.getSumDocFreq(), terms.getSumTotalTermFreq(), isSearchable(), isAggregatable(), minValue, maxValue); }
/** Test the WordScorer emitted by the smoothing model */ public void testBuildWordScorer() throws IOException { SmoothingModel testModel = createTestModel(); Map<String, Analyzer> mapping = new HashMap<>(); mapping.put("field", new WhitespaceAnalyzer()); PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping); IndexWriter writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(wrapper)); Document doc = new Document(); doc.add(new Field("field", "someText", TextField.TYPE_NOT_STORED)); writer.addDocument(doc); DirectoryReader ir = DirectoryReader.open(writer); WordScorer wordScorer = testModel .buildWordScorerFactory() .newScorer( ir, MultiFields.getTerms(ir, "field"), "field", 0.9d, BytesRefs.toBytesRef(" ")); assertWordScorer(wordScorer, testModel); }
public void testSeveralNonRootBlocks() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); IndexWriter w = new IndexWriter(dir, iwc); List<String> terms = new ArrayList<>(); for (int i = 0; i < 30; i++) { for (int j = 0; j < 30; j++) { Document doc = new Document(); String term = "" + (char) (97 + i) + (char) (97 + j); terms.add(term); if (VERBOSE) { System.out.println("term=" + term); } doc.add(newTextField("body", term, Field.Store.NO)); w.addDocument(doc); } } w.forceMerge(1); IndexReader r = DirectoryReader.open(w, true); TermsEnum te = MultiFields.getTerms(r, "body").iterator(null); for (int i = 0; i < 30; i++) { for (int j = 0; j < 30; j++) { String term = "" + (char) (97 + i) + (char) (97 + j); if (VERBOSE) { System.out.println("TEST: check term=" + term); } assertEquals(term, te.next().utf8ToString()); assertEquals(30 * i + j, te.ord()); } } testEnum(te, terms); te.seekExact(0); assertEquals("aa", te.term().utf8ToString()); r.close(); w.close(); dir.close(); }
@Override protected ShardTermlistResponse shardOperation(ShardTermlistRequest request) throws ElasticSearchException { synchronized (termlistMutex) { InternalIndexShard indexShard = (InternalIndexShard) indicesService.indexServiceSafe(request.index()).shardSafe(request.shardId()); indexShard.store().directory(); Engine.Searcher searcher = indexShard.searcher(); try { Set<String> set = new CompactHashSet(); Fields fields = MultiFields.getFields(searcher.reader()); if (fields != null) { for (Iterator<String> it = fields.iterator(); it.hasNext(); ) { String field = it.next(); if (field.charAt(0) == '_') { continue; } if (request.getField() == null || field.equals(request.getField())) { Terms terms = fields.terms(field); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { set.add(text.utf8ToString()); System.out.println("field=" + field + "; text=" + text.utf8ToString()); } } } } } return new ShardTermlistResponse(request.index(), request.shardId(), set); } catch (IOException ex) { throw new ElasticSearchException(ex.getMessage(), ex); } } }