private void remove(Class entity, Serializable id) { log.trace("remove from Lucene index: " + entity + "#" + id); DocumentBuilder builder = workspace.getDocumentBuilder(entity); Term term = builder.getTerm(id); IndexReader reader = workspace.getIndexReader(entity); TermDocs termDocs = null; try { // TODO is there a faster way? // TODO include TermDocs into the workspace? termDocs = reader.termDocs(term); String entityName = entity.getName(); while (termDocs.next()) { int docIndex = termDocs.doc(); if (entityName.equals(reader.document(docIndex).get(DocumentBuilder.CLASS_FIELDNAME))) { // remove only the one of the right class // loop all to remove all the matches (defensive code) reader.deleteDocument(docIndex); } } } catch (Exception e) { throw new HibernateException("Unable to remove from Lucene index: " + entity + "#" + id, e); } finally { if (termDocs != null) try { termDocs.close(); } catch (IOException e) { log.warn("Unable to close termDocs properly", e); } } }
MatchNoneScorer(IndexReader reader, Similarity similarity, Weight w, byte[] norms) throws IOException { super(similarity, w); this.termDocs = reader.termDocs(null); score = w.getValue(); this.norms = norms; }
public Scorer scorer(IndexReader reader) throws IOException { TermDocs termDocs = reader.termDocs(term); if (termDocs == null) return null; return new MyTermScorer(this, termDocs, similarity, reader.norms(term.field())); }
@Override public DocIdSet getDocIdSet(IndexReader reader) throws IOException { StockAction action = getStockAction(); // retrieve the service long lastUpdateTime = action.geLastUpdateTime(); if (lastUpdateTime != this.lastUpdateTime) { cache.clear(); // clear outdated cache } DocIdSet cached = cache.get(reader); // check if in cache already if (cached != null) return cached; // not in cache, build info final BitSet bitSet = getAllPositiveBitSet(reader.maxDoc()); // by default, all documents pass Term clazzTerm = new Term(DocumentBuilder.CLASS_FIELDNAME, Item.class.getName()); if (reader.docFreq(clazzTerm) == 0) { // no need to filter // index does not contain Item objects // no-op } else { // for each item out of stock, find the corresponding document id by item id // and switch off the corresponding bit for (String ean : action.getEanOfItemsOutOfStock()) { // invoke external service Term term = new Term("ean", ean); TermDocs termDocs = reader.termDocs(term); // find document by ean while (termDocs.next()) { bitSet.clear(termDocs.doc()); } } } DocIdSet docIdSet = new DocIdBitSet(bitSet); // build DocIdSet from BitSet cache.put(reader, docIdSet); // put results in the cache this.lastUpdateTime = lastUpdateTime; // update timestamp return docIdSet; }
private static Map<String, List<String>> generate_result(Directory directory) { Map<String, List<String>> result_map = new HashMap<String, List<String>>(); try { IndexReader reader = IndexReader.open(directory); TermEnum termEnum = reader.terms(); while (termEnum.next()) { String termEnumString = termEnum.term().toString(); if (termEnumString.startsWith("content:")) { String term = termEnumString.substring(termEnumString.lastIndexOf(":") + 1); TermDocs termDocs = reader.termDocs(termEnum.term()); while (termDocs.next()) { Document doc = reader.document(termDocs.doc()); String relative_path = doc.get("relative_path"); if (result_map.containsKey(relative_path)) { result_map.get(relative_path).add(term + termDocs.freq()); } else { result_map.put(relative_path, new ArrayList<String>()); } } } } } catch (IOException e) { e.printStackTrace(); } finally { } return result_map; }
public static int docId(IndexReader reader, Term term) throws IOException { TermDocs termDocs = reader.termDocs(term); try { if (termDocs.next()) { return termDocs.doc(); } return NO_DOC; } finally { termDocs.close(); } }
public void getIndexInfo(String indexdir, int freqThreshold) { IndexReader reader = null; try { Directory dir = FSDirectory.open(new File(indexdir)); System.out.println(dir); reader = IndexReader.open(dir); System.out.println("document num:" + reader.numDocs()); System.out.println("======================"); TermEnum terms = reader.terms(); sortedTermQueue.clear(); maxDocNum = reader.maxDoc(); linkMap.clear(); termList.clear(); while (terms.next()) { // System.out.print(terms.term() + "\tDocFreq:" + TermDocs termDocs = reader.termDocs(terms.term()); MyTerm temp = new MyTerm(terms.term(), termDocs, maxDocNum); if (temp.totalFreq < freqThreshold) { continue; } /* * if(temp.originTrem.text().length()==1){ continue; } */ linkMap.put(temp.originTrem.text(), temp); sortedTermQueue.add(temp); termList.add(temp); } System.out.println("total Size:" + sortedTermQueue.size()); System.out.println("mapsize:" + linkMap.keySet().size()); // System.exit(0); int num = 0; this.maxFreq = sortedTermQueue.peek().totalFreq; while (!sortedTermQueue.isEmpty()) { num++; System.out.println(num + ":" + sortedTermQueue.poll()); } System.out.println("read index info done"); } catch (IOException e) { e.printStackTrace(); } finally { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } }
@SuppressWarnings({"StringEquality"}) @Override public void run() { TermDocs termDocs = null; TermEnum termEnum = null; try { BloomFilter filter = BloomFilterFactory.getFilter(reader.numDocs(), 15); termDocs = reader.termDocs(); termEnum = reader.terms(new Term(field)); do { Term term = termEnum.term(); if (term == null || term.field() != field) break; // LUCENE MONITOR: 4.0, move to use bytes! UnicodeUtil.UTF8Result utf8Result = Unicode.fromStringAsUtf8(term.text()); termDocs.seek(termEnum); while (termDocs.next()) { // when traversing, make sure to ignore deleted docs, so the key->docId will be correct if (!reader.isDeleted(termDocs.doc())) { filter.add(utf8Result.result, 0, utf8Result.length); } } } while (termEnum.next()); ConcurrentMap<String, BloomFilterEntry> fieldCache = cache.get(reader.getFieldCacheKey()); if (fieldCache != null) { if (fieldCache.containsKey(field)) { BloomFilterEntry filterEntry = new BloomFilterEntry(reader.numDocs(), filter); filterEntry.loading.set(false); fieldCache.put(field, filterEntry); } } } catch (Exception e) { logger.warn("failed to load bloom filter for [{}]", e, field); } finally { try { if (termDocs != null) { termDocs.close(); } } catch (IOException e) { // ignore } try { if (termEnum != null) { termEnum.close(); } } catch (IOException e) { // ignore } } }
/** * Tests the IndexReader.getFieldNames implementation * * @throws Exception on error */ public void testFilterIndexReader() throws Exception { Directory directory = newDirectory(); IndexWriter writer = new IndexWriter( directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))); Document d1 = new Document(); d1.add(newField("default", "one two", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(d1); Document d2 = new Document(); d2.add(newField("default", "one three", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(d2); Document d3 = new Document(); d3.add(newField("default", "two four", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(d3); writer.close(); IndexReader reader = new TestReader(IndexReader.open(directory, true)); TermEnum terms = reader.terms(); while (terms.next()) { assertTrue(terms.term().text().indexOf('e') != -1); } terms.close(); TermPositions positions = reader.termPositions(new Term("default", "one")); while (positions.next()) { assertTrue((positions.doc() % 2) == 1); } int NUM_DOCS = 3; TermDocs td = reader.termDocs(null); for (int i = 0; i < NUM_DOCS; i++) { assertTrue(td.next()); assertEquals(i, td.doc()); assertEquals(1, td.freq()); } td.close(); reader.close(); directory.close(); }
/* */ public DocIdSet getDocIdSet(IndexReader reader) /* */ throws IOException /* */ { /* 103 */ TermEnum enumerator = this.query.getEnum(reader); /* */ try /* */ { /* 106 */ if (enumerator.term() == null) { /* 107 */ return DocIdSet.EMPTY_DOCIDSET; } OpenBitSet bitSet = new OpenBitSet(reader.maxDoc()); /* 110 */ int[] docs = new int[32]; /* 111 */ int[] freqs = new int[32]; /* 112 */ TermDocs termDocs = reader.termDocs(); /* */ int termCount; /* */ try { termCount = 0; /* */ do { /* 116 */ Term term = enumerator.term(); /* 117 */ if (term == null) /* */ break; /* 119 */ termCount++; /* 120 */ termDocs.seek(term); /* */ while (true) { /* 122 */ int count = termDocs.read(docs, freqs); /* 123 */ if (count == 0) break; /* 124 */ for (int i = 0; i < count; i++) { /* 125 */ bitSet.set(docs[i]); /* */ } /* */ } /* */ /* */ } /* */ /* 131 */ while (enumerator.next()); /* */ /* 133 */ this.query.incTotalNumberOfTerms(termCount); /* */ } finally /* */ { /* 136 */ termDocs.close(); /* */ } /* 138 */ return bitSet; /* */ } finally { /* 140 */ enumerator.close(); /* */ } /* */ }
@Override public DocIdSetIterator iterator() throws IOException { final TermDocs td = reader.termDocs(term); if (td == null) { return EmptyDocIdSet.getInstance().iterator(); } return new DocIdSetIterator() { private int _doc = -1; @Override public int advance(int target) throws IOException { if (td.skipTo(target)) { _doc = td.doc(); } else { td.close(); _doc = DocIdSetIterator.NO_MORE_DOCS; } return _doc; } @Override public int docID() { return _doc; } @Override public int nextDoc() throws IOException { if (td.next()) { _doc = td.doc(); } else { td.close(); _doc = DocIdSetIterator.NO_MORE_DOCS; } return _doc; } }; }
/** * Returns a BitSet with true for documents which should be permitted in searchCallback results, * and false for those that should not. */ public BitSet bits(IndexReader reader) throws IOException { long start = System.currentTimeMillis(); BitSet bits = new BitSet(reader.maxDoc()); // TermEnum enumerator = // (null == lowerTerm ? reader.terms(new Term(fieldName, "")) : reader.terms(new // Term(fieldName, lowerTerm))); TermEnum enumerator = (null != lowerTerm ? reader.terms(new Term(fieldName, lowerTerm)) : reader.terms(new Term(fieldName, ""))); // coords = new HashMap(enumerator.docFreq()); try { if (enumerator.term() == null) { return bits; } boolean checkLower = false; if (!includeLower) // make adjustments to set to exclusive checkLower = true; TermDocs termDocs = reader.termDocs(); try { do { Term term = enumerator.term(); if (term != null && term.field().equals(fieldName)) { if (!checkLower || null == lowerTerm || term.text().compareTo(lowerTerm) > 0) { checkLower = false; if (upperTerm != null) { int compare = upperTerm.compareTo(term.text()); /* if beyond the upper term, or is exclusive and * this is equal to the upper term, break out */ if ((compare < 0) || (!includeUpper && compare == 0)) { break; } } /* we have a good term, find the docs */ termDocs.seek(enumerator.term()); while (termDocs.next()) { bits.set(termDocs.doc()); } } } else { break; } } while (enumerator.next()); } finally { termDocs.close(); } } finally { enumerator.close(); } long end = System.currentTimeMillis(); log.info("BoundaryBox Time Taken: " + (end - start)); return bits; }
public MatchAllDocIdSetIterator(IndexReader reader) throws IOException { _termDocs = reader.termDocs(null); _docid = -1; }
public void testSkipTo(int indexDivisor) throws IOException { Directory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); Term ta = new Term("content", "aaa"); for (int i = 0; i < 10; i++) addDoc(writer, "aaa aaa aaa aaa"); Term tb = new Term("content", "bbb"); for (int i = 0; i < 16; i++) addDoc(writer, "bbb bbb bbb bbb"); Term tc = new Term("content", "ccc"); for (int i = 0; i < 50; i++) addDoc(writer, "ccc ccc ccc ccc"); // assure that we deal with a single segment writer.optimize(); writer.close(); IndexReader reader = IndexReader.open(dir); reader.setTermInfosIndexDivisor(indexDivisor); assertEquals(indexDivisor, reader.getTermInfosIndexDivisor()); TermDocs tdocs = reader.termDocs(); // without optimization (assumption skipInterval == 16) // with next tdocs.seek(ta); assertTrue(tdocs.next()); assertEquals(0, tdocs.doc()); assertEquals(4, tdocs.freq()); assertTrue(tdocs.next()); assertEquals(1, tdocs.doc()); assertEquals(4, tdocs.freq()); assertTrue(tdocs.skipTo(0)); assertEquals(2, tdocs.doc()); assertTrue(tdocs.skipTo(4)); assertEquals(4, tdocs.doc()); assertTrue(tdocs.skipTo(9)); assertEquals(9, tdocs.doc()); assertFalse(tdocs.skipTo(10)); // without next tdocs.seek(ta); assertTrue(tdocs.skipTo(0)); assertEquals(0, tdocs.doc()); assertTrue(tdocs.skipTo(4)); assertEquals(4, tdocs.doc()); assertTrue(tdocs.skipTo(9)); assertEquals(9, tdocs.doc()); assertFalse(tdocs.skipTo(10)); // exactly skipInterval documents and therefore with optimization // with next tdocs.seek(tb); assertTrue(tdocs.next()); assertEquals(10, tdocs.doc()); assertEquals(4, tdocs.freq()); assertTrue(tdocs.next()); assertEquals(11, tdocs.doc()); assertEquals(4, tdocs.freq()); assertTrue(tdocs.skipTo(5)); assertEquals(12, tdocs.doc()); assertTrue(tdocs.skipTo(15)); assertEquals(15, tdocs.doc()); assertTrue(tdocs.skipTo(24)); assertEquals(24, tdocs.doc()); assertTrue(tdocs.skipTo(25)); assertEquals(25, tdocs.doc()); assertFalse(tdocs.skipTo(26)); // without next tdocs.seek(tb); assertTrue(tdocs.skipTo(5)); assertEquals(10, tdocs.doc()); assertTrue(tdocs.skipTo(15)); assertEquals(15, tdocs.doc()); assertTrue(tdocs.skipTo(24)); assertEquals(24, tdocs.doc()); assertTrue(tdocs.skipTo(25)); assertEquals(25, tdocs.doc()); assertFalse(tdocs.skipTo(26)); // much more than skipInterval documents and therefore with optimization // with next tdocs.seek(tc); assertTrue(tdocs.next()); assertEquals(26, tdocs.doc()); assertEquals(4, tdocs.freq()); assertTrue(tdocs.next()); assertEquals(27, tdocs.doc()); assertEquals(4, tdocs.freq()); assertTrue(tdocs.skipTo(5)); assertEquals(28, tdocs.doc()); assertTrue(tdocs.skipTo(40)); assertEquals(40, tdocs.doc()); assertTrue(tdocs.skipTo(57)); assertEquals(57, tdocs.doc()); assertTrue(tdocs.skipTo(74)); assertEquals(74, tdocs.doc()); assertTrue(tdocs.skipTo(75)); assertEquals(75, tdocs.doc()); assertFalse(tdocs.skipTo(76)); // without next tdocs.seek(tc); assertTrue(tdocs.skipTo(5)); assertEquals(26, tdocs.doc()); assertTrue(tdocs.skipTo(40)); assertEquals(40, tdocs.doc()); assertTrue(tdocs.skipTo(57)); assertEquals(57, tdocs.doc()); assertTrue(tdocs.skipTo(74)); assertEquals(74, tdocs.doc()); assertTrue(tdocs.skipTo(75)); assertEquals(75, tdocs.doc()); assertFalse(tdocs.skipTo(76)); tdocs.close(); reader.close(); dir.close(); }
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("TermDumper [-c|-v value] field <index...>"); System.exit(1); } boolean count = false; String value = null; boolean all = false; int i = 0; for (; i < args.length; i++) { String arg = args[i]; if ("-h".equals(arg) || "--help".equals(arg)) { System.err.println("TermDumper [-c|-v value] field <index...>"); System.exit(1); } else if ("-c".equals(arg) || "--count".equals(arg)) { count = true; } else if ("-v".equals(arg) || "--vaue".equals(arg)) { value = args[++i]; } else if ("-a".equals(arg) || "--all".equals(arg)) { all = true; } else { break; } } String field = args[i++]; java.util.ArrayList<IndexReader> readers = new java.util.ArrayList<IndexReader>(args.length - 1); for (; i < args.length; i++) { String arg = args[i]; try { IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true); readers.add(reader); } catch (IOException ioe) { System.err.println("Error reading: " + arg); } } for (IndexReader reader : readers) { TermDocs termDocs = reader.termDocs(); TermEnum termEnum = reader.terms(new Term(field)); try { do { Term term = termEnum.term(); if (term == null || !field.equals(term.field())) break; if (value == null) { if (count) { termDocs.seek(termEnum); int c = 0; for (; termDocs.next(); c++) ; System.out.print(c + " "); } System.out.println(term.text()); } else if (value.equals(term.text())) { termDocs.seek(termEnum); while (termDocs.next()) { if (all) { Document d = reader.document(termDocs.doc()); System.out.println(termDocs.doc()); for (Object o : d.getFields()) { Field f = (Field) o; System.out.println(f.name() + " " + d.get(f.name())); } } else { System.out.println( termDocs.doc() + " " + reader.document(termDocs.doc()).get("url")); } } } } while (termEnum.next()); } finally { termDocs.close(); termEnum.close(); } } }
@Override public boolean reload(String collectionName, String topRankingField) { if (collectionName == null) { return false; } CrescentCollectionHandler collectionHandler = SpringApplicationContext.getBean( "crescentCollectionHandler", CrescentCollectionHandler.class); CrescentCollection collection = collectionHandler.getCrescentCollections().getCrescentCollection(collectionName); if (collection == null) { logger.debug("doesn't Collection Info => {}", collectionName); init(View.Overview); return false; } if (topRankingField == null) { if (collection.getDefaultSearchFields().get(0) != null) { topRankingField = collection.getDefaultSearchFields().get(0).getName(); } else { logger.debug("doesn't defaultSearchField => {}", collectionName); init(View.Overview); return false; } } List<String> fieldName = new ArrayList<String>(); for (CrescentCollectionField field : collection.getFields()) fieldName.add(field.getName()); TopRankingQueue topRankingQueue = new TopRankingQueue(DEFAULT_TOPRANKING_TERM, new RankingTermComparator()); try { Directory directory = FSDirectory.open(new File(collection.getIndexingDirectory())); IndexReader reader = IndexReader.open(directory); TermEnum terms = reader.terms(); int termFreq = 0; int termCount = 0; Term beforeTerm = null; // init term count fieldTermCount.clear(); for (CrescentCollectionField field : collection.getFields()) fieldTermCount.put(field.getName(), 0); topRankingQueue.clear(); while (terms.next()) { Term currTerm = terms.term(); if (beforeTerm == null) { beforeTerm = currTerm; } if (beforeTerm.field() == currTerm.field()) { termCount++; } else { fieldTermCount.put(beforeTerm.field(), termCount); termCount = 1; beforeTerm = currTerm; } TermDocs termDocs = reader.termDocs(currTerm); while (termDocs.next()) { if (currTerm.field().equals(topRankingField)) { RankingTerm e = new RankingTerm(currTerm.text(), currTerm.field(), termDocs.freq()); topRankingQueue.add(e); } } termFreq++; } if (beforeTerm != null) fieldTermCount.put(beforeTerm.field(), termCount); terms.close(); result.put("numOfTerm", termFreq); result.put("numOfDoc", reader.numDocs()); result.put("hasDel", reader.hasDeletions()); result.put("isOptimize", reader.isOptimized()); result.put("indexVersion", reader.getVersion()); result.put("lastModify", new Date(IndexReader.lastModified(directory))); } catch (IOException e) { e.printStackTrace(); return false; } if (topRankingQueue.size() != 0) { topRankingTerms = topRankingQueue.toArray(); Arrays.sort(topRankingTerms); } result.put("collectionName", collectionName); result.put("indexName", collection.getIndexingDirectory()); result.put("numOfField", collection.getFields().size()); result.put("termCount", fieldTermCount); result.put("topRanking", topRankingTerms); result.put("fieldName", fieldName); return true; }
/** * loads multi-value facet data. This method uses a workarea to prepare loading. * * @param fieldName * @param reader * @param listFactory * @param workArea * @throws IOException */ public void load( String fieldName, IndexReader reader, TermListFactory<T> listFactory, WorkArea workArea) throws IOException { long t0 = System.currentTimeMillis(); int maxdoc = reader.maxDoc(); BufferedLoader loader = getBufferedLoader(maxdoc, workArea); TermEnum tenum = null; TermDocs tdoc = null; TermValueList<T> list = (listFactory == null ? (TermValueList<T>) new TermStringList() : listFactory.createTermList()); IntArrayList minIDList = new IntArrayList(); IntArrayList maxIDList = new IntArrayList(); IntArrayList freqList = new IntArrayList(); OpenBitSet bitset = new OpenBitSet(maxdoc + 1); int negativeValueCount = getNegativeValueCount(reader, fieldName.intern()); int t = 0; // current term number list.add(null); minIDList.add(-1); maxIDList.add(-1); freqList.add(0); t++; _overflow = false; try { tdoc = reader.termDocs(); tenum = reader.terms(new Term(fieldName, "")); if (tenum != null) { do { Term term = tenum.term(); if (term == null || !fieldName.equals(term.field())) break; String val = term.text(); if (val != null) { list.add(val); tdoc.seek(tenum); // freqList.add(tenum.docFreq()); // removed because the df doesn't take into account // the num of deletedDocs int df = 0; int minID = -1; int maxID = -1; int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; if (tdoc.next()) { df++; int docid = tdoc.doc(); if (!loader.add(docid, valId)) logOverflow(fieldName); minID = docid; bitset.fastSet(docid); while (tdoc.next()) { df++; docid = tdoc.doc(); if (!loader.add(docid, valId)) logOverflow(fieldName); bitset.fastSet(docid); } maxID = docid; } freqList.add(df); minIDList.add(minID); maxIDList.add(maxID); } t++; } while (tenum.next()); } } finally { try { if (tdoc != null) { tdoc.close(); } } finally { if (tenum != null) { tenum.close(); } } } list.seal(); try { _nestedArray.load(maxdoc + 1, loader); } catch (IOException e) { throw e; } catch (Exception e) { throw new RuntimeException("failed to load due to " + e.toString(), e); } this.valArray = list; this.freqs = freqList.toIntArray(); this.minIDs = minIDList.toIntArray(); this.maxIDs = maxIDList.toIntArray(); int doc = 0; while (doc <= maxdoc && !_nestedArray.contains(doc, 0, true)) { ++doc; } if (doc <= maxdoc) { this.minIDs[0] = doc; doc = maxdoc; while (doc > 0 && !_nestedArray.contains(doc, 0, true)) { --doc; } if (doc > 0) { this.maxIDs[0] = doc; } } this.freqs[0] = maxdoc + 1 - (int) bitset.cardinality(); }
/** @see LuceneIndexReader#termDocs() */ public TermDocs termDocs(Term term) throws IOException { return indexReader.termDocs(term); }
public static void main(String[] args) throws Exception { // the IndexReader object is the main handle that will give you // all the documents, terms and inverted index IndexReader r = IndexReader.open(FSDirectory.open(new File("index"))); // You can figure out the number of documents using the maxDoc() function System.out.println("The number of documents in this index is: " + r.maxDoc()); int i = 0; // You can find out all the terms that have been indexed using the terms() function TermEnum t = r.terms(); while (t.next()) { // Since there are so many terms, let us try printing only term #100000-#100010 if (i > 100000) System.out.println("[" + i + "] " + t.term().text()); if (++i > 100010) break; } // You can create your own query terms by calling the Term constructor, with the field // 'contents' // In the following example, the query term is 'brute' Term te = new Term("contents", "brute"); // You can also quickly find out the number of documents that have term t System.out.println("Number of documents with the word 'brute' is: " + r.docFreq(te)); // You can use the inverted index to find out all the documents that contain the term 'brute' // by using the termDocs function TermDocs td = r.termDocs(te); while (td.next()) { System.out.println( "Document number [" + td.doc() + "] contains the term 'brute' " + td.freq() + " time(s)."); } // You can find the URL of the a specific document number using the document() function // For example, the URL for document number 14191 is: Document d = r.document(14191); String url = d.getFieldable("path") .stringValue(); // the 'path' field of the Document object holds the URL System.out.println(url.replace("%%", "/")); // -------- Now let us use all of the functions above to make something useful -------- // The following bit of code is a worked out example of how to get a bunch of documents // in response to a query and show them (without ranking them according to TF/IDF) Scanner sc = new Scanner(System.in); String str = ""; System.out.print("query> "); while (!(str = sc.nextLine()).equals("quit")) { String[] terms = str.split("\\s+"); for (String word : terms) { Term term = new Term("contents", word); TermDocs tdocs = r.termDocs(term); while (tdocs.next()) { String d_url = r.document(tdocs.doc()).getFieldable("path").stringValue().replace("%%", "/"); System.out.println("[" + tdocs.doc() + "] " + d_url); } } System.out.print("query> "); } }
private static float[] getFloats(FileFloatSource ffs, IndexReader reader) { float[] vals = new float[reader.maxDoc()]; if (ffs.defVal != 0) { Arrays.fill(vals, ffs.defVal); } InputStream is; String fname = "external_" + ffs.field.getName(); try { is = VersionedFile.getLatestFile(ffs.dataDir, fname); } catch (IOException e) { // log, use defaults SolrCore.log.error("Error opening external value source file: " + e); return vals; } BufferedReader r = new BufferedReader(new InputStreamReader(is)); String idName = StringHelper.intern(ffs.keyField.getName()); FieldType idType = ffs.keyField.getType(); boolean sorted = true; // assume sorted until we discover it's not // warning: lucene's termEnum.skipTo() is not optimized... it simply does a next() // because of this, simply ask the reader for a new termEnum rather than // trying to use skipTo() List<String> notFound = new ArrayList<String>(); int notFoundCount = 0; int otherErrors = 0; TermDocs termDocs = null; Term protoTerm = new Term(idName, ""); TermEnum termEnum = null; // Number of times to try termEnum.next() before resorting to skip int numTimesNext = 10; char delimiter = '='; String termVal; boolean hasNext = true; String prevKey = ""; String lastVal = "\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF"; try { termDocs = reader.termDocs(); termEnum = reader.terms(protoTerm); Term t = termEnum.term(); if (t != null && t.field() == idName) { // intern'd comparison termVal = t.text(); } else { termVal = lastVal; } for (String line; (line = r.readLine()) != null; ) { int delimIndex = line.indexOf(delimiter); if (delimIndex < 0) continue; int endIndex = line.length(); /* EOLs should already be removed for BufferedReader.readLine() for(int endIndex = line.length();endIndex>delimIndex+1; endIndex--) { char ch = line.charAt(endIndex-1); if (ch!='\n' && ch!='\r') break; } */ String key = line.substring(0, delimIndex); String val = line.substring(delimIndex + 1, endIndex); String internalKey = idType.toInternal(key); float fval; try { fval = Float.parseFloat(val); } catch (Exception e) { if (++otherErrors <= 10) { SolrCore.log.error( "Error loading external value source + fileName + " + e + (otherErrors < 10 ? "" : "\tSkipping future errors for this file.")); } continue; // go to next line in file.. leave values as default. } if (sorted) { // make sure this key is greater than the previous key sorted = internalKey.compareTo(prevKey) >= 0; prevKey = internalKey; if (sorted) { int countNext = 0; for (; ; ) { int cmp = internalKey.compareTo(termVal); if (cmp == 0) { termDocs.seek(termEnum); while (termDocs.next()) { vals[termDocs.doc()] = fval; } break; } else if (cmp < 0) { // term enum has already advanced past current key... we didn't find it. if (notFoundCount < 10) { // collect first 10 not found for logging notFound.add(key); } notFoundCount++; break; } else { // termEnum is less than our current key, so skip ahead // try next() a few times to see if we hit or pass the target. // Lucene's termEnum.skipTo() is currently unoptimized (it just does next()) // so the best thing is to simply ask the reader for a new termEnum(target) // if we really need to skip. if (++countNext > numTimesNext) { termEnum = reader.terms(protoTerm.createTerm(internalKey)); t = termEnum.term(); } else { hasNext = termEnum.next(); t = hasNext ? termEnum.term() : null; } if (t != null && t.field() == idName) { // intern'd comparison termVal = t.text(); } else { termVal = lastVal; } } } // end for(;;) } } if (!sorted) { termEnum = reader.terms(protoTerm.createTerm(internalKey)); t = termEnum.term(); if (t != null && t.field() == idName // intern'd comparison && internalKey.equals(t.text())) { termDocs.seek(termEnum); while (termDocs.next()) { vals[termDocs.doc()] = fval; } } else { if (notFoundCount < 10) { // collect first 10 not found for logging notFound.add(key); } notFoundCount++; } } } } catch (IOException e) { // log, use defaults SolrCore.log.error("Error loading external value source: " + e); } finally { // swallow exceptions on close so we don't override any // exceptions that happened in the loop if (termDocs != null) try { termDocs.close(); } catch (Exception e) { } if (termEnum != null) try { termEnum.close(); } catch (Exception e) { } try { r.close(); } catch (Exception e) { } } SolrCore.log.info( "Loaded external value source " + fname + (notFoundCount == 0 ? "" : " :" + notFoundCount + " missing keys " + notFound)); return vals; }
protected TermDocs termDocs(IndexReader reader) throws IOException { return term == null ? reader.termDocs(null, this.buffer) : reader.termDocs(this.buffer); }
public TermDocs getTermDocs() throws IOException { if (termDocs == null) termDocs = reader.termDocs(t, 102400); else termDocs.seek(t); return termDocs; }