private static final int[] computeMultivaluedTD( ReaderAbstract reader, String fieldName, FieldCacheIndex stringIndex, DocIdInterface docIdInterface) throws IOException, SearchLibException { int[] countIndex = new int[stringIndex.lookup.length]; int indexPos = 0; if (docIdInterface.getSize() == 0) return countIndex; int[] docs = new int[100]; int[] freqs = new int[100]; BitSetInterface bitset = docIdInterface.getBitSet(); Term oTerm = new Term(fieldName); for (String term : stringIndex.lookup) { if (term != null) { Term t = oTerm.createTerm(term); TermDocs termDocs = reader.getTermDocs(t); int l; while ((l = termDocs.read(docs, freqs)) > 0) for (int i = 0; i < l; i++) if (freqs[i] > 0) if (bitset.get(docs[i])) countIndex[indexPos]++; termDocs.close(); } indexPos++; } return countIndex; }
@Override public DocIdSet getDocIdSet(IndexReader reader) throws IOException { StockAction action = getStockAction(); // retrieve the service long lastUpdateTime = action.geLastUpdateTime(); if (lastUpdateTime != this.lastUpdateTime) { cache.clear(); // clear outdated cache } DocIdSet cached = cache.get(reader); // check if in cache already if (cached != null) return cached; // not in cache, build info final BitSet bitSet = getAllPositiveBitSet(reader.maxDoc()); // by default, all documents pass Term clazzTerm = new Term(DocumentBuilder.CLASS_FIELDNAME, Item.class.getName()); if (reader.docFreq(clazzTerm) == 0) { // no need to filter // index does not contain Item objects // no-op } else { // for each item out of stock, find the corresponding document id by item id // and switch off the corresponding bit for (String ean : action.getEanOfItemsOutOfStock()) { // invoke external service Term term = new Term("ean", ean); TermDocs termDocs = reader.termDocs(term); // find document by ean while (termDocs.next()) { bitSet.clear(termDocs.doc()); } } } DocIdSet docIdSet = new DocIdBitSet(bitSet); // build DocIdSet from BitSet cache.put(reader, docIdSet); // put results in the cache this.lastUpdateTime = lastUpdateTime; // update timestamp return docIdSet; }
private void remove(Class entity, Serializable id) { log.trace("remove from Lucene index: " + entity + "#" + id); DocumentBuilder builder = workspace.getDocumentBuilder(entity); Term term = builder.getTerm(id); IndexReader reader = workspace.getIndexReader(entity); TermDocs termDocs = null; try { // TODO is there a faster way? // TODO include TermDocs into the workspace? termDocs = reader.termDocs(term); String entityName = entity.getName(); while (termDocs.next()) { int docIndex = termDocs.doc(); if (entityName.equals(reader.document(docIndex).get(DocumentBuilder.CLASS_FIELDNAME))) { // remove only the one of the right class // loop all to remove all the matches (defensive code) reader.deleteDocument(docIndex); } } } catch (Exception e) { throw new HibernateException("Unable to remove from Lucene index: " + entity + "#" + id, e); } finally { if (termDocs != null) try { termDocs.close(); } catch (IOException e) { log.warn("Unable to close termDocs properly", e); } } }
private static Map<String, List<String>> generate_result(Directory directory) { Map<String, List<String>> result_map = new HashMap<String, List<String>>(); try { IndexReader reader = IndexReader.open(directory); TermEnum termEnum = reader.terms(); while (termEnum.next()) { String termEnumString = termEnum.term().toString(); if (termEnumString.startsWith("content:")) { String term = termEnumString.substring(termEnumString.lastIndexOf(":") + 1); TermDocs termDocs = reader.termDocs(termEnum.term()); while (termDocs.next()) { Document doc = reader.document(termDocs.doc()); String relative_path = doc.get("relative_path"); if (result_map.containsKey(relative_path)) { result_map.get(relative_path).add(term + termDocs.freq()); } else { result_map.put(relative_path, new ArrayList<String>()); } } } } } catch (IOException e) { e.printStackTrace(); } finally { } return result_map; }
private final DocumentTable more(Term term) throws IOException { final TermDocs docs = state.getIndexReader().termDocs(term); if (!docs.next()) throw new EvalException("no such document"); final MoreLikeThis mlt = new MoreLikeThis(state.getIndexReader()); mlt.setFieldNames(FIELDS); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); final Query query = mlt.like(docs.doc()); return new SearchTable(view, state, query); }
public static int docId(IndexReader reader, Term term) throws IOException { TermDocs termDocs = reader.termDocs(term); try { if (termDocs.next()) { return termDocs.doc(); } return NO_DOC; } finally { termDocs.close(); } }
@SuppressWarnings({"StringEquality"}) @Override public void run() { TermDocs termDocs = null; TermEnum termEnum = null; try { BloomFilter filter = BloomFilterFactory.getFilter(reader.numDocs(), 15); termDocs = reader.termDocs(); termEnum = reader.terms(new Term(field)); do { Term term = termEnum.term(); if (term == null || term.field() != field) break; // LUCENE MONITOR: 4.0, move to use bytes! UnicodeUtil.UTF8Result utf8Result = Unicode.fromStringAsUtf8(term.text()); termDocs.seek(termEnum); while (termDocs.next()) { // when traversing, make sure to ignore deleted docs, so the key->docId will be correct if (!reader.isDeleted(termDocs.doc())) { filter.add(utf8Result.result, 0, utf8Result.length); } } } while (termEnum.next()); ConcurrentMap<String, BloomFilterEntry> fieldCache = cache.get(reader.getFieldCacheKey()); if (fieldCache != null) { if (fieldCache.containsKey(field)) { BloomFilterEntry filterEntry = new BloomFilterEntry(reader.numDocs(), filter); filterEntry.loading.set(false); fieldCache.put(field, filterEntry); } } } catch (Exception e) { logger.warn("failed to load bloom filter for [{}]", e, field); } finally { try { if (termDocs != null) { termDocs.close(); } } catch (IOException e) { // ignore } try { if (termEnum != null) { termEnum.close(); } } catch (IOException e) { // ignore } } }
/** * Gets the global term frequency of a term, i.e. how may times it occurs in the whole corpus * * @param term whose frequency you want * @return Global term frequency of term, or 1 if unavailable. */ private int getGlobalTermFreq(Term term) { int tf = 0; try { TermDocs tDocs = this.indexReader.termDocs(term); if (tDocs == null) { logger.info("Couldn't get term frequency for term " + term.text()); return 1; } while (tDocs.next()) { tf += tDocs.freq(); } } catch (IOException e) { logger.info("Couldn't get term frequency for term " + term.text()); return 1; } return tf; }
public MyTerm(Term originTrem, TermDocs termDocs, int maxDocNum) throws IOException { super(); this.originTrem = originTrem; this.termDocs = termDocs; this.totalFreq = 0; while (this.termDocs.next()) { int docNum = termDocs.doc(); int freq = termDocs.freq(); this.termMap.put(docNum, freq); this.totalFreq += freq; } this.vector = new int[maxDocNum]; for (int i = 0; i < maxDocNum; i++) { this.vector[i] = 0; } for (int k : this.termMap.keySet()) { this.vector[k] = (int) this.termMap.get(k); } }
/* */ public DocIdSet getDocIdSet(IndexReader reader) /* */ throws IOException /* */ { /* 103 */ TermEnum enumerator = this.query.getEnum(reader); /* */ try /* */ { /* 106 */ if (enumerator.term() == null) { /* 107 */ return DocIdSet.EMPTY_DOCIDSET; } OpenBitSet bitSet = new OpenBitSet(reader.maxDoc()); /* 110 */ int[] docs = new int[32]; /* 111 */ int[] freqs = new int[32]; /* 112 */ TermDocs termDocs = reader.termDocs(); /* */ int termCount; /* */ try { termCount = 0; /* */ do { /* 116 */ Term term = enumerator.term(); /* 117 */ if (term == null) /* */ break; /* 119 */ termCount++; /* 120 */ termDocs.seek(term); /* */ while (true) { /* 122 */ int count = termDocs.read(docs, freqs); /* 123 */ if (count == 0) break; /* 124 */ for (int i = 0; i < count; i++) { /* 125 */ bitSet.set(docs[i]); /* */ } /* */ } /* */ /* */ } /* */ /* 131 */ while (enumerator.next()); /* */ /* 133 */ this.query.incTotalNumberOfTerms(termCount); /* */ } finally /* */ { /* 136 */ termDocs.close(); /* */ } /* 138 */ return bitSet; /* */ } finally { /* 140 */ enumerator.close(); /* */ } /* */ }
private static final int[] toDocArray(ReaderLocal reader, DocumentsRequest request) throws IOException { SchemaField schemaField = null; Schema schema = request.getConfig().getSchema(); String field = request.getField(); if (!StringUtils.isEmpty(field)) { schemaField = schema.getField(field); if (schemaField == null) throw new IOException("Field not found: " + field); } else { schemaField = schema.getFieldList().getUniqueField(); if (schemaField == null) throw new IOException("No unique field"); } int higher = -1; RoaringBitmap bitSet = new RoaringBitmap(); String fieldName = schemaField.getName(); for (String uniqueKey : request.getUniqueKeyList()) { TermDocs termDocs = reader.getTermDocs(new Term(fieldName, uniqueKey)); if (termDocs != null) { while (termDocs.next()) { int doc = termDocs.doc(); if (doc > higher) higher = doc; bitSet.add(doc); } } termDocs.close(); } if (request.isReverse()) bitSet.flip(0, higher + 1); IntBufferedArrayInterface intBufferArray = IntBufferedArrayFactory.INSTANCE.newInstance(bitSet.getCardinality()); IntIterator iterator = bitSet.getIntIterator(); while (iterator.hasNext()) { int docId = iterator.next(); if (!reader.isDeletedNoLock(docId)) intBufferArray.add(docId); } return intBufferArray.getFinalArray(); }
/** * Returns a BitSet with true for documents which should be permitted in searchCallback results, * and false for those that should not. */ public BitSet bits(IndexReader reader) throws IOException { long start = System.currentTimeMillis(); BitSet bits = new BitSet(reader.maxDoc()); // TermEnum enumerator = // (null == lowerTerm ? reader.terms(new Term(fieldName, "")) : reader.terms(new // Term(fieldName, lowerTerm))); TermEnum enumerator = (null != lowerTerm ? reader.terms(new Term(fieldName, lowerTerm)) : reader.terms(new Term(fieldName, ""))); // coords = new HashMap(enumerator.docFreq()); try { if (enumerator.term() == null) { return bits; } boolean checkLower = false; if (!includeLower) // make adjustments to set to exclusive checkLower = true; TermDocs termDocs = reader.termDocs(); try { do { Term term = enumerator.term(); if (term != null && term.field().equals(fieldName)) { if (!checkLower || null == lowerTerm || term.text().compareTo(lowerTerm) > 0) { checkLower = false; if (upperTerm != null) { int compare = upperTerm.compareTo(term.text()); /* if beyond the upper term, or is exclusive and * this is equal to the upper term, break out */ if ((compare < 0) || (!includeUpper && compare == 0)) { break; } } /* we have a good term, find the docs */ termDocs.seek(enumerator.term()); while (termDocs.next()) { bits.set(termDocs.doc()); } } } else { break; } } while (enumerator.next()); } finally { termDocs.close(); } } finally { enumerator.close(); } long end = System.currentTimeMillis(); log.info("BoundaryBox Time Taken: " + (end - start)); return bits; }
@Override public int nextDoc() throws IOException { return _docid = _termDocs.next() ? _termDocs.doc() : NO_MORE_DOCS; }
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("TermDumper [-c|-v value] field <index...>"); System.exit(1); } boolean count = false; String value = null; boolean all = false; int i = 0; for (; i < args.length; i++) { String arg = args[i]; if ("-h".equals(arg) || "--help".equals(arg)) { System.err.println("TermDumper [-c|-v value] field <index...>"); System.exit(1); } else if ("-c".equals(arg) || "--count".equals(arg)) { count = true; } else if ("-v".equals(arg) || "--vaue".equals(arg)) { value = args[++i]; } else if ("-a".equals(arg) || "--all".equals(arg)) { all = true; } else { break; } } String field = args[i++]; java.util.ArrayList<IndexReader> readers = new java.util.ArrayList<IndexReader>(args.length - 1); for (; i < args.length; i++) { String arg = args[i]; try { IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true); readers.add(reader); } catch (IOException ioe) { System.err.println("Error reading: " + arg); } } for (IndexReader reader : readers) { TermDocs termDocs = reader.termDocs(); TermEnum termEnum = reader.terms(new Term(field)); try { do { Term term = termEnum.term(); if (term == null || !field.equals(term.field())) break; if (value == null) { if (count) { termDocs.seek(termEnum); int c = 0; for (; termDocs.next(); c++) ; System.out.print(c + " "); } System.out.println(term.text()); } else if (value.equals(term.text())) { termDocs.seek(termEnum); while (termDocs.next()) { if (all) { Document d = reader.document(termDocs.doc()); System.out.println(termDocs.doc()); for (Object o : d.getFields()) { Field f = (Field) o; System.out.println(f.name() + " " + d.get(f.name())); } } else { System.out.println( termDocs.doc() + " " + reader.document(termDocs.doc()).get("url")); } } } } while (termEnum.next()); } finally { termDocs.close(); termEnum.close(); } } }
private static float[] getFloats(FileFloatSource ffs, IndexReader reader) { float[] vals = new float[reader.maxDoc()]; if (ffs.defVal != 0) { Arrays.fill(vals, ffs.defVal); } InputStream is; String fname = "external_" + ffs.field.getName(); try { is = VersionedFile.getLatestFile(ffs.dataDir, fname); } catch (IOException e) { // log, use defaults SolrCore.log.error("Error opening external value source file: " + e); return vals; } BufferedReader r = new BufferedReader(new InputStreamReader(is)); String idName = StringHelper.intern(ffs.keyField.getName()); FieldType idType = ffs.keyField.getType(); boolean sorted = true; // assume sorted until we discover it's not // warning: lucene's termEnum.skipTo() is not optimized... it simply does a next() // because of this, simply ask the reader for a new termEnum rather than // trying to use skipTo() List<String> notFound = new ArrayList<String>(); int notFoundCount = 0; int otherErrors = 0; TermDocs termDocs = null; Term protoTerm = new Term(idName, ""); TermEnum termEnum = null; // Number of times to try termEnum.next() before resorting to skip int numTimesNext = 10; char delimiter = '='; String termVal; boolean hasNext = true; String prevKey = ""; String lastVal = "\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF"; try { termDocs = reader.termDocs(); termEnum = reader.terms(protoTerm); Term t = termEnum.term(); if (t != null && t.field() == idName) { // intern'd comparison termVal = t.text(); } else { termVal = lastVal; } for (String line; (line = r.readLine()) != null; ) { int delimIndex = line.indexOf(delimiter); if (delimIndex < 0) continue; int endIndex = line.length(); /* EOLs should already be removed for BufferedReader.readLine() for(int endIndex = line.length();endIndex>delimIndex+1; endIndex--) { char ch = line.charAt(endIndex-1); if (ch!='\n' && ch!='\r') break; } */ String key = line.substring(0, delimIndex); String val = line.substring(delimIndex + 1, endIndex); String internalKey = idType.toInternal(key); float fval; try { fval = Float.parseFloat(val); } catch (Exception e) { if (++otherErrors <= 10) { SolrCore.log.error( "Error loading external value source + fileName + " + e + (otherErrors < 10 ? "" : "\tSkipping future errors for this file.")); } continue; // go to next line in file.. leave values as default. } if (sorted) { // make sure this key is greater than the previous key sorted = internalKey.compareTo(prevKey) >= 0; prevKey = internalKey; if (sorted) { int countNext = 0; for (; ; ) { int cmp = internalKey.compareTo(termVal); if (cmp == 0) { termDocs.seek(termEnum); while (termDocs.next()) { vals[termDocs.doc()] = fval; } break; } else if (cmp < 0) { // term enum has already advanced past current key... we didn't find it. if (notFoundCount < 10) { // collect first 10 not found for logging notFound.add(key); } notFoundCount++; break; } else { // termEnum is less than our current key, so skip ahead // try next() a few times to see if we hit or pass the target. // Lucene's termEnum.skipTo() is currently unoptimized (it just does next()) // so the best thing is to simply ask the reader for a new termEnum(target) // if we really need to skip. if (++countNext > numTimesNext) { termEnum = reader.terms(protoTerm.createTerm(internalKey)); t = termEnum.term(); } else { hasNext = termEnum.next(); t = hasNext ? termEnum.term() : null; } if (t != null && t.field() == idName) { // intern'd comparison termVal = t.text(); } else { termVal = lastVal; } } } // end for(;;) } } if (!sorted) { termEnum = reader.terms(protoTerm.createTerm(internalKey)); t = termEnum.term(); if (t != null && t.field() == idName // intern'd comparison && internalKey.equals(t.text())) { termDocs.seek(termEnum); while (termDocs.next()) { vals[termDocs.doc()] = fval; } } else { if (notFoundCount < 10) { // collect first 10 not found for logging notFound.add(key); } notFoundCount++; } } } } catch (IOException e) { // log, use defaults SolrCore.log.error("Error loading external value source: " + e); } finally { // swallow exceptions on close so we don't override any // exceptions that happened in the loop if (termDocs != null) try { termDocs.close(); } catch (Exception e) { } if (termEnum != null) try { termEnum.close(); } catch (Exception e) { } try { r.close(); } catch (Exception e) { } } SolrCore.log.info( "Loaded external value source " + fname + (notFoundCount == 0 ? "" : " :" + notFoundCount + " missing keys " + notFound)); return vals; }
@Override public int advance(int target) throws IOException { return doc = termDocs.skipTo(target) ? termDocs.doc() : NO_MORE_DOCS; }
/** * loads multi-value facet data. This method uses a workarea to prepare loading. * * @param fieldName * @param reader * @param listFactory * @param workArea * @throws IOException */ public void load( String fieldName, IndexReader reader, TermListFactory<T> listFactory, WorkArea workArea) throws IOException { long t0 = System.currentTimeMillis(); int maxdoc = reader.maxDoc(); BufferedLoader loader = getBufferedLoader(maxdoc, workArea); TermEnum tenum = null; TermDocs tdoc = null; TermValueList<T> list = (listFactory == null ? (TermValueList<T>) new TermStringList() : listFactory.createTermList()); IntArrayList minIDList = new IntArrayList(); IntArrayList maxIDList = new IntArrayList(); IntArrayList freqList = new IntArrayList(); OpenBitSet bitset = new OpenBitSet(maxdoc + 1); int negativeValueCount = getNegativeValueCount(reader, fieldName.intern()); int t = 0; // current term number list.add(null); minIDList.add(-1); maxIDList.add(-1); freqList.add(0); t++; _overflow = false; try { tdoc = reader.termDocs(); tenum = reader.terms(new Term(fieldName, "")); if (tenum != null) { do { Term term = tenum.term(); if (term == null || !fieldName.equals(term.field())) break; String val = term.text(); if (val != null) { list.add(val); tdoc.seek(tenum); // freqList.add(tenum.docFreq()); // removed because the df doesn't take into account // the num of deletedDocs int df = 0; int minID = -1; int maxID = -1; int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; if (tdoc.next()) { df++; int docid = tdoc.doc(); if (!loader.add(docid, valId)) logOverflow(fieldName); minID = docid; bitset.fastSet(docid); while (tdoc.next()) { df++; docid = tdoc.doc(); if (!loader.add(docid, valId)) logOverflow(fieldName); bitset.fastSet(docid); } maxID = docid; } freqList.add(df); minIDList.add(minID); maxIDList.add(maxID); } t++; } while (tenum.next()); } } finally { try { if (tdoc != null) { tdoc.close(); } } finally { if (tenum != null) { tenum.close(); } } } list.seal(); try { _nestedArray.load(maxdoc + 1, loader); } catch (IOException e) { throw e; } catch (Exception e) { throw new RuntimeException("failed to load due to " + e.toString(), e); } this.valArray = list; this.freqs = freqList.toIntArray(); this.minIDs = minIDList.toIntArray(); this.maxIDs = maxIDList.toIntArray(); int doc = 0; while (doc <= maxdoc && !_nestedArray.contains(doc, 0, true)) { ++doc; } if (doc <= maxdoc) { this.minIDs[0] = doc; doc = maxdoc; while (doc > 0 && !_nestedArray.contains(doc, 0, true)) { --doc; } if (doc > 0) { this.maxIDs[0] = doc; } } this.freqs[0] = maxdoc + 1 - (int) bitset.cardinality(); }
@Override public boolean reload(String collectionName, String topRankingField) { if (collectionName == null) { return false; } CrescentCollectionHandler collectionHandler = SpringApplicationContext.getBean( "crescentCollectionHandler", CrescentCollectionHandler.class); CrescentCollection collection = collectionHandler.getCrescentCollections().getCrescentCollection(collectionName); if (collection == null) { logger.debug("doesn't Collection Info => {}", collectionName); init(View.Overview); return false; } if (topRankingField == null) { if (collection.getDefaultSearchFields().get(0) != null) { topRankingField = collection.getDefaultSearchFields().get(0).getName(); } else { logger.debug("doesn't defaultSearchField => {}", collectionName); init(View.Overview); return false; } } List<String> fieldName = new ArrayList<String>(); for (CrescentCollectionField field : collection.getFields()) fieldName.add(field.getName()); TopRankingQueue topRankingQueue = new TopRankingQueue(DEFAULT_TOPRANKING_TERM, new RankingTermComparator()); try { Directory directory = FSDirectory.open(new File(collection.getIndexingDirectory())); IndexReader reader = IndexReader.open(directory); TermEnum terms = reader.terms(); int termFreq = 0; int termCount = 0; Term beforeTerm = null; // init term count fieldTermCount.clear(); for (CrescentCollectionField field : collection.getFields()) fieldTermCount.put(field.getName(), 0); topRankingQueue.clear(); while (terms.next()) { Term currTerm = terms.term(); if (beforeTerm == null) { beforeTerm = currTerm; } if (beforeTerm.field() == currTerm.field()) { termCount++; } else { fieldTermCount.put(beforeTerm.field(), termCount); termCount = 1; beforeTerm = currTerm; } TermDocs termDocs = reader.termDocs(currTerm); while (termDocs.next()) { if (currTerm.field().equals(topRankingField)) { RankingTerm e = new RankingTerm(currTerm.text(), currTerm.field(), termDocs.freq()); topRankingQueue.add(e); } } termFreq++; } if (beforeTerm != null) fieldTermCount.put(beforeTerm.field(), termCount); terms.close(); result.put("numOfTerm", termFreq); result.put("numOfDoc", reader.numDocs()); result.put("hasDel", reader.hasDeletions()); result.put("isOptimize", reader.isOptimized()); result.put("indexVersion", reader.getVersion()); result.put("lastModify", new Date(IndexReader.lastModified(directory))); } catch (IOException e) { e.printStackTrace(); return false; } if (topRankingQueue.size() != 0) { topRankingTerms = topRankingQueue.toArray(); Arrays.sort(topRankingTerms); } result.put("collectionName", collectionName); result.put("indexName", collection.getIndexingDirectory()); result.put("numOfField", collection.getFields().size()); result.put("termCount", fieldTermCount); result.put("topRanking", topRankingTerms); result.put("fieldName", fieldName); return true; }
public static void main(String[] args) throws Exception { // the IndexReader object is the main handle that will give you // all the documents, terms and inverted index IndexReader r = IndexReader.open(FSDirectory.open(new File("index"))); // You can figure out the number of documents using the maxDoc() function System.out.println("The number of documents in this index is: " + r.maxDoc()); int i = 0; // You can find out all the terms that have been indexed using the terms() function TermEnum t = r.terms(); while (t.next()) { // Since there are so many terms, let us try printing only term #100000-#100010 if (i > 100000) System.out.println("[" + i + "] " + t.term().text()); if (++i > 100010) break; } // You can create your own query terms by calling the Term constructor, with the field // 'contents' // In the following example, the query term is 'brute' Term te = new Term("contents", "brute"); // You can also quickly find out the number of documents that have term t System.out.println("Number of documents with the word 'brute' is: " + r.docFreq(te)); // You can use the inverted index to find out all the documents that contain the term 'brute' // by using the termDocs function TermDocs td = r.termDocs(te); while (td.next()) { System.out.println( "Document number [" + td.doc() + "] contains the term 'brute' " + td.freq() + " time(s)."); } // You can find the URL of the a specific document number using the document() function // For example, the URL for document number 14191 is: Document d = r.document(14191); String url = d.getFieldable("path") .stringValue(); // the 'path' field of the Document object holds the URL System.out.println(url.replace("%%", "/")); // -------- Now let us use all of the functions above to make something useful -------- // The following bit of code is a worked out example of how to get a bunch of documents // in response to a query and show them (without ranking them according to TF/IDF) Scanner sc = new Scanner(System.in); String str = ""; System.out.print("query> "); while (!(str = sc.nextLine()).equals("quit")) { String[] terms = str.split("\\s+"); for (String word : terms) { Term term = new Term("contents", word); TermDocs tdocs = r.termDocs(term); while (tdocs.next()) { String d_url = r.document(tdocs.doc()).getFieldable("path").stringValue().replace("%%", "/"); System.out.println("[" + tdocs.doc() + "] " + d_url); } } System.out.print("query> "); } }
public void load(String latFieldName, String lonFieldName, BoboIndexReader reader) throws IOException { if (reader == null) throw new NullPointerException("reader object is null"); if (latFieldName == null) throw new NullPointerException("latitude Field Name is null"); if (lonFieldName == null) throw new NullPointerException("longitude Field Name is null"); String latField = latFieldName.intern(); String lonField = lonFieldName.intern(); int maxDoc = reader.maxDoc(); BigFloatArray xVals = this._xValArray; BigFloatArray yVals = this._yValArray; BigFloatArray zVals = this._zValArray; if (xVals == null) xVals = newInstance(maxDoc); else xVals.ensureCapacity(maxDoc); if (yVals == null) yVals = newInstance(maxDoc); else yVals.ensureCapacity(maxDoc); if (zVals == null) zVals = newInstance(maxDoc); else zVals.ensureCapacity(maxDoc); this._xValArray = xVals; this._yValArray = yVals; this._zValArray = zVals; Term latTerm = new Term(latFieldName, ""); TermDocs termDocs = reader.termDocs(latTerm); TermEnum termEnum = reader.terms(latTerm); float docLat, docLon; int termCount = 1; String lonValue = null; int length = maxDoc + 1; int doc; termDocs.next(); try { do { Term term = termEnum.term(); if (term == null || term.field() != latFieldName) continue; if (termCount > xVals.capacity()) throw new IOException("Maximum number of values cannot exceed: " + xVals.capacity()); if (termCount >= length) throw new RuntimeException( "There are more terms than documents in field " + latFieldName + " or " + lonFieldName + ", but its impossible to sort on tokenized fields"); // pull the termDocs to point to the document for the current term in the termEnum termDocs.seek(termEnum); while (termDocs.next()) { doc = termDocs.doc(); // read the latitude value in the current document docLat = Float.parseFloat(term.text().trim()); // read the longitude value in the current document Document docVal = reader.document(doc, null); lonValue = docVal.get(lonFieldName); if (lonValue == null) continue; else docLon = Float.parseFloat(lonValue); // convert the lat, lon values to x,y,z coordinates float[] coords = GeoMatchUtil.geoMatchCoordsFromDegrees(docLat, docLon); _xValArray.add(doc, coords[0]); _yValArray.add(doc, coords[1]); _zValArray.add(doc, coords[2]); } } while (termEnum.next()); } catch (Exception e) { // TODO: get rid of this catch phrase e.printStackTrace(); } finally { if (termDocs != null) termDocs.close(); if (termEnum != null) termEnum.close(); } }
public TermDocs getTermDocs() throws IOException { if (termDocs == null) termDocs = reader.termDocs(t, 102400); else termDocs.seek(t); return termDocs; }