private void initBiggerDiagonal(IndexReader reader) throws IOException { logger.info("Initializing Spatial Indexes for Queries Strategies"); if (biggerDiagonal == null) { biggerDiagonal = (Double) IndexReaderPersistentCache.get(reader, biggerDiagonalCacheKey); twiceBiggerDiagonal = (Double) IndexReaderPersistentCache.get(reader, twiceBiggerDiagonalCacheKey); if (biggerDiagonal == null || twiceBiggerDiagonal == null) { biggerDiagonal = 0.0; Term last = null; TermEnum termEnum = reader.terms(new Term(Globals.LUCENE_DIAGONAL_INDEX, "")); if (termEnum.term() != null && termEnum.term().field().equals(Globals.LUCENE_DIAGONAL_INDEX)) last = termEnum.term(); if (termEnum.term() != null) while (termEnum.next()) if (termEnum.term().field().equals(Globals.LUCENE_DIAGONAL_INDEX)) last = termEnum.term(); if (last != null) { biggerDiagonal = NumberUtils.SortableStr2double(last.text()); logger.info("Found bigger spatial width:" + biggerDiagonal); } twiceBiggerDiagonal = 2 * biggerDiagonal; halfBiggerDiagonal = biggerDiagonal / ((double) 2); logger.info("defining twice bigger spatial width:" + twiceBiggerDiagonal); termEnum.close(); IndexReaderPersistentCache.put(biggerDiagonalCacheKey, biggerDiagonal, reader); IndexReaderPersistentCache.put(twiceBiggerDiagonalCacheKey, twiceBiggerDiagonal, reader); } } if (biggerInternalCircleRadium == null) { biggerInternalCircleRadium = (Double) IndexReaderPersistentCache.get(reader, biggerRadiumCacheKey); if (biggerInternalCircleRadium == null) { biggerInternalCircleRadium = 0.0; Term last = null; TermEnum termEnum = reader.terms(new Term(Globals.LUCENE_RADIUM_INDEX, "")); if (termEnum.term() != null && termEnum.term().field().equals(Globals.LUCENE_RADIUM_INDEX)) last = termEnum.term(); if (termEnum.term() != null) while (termEnum.next()) if (termEnum.term().field().equals(Globals.LUCENE_RADIUM_INDEX)) last = termEnum.term(); if (last != null) { biggerInternalCircleRadium = NumberUtils.SortableStr2double(last.text()); logger.info("Found bigger spatial width:" + biggerInternalCircleRadium); } termEnum.close(); IndexReaderPersistentCache.put(biggerRadiumCacheKey, biggerInternalCircleRadium, reader); } } }
public static SimpleOrderedMap<Object> getIndexInfo(IndexReader reader, boolean countTerms) throws IOException { Directory dir = reader.directory(); SimpleOrderedMap<Object> indexInfo = new SimpleOrderedMap<Object>(); indexInfo.add("numDocs", reader.numDocs()); indexInfo.add("maxDoc", reader.maxDoc()); if (countTerms) { TermEnum te = null; try { te = reader.terms(); int numTerms = 0; while (te.next()) { numTerms++; } indexInfo.add("numTerms", numTerms); } finally { if (te != null) te.close(); } } indexInfo.add( "version", reader.getVersion()); // TODO? Is this different then: IndexReader.getCurrentVersion( dir )? indexInfo.add("optimized", reader.isOptimized()); indexInfo.add("current", reader.isCurrent()); indexInfo.add("hasDeletions", reader.hasDeletions()); indexInfo.add("directory", dir); indexInfo.add("lastModified", new Date(IndexReader.lastModified(dir))); return indexInfo; }
private static Map<String, List<String>> generate_result(Directory directory) { Map<String, List<String>> result_map = new HashMap<String, List<String>>(); try { IndexReader reader = IndexReader.open(directory); TermEnum termEnum = reader.terms(); while (termEnum.next()) { String termEnumString = termEnum.term().toString(); if (termEnumString.startsWith("content:")) { String term = termEnumString.substring(termEnumString.lastIndexOf(":") + 1); TermDocs termDocs = reader.termDocs(termEnum.term()); while (termDocs.next()) { Document doc = reader.document(termDocs.doc()); String relative_path = doc.get("relative_path"); if (result_map.containsKey(relative_path)) { result_map.get(relative_path).add(term + termDocs.freq()); } else { result_map.put(relative_path, new ArrayList<String>()); } } } } } catch (IOException e) { e.printStackTrace(); } finally { } return result_map; }
@Override public void visitMatchingTerms(IndexReader reader, String fieldName, MatchingTermVisitor mtv) throws IOException { boolean expanded = false; int prefixLength = prefix.length(); TermEnum enumerator = reader.terms(new Term(fieldName, prefix)); Matcher matcher = pattern.matcher(""); try { do { Term term = enumerator.term(); if (term != null) { String text = term.text(); if ((!text.startsWith(prefix)) || (!term.field().equals(fieldName))) { break; } else { matcher.reset(text.substring(prefixLength)); if (matcher.matches()) { mtv.visitMatchingTerm(term); expanded = true; } } } } while (enumerator.next()); } finally { enumerator.close(); matcher.reset(); } if (!expanded) { System.out.println("No terms in " + fieldName + " field for: " + toString()); } }
public boolean skipTo(Term target) throws IOException { // already here if (t != null && t.equals(target)) return true; int startIdx = tindex.index.search(target.text()); if (startIdx >= 0) { // we hit the term exactly... lucky us! if (tenum != null) tenum.close(); tenum = reader.terms(target); pos = startIdx << tindex.intervalBits; return setTerm(); } // we didn't hit the term exactly startIdx = -startIdx - 1; if (startIdx == 0) { // our target occurs *before* the first term if (tenum != null) tenum.close(); tenum = reader.terms(target); pos = 0; return setTerm(); } // back up to the start of the block startIdx--; if ((pos >> tindex.intervalBits) == startIdx && t != null && t.text().compareTo(target.text()) <= 0) { // we are already in the right block and the current term is before the term we want, // so we don't need to seek. } else { // seek to the right block if (tenum != null) tenum.close(); tenum = reader.terms(target.createTerm(tindex.index.get(startIdx))); pos = startIdx << tindex.intervalBits; setTerm(); // should be true since it's in the index } while (t != null && t.text().compareTo(target.text()) < 0) { next(); } return t != null; }
TermNumEnumerator(IndexReader reader, TermIndex tindex, String termValue, int pos) throws IOException { this.reader = reader; this.tindex = tindex; this.pos = pos; tenum = reader.terms(tindex.createTerm(termValue)); setTerm(); }
/** * words in each records in input is sorted by document frequency, if ceil(prefix*length)-prefix * share at least one token, block them, * * @param input * @param lines number of lines to block * @param prefix prefix parameter * @param maxDocFreq max document frequency for a token to be considered a rare feature * @param indexFolder temporary index folder * @param output * @param report * @throws Exception */ public static void prefixBlockingWithLucene( String input, int lines, float prefix, int maxPrefixLength, int maxDocFreq, String indexFolder, String output, String report) throws Exception { long startTime = new Date().getTime(); Common.indexPrefix(input, lines, prefix, maxPrefixLength, indexFolder); IndexReader ireader = IndexReader.open(indexFolder); IndexSearcher isearcher = new IndexSearcher(ireader); TermEnum te = ireader.terms(); PrintWriter pw = IOFactory.getPrintWriter(output); int maxBlockSize = 0; int totalBlockSize = 0; int blockCount = 0; while (te.next()) { TopDocs td = isearcher.search(new TermQuery(te.term()), maxDocFreq + 1); // discard blocks with only one individual or of too frequent tokens if (td.scoreDocs.length <= 1 || td.scoreDocs.length > maxDocFreq) continue; if (td.scoreDocs.length > maxBlockSize) maxBlockSize = td.scoreDocs.length; totalBlockSize += td.scoreDocs.length; blockCount++; pw.print(ireader.document(td.scoreDocs[0].doc).get("id")); for (int i = 1; i < td.scoreDocs.length; i++) { pw.print(" " + ireader.document(td.scoreDocs[i].doc).get("id")); } pw.println(); if (blockCount % 1000 == 0) System.out.println(new Date().toString() + " : " + blockCount + " blocks"); } pw.close(); ireader.close(); long time = new Date().getTime() - startTime; pw = IOFactory.getPrintWriter(report, true); pw.println(new Date().toString()); pw.println("#individual: " + lines); pw.println("blocking parameter: " + prefix); pw.println("time: " + time); pw.println("#block: " + blockCount); pw.println("max block size: " + maxBlockSize); pw.println("avg block size: " + (totalBlockSize + 0.0) / blockCount); pw.close(); Common.deleteFolder(new File(indexFolder)); System.out.println(prefix + "\t" + lines + "\t" + time); // for speed test }
private int getDocFreq(String term) { int result = 1; currentTerm = currentTerm.createTerm(term); try { TermEnum termEnum = reader.terms(currentTerm); if (termEnum != null && termEnum.term().equals(currentTerm)) { result = termEnum.docFreq(); } } catch (IOException e) { throw new RuntimeException(e); } return result; }
/** * Computes a term frequency map for the index at the specified location. "Most Frequent" is * defined as the terms whose frequencies are greater than or equal to the topTermCutoff * the * frequency of the top term, where the topTermCutoff is number between 0 and 1. * * @return * @throws CorruptIndexException * @throws IOException */ protected ArrayList<String> retrieveTopTerms() throws CorruptIndexException, IOException { final Map<String, Integer> frequencyMap = new HashMap<String, Integer>(); List<String> termlist = new ArrayList<String>(); IndexReader reader = IndexReader.open(ramdir); TermEnum terms = reader.terms(); while (terms.next()) { Term term = terms.term(); String termText = term.text(); int frequency = reader.docFreq(term); frequencyMap.put(termText, frequency); termlist.add(termText); } reader.close(); // sort the term map by frequency descending Collections.sort( termlist, new Comparator<String>() { @Override public int compare(String term1, String term2) { int term1Freq = frequencyMap.get(term1); int term2Freq = frequencyMap.get(term2); if (term1Freq < term2Freq) return 1; if (term1Freq > term2Freq) return -1; return 0; } }); // retrieve the top terms based on topTermCutoff ArrayList<String> topTerms = new ArrayList<String>(); double topFreq = -1.0F; for (String term : termlist) { if (topFreq < 0.0F) { // first term, capture the value topFreq = (double) frequencyMap.get(term); topTerms.add(term); } else { // not the first term, compute the ratio and discard if below // topTermCutoff score double ratio = (double) ((double) frequencyMap.get(term) / topFreq); if (ratio >= topTermCutoff) { topTerms.add(term); } else { break; } } } return topTerms; }
public void getIndexInfo(String indexdir, int freqThreshold) { IndexReader reader = null; try { Directory dir = FSDirectory.open(new File(indexdir)); System.out.println(dir); reader = IndexReader.open(dir); System.out.println("document num:" + reader.numDocs()); System.out.println("======================"); TermEnum terms = reader.terms(); sortedTermQueue.clear(); maxDocNum = reader.maxDoc(); linkMap.clear(); termList.clear(); while (terms.next()) { // System.out.print(terms.term() + "\tDocFreq:" + TermDocs termDocs = reader.termDocs(terms.term()); MyTerm temp = new MyTerm(terms.term(), termDocs, maxDocNum); if (temp.totalFreq < freqThreshold) { continue; } /* * if(temp.originTrem.text().length()==1){ continue; } */ linkMap.put(temp.originTrem.text(), temp); sortedTermQueue.add(temp); termList.add(temp); } System.out.println("total Size:" + sortedTermQueue.size()); System.out.println("mapsize:" + linkMap.keySet().size()); // System.exit(0); int num = 0; this.maxFreq = sortedTermQueue.peek().totalFreq; while (!sortedTermQueue.isEmpty()) { num++; System.out.println(num + ":" + sortedTermQueue.poll()); } System.out.println("read index info done"); } catch (IOException e) { e.printStackTrace(); } finally { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } }
@SuppressWarnings({"StringEquality"}) @Override public void run() { TermDocs termDocs = null; TermEnum termEnum = null; try { BloomFilter filter = BloomFilterFactory.getFilter(reader.numDocs(), 15); termDocs = reader.termDocs(); termEnum = reader.terms(new Term(field)); do { Term term = termEnum.term(); if (term == null || term.field() != field) break; // LUCENE MONITOR: 4.0, move to use bytes! UnicodeUtil.UTF8Result utf8Result = Unicode.fromStringAsUtf8(term.text()); termDocs.seek(termEnum); while (termDocs.next()) { // when traversing, make sure to ignore deleted docs, so the key->docId will be correct if (!reader.isDeleted(termDocs.doc())) { filter.add(utf8Result.result, 0, utf8Result.length); } } } while (termEnum.next()); ConcurrentMap<String, BloomFilterEntry> fieldCache = cache.get(reader.getFieldCacheKey()); if (fieldCache != null) { if (fieldCache.containsKey(field)) { BloomFilterEntry filterEntry = new BloomFilterEntry(reader.numDocs(), filter); filterEntry.loading.set(false); fieldCache.put(field, filterEntry); } } } catch (Exception e) { logger.warn("failed to load bloom filter for [{}]", e, field); } finally { try { if (termDocs != null) { termDocs.close(); } } catch (IOException e) { // ignore } try { if (termEnum != null) { termEnum.close(); } } catch (IOException e) { // ignore } } }
public MultiTermEnum(IndexReader topReader, IndexReader[] readers, int[] starts, Term t) throws IOException { this.topReader = topReader; queue = new SegmentMergeQueue(readers.length); matchingSegments = new SegmentMergeInfo[readers.length + 1]; for (int i = 0; i < readers.length; i++) { IndexReader reader = readers[i]; TermEnum termEnum; if (t != null) { termEnum = reader.terms(t); } else termEnum = reader.terms(); SegmentMergeInfo smi = new SegmentMergeInfo(starts[i], termEnum, reader); smi.ord = i; if (t == null ? smi.next() : termEnum.term() != null) queue.add(smi); // initialize queue else smi.close(); } if (t != null && queue.size() > 0) { next(); } }
/** * Gets the global term frequencies and writes them in the index directory. * * @throws Exception the exception */ public void getGlobalTermFrequencies() throws Exception { String parentDir = Flags.rootDir + (Flags.positional ? "/positional-" : "/") + "lucene/" + Flags.suffix; File file = new File(parentDir); indexReader = IndexReader.open(FSDirectory.open(file)); TermEnum terms = indexReader.terms(); BufferedWriter out = new BufferedWriter(new FileWriter(new File(parentDir + "/globalTermFreq.txt"))); while (terms.next()) { org.apache.lucene.index.Term term = terms.term(); out.write(term.text() + " " + getGlobalTermFreq(term) + "\n"); } out.close(); indexReader.close(); }
/** * Tests the IndexReader.getFieldNames implementation * * @throws Exception on error */ public void testFilterIndexReader() throws Exception { Directory directory = newDirectory(); IndexWriter writer = new IndexWriter( directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))); Document d1 = new Document(); d1.add(newField("default", "one two", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(d1); Document d2 = new Document(); d2.add(newField("default", "one three", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(d2); Document d3 = new Document(); d3.add(newField("default", "two four", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(d3); writer.close(); IndexReader reader = new TestReader(IndexReader.open(directory, true)); TermEnum terms = reader.terms(); while (terms.next()) { assertTrue(terms.term().text().indexOf('e') != -1); } terms.close(); TermPositions positions = reader.termPositions(new Term("default", "one")); while (positions.next()) { assertTrue((positions.doc() % 2) == 1); } int NUM_DOCS = 3; TermDocs td = reader.termDocs(null); for (int i = 0; i < NUM_DOCS; i++) { assertTrue(td.next()); assertEquals(i, td.doc()); assertEquals(1, td.freq()); } td.close(); reader.close(); directory.close(); }
private static Map<String, TopTermQueue> getTopTerms( IndexReader reader, Set<String> fields, int numTerms, Set<String> junkWords) throws Exception { Map<String, TopTermQueue> info = new HashMap<String, TopTermQueue>(); TermEnum terms = null; try { terms = reader.terms(); while (terms.next()) { String field = terms.term().field(); String t = terms.term().text(); // Compute distinct terms for every field TopTermQueue tiq = info.get(field); if (tiq == null) { tiq = new TopTermQueue(numTerms + 1); info.put(field, tiq); } tiq.distinctTerms++; tiq.histogram.add(terms.docFreq()); // add the term to the histogram // Only save the distinct terms for fields we worry about if (fields != null && fields.size() > 0) { if (!fields.contains(field)) { continue; } } if (junkWords != null && junkWords.contains(t)) { continue; } if (terms.docFreq() > tiq.minFreq) { tiq.add(new TopTermQueue.TermInfo(terms.term(), terms.docFreq())); if (tiq.size() > numTerms) { // if tiq full tiq.pop(); // remove lowest in tiq tiq.minFreq = ((TopTermQueue.TermInfo) tiq.top()).docFreq; // reset minFreq } } } } finally { if (terms != null) terms.close(); } return info; }
public boolean skipTo(int termNumber) throws IOException { int delta = termNumber - pos; if (delta < 0 || delta > tindex.interval || tenum == null) { int idx = termNumber >>> tindex.intervalBits; String base = tindex.index.get(idx); pos = idx << tindex.intervalBits; delta = termNumber - pos; if (tenum != null) { tenum.close(); } tenum = reader.terms(tindex.createTerm(base)); } while (--delta >= 0) { boolean b = tenum.next(); if (b == false) { t = null; return false; } ++pos; } return setTerm(); }
private void dumpTerms() throws IOException { outputBanner("Terms (in Term.compareTo() order)"); TermEnum terms = mIndexReader.terms(); int order = 0; while (terms.next()) { order++; Term term = terms.term(); String field = term.field(); String text = term.text(); if (!wantThisTerm(field, text)) { continue; } outputLn(order + " " + field + ": " + text); /* * for each term, print the * <document, frequency, <position>* > tuples for a term. * * document: document in which the Term appears * frequency: number of time the Term appears in the document * position: position for each appearance in the document * * e.g. doc.add(new Field("field", "one two three two four five", Field.Store.YES, Field.Index.ANALYZED)); * then the tuple for Term("field", "two") in this document would be like: * 88, 2, <2, 4> * where * 88 is the document number * 2 is the frequency this term appear in the document * <2, 4> are the positions for each appearance in the document */ // by TermPositions outputLn(" document, frequency, <position>*"); // keep track of docs that appear in all terms that are filtered in. Set<Integer> docNums = null; if (hasFilters()) { docNums = new HashSet<Integer>(); } TermPositions termPos = mIndexReader.termPositions(term); while (termPos.next()) { int docNum = termPos.doc(); int freq = termPos.freq(); if (docNums != null) { docNums.add(docNum); } output(" " + docNum + ", " + freq + ", <"); boolean first = true; for (int f = 0; f < freq; f++) { int positionInDoc = termPos.nextPosition(); if (!first) { output(" "); } else { first = false; } output(positionInDoc + ""); } outputLn(">"); } termPos.close(); if (docNums != null) { computeDocsIntersection(docNums); } outputLn(); if (order % 1000 == 0) { mConsole.debug("Dumped " + order + " terms"); } } terms.close(); }
private static float[] getFloats(FileFloatSource ffs, IndexReader reader) { float[] vals = new float[reader.maxDoc()]; if (ffs.defVal != 0) { Arrays.fill(vals, ffs.defVal); } InputStream is; String fname = "external_" + ffs.field.getName(); try { is = VersionedFile.getLatestFile(ffs.dataDir, fname); } catch (IOException e) { // log, use defaults SolrCore.log.error("Error opening external value source file: " + e); return vals; } BufferedReader r = new BufferedReader(new InputStreamReader(is)); String idName = StringHelper.intern(ffs.keyField.getName()); FieldType idType = ffs.keyField.getType(); boolean sorted = true; // assume sorted until we discover it's not // warning: lucene's termEnum.skipTo() is not optimized... it simply does a next() // because of this, simply ask the reader for a new termEnum rather than // trying to use skipTo() List<String> notFound = new ArrayList<String>(); int notFoundCount = 0; int otherErrors = 0; TermDocs termDocs = null; Term protoTerm = new Term(idName, ""); TermEnum termEnum = null; // Number of times to try termEnum.next() before resorting to skip int numTimesNext = 10; char delimiter = '='; String termVal; boolean hasNext = true; String prevKey = ""; String lastVal = "\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF"; try { termDocs = reader.termDocs(); termEnum = reader.terms(protoTerm); Term t = termEnum.term(); if (t != null && t.field() == idName) { // intern'd comparison termVal = t.text(); } else { termVal = lastVal; } for (String line; (line = r.readLine()) != null; ) { int delimIndex = line.indexOf(delimiter); if (delimIndex < 0) continue; int endIndex = line.length(); /* EOLs should already be removed for BufferedReader.readLine() for(int endIndex = line.length();endIndex>delimIndex+1; endIndex--) { char ch = line.charAt(endIndex-1); if (ch!='\n' && ch!='\r') break; } */ String key = line.substring(0, delimIndex); String val = line.substring(delimIndex + 1, endIndex); String internalKey = idType.toInternal(key); float fval; try { fval = Float.parseFloat(val); } catch (Exception e) { if (++otherErrors <= 10) { SolrCore.log.error( "Error loading external value source + fileName + " + e + (otherErrors < 10 ? "" : "\tSkipping future errors for this file.")); } continue; // go to next line in file.. leave values as default. } if (sorted) { // make sure this key is greater than the previous key sorted = internalKey.compareTo(prevKey) >= 0; prevKey = internalKey; if (sorted) { int countNext = 0; for (; ; ) { int cmp = internalKey.compareTo(termVal); if (cmp == 0) { termDocs.seek(termEnum); while (termDocs.next()) { vals[termDocs.doc()] = fval; } break; } else if (cmp < 0) { // term enum has already advanced past current key... we didn't find it. if (notFoundCount < 10) { // collect first 10 not found for logging notFound.add(key); } notFoundCount++; break; } else { // termEnum is less than our current key, so skip ahead // try next() a few times to see if we hit or pass the target. // Lucene's termEnum.skipTo() is currently unoptimized (it just does next()) // so the best thing is to simply ask the reader for a new termEnum(target) // if we really need to skip. if (++countNext > numTimesNext) { termEnum = reader.terms(protoTerm.createTerm(internalKey)); t = termEnum.term(); } else { hasNext = termEnum.next(); t = hasNext ? termEnum.term() : null; } if (t != null && t.field() == idName) { // intern'd comparison termVal = t.text(); } else { termVal = lastVal; } } } // end for(;;) } } if (!sorted) { termEnum = reader.terms(protoTerm.createTerm(internalKey)); t = termEnum.term(); if (t != null && t.field() == idName // intern'd comparison && internalKey.equals(t.text())) { termDocs.seek(termEnum); while (termDocs.next()) { vals[termDocs.doc()] = fval; } } else { if (notFoundCount < 10) { // collect first 10 not found for logging notFound.add(key); } notFoundCount++; } } } } catch (IOException e) { // log, use defaults SolrCore.log.error("Error loading external value source: " + e); } finally { // swallow exceptions on close so we don't override any // exceptions that happened in the loop if (termDocs != null) try { termDocs.close(); } catch (Exception e) { } if (termEnum != null) try { termEnum.close(); } catch (Exception e) { } try { r.close(); } catch (Exception e) { } } SolrCore.log.info( "Loaded external value source " + fname + (notFoundCount == 0 ? "" : " :" + notFoundCount + " missing keys " + notFound)); return vals; }
@Before public void setup() throws IOException { when(searcher.getIndexReader()).thenReturn(reader); when(reader.terms()).thenReturn(terms); }
@Override public boolean reload(String collectionName, String topRankingField) { if (collectionName == null) { return false; } CrescentCollectionHandler collectionHandler = SpringApplicationContext.getBean( "crescentCollectionHandler", CrescentCollectionHandler.class); CrescentCollection collection = collectionHandler.getCrescentCollections().getCrescentCollection(collectionName); if (collection == null) { logger.debug("doesn't Collection Info => {}", collectionName); init(View.Overview); return false; } if (topRankingField == null) { if (collection.getDefaultSearchFields().get(0) != null) { topRankingField = collection.getDefaultSearchFields().get(0).getName(); } else { logger.debug("doesn't defaultSearchField => {}", collectionName); init(View.Overview); return false; } } List<String> fieldName = new ArrayList<String>(); for (CrescentCollectionField field : collection.getFields()) fieldName.add(field.getName()); TopRankingQueue topRankingQueue = new TopRankingQueue(DEFAULT_TOPRANKING_TERM, new RankingTermComparator()); try { Directory directory = FSDirectory.open(new File(collection.getIndexingDirectory())); IndexReader reader = IndexReader.open(directory); TermEnum terms = reader.terms(); int termFreq = 0; int termCount = 0; Term beforeTerm = null; // init term count fieldTermCount.clear(); for (CrescentCollectionField field : collection.getFields()) fieldTermCount.put(field.getName(), 0); topRankingQueue.clear(); while (terms.next()) { Term currTerm = terms.term(); if (beforeTerm == null) { beforeTerm = currTerm; } if (beforeTerm.field() == currTerm.field()) { termCount++; } else { fieldTermCount.put(beforeTerm.field(), termCount); termCount = 1; beforeTerm = currTerm; } TermDocs termDocs = reader.termDocs(currTerm); while (termDocs.next()) { if (currTerm.field().equals(topRankingField)) { RankingTerm e = new RankingTerm(currTerm.text(), currTerm.field(), termDocs.freq()); topRankingQueue.add(e); } } termFreq++; } if (beforeTerm != null) fieldTermCount.put(beforeTerm.field(), termCount); terms.close(); result.put("numOfTerm", termFreq); result.put("numOfDoc", reader.numDocs()); result.put("hasDel", reader.hasDeletions()); result.put("isOptimize", reader.isOptimized()); result.put("indexVersion", reader.getVersion()); result.put("lastModify", new Date(IndexReader.lastModified(directory))); } catch (IOException e) { e.printStackTrace(); return false; } if (topRankingQueue.size() != 0) { topRankingTerms = topRankingQueue.toArray(); Arrays.sort(topRankingTerms); } result.put("collectionName", collectionName); result.put("indexName", collection.getIndexingDirectory()); result.put("numOfField", collection.getFields().size()); result.put("termCount", fieldTermCount); result.put("topRanking", topRankingTerms); result.put("fieldName", fieldName); return true; }
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("TermDumper [-c|-v value] field <index...>"); System.exit(1); } boolean count = false; String value = null; boolean all = false; int i = 0; for (; i < args.length; i++) { String arg = args[i]; if ("-h".equals(arg) || "--help".equals(arg)) { System.err.println("TermDumper [-c|-v value] field <index...>"); System.exit(1); } else if ("-c".equals(arg) || "--count".equals(arg)) { count = true; } else if ("-v".equals(arg) || "--vaue".equals(arg)) { value = args[++i]; } else if ("-a".equals(arg) || "--all".equals(arg)) { all = true; } else { break; } } String field = args[i++]; java.util.ArrayList<IndexReader> readers = new java.util.ArrayList<IndexReader>(args.length - 1); for (; i < args.length; i++) { String arg = args[i]; try { IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true); readers.add(reader); } catch (IOException ioe) { System.err.println("Error reading: " + arg); } } for (IndexReader reader : readers) { TermDocs termDocs = reader.termDocs(); TermEnum termEnum = reader.terms(new Term(field)); try { do { Term term = termEnum.term(); if (term == null || !field.equals(term.field())) break; if (value == null) { if (count) { termDocs.seek(termEnum); int c = 0; for (; termDocs.next(); c++) ; System.out.print(c + " "); } System.out.println(term.text()); } else if (value.equals(term.text())) { termDocs.seek(termEnum); while (termDocs.next()) { if (all) { Document d = reader.document(termDocs.doc()); System.out.println(termDocs.doc()); for (Object o : d.getFields()) { Field f = (Field) o; System.out.println(f.name() + " " + d.get(f.name())); } } else { System.out.println( termDocs.doc() + " " + reader.document(termDocs.doc()).get("url")); } } } } while (termEnum.next()); } finally { termDocs.close(); termEnum.close(); } } }
/** * Returns a BitSet with true for documents which should be permitted in searchCallback results, * and false for those that should not. */ public BitSet bits(IndexReader reader) throws IOException { long start = System.currentTimeMillis(); BitSet bits = new BitSet(reader.maxDoc()); // TermEnum enumerator = // (null == lowerTerm ? reader.terms(new Term(fieldName, "")) : reader.terms(new // Term(fieldName, lowerTerm))); TermEnum enumerator = (null != lowerTerm ? reader.terms(new Term(fieldName, lowerTerm)) : reader.terms(new Term(fieldName, ""))); // coords = new HashMap(enumerator.docFreq()); try { if (enumerator.term() == null) { return bits; } boolean checkLower = false; if (!includeLower) // make adjustments to set to exclusive checkLower = true; TermDocs termDocs = reader.termDocs(); try { do { Term term = enumerator.term(); if (term != null && term.field().equals(fieldName)) { if (!checkLower || null == lowerTerm || term.text().compareTo(lowerTerm) > 0) { checkLower = false; if (upperTerm != null) { int compare = upperTerm.compareTo(term.text()); /* if beyond the upper term, or is exclusive and * this is equal to the upper term, break out */ if ((compare < 0) || (!includeUpper && compare == 0)) { break; } } /* we have a good term, find the docs */ termDocs.seek(enumerator.term()); while (termDocs.next()) { bits.set(termDocs.doc()); } } } else { break; } } while (enumerator.next()); } finally { termDocs.close(); } } finally { enumerator.close(); } long end = System.currentTimeMillis(); log.info("BoundaryBox Time Taken: " + (end - start)); return bits; }
/** @see LuceneIndexReader#terms(Term) */ public TermEnum terms(Term term) throws IOException { return indexReader.terms(term); }
public List<Pair<String, Integer>> getTopTerms(String field, int count) { ClosingIndexSearcher searcher = null; try { LinkedList<Pair<String, Integer>> answer = new LinkedList<Pair<String, Integer>>(); searcher = getSearcher(indexer); IndexReader reader = searcher.getIndexReader(); TermEnum terms = reader.terms(new Term(field, "")); do { Term term = terms.term(); if (term != null) { if (!term.field().equals(field)) { break; } int freq = terms.docFreq(); Pair<String, Integer> pair = new Pair<String, Integer>(term.text(), Integer.valueOf(freq)); if (answer.size() < count) { if (answer.size() == 0) { answer.add(pair); } else if (answer.get(answer.size() - 1).getSecond().compareTo(pair.getSecond()) >= 0) { answer.add(pair); } else { for (ListIterator<Pair<String, Integer>> it = answer.listIterator(); it.hasNext(); /**/ ) { Pair<String, Integer> test = it.next(); if (test.getSecond().compareTo(pair.getSecond()) < 0) { it.previous(); it.add(pair); break; } } } } else if (answer.get(count - 1).getSecond().compareTo(pair.getSecond()) < 0) { for (ListIterator<Pair<String, Integer>> it = answer.listIterator(); it.hasNext(); /**/ ) { Pair<String, Integer> test = it.next(); if (test.getSecond().compareTo(pair.getSecond()) < 0) { it.previous(); it.add(pair); break; } } answer.removeLast(); } else { // off the end } } } while (terms.next()); terms.close(); return answer; } catch (IOException e) { throw new SearcherException(e); } finally { if (searcher != null) { try { searcher.close(); } catch (IOException e) { throw new SearcherException(e); } } } }
/** * loads multi-value facet data. This method uses a workarea to prepare loading. * * @param fieldName * @param reader * @param listFactory * @param workArea * @throws IOException */ public void load( String fieldName, IndexReader reader, TermListFactory<T> listFactory, WorkArea workArea) throws IOException { long t0 = System.currentTimeMillis(); int maxdoc = reader.maxDoc(); BufferedLoader loader = getBufferedLoader(maxdoc, workArea); TermEnum tenum = null; TermDocs tdoc = null; TermValueList<T> list = (listFactory == null ? (TermValueList<T>) new TermStringList() : listFactory.createTermList()); IntArrayList minIDList = new IntArrayList(); IntArrayList maxIDList = new IntArrayList(); IntArrayList freqList = new IntArrayList(); OpenBitSet bitset = new OpenBitSet(maxdoc + 1); int negativeValueCount = getNegativeValueCount(reader, fieldName.intern()); int t = 0; // current term number list.add(null); minIDList.add(-1); maxIDList.add(-1); freqList.add(0); t++; _overflow = false; try { tdoc = reader.termDocs(); tenum = reader.terms(new Term(fieldName, "")); if (tenum != null) { do { Term term = tenum.term(); if (term == null || !fieldName.equals(term.field())) break; String val = term.text(); if (val != null) { list.add(val); tdoc.seek(tenum); // freqList.add(tenum.docFreq()); // removed because the df doesn't take into account // the num of deletedDocs int df = 0; int minID = -1; int maxID = -1; int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; if (tdoc.next()) { df++; int docid = tdoc.doc(); if (!loader.add(docid, valId)) logOverflow(fieldName); minID = docid; bitset.fastSet(docid); while (tdoc.next()) { df++; docid = tdoc.doc(); if (!loader.add(docid, valId)) logOverflow(fieldName); bitset.fastSet(docid); } maxID = docid; } freqList.add(df); minIDList.add(minID); maxIDList.add(maxID); } t++; } while (tenum.next()); } } finally { try { if (tdoc != null) { tdoc.close(); } } finally { if (tenum != null) { tenum.close(); } } } list.seal(); try { _nestedArray.load(maxdoc + 1, loader); } catch (IOException e) { throw e; } catch (Exception e) { throw new RuntimeException("failed to load due to " + e.toString(), e); } this.valArray = list; this.freqs = freqList.toIntArray(); this.minIDs = minIDList.toIntArray(); this.maxIDs = maxIDList.toIntArray(); int doc = 0; while (doc <= maxdoc && !_nestedArray.contains(doc, 0, true)) { ++doc; } if (doc <= maxdoc) { this.minIDs[0] = doc; doc = maxdoc; while (doc > 0 && !_nestedArray.contains(doc, 0, true)) { --doc; } if (doc > 0) { this.maxIDs[0] = doc; } } this.freqs[0] = maxdoc + 1 - (int) bitset.cardinality(); }
public static DocVector[] getCosineSimilarityMatrix(List<String> fileSentences) throws IOException { RAMDirectory ramDir = new RAMDirectory(); FileReader fr = new FileReader(new File("lib/stoplists/en.txt")); // Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt"))); Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr); // Index the full text of both documents // IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true, // IndexWriter.MaxFieldLength.UNLIMITED); IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer)); for (String s : fileSentences) { Document doc1 = new Document(); StringReader d1reader = new StringReader(s); doc1.add(new Field("contents", d1reader, TermVector.YES)); writer.addDocument(doc1); } // writer.commit(); writer.close(); DocVector[] docs = new DocVector[fileSentences.size()]; // Build a term vector for each document IndexReader RAMreader = IndexReader.open(ramDir); Map<String, Integer> terms = new HashMap<String, Integer>(); TermEnum termEnum = RAMreader.terms(new Term("contents")); // System.out.println(RAMreader.numDocs()); int pos = 0; while (termEnum.next()) { Term term = termEnum.term(); if (!"contents".equals(term.field())) break; terms.put(term.text(), pos++); } // System.out.println("Num terms:"+terms.size()); for (int i = 0; i < fileSentences.size(); i++) { TermFreqVector[] tfvs = RAMreader.getTermFreqVectors(i); docs[i] = new DocVector(terms); if (tfvs == null) continue; for (TermFreqVector tfv : tfvs) { String[] termTexts = tfv.getTerms(); int[] termFreqs = tfv.getTermFrequencies(); for (int j = 0; j < termTexts.length; j++) { double idfValue = getIDF(RAMreader, termTexts[j]); double tfIdfValue = termFreqs[j] * idfValue; docs[i].setEntry(termTexts[j], tfIdfValue); } } docs[i].normalize(); } RAMreader.close(); ramDir.close(); // ramDir.close(); // System.out.println(RAMreader.numDocs()); // System.out.println("Similarity:" + calcCosineSimilarity(docs[5], docs[19])); return docs; }
/** * Calculates the cosine similarity between two documents. * * @param d1 the first document * @param d2 the second document * @return the cosine similarity * @throws IOException */ public double getCosineSimilarity(String d1, String d2) throws IOException { RAMDirectory ramDir = new RAMDirectory(); FileReader fr = new FileReader(new File(WikiHelper.getSpecificProperty("stopwordFile"))); // Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt"))); Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr); // Index the full text of both documents @SuppressWarnings("deprecation") // IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true, // IndexWriter.MaxFieldLength.UNLIMITED); IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer)); Document doc1 = new Document(); StringReader d1reader = new StringReader(d1); doc1.add(new Field("contents", d1reader, TermVector.YES)); writer.addDocument(doc1); Document doc2 = new Document(); StringReader d2reader = new StringReader(d2); doc2.add(new Field("contents", d2reader, TermVector.YES)); writer.addDocument(doc2); // writer.commit(); writer.close(); DocVector[] docs = new DocVector[2]; // Build a term vector for each document IndexReader RAMreader = IndexReader.open(ramDir); Map<String, Integer> terms = new HashMap<String, Integer>(); TermEnum termEnum = RAMreader.terms(new Term("contents")); // System.out.println(RAMreader.numDocs()); TermFreqVector tfvs1 = RAMreader.getTermFreqVector(0, "contents"); TermFreqVector tfvs2 = RAMreader.getTermFreqVector(1, "contents"); // System.out.println(tfvs1.toString()); if (tfvs1 == null || tfvs2 == null) { return 0.0; } String[] termTexts1 = tfvs1.getTerms(); String[] termTexts2 = tfvs2.getTerms(); // Store the terms and their positions in a hashmap - this represents the vocabulary int pos = 0; for (String term : termTexts1) { terms.put(term, pos++); } for (String term : termTexts2) { if (!terms.containsKey(term)) { terms.put(term, pos++); } } docs[0] = new DocVector(terms); docs[1] = new DocVector(terms); int[] termFreqs1 = tfvs1.getTermFrequencies(); for (int j = 0; j < termTexts1.length; j++) { // System.out.println("termtext:"+termTexts1[j]); double idfValue = getIDF(RAMreader, termTexts1[j]); // System.out.println("idf:"+idfValue); double tfIdfValue = termFreqs1[j] * idfValue; // docs[i].setEntry(termTexts[j], termFreqs[j]); // System.out.println("TF IDF value "+termFreqs[j]+" "+termTexts[j]+" // "+idfValue+"\t"+tfIdfValue); docs[0].setEntry(termTexts1[j], tfIdfValue); } int[] termFreqs2 = tfvs2.getTermFrequencies(); for (int j = 0; j < termTexts2.length; j++) { double idfValue = getIDF(RAMreader, termTexts2[j]); double tfIdfValue = termFreqs2[j] * idfValue; // docs[i].setEntry(termTexts[j], termFreqs[j]); // System.out.println("TF IDF value "+termFreqs[j]+" "+termTexts[j]+" // "+idfValue+"\t"+tfIdfValue); docs[1].setEntry(termTexts2[j], tfIdfValue); } // // // // System.out.println(terms.toString()); // System.out.println(docs[0]); // System.out.println(docs[1]); RAMreader.close(); ramDir.close(); // docs[0].normalize(); // docs[1].normalize(); // Return the cosine similarity of the term vectors return calcCosineSimilarity(docs[0], docs[1]); }
public static void main(String[] args) throws Exception { // the IndexReader object is the main handle that will give you // all the documents, terms and inverted index IndexReader r = IndexReader.open(FSDirectory.open(new File("index"))); // You can figure out the number of documents using the maxDoc() function System.out.println("The number of documents in this index is: " + r.maxDoc()); int i = 0; // You can find out all the terms that have been indexed using the terms() function TermEnum t = r.terms(); while (t.next()) { // Since there are so many terms, let us try printing only term #100000-#100010 if (i > 100000) System.out.println("[" + i + "] " + t.term().text()); if (++i > 100010) break; } // You can create your own query terms by calling the Term constructor, with the field // 'contents' // In the following example, the query term is 'brute' Term te = new Term("contents", "brute"); // You can also quickly find out the number of documents that have term t System.out.println("Number of documents with the word 'brute' is: " + r.docFreq(te)); // You can use the inverted index to find out all the documents that contain the term 'brute' // by using the termDocs function TermDocs td = r.termDocs(te); while (td.next()) { System.out.println( "Document number [" + td.doc() + "] contains the term 'brute' " + td.freq() + " time(s)."); } // You can find the URL of the a specific document number using the document() function // For example, the URL for document number 14191 is: Document d = r.document(14191); String url = d.getFieldable("path") .stringValue(); // the 'path' field of the Document object holds the URL System.out.println(url.replace("%%", "/")); // -------- Now let us use all of the functions above to make something useful -------- // The following bit of code is a worked out example of how to get a bunch of documents // in response to a query and show them (without ranking them according to TF/IDF) Scanner sc = new Scanner(System.in); String str = ""; System.out.print("query> "); while (!(str = sc.nextLine()).equals("quit")) { String[] terms = str.split("\\s+"); for (String word : terms) { Term term = new Term("contents", word); TermDocs tdocs = r.termDocs(term); while (tdocs.next()) { String d_url = r.document(tdocs.doc()).getFieldable("path").stringValue().replace("%%", "/"); System.out.println("[" + tdocs.doc() + "] " + d_url); } } System.out.print("query> "); } }