private static Map<String, List<String>> generate_result(Directory directory) { Map<String, List<String>> result_map = new HashMap<String, List<String>>(); try { IndexReader reader = IndexReader.open(directory); TermEnum termEnum = reader.terms(); while (termEnum.next()) { String termEnumString = termEnum.term().toString(); if (termEnumString.startsWith("content:")) { String term = termEnumString.substring(termEnumString.lastIndexOf(":") + 1); TermDocs termDocs = reader.termDocs(termEnum.term()); while (termDocs.next()) { Document doc = reader.document(termDocs.doc()); String relative_path = doc.get("relative_path"); if (result_map.containsKey(relative_path)) { result_map.get(relative_path).add(term + termDocs.freq()); } else { result_map.put(relative_path, new ArrayList<String>()); } } } } } catch (IOException e) { e.printStackTrace(); } finally { } return result_map; }
public static int docId(IndexReader reader, Term term) throws IOException { TermDocs termDocs = reader.termDocs(term); try { if (termDocs.next()) { return termDocs.doc(); } return NO_DOC; } finally { termDocs.close(); } }
/** * Tests the IndexReader.getFieldNames implementation * * @throws Exception on error */ public void testFilterIndexReader() throws Exception { Directory directory = newDirectory(); IndexWriter writer = new IndexWriter( directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))); Document d1 = new Document(); d1.add(newField("default", "one two", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(d1); Document d2 = new Document(); d2.add(newField("default", "one three", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(d2); Document d3 = new Document(); d3.add(newField("default", "two four", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(d3); writer.close(); IndexReader reader = new TestReader(IndexReader.open(directory, true)); TermEnum terms = reader.terms(); while (terms.next()) { assertTrue(terms.term().text().indexOf('e') != -1); } terms.close(); TermPositions positions = reader.termPositions(new Term("default", "one")); while (positions.next()) { assertTrue((positions.doc() % 2) == 1); } int NUM_DOCS = 3; TermDocs td = reader.termDocs(null); for (int i = 0; i < NUM_DOCS; i++) { assertTrue(td.next()); assertEquals(i, td.doc()); assertEquals(1, td.freq()); } td.close(); reader.close(); directory.close(); }
public MyTerm(Term originTrem, TermDocs termDocs, int maxDocNum) throws IOException { super(); this.originTrem = originTrem; this.termDocs = termDocs; this.totalFreq = 0; while (this.termDocs.next()) { int docNum = termDocs.doc(); int freq = termDocs.freq(); this.termMap.put(docNum, freq); this.totalFreq += freq; } this.vector = new int[maxDocNum]; for (int i = 0; i < maxDocNum; i++) { this.vector[i] = 0; } for (int k : this.termMap.keySet()) { this.vector[k] = (int) this.termMap.get(k); } }
public void testSkipTo(int indexDivisor) throws IOException { Directory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); Term ta = new Term("content", "aaa"); for (int i = 0; i < 10; i++) addDoc(writer, "aaa aaa aaa aaa"); Term tb = new Term("content", "bbb"); for (int i = 0; i < 16; i++) addDoc(writer, "bbb bbb bbb bbb"); Term tc = new Term("content", "ccc"); for (int i = 0; i < 50; i++) addDoc(writer, "ccc ccc ccc ccc"); // assure that we deal with a single segment writer.optimize(); writer.close(); IndexReader reader = IndexReader.open(dir); reader.setTermInfosIndexDivisor(indexDivisor); assertEquals(indexDivisor, reader.getTermInfosIndexDivisor()); TermDocs tdocs = reader.termDocs(); // without optimization (assumption skipInterval == 16) // with next tdocs.seek(ta); assertTrue(tdocs.next()); assertEquals(0, tdocs.doc()); assertEquals(4, tdocs.freq()); assertTrue(tdocs.next()); assertEquals(1, tdocs.doc()); assertEquals(4, tdocs.freq()); assertTrue(tdocs.skipTo(0)); assertEquals(2, tdocs.doc()); assertTrue(tdocs.skipTo(4)); assertEquals(4, tdocs.doc()); assertTrue(tdocs.skipTo(9)); assertEquals(9, tdocs.doc()); assertFalse(tdocs.skipTo(10)); // without next tdocs.seek(ta); assertTrue(tdocs.skipTo(0)); assertEquals(0, tdocs.doc()); assertTrue(tdocs.skipTo(4)); assertEquals(4, tdocs.doc()); assertTrue(tdocs.skipTo(9)); assertEquals(9, tdocs.doc()); assertFalse(tdocs.skipTo(10)); // exactly skipInterval documents and therefore with optimization // with next tdocs.seek(tb); assertTrue(tdocs.next()); assertEquals(10, tdocs.doc()); assertEquals(4, tdocs.freq()); assertTrue(tdocs.next()); assertEquals(11, tdocs.doc()); assertEquals(4, tdocs.freq()); assertTrue(tdocs.skipTo(5)); assertEquals(12, tdocs.doc()); assertTrue(tdocs.skipTo(15)); assertEquals(15, tdocs.doc()); assertTrue(tdocs.skipTo(24)); assertEquals(24, tdocs.doc()); assertTrue(tdocs.skipTo(25)); assertEquals(25, tdocs.doc()); assertFalse(tdocs.skipTo(26)); // without next tdocs.seek(tb); assertTrue(tdocs.skipTo(5)); assertEquals(10, tdocs.doc()); assertTrue(tdocs.skipTo(15)); assertEquals(15, tdocs.doc()); assertTrue(tdocs.skipTo(24)); assertEquals(24, tdocs.doc()); assertTrue(tdocs.skipTo(25)); assertEquals(25, tdocs.doc()); assertFalse(tdocs.skipTo(26)); // much more than skipInterval documents and therefore with optimization // with next tdocs.seek(tc); assertTrue(tdocs.next()); assertEquals(26, tdocs.doc()); assertEquals(4, tdocs.freq()); assertTrue(tdocs.next()); assertEquals(27, tdocs.doc()); assertEquals(4, tdocs.freq()); assertTrue(tdocs.skipTo(5)); assertEquals(28, tdocs.doc()); assertTrue(tdocs.skipTo(40)); assertEquals(40, tdocs.doc()); assertTrue(tdocs.skipTo(57)); assertEquals(57, tdocs.doc()); assertTrue(tdocs.skipTo(74)); assertEquals(74, tdocs.doc()); assertTrue(tdocs.skipTo(75)); assertEquals(75, tdocs.doc()); assertFalse(tdocs.skipTo(76)); // without next tdocs.seek(tc); assertTrue(tdocs.skipTo(5)); assertEquals(26, tdocs.doc()); assertTrue(tdocs.skipTo(40)); assertEquals(40, tdocs.doc()); assertTrue(tdocs.skipTo(57)); assertEquals(57, tdocs.doc()); assertTrue(tdocs.skipTo(74)); assertEquals(74, tdocs.doc()); assertTrue(tdocs.skipTo(75)); assertEquals(75, tdocs.doc()); assertFalse(tdocs.skipTo(76)); tdocs.close(); reader.close(); dir.close(); }
public int doc() { return base + current.doc(); }
public static void main(String[] args) throws Exception { // the IndexReader object is the main handle that will give you // all the documents, terms and inverted index IndexReader r = IndexReader.open(FSDirectory.open(new File("index"))); // You can figure out the number of documents using the maxDoc() function System.out.println("The number of documents in this index is: " + r.maxDoc()); int i = 0; // You can find out all the terms that have been indexed using the terms() function TermEnum t = r.terms(); while (t.next()) { // Since there are so many terms, let us try printing only term #100000-#100010 if (i > 100000) System.out.println("[" + i + "] " + t.term().text()); if (++i > 100010) break; } // You can create your own query terms by calling the Term constructor, with the field // 'contents' // In the following example, the query term is 'brute' Term te = new Term("contents", "brute"); // You can also quickly find out the number of documents that have term t System.out.println("Number of documents with the word 'brute' is: " + r.docFreq(te)); // You can use the inverted index to find out all the documents that contain the term 'brute' // by using the termDocs function TermDocs td = r.termDocs(te); while (td.next()) { System.out.println( "Document number [" + td.doc() + "] contains the term 'brute' " + td.freq() + " time(s)."); } // You can find the URL of the a specific document number using the document() function // For example, the URL for document number 14191 is: Document d = r.document(14191); String url = d.getFieldable("path") .stringValue(); // the 'path' field of the Document object holds the URL System.out.println(url.replace("%%", "/")); // -------- Now let us use all of the functions above to make something useful -------- // The following bit of code is a worked out example of how to get a bunch of documents // in response to a query and show them (without ranking them according to TF/IDF) Scanner sc = new Scanner(System.in); String str = ""; System.out.print("query> "); while (!(str = sc.nextLine()).equals("quit")) { String[] terms = str.split("\\s+"); for (String word : terms) { Term term = new Term("contents", word); TermDocs tdocs = r.termDocs(term); while (tdocs.next()) { String d_url = r.document(tdocs.doc()).getFieldable("path").stringValue().replace("%%", "/"); System.out.println("[" + tdocs.doc() + "] " + d_url); } } System.out.print("query> "); } }
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("TermDumper [-c|-v value] field <index...>"); System.exit(1); } boolean count = false; String value = null; boolean all = false; int i = 0; for (; i < args.length; i++) { String arg = args[i]; if ("-h".equals(arg) || "--help".equals(arg)) { System.err.println("TermDumper [-c|-v value] field <index...>"); System.exit(1); } else if ("-c".equals(arg) || "--count".equals(arg)) { count = true; } else if ("-v".equals(arg) || "--vaue".equals(arg)) { value = args[++i]; } else if ("-a".equals(arg) || "--all".equals(arg)) { all = true; } else { break; } } String field = args[i++]; java.util.ArrayList<IndexReader> readers = new java.util.ArrayList<IndexReader>(args.length - 1); for (; i < args.length; i++) { String arg = args[i]; try { IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true); readers.add(reader); } catch (IOException ioe) { System.err.println("Error reading: " + arg); } } for (IndexReader reader : readers) { TermDocs termDocs = reader.termDocs(); TermEnum termEnum = reader.terms(new Term(field)); try { do { Term term = termEnum.term(); if (term == null || !field.equals(term.field())) break; if (value == null) { if (count) { termDocs.seek(termEnum); int c = 0; for (; termDocs.next(); c++) ; System.out.print(c + " "); } System.out.println(term.text()); } else if (value.equals(term.text())) { termDocs.seek(termEnum); while (termDocs.next()) { if (all) { Document d = reader.document(termDocs.doc()); System.out.println(termDocs.doc()); for (Object o : d.getFields()) { Field f = (Field) o; System.out.println(f.name() + " " + d.get(f.name())); } } else { System.out.println( termDocs.doc() + " " + reader.document(termDocs.doc()).get("url")); } } } } while (termEnum.next()); } finally { termDocs.close(); termEnum.close(); } } }