public void testUpdateSameDoc() throws Exception { final Directory dir = newDirectory(); final LineFileDocs docs = new LineFileDocs(random()); for (int r = 0; r < 3; r++) { final IndexWriter w = new IndexWriter( dir, newIndexWriterConfig(new MockAnalyzer(random())).setMaxBufferedDocs(2)); final int numUpdates = atLeast(20); int numThreads = TestUtil.nextInt(random(), 2, 6); IndexingThread[] threads = new IndexingThread[numThreads]; for (int i = 0; i < numThreads; i++) { threads[i] = new IndexingThread(docs, w, numUpdates); threads[i].start(); } for (int i = 0; i < numThreads; i++) { threads[i].join(); } w.close(); } IndexReader open = DirectoryReader.open(dir); assertEquals(1, open.numDocs()); open.close(); docs.close(); dir.close(); }
private void doTest(Random random, PrintWriter out, boolean useCompoundFiles, int MAX_DOCS) throws Exception { Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random); IndexWriterConfig conf = newIndexWriterConfig(analyzer); final MergePolicy mp = conf.getMergePolicy(); mp.setNoCFSRatio(useCompoundFiles ? 1.0 : 0.0); IndexWriter writer = new IndexWriter(directory, conf); if (VERBOSE) { System.out.println("TEST: now build index MAX_DOCS=" + MAX_DOCS); } for (int j = 0; j < MAX_DOCS; j++) { Document d = new Document(); d.add(newTextField(PRIORITY_FIELD, HIGH_PRIORITY, Field.Store.YES)); d.add(newTextField(ID_FIELD, Integer.toString(j), Field.Store.YES)); writer.addDocument(d); } writer.close(); // try a search without OR IndexReader reader = DirectoryReader.open(directory); IndexSearcher searcher = newSearcher(reader); Query query = new TermQuery(new Term(PRIORITY_FIELD, HIGH_PRIORITY)); out.println("Query: " + query.toString(PRIORITY_FIELD)); if (VERBOSE) { System.out.println("TEST: search query=" + query); } final Sort sort = new Sort(SortField.FIELD_SCORE, new SortField(ID_FIELD, SortField.Type.INT)); ScoreDoc[] hits = searcher.search(query, null, MAX_DOCS, sort).scoreDocs; printHits(out, hits, searcher); checkHits(hits, MAX_DOCS, searcher); // try a new search with OR searcher = newSearcher(reader); hits = null; BooleanQuery booleanQuery = new BooleanQuery(); booleanQuery.add( new TermQuery(new Term(PRIORITY_FIELD, HIGH_PRIORITY)), BooleanClause.Occur.SHOULD); booleanQuery.add( new TermQuery(new Term(PRIORITY_FIELD, MED_PRIORITY)), BooleanClause.Occur.SHOULD); out.println("Query: " + booleanQuery.toString(PRIORITY_FIELD)); hits = searcher.search(booleanQuery, null, MAX_DOCS, sort).scoreDocs; printHits(out, hits, searcher); checkHits(hits, MAX_DOCS, searcher); reader.close(); directory.close(); }
public void testTermDocsEnum() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter( dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); Document d = new Document(); d.add(newStringField("f", "j", Field.Store.NO)); w.addDocument(d); w.commit(); w.addDocument(d); IndexReader r = w.getReader(); w.close(); DocsEnum de = MultiFields.getTermDocsEnum(r, null, "f", new BytesRef("j")); assertEquals(0, de.nextDoc()); assertEquals(1, de.nextDoc()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.nextDoc()); r.close(); dir.close(); }
public void testSeparateEnums() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter( dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); Document d = new Document(); d.add(newStringField("f", "j", Field.Store.NO)); w.addDocument(d); w.commit(); w.addDocument(d); IndexReader r = w.getReader(); w.close(); DocsEnum d1 = _TestUtil.docs(random(), r, "f", new BytesRef("j"), null, null, 0); DocsEnum d2 = _TestUtil.docs(random(), r, "f", new BytesRef("j"), null, null, 0); assertEquals(0, d1.nextDoc()); assertEquals(0, d2.nextDoc()); r.close(); dir.close(); }
// public SortingReader(IndexReader oldReader, int[] oldToNew) { // TODO MC public SortingReader(IndexReader oldReader, DocScore[] newToOld) { // TODO MC super(oldReader); this.newToOld = newToOld; this.oldToNew = new int[oldReader.maxDoc()]; int newDoc = 0; while (newDoc < newToOld.length) { int oldDoc = newToOld[newDoc].doc; oldToNew[oldDoc] = newDoc; newDoc++; } }
public void sort(File directory) throws IOException { LOG.info("IndexSorter: starting."); Date start = new Date(); int termIndexInterval = getConf().getInt("indexer.termIndexInterval", 128); IndexReader reader = IndexReader.open(new File(directory, "index")); Searcher searcher = new IndexSearcher(new File(directory, "index").getAbsolutePath()); // TODO MC SortingReader sorter = new SortingReader(reader, newToOld(reader, searcher)); // TODO MC IndexWriter writer = new IndexWriter(new File(directory, "index-sorted"), null, true); writer.setTermIndexInterval(termIndexInterval); writer.setUseCompoundFile(false); writer.addIndexes(new IndexReader[] {sorter}); writer.close(); Date end = new Date(); LOG.info("IndexSorter: done, " + (end.getTime() - start.getTime()) + " total milliseconds"); }
public void testRandom() throws Exception { int num = atLeast(2); for (int iter = 0; iter < num; iter++) { if (VERBOSE) { System.out.println("TEST: iter=" + iter); } Directory dir = newDirectory(); IndexWriter w = new IndexWriter( dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())) .setMergePolicy(NoMergePolicy.COMPOUND_FILES)); _TestUtil.keepFullyDeletedSegments(w); Map<BytesRef, List<Integer>> docs = new HashMap<BytesRef, List<Integer>>(); Set<Integer> deleted = new HashSet<Integer>(); List<BytesRef> terms = new ArrayList<BytesRef>(); int numDocs = _TestUtil.nextInt(random(), 1, 100 * RANDOM_MULTIPLIER); Document doc = new Document(); Field f = newStringField("field", "", Field.Store.NO); doc.add(f); Field id = newStringField("id", "", Field.Store.NO); doc.add(id); boolean onlyUniqueTerms = random().nextBoolean(); if (VERBOSE) { System.out.println("TEST: onlyUniqueTerms=" + onlyUniqueTerms + " numDocs=" + numDocs); } Set<BytesRef> uniqueTerms = new HashSet<BytesRef>(); for (int i = 0; i < numDocs; i++) { if (!onlyUniqueTerms && random().nextBoolean() && terms.size() > 0) { // re-use existing term BytesRef term = terms.get(random().nextInt(terms.size())); docs.get(term).add(i); f.setStringValue(term.utf8ToString()); } else { String s = _TestUtil.randomUnicodeString(random(), 10); BytesRef term = new BytesRef(s); if (!docs.containsKey(term)) { docs.put(term, new ArrayList<Integer>()); } docs.get(term).add(i); terms.add(term); uniqueTerms.add(term); f.setStringValue(s); } id.setStringValue("" + i); w.addDocument(doc); if (random().nextInt(4) == 1) { w.commit(); } if (i > 0 && random().nextInt(20) == 1) { int delID = random().nextInt(i); deleted.add(delID); w.deleteDocuments(new Term("id", "" + delID)); if (VERBOSE) { System.out.println("TEST: delete " + delID); } } } if (VERBOSE) { List<BytesRef> termsList = new ArrayList<BytesRef>(uniqueTerms); Collections.sort(termsList, BytesRef.getUTF8SortedAsUTF16Comparator()); System.out.println("TEST: terms in UTF16 order:"); for (BytesRef b : termsList) { System.out.println(" " + UnicodeUtil.toHexString(b.utf8ToString()) + " " + b); for (int docID : docs.get(b)) { if (deleted.contains(docID)) { System.out.println(" " + docID + " (deleted)"); } else { System.out.println(" " + docID); } } } } IndexReader reader = w.getReader(); w.close(); if (VERBOSE) { System.out.println("TEST: reader=" + reader); } Bits liveDocs = MultiFields.getLiveDocs(reader); for (int delDoc : deleted) { assertFalse(liveDocs.get(delDoc)); } for (int i = 0; i < 100; i++) { BytesRef term = terms.get(random().nextInt(terms.size())); if (VERBOSE) { System.out.println( "TEST: seek term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " " + term); } DocsEnum docsEnum = _TestUtil.docs(random(), reader, "field", term, liveDocs, null, 0); assertNotNull(docsEnum); for (int docID : docs.get(term)) { if (!deleted.contains(docID)) { assertEquals(docID, docsEnum.nextDoc()); } } assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsEnum.nextDoc()); } reader.close(); dir.close(); } }
@Test public void testRollingUpdates() throws Exception { Random random = new Random(random().nextLong()); final BaseDirectoryWrapper dir = newDirectory(); // test checks for no unref'ed files with the IW helper method, which isn't aware of "tried to // delete files" if (dir instanceof MockDirectoryWrapper) { ((MockDirectoryWrapper) dir).setEnableVirusScanner(false); } final LineFileDocs docs = new LineFileDocs(random, true); // provider.register(new MemoryCodec()); if (random().nextBoolean()) { Codec.setDefault( TestUtil.alwaysPostingsFormat( new MemoryPostingsFormat(random().nextBoolean(), random.nextFloat()))); } MockAnalyzer analyzer = new MockAnalyzer(random()); analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); final IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer)); final int SIZE = atLeast(20); int id = 0; IndexReader r = null; IndexSearcher s = null; final int numUpdates = (int) (SIZE * (2 + (TEST_NIGHTLY ? 200 * random().nextDouble() : 5 * random().nextDouble()))); if (VERBOSE) { System.out.println("TEST: numUpdates=" + numUpdates); } int updateCount = 0; // TODO: sometimes update ids not in order... for (int docIter = 0; docIter < numUpdates; docIter++) { final Document doc = docs.nextDoc(); final String myID = Integer.toString(id); if (id == SIZE - 1) { id = 0; } else { id++; } if (VERBOSE) { System.out.println(" docIter=" + docIter + " id=" + id); } ((Field) doc.getField("docid")).setStringValue(myID); Term idTerm = new Term("docid", myID); final boolean doUpdate; if (s != null && updateCount < SIZE) { TopDocs hits = s.search(new TermQuery(idTerm), 1); assertEquals(1, hits.totalHits); doUpdate = !w.tryDeleteDocument(r, hits.scoreDocs[0].doc); if (VERBOSE) { if (doUpdate) { System.out.println(" tryDeleteDocument failed"); } else { System.out.println(" tryDeleteDocument succeeded"); } } } else { doUpdate = true; if (VERBOSE) { System.out.println(" no searcher: doUpdate=true"); } } updateCount++; if (doUpdate) { if (random().nextBoolean()) { w.updateDocument(idTerm, doc); } else { // It's OK to not be atomic for this test (no separate thread reopening readers): w.deleteDocuments(new TermQuery(idTerm)); w.addDocument(doc); } } else { w.addDocument(doc); } if (docIter >= SIZE && random().nextInt(50) == 17) { if (r != null) { r.close(); } final boolean applyDeletions = random().nextBoolean(); if (VERBOSE) { System.out.println("TEST: reopen applyDeletions=" + applyDeletions); } r = w.getReader(applyDeletions); if (applyDeletions) { s = newSearcher(r); } else { s = null; } assertTrue( "applyDeletions=" + applyDeletions + " r.numDocs()=" + r.numDocs() + " vs SIZE=" + SIZE, !applyDeletions || r.numDocs() == SIZE); updateCount = 0; } } if (r != null) { r.close(); } w.commit(); assertEquals(SIZE, w.numDocs()); w.close(); TestIndexWriter.assertNoUnreferencedFiles(dir, "leftover files after rolling updates"); docs.close(); // LUCENE-4455: SegmentInfos infos = SegmentInfos.readLatestCommit(dir); long totalBytes = 0; for (SegmentCommitInfo sipc : infos) { totalBytes += sipc.sizeInBytes(); } long totalBytes2 = 0; for (String fileName : dir.listAll()) { if (IndexFileNames.CODEC_FILE_PATTERN.matcher(fileName).matches()) { totalBytes2 += dir.fileLength(fileName); } } assertEquals(totalBytes2, totalBytes); dir.close(); }
public static void main(String[] args) throws Exception { // the IndexReader object is the main handle that will give you // all the documents, terms and inverted index IndexReader r = IndexReader.open(FSDirectory.open(new File("index"))); // You can figure out the number of documents using the maxDoc() function System.out.println("The number of documents in this index is: " + r.maxDoc()); int i = 0; // You can find out all the terms that have been indexed using the terms() function TermEnum t = r.terms(); while (t.next()) { // Since there are so many terms, let us try printing only term #100000-#100010 if (i > 100000) System.out.println("[" + i + "] " + t.term().text()); if (++i > 100010) break; } // You can create your own query terms by calling the Term constructor, with the field // 'contents' // In the following example, the query term is 'brute' Term te = new Term("contents", "brute"); // You can also quickly find out the number of documents that have term t System.out.println("Number of documents with the word 'brute' is: " + r.docFreq(te)); // You can use the inverted index to find out all the documents that contain the term 'brute' // by using the termDocs function TermDocs td = r.termDocs(te); while (td.next()) { System.out.println( "Document number [" + td.doc() + "] contains the term 'brute' " + td.freq() + " time(s)."); } // You can find the URL of the a specific document number using the document() function // For example, the URL for document number 14191 is: Document d = r.document(14191); String url = d.getFieldable("path") .stringValue(); // the 'path' field of the Document object holds the URL System.out.println(url.replace("%%", "/")); // -------- Now let us use all of the functions above to make something useful -------- // The following bit of code is a worked out example of how to get a bunch of documents // in response to a query and show them (without ranking them according to TF/IDF) Scanner sc = new Scanner(System.in); String str = ""; System.out.print("query> "); while (!(str = sc.nextLine()).equals("quit")) { String[] terms = str.split("\\s+"); for (String word : terms) { Term term = new Term("contents", word); TermDocs tdocs = r.termDocs(term); while (tdocs.next()) { String d_url = r.document(tdocs.doc()).getFieldable("path").stringValue().replace("%%", "/"); System.out.println("[" + tdocs.doc() + "] " + d_url); } } System.out.print("query> "); } }
// private static int[] oldToNew(IndexReader reader, Searcher searcher) throws IOException { private static DocScore[] newToOld(IndexReader reader, Searcher searcher) throws IOException { int readerMax = reader.maxDoc(); DocScore[] newToOld = new DocScore[readerMax]; // use site, an indexed, un-tokenized field to get boost // byte[] boosts = reader.norms("site"); TODO MC /* TODO MC */ Document docMeta; Pattern includes = Pattern.compile("\\|"); String value = NutchConfiguration.create().get(INCLUDE_EXTENSIONS_KEY, ""); String includeExtensions[] = includes.split(value); Hashtable<String, Boolean> validExtensions = new Hashtable<String, Boolean>(); for (int i = 0; i < includeExtensions.length; i++) { validExtensions.put(includeExtensions[i], true); System.out.println("extension boosted " + includeExtensions[i]); } /* TODO MC */ for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) { float score; if (reader.isDeleted(oldDoc)) { // score = 0.0f; score = -1f; // TODO MC } else { // score = Similarity.decodeNorm(boosts[oldDoc]); TODO MC /* TODO MC */ docMeta = searcher.doc(oldDoc); if (validExtensions.get(docMeta.get("subType")) == null) { // searched extensions will have higher scores score = -0.5f; } else { score = Integer.parseInt(docMeta.get("inlinks")); /* if (score==0) { score=0.001f; // TODO MC - to not erase } */ } /* TODO MC */ // System.out.println("Score for old document "+oldDoc+" is "+score+" and type // "+docMeta.get("subType")); // TODO MC debug remove } DocScore docScore = new DocScore(); docScore.doc = oldDoc; docScore.score = score; newToOld[oldDoc] = docScore; } System.out.println("Sorting " + newToOld.length + " documents."); Arrays.sort(newToOld); // HeapSorter.sort(newToOld); // TODO MC - due to the lack of space /* TODO MC int[] oldToNew = new int[readerMax]; for (int newDoc = 0; newDoc < readerMax; newDoc++) { DocScore docScore = newToOld[newDoc]; //oldToNew[docScore.oldDoc] = docScore.score > 0.0f ? newDoc : -1; // TODO MC oldToNew[docScore.oldDoc] = newDoc; // TODO MC } */ /* TODO MC * for (int newDoc = 0; newDoc < readerMax; newDoc++) { DocScore docScore = newToOld[newDoc]; System.out.println("Score for new document "+newDoc+" is "+docScore.score); // TODO MC debug remove } * TODO MC */ // return oldToNew; TODO MC return newToOld; // TODO MC }
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("TermDumper [-c|-v value] field <index...>"); System.exit(1); } boolean count = false; String value = null; boolean all = false; int i = 0; for (; i < args.length; i++) { String arg = args[i]; if ("-h".equals(arg) || "--help".equals(arg)) { System.err.println("TermDumper [-c|-v value] field <index...>"); System.exit(1); } else if ("-c".equals(arg) || "--count".equals(arg)) { count = true; } else if ("-v".equals(arg) || "--vaue".equals(arg)) { value = args[++i]; } else if ("-a".equals(arg) || "--all".equals(arg)) { all = true; } else { break; } } String field = args[i++]; java.util.ArrayList<IndexReader> readers = new java.util.ArrayList<IndexReader>(args.length - 1); for (; i < args.length; i++) { String arg = args[i]; try { IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true); readers.add(reader); } catch (IOException ioe) { System.err.println("Error reading: " + arg); } } for (IndexReader reader : readers) { TermDocs termDocs = reader.termDocs(); TermEnum termEnum = reader.terms(new Term(field)); try { do { Term term = termEnum.term(); if (term == null || !field.equals(term.field())) break; if (value == null) { if (count) { termDocs.seek(termEnum); int c = 0; for (; termDocs.next(); c++) ; System.out.print(c + " "); } System.out.println(term.text()); } else if (value.equals(term.text())) { termDocs.seek(termEnum); while (termDocs.next()) { if (all) { Document d = reader.document(termDocs.doc()); System.out.println(termDocs.doc()); for (Object o : d.getFields()) { Field f = (Field) o; System.out.println(f.name() + " " + d.get(f.name())); } } else { System.out.println( termDocs.doc() + " " + reader.document(termDocs.doc()).get("url")); } } } } while (termEnum.next()); } finally { termDocs.close(); termEnum.close(); } } }