public void testUpdateSameDoc() throws Exception { final Directory dir = newDirectory(); final LineFileDocs docs = new LineFileDocs(random()); for (int r = 0; r < 3; r++) { final IndexWriter w = new IndexWriter( dir, newIndexWriterConfig(new MockAnalyzer(random())).setMaxBufferedDocs(2)); final int numUpdates = atLeast(20); int numThreads = TestUtil.nextInt(random(), 2, 6); IndexingThread[] threads = new IndexingThread[numThreads]; for (int i = 0; i < numThreads; i++) { threads[i] = new IndexingThread(docs, w, numUpdates); threads[i].start(); } for (int i = 0; i < numThreads; i++) { threads[i].join(); } w.close(); } IndexReader open = DirectoryReader.open(dir); assertEquals(1, open.numDocs()); open.close(); docs.close(); dir.close(); }
/* * listTermVectors displays the term vectors for all of the fields * in a document in an index (specified by reader). */ static void listTermVectors(IndexReader reader, String docidString) throws IOException { System.out.println("\nTermVector: docid " + docidString); int docid = Integer.parseInt(docidString); if ((docid < 0) || (docid >= reader.numDocs())) { System.out.println("ERROR: " + docidString + " is a bad document id."); return; } ; /* * Iterate over the fields in this document. */ Fields fields = reader.getTermVectors(docid); Iterator<String> fieldIterator = fields.iterator(); while (fieldIterator.hasNext()) { String fieldName = fieldIterator.next(); System.out.println(" Field: " + fieldName); Terms terms = fields.terms(fieldName); termVectorDisplay(terms); } ; }
/* * listPostings displays the first n postings for a term in a * field in an index (specified by reader). Set n to MAX_VALUE * to display all postings. */ static void listPostings(IndexReader reader, String termString, String field, Integer n) throws IOException { System.out.println("\nPostings: " + termString + " " + field); /* * Prepare to access the index. */ BytesRef termBytes = new BytesRef(termString); Term term = new Term(field, termBytes); Bits liveDocs = MultiFields.getLiveDocs(reader); /* * Lookup the collection term frequency (ctf). */ long df = reader.docFreq(term); System.out.println("\tdf: " + df); long ctf = reader.totalTermFreq(term); System.out.println("\tctf: " + ctf); if (df < 1) return; /* * Lookup the inverted list. */ DocsAndPositionsEnum postings = MultiFields.getTermPositionsEnum(reader, liveDocs, field, termBytes); /* * Iterate through the first n postings. */ long count = 0; while ((count < n) && (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS)) { System.out.println("\tdocid: " + postings.docID()); int tf = postings.freq(); System.out.println("\ttf: " + tf); System.out.print("\tPositions: "); for (int j = 0; j < tf; j++) { int pos = postings.nextPosition(); System.out.print(pos + " "); } System.out.println(""); count++; } ; return; }
private void doTest(Random random, PrintWriter out, boolean useCompoundFiles, int MAX_DOCS) throws Exception { Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random); IndexWriterConfig conf = newIndexWriterConfig(analyzer); final MergePolicy mp = conf.getMergePolicy(); mp.setNoCFSRatio(useCompoundFiles ? 1.0 : 0.0); IndexWriter writer = new IndexWriter(directory, conf); if (VERBOSE) { System.out.println("TEST: now build index MAX_DOCS=" + MAX_DOCS); } for (int j = 0; j < MAX_DOCS; j++) { Document d = new Document(); d.add(newTextField(PRIORITY_FIELD, HIGH_PRIORITY, Field.Store.YES)); d.add(newTextField(ID_FIELD, Integer.toString(j), Field.Store.YES)); writer.addDocument(d); } writer.close(); // try a search without OR IndexReader reader = DirectoryReader.open(directory); IndexSearcher searcher = newSearcher(reader); Query query = new TermQuery(new Term(PRIORITY_FIELD, HIGH_PRIORITY)); out.println("Query: " + query.toString(PRIORITY_FIELD)); if (VERBOSE) { System.out.println("TEST: search query=" + query); } final Sort sort = new Sort(SortField.FIELD_SCORE, new SortField(ID_FIELD, SortField.Type.INT)); ScoreDoc[] hits = searcher.search(query, null, MAX_DOCS, sort).scoreDocs; printHits(out, hits, searcher); checkHits(hits, MAX_DOCS, searcher); // try a new search with OR searcher = newSearcher(reader); hits = null; BooleanQuery booleanQuery = new BooleanQuery(); booleanQuery.add( new TermQuery(new Term(PRIORITY_FIELD, HIGH_PRIORITY)), BooleanClause.Occur.SHOULD); booleanQuery.add( new TermQuery(new Term(PRIORITY_FIELD, MED_PRIORITY)), BooleanClause.Occur.SHOULD); out.println("Query: " + booleanQuery.toString(PRIORITY_FIELD)); hits = searcher.search(booleanQuery, null, MAX_DOCS, sort).scoreDocs; printHits(out, hits, searcher); checkHits(hits, MAX_DOCS, searcher); reader.close(); directory.close(); }
/* * listTermVectorField displays the term vector for a field in * a document in an index (specified by reader). */ static void listTermVectorField(IndexReader reader, String docidString, String field) throws IOException { System.out.println("\nTermVector: docid " + docidString + ", field " + field); int docid = Integer.parseInt(docidString); if ((docid < 0) || (docid >= reader.numDocs())) { System.out.println("ERROR: " + docidString + " is a bad document id."); return; } ; Terms terms = reader.getTermVector(docid, field); termVectorDisplay(terms); }
public void testTermDocsEnum() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter( dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); Document d = new Document(); d.add(newStringField("f", "j", Field.Store.NO)); w.addDocument(d); w.commit(); w.addDocument(d); IndexReader r = w.getReader(); w.close(); DocsEnum de = MultiFields.getTermDocsEnum(r, null, "f", new BytesRef("j")); assertEquals(0, de.nextDoc()); assertEquals(1, de.nextDoc()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.nextDoc()); r.close(); dir.close(); }
public void testSeparateEnums() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter( dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); Document d = new Document(); d.add(newStringField("f", "j", Field.Store.NO)); w.addDocument(d); w.commit(); w.addDocument(d); IndexReader r = w.getReader(); w.close(); DocsEnum d1 = _TestUtil.docs(random(), r, "f", new BytesRef("j"), null, null, 0); DocsEnum d2 = _TestUtil.docs(random(), r, "f", new BytesRef("j"), null, null, 0); assertEquals(0, d1.nextDoc()); assertEquals(0, d2.nextDoc()); r.close(); dir.close(); }
// public SortingReader(IndexReader oldReader, int[] oldToNew) { // TODO MC public SortingReader(IndexReader oldReader, DocScore[] newToOld) { // TODO MC super(oldReader); this.newToOld = newToOld; this.oldToNew = new int[oldReader.maxDoc()]; int newDoc = 0; while (newDoc < newToOld.length) { int oldDoc = newToOld[newDoc].doc; oldToNew[oldDoc] = newDoc; newDoc++; } }
public void sort(File directory) throws IOException { LOG.info("IndexSorter: starting."); Date start = new Date(); int termIndexInterval = getConf().getInt("indexer.termIndexInterval", 128); IndexReader reader = IndexReader.open(new File(directory, "index")); Searcher searcher = new IndexSearcher(new File(directory, "index").getAbsolutePath()); // TODO MC SortingReader sorter = new SortingReader(reader, newToOld(reader, searcher)); // TODO MC IndexWriter writer = new IndexWriter(new File(directory, "index-sorted"), null, true); writer.setTermIndexInterval(termIndexInterval); writer.setUseCompoundFile(false); writer.addIndexes(new IndexReader[] {sorter}); writer.close(); Date end = new Date(); LOG.info("IndexSorter: done, " + (end.getTime() - start.getTime()) + " total milliseconds"); }
public void testRandom() throws Exception { int num = atLeast(2); for (int iter = 0; iter < num; iter++) { if (VERBOSE) { System.out.println("TEST: iter=" + iter); } Directory dir = newDirectory(); IndexWriter w = new IndexWriter( dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())) .setMergePolicy(NoMergePolicy.COMPOUND_FILES)); _TestUtil.keepFullyDeletedSegments(w); Map<BytesRef, List<Integer>> docs = new HashMap<BytesRef, List<Integer>>(); Set<Integer> deleted = new HashSet<Integer>(); List<BytesRef> terms = new ArrayList<BytesRef>(); int numDocs = _TestUtil.nextInt(random(), 1, 100 * RANDOM_MULTIPLIER); Document doc = new Document(); Field f = newStringField("field", "", Field.Store.NO); doc.add(f); Field id = newStringField("id", "", Field.Store.NO); doc.add(id); boolean onlyUniqueTerms = random().nextBoolean(); if (VERBOSE) { System.out.println("TEST: onlyUniqueTerms=" + onlyUniqueTerms + " numDocs=" + numDocs); } Set<BytesRef> uniqueTerms = new HashSet<BytesRef>(); for (int i = 0; i < numDocs; i++) { if (!onlyUniqueTerms && random().nextBoolean() && terms.size() > 0) { // re-use existing term BytesRef term = terms.get(random().nextInt(terms.size())); docs.get(term).add(i); f.setStringValue(term.utf8ToString()); } else { String s = _TestUtil.randomUnicodeString(random(), 10); BytesRef term = new BytesRef(s); if (!docs.containsKey(term)) { docs.put(term, new ArrayList<Integer>()); } docs.get(term).add(i); terms.add(term); uniqueTerms.add(term); f.setStringValue(s); } id.setStringValue("" + i); w.addDocument(doc); if (random().nextInt(4) == 1) { w.commit(); } if (i > 0 && random().nextInt(20) == 1) { int delID = random().nextInt(i); deleted.add(delID); w.deleteDocuments(new Term("id", "" + delID)); if (VERBOSE) { System.out.println("TEST: delete " + delID); } } } if (VERBOSE) { List<BytesRef> termsList = new ArrayList<BytesRef>(uniqueTerms); Collections.sort(termsList, BytesRef.getUTF8SortedAsUTF16Comparator()); System.out.println("TEST: terms in UTF16 order:"); for (BytesRef b : termsList) { System.out.println(" " + UnicodeUtil.toHexString(b.utf8ToString()) + " " + b); for (int docID : docs.get(b)) { if (deleted.contains(docID)) { System.out.println(" " + docID + " (deleted)"); } else { System.out.println(" " + docID); } } } } IndexReader reader = w.getReader(); w.close(); if (VERBOSE) { System.out.println("TEST: reader=" + reader); } Bits liveDocs = MultiFields.getLiveDocs(reader); for (int delDoc : deleted) { assertFalse(liveDocs.get(delDoc)); } for (int i = 0; i < 100; i++) { BytesRef term = terms.get(random().nextInt(terms.size())); if (VERBOSE) { System.out.println( "TEST: seek term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " " + term); } DocsEnum docsEnum = _TestUtil.docs(random(), reader, "field", term, liveDocs, null, 0); assertNotNull(docsEnum); for (int docID : docs.get(term)) { if (!deleted.contains(docID)) { assertEquals(docID, docsEnum.nextDoc()); } } assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsEnum.nextDoc()); } reader.close(); dir.close(); } }
@Test public void testRollingUpdates() throws Exception { Random random = new Random(random().nextLong()); final BaseDirectoryWrapper dir = newDirectory(); // test checks for no unref'ed files with the IW helper method, which isn't aware of "tried to // delete files" if (dir instanceof MockDirectoryWrapper) { ((MockDirectoryWrapper) dir).setEnableVirusScanner(false); } final LineFileDocs docs = new LineFileDocs(random, true); // provider.register(new MemoryCodec()); if (random().nextBoolean()) { Codec.setDefault( TestUtil.alwaysPostingsFormat( new MemoryPostingsFormat(random().nextBoolean(), random.nextFloat()))); } MockAnalyzer analyzer = new MockAnalyzer(random()); analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); final IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer)); final int SIZE = atLeast(20); int id = 0; IndexReader r = null; IndexSearcher s = null; final int numUpdates = (int) (SIZE * (2 + (TEST_NIGHTLY ? 200 * random().nextDouble() : 5 * random().nextDouble()))); if (VERBOSE) { System.out.println("TEST: numUpdates=" + numUpdates); } int updateCount = 0; // TODO: sometimes update ids not in order... for (int docIter = 0; docIter < numUpdates; docIter++) { final Document doc = docs.nextDoc(); final String myID = Integer.toString(id); if (id == SIZE - 1) { id = 0; } else { id++; } if (VERBOSE) { System.out.println(" docIter=" + docIter + " id=" + id); } ((Field) doc.getField("docid")).setStringValue(myID); Term idTerm = new Term("docid", myID); final boolean doUpdate; if (s != null && updateCount < SIZE) { TopDocs hits = s.search(new TermQuery(idTerm), 1); assertEquals(1, hits.totalHits); doUpdate = !w.tryDeleteDocument(r, hits.scoreDocs[0].doc); if (VERBOSE) { if (doUpdate) { System.out.println(" tryDeleteDocument failed"); } else { System.out.println(" tryDeleteDocument succeeded"); } } } else { doUpdate = true; if (VERBOSE) { System.out.println(" no searcher: doUpdate=true"); } } updateCount++; if (doUpdate) { if (random().nextBoolean()) { w.updateDocument(idTerm, doc); } else { // It's OK to not be atomic for this test (no separate thread reopening readers): w.deleteDocuments(new TermQuery(idTerm)); w.addDocument(doc); } } else { w.addDocument(doc); } if (docIter >= SIZE && random().nextInt(50) == 17) { if (r != null) { r.close(); } final boolean applyDeletions = random().nextBoolean(); if (VERBOSE) { System.out.println("TEST: reopen applyDeletions=" + applyDeletions); } r = w.getReader(applyDeletions); if (applyDeletions) { s = newSearcher(r); } else { s = null; } assertTrue( "applyDeletions=" + applyDeletions + " r.numDocs()=" + r.numDocs() + " vs SIZE=" + SIZE, !applyDeletions || r.numDocs() == SIZE); updateCount = 0; } } if (r != null) { r.close(); } w.commit(); assertEquals(SIZE, w.numDocs()); w.close(); TestIndexWriter.assertNoUnreferencedFiles(dir, "leftover files after rolling updates"); docs.close(); // LUCENE-4455: SegmentInfos infos = SegmentInfos.readLatestCommit(dir); long totalBytes = 0; for (SegmentCommitInfo sipc : infos) { totalBytes += sipc.sizeInBytes(); } long totalBytes2 = 0; for (String fileName : dir.listAll()) { if (IndexFileNames.CODEC_FILE_PATTERN.matcher(fileName).matches()) { totalBytes2 += dir.fileLength(fileName); } } assertEquals(totalBytes2, totalBytes); dir.close(); }
public static void main(String[] args) throws IOException { IndexReader reader = null; /* * Opening the index first simplifies the processing of the * rest of the command line arguments. */ for (int i = 0; i < args.length; i++) { if (("-index".equals(args[i])) && ((i + 1) < args.length)) { reader = DirectoryReader.open(FSDirectory.open(new File(args[i + 1]))); if (reader == null) { System.err.println("Error: Can't open index " + args[i + 1]); System.exit(1); } ; break; } ; } ; if (reader == null) { System.err.println(usage); System.exit(1); } ; /* * Process the command line arguments sequentially. */ for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { /* * Handled in the previous loop, so just skip the argument. */ i++; } else if ("-list-edocid".equals(args[i])) { System.out.println("-list-edocid:"); if ((i + 1) >= args.length) { System.out.println(usage); break; } ; Document d = reader.document(Integer.parseInt(args[i + 1])); System.out.println( "Internal docid --> External docid: " + args[i + 1] + " --> " + d.get("externalId")); i += 1; } else if ("-list-docids".equals(args[i])) { System.out.println("-list-docids:"); for (int j = 0; j < reader.numDocs(); j++) { Document d = reader.document(j); System.out.println("Internal --> external docid: " + j + " --> " + d.get("externalId")); } ; } else if ("-list-fields".equals(args[i])) { Fields fields = MultiFields.getFields(reader); System.out.print("\nNumber of fields: "); if (fields == null) System.out.println("0"); else { System.out.println(fields.size()); Iterator<String> is = fields.iterator(); while (is.hasNext()) { System.out.println("\t" + is.next()); } ; } ; } else if ("-list-postings".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listPostings(reader, args[i + 1], args[i + 2], Integer.MAX_VALUE); i += 2; } else if ("-list-postings-sample".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listPostings(reader, args[i + 1], args[i + 2], 5); i += 2; } else if ("-list-stats".equals(args[i])) { System.out.println("Corpus statistics:"); System.out.println("\tnumdocs\t\t" + reader.numDocs()); System.out.println( "\turl:\t" + "\tnumdocs=" + reader.getDocCount("url") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("url") + "\tavglen=" + reader.getSumTotalTermFreq("url") / (float) reader.getDocCount("url")); System.out.println( "\tkeywords:" + "\tnumdocs=" + reader.getDocCount("keywords") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("keywords") + "\tavglen=" + reader.getSumTotalTermFreq("keywords") / (float) reader.getDocCount("keywords")); System.out.println( "\ttitle:\t" + "\tnumdocs=" + reader.getDocCount("title") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("title") + "\tavglen=" + reader.getSumTotalTermFreq("title") / (float) reader.getDocCount("title")); System.out.println( "\tbody:\t" + "\tnumdocs=" + reader.getDocCount("body") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("body") + "\tavglen=" + reader.getSumTotalTermFreq("body") / (float) reader.getDocCount("body")); System.out.println( "\tinlink:\t" + "\tnumdocs=" + reader.getDocCount("inlink") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("inlink") + "\tavglen=" + reader.getSumTotalTermFreq("inlink") / (float) reader.getDocCount("inlink")); } else if ("-list-terms".equals(args[i])) { if ((i + 1) >= args.length) { System.out.println(usage); break; } ; listTermDictionary(reader, args[i + 1]); i += 1; } else if ("-list-termvector".equals(args[i])) { if ((i + 1) >= args.length) { System.out.println(usage); break; } ; listTermVectors(reader, args[i + 1]); i += 1; } else if ("-list-termvector-field".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listTermVectorField(reader, args[i + 1], args[i + 2]); i += 2; } else System.err.println("\nWarning: Unknown argument " + args[i] + " ignored."); } ; /* * Close the index and exit gracefully. */ reader.close(); }
public static void main(String[] args) throws Exception { // the IndexReader object is the main handle that will give you // all the documents, terms and inverted index IndexReader r = IndexReader.open(FSDirectory.open(new File("index"))); // You can figure out the number of documents using the maxDoc() function System.out.println("The number of documents in this index is: " + r.maxDoc()); int i = 0; // You can find out all the terms that have been indexed using the terms() function TermEnum t = r.terms(); while (t.next()) { // Since there are so many terms, let us try printing only term #100000-#100010 if (i > 100000) System.out.println("[" + i + "] " + t.term().text()); if (++i > 100010) break; } // You can create your own query terms by calling the Term constructor, with the field // 'contents' // In the following example, the query term is 'brute' Term te = new Term("contents", "brute"); // You can also quickly find out the number of documents that have term t System.out.println("Number of documents with the word 'brute' is: " + r.docFreq(te)); // You can use the inverted index to find out all the documents that contain the term 'brute' // by using the termDocs function TermDocs td = r.termDocs(te); while (td.next()) { System.out.println( "Document number [" + td.doc() + "] contains the term 'brute' " + td.freq() + " time(s)."); } // You can find the URL of the a specific document number using the document() function // For example, the URL for document number 14191 is: Document d = r.document(14191); String url = d.getFieldable("path") .stringValue(); // the 'path' field of the Document object holds the URL System.out.println(url.replace("%%", "/")); // -------- Now let us use all of the functions above to make something useful -------- // The following bit of code is a worked out example of how to get a bunch of documents // in response to a query and show them (without ranking them according to TF/IDF) Scanner sc = new Scanner(System.in); String str = ""; System.out.print("query> "); while (!(str = sc.nextLine()).equals("quit")) { String[] terms = str.split("\\s+"); for (String word : terms) { Term term = new Term("contents", word); TermDocs tdocs = r.termDocs(term); while (tdocs.next()) { String d_url = r.document(tdocs.doc()).getFieldable("path").stringValue().replace("%%", "/"); System.out.println("[" + tdocs.doc() + "] " + d_url); } } System.out.print("query> "); } }
// private static int[] oldToNew(IndexReader reader, Searcher searcher) throws IOException { private static DocScore[] newToOld(IndexReader reader, Searcher searcher) throws IOException { int readerMax = reader.maxDoc(); DocScore[] newToOld = new DocScore[readerMax]; // use site, an indexed, un-tokenized field to get boost // byte[] boosts = reader.norms("site"); TODO MC /* TODO MC */ Document docMeta; Pattern includes = Pattern.compile("\\|"); String value = NutchConfiguration.create().get(INCLUDE_EXTENSIONS_KEY, ""); String includeExtensions[] = includes.split(value); Hashtable<String, Boolean> validExtensions = new Hashtable<String, Boolean>(); for (int i = 0; i < includeExtensions.length; i++) { validExtensions.put(includeExtensions[i], true); System.out.println("extension boosted " + includeExtensions[i]); } /* TODO MC */ for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) { float score; if (reader.isDeleted(oldDoc)) { // score = 0.0f; score = -1f; // TODO MC } else { // score = Similarity.decodeNorm(boosts[oldDoc]); TODO MC /* TODO MC */ docMeta = searcher.doc(oldDoc); if (validExtensions.get(docMeta.get("subType")) == null) { // searched extensions will have higher scores score = -0.5f; } else { score = Integer.parseInt(docMeta.get("inlinks")); /* if (score==0) { score=0.001f; // TODO MC - to not erase } */ } /* TODO MC */ // System.out.println("Score for old document "+oldDoc+" is "+score+" and type // "+docMeta.get("subType")); // TODO MC debug remove } DocScore docScore = new DocScore(); docScore.doc = oldDoc; docScore.score = score; newToOld[oldDoc] = docScore; } System.out.println("Sorting " + newToOld.length + " documents."); Arrays.sort(newToOld); // HeapSorter.sort(newToOld); // TODO MC - due to the lack of space /* TODO MC int[] oldToNew = new int[readerMax]; for (int newDoc = 0; newDoc < readerMax; newDoc++) { DocScore docScore = newToOld[newDoc]; //oldToNew[docScore.oldDoc] = docScore.score > 0.0f ? newDoc : -1; // TODO MC oldToNew[docScore.oldDoc] = newDoc; // TODO MC } */ /* TODO MC * for (int newDoc = 0; newDoc < readerMax; newDoc++) { DocScore docScore = newToOld[newDoc]; System.out.println("Score for new document "+newDoc+" is "+docScore.score); // TODO MC debug remove } * TODO MC */ // return oldToNew; TODO MC return newToOld; // TODO MC }