public static void main(String[] args) throws IOException { IndexReader reader = null; /* * Opening the index first simplifies the processing of the * rest of the command line arguments. */ for (int i = 0; i < args.length; i++) { if (("-index".equals(args[i])) && ((i + 1) < args.length)) { reader = DirectoryReader.open(FSDirectory.open(new File(args[i + 1]))); if (reader == null) { System.err.println("Error: Can't open index " + args[i + 1]); System.exit(1); } ; break; } ; } ; if (reader == null) { System.err.println(usage); System.exit(1); } ; /* * Process the command line arguments sequentially. */ for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { /* * Handled in the previous loop, so just skip the argument. */ i++; } else if ("-list-edocid".equals(args[i])) { System.out.println("-list-edocid:"); if ((i + 1) >= args.length) { System.out.println(usage); break; } ; Document d = reader.document(Integer.parseInt(args[i + 1])); System.out.println( "Internal docid --> External docid: " + args[i + 1] + " --> " + d.get("externalId")); i += 1; } else if ("-list-docids".equals(args[i])) { System.out.println("-list-docids:"); for (int j = 0; j < reader.numDocs(); j++) { Document d = reader.document(j); System.out.println("Internal --> external docid: " + j + " --> " + d.get("externalId")); } ; } else if ("-list-fields".equals(args[i])) { Fields fields = MultiFields.getFields(reader); System.out.print("\nNumber of fields: "); if (fields == null) System.out.println("0"); else { System.out.println(fields.size()); Iterator<String> is = fields.iterator(); while (is.hasNext()) { System.out.println("\t" + is.next()); } ; } ; } else if ("-list-postings".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listPostings(reader, args[i + 1], args[i + 2], Integer.MAX_VALUE); i += 2; } else if ("-list-postings-sample".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listPostings(reader, args[i + 1], args[i + 2], 5); i += 2; } else if ("-list-stats".equals(args[i])) { System.out.println("Corpus statistics:"); System.out.println("\tnumdocs\t\t" + reader.numDocs()); System.out.println( "\turl:\t" + "\tnumdocs=" + reader.getDocCount("url") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("url") + "\tavglen=" + reader.getSumTotalTermFreq("url") / (float) reader.getDocCount("url")); System.out.println( "\tkeywords:" + "\tnumdocs=" + reader.getDocCount("keywords") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("keywords") + "\tavglen=" + reader.getSumTotalTermFreq("keywords") / (float) reader.getDocCount("keywords")); System.out.println( "\ttitle:\t" + "\tnumdocs=" + reader.getDocCount("title") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("title") + "\tavglen=" + reader.getSumTotalTermFreq("title") / (float) reader.getDocCount("title")); System.out.println( "\tbody:\t" + "\tnumdocs=" + reader.getDocCount("body") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("body") + "\tavglen=" + reader.getSumTotalTermFreq("body") / (float) reader.getDocCount("body")); System.out.println( "\tinlink:\t" + "\tnumdocs=" + reader.getDocCount("inlink") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("inlink") + "\tavglen=" + reader.getSumTotalTermFreq("inlink") / (float) reader.getDocCount("inlink")); } else if ("-list-terms".equals(args[i])) { if ((i + 1) >= args.length) { System.out.println(usage); break; } ; listTermDictionary(reader, args[i + 1]); i += 1; } else if ("-list-termvector".equals(args[i])) { if ((i + 1) >= args.length) { System.out.println(usage); break; } ; listTermVectors(reader, args[i + 1]); i += 1; } else if ("-list-termvector-field".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listTermVectorField(reader, args[i + 1], args[i + 2]); i += 2; } else System.err.println("\nWarning: Unknown argument " + args[i] + " ignored."); } ; /* * Close the index and exit gracefully. */ reader.close(); }
public static void main(String[] args) throws Exception { // the IndexReader object is the main handle that will give you // all the documents, terms and inverted index IndexReader r = IndexReader.open(FSDirectory.open(new File("index"))); // You can figure out the number of documents using the maxDoc() function System.out.println("The number of documents in this index is: " + r.maxDoc()); int i = 0; // You can find out all the terms that have been indexed using the terms() function TermEnum t = r.terms(); while (t.next()) { // Since there are so many terms, let us try printing only term #100000-#100010 if (i > 100000) System.out.println("[" + i + "] " + t.term().text()); if (++i > 100010) break; } // You can create your own query terms by calling the Term constructor, with the field // 'contents' // In the following example, the query term is 'brute' Term te = new Term("contents", "brute"); // You can also quickly find out the number of documents that have term t System.out.println("Number of documents with the word 'brute' is: " + r.docFreq(te)); // You can use the inverted index to find out all the documents that contain the term 'brute' // by using the termDocs function TermDocs td = r.termDocs(te); while (td.next()) { System.out.println( "Document number [" + td.doc() + "] contains the term 'brute' " + td.freq() + " time(s)."); } // You can find the URL of the a specific document number using the document() function // For example, the URL for document number 14191 is: Document d = r.document(14191); String url = d.getFieldable("path") .stringValue(); // the 'path' field of the Document object holds the URL System.out.println(url.replace("%%", "/")); // -------- Now let us use all of the functions above to make something useful -------- // The following bit of code is a worked out example of how to get a bunch of documents // in response to a query and show them (without ranking them according to TF/IDF) Scanner sc = new Scanner(System.in); String str = ""; System.out.print("query> "); while (!(str = sc.nextLine()).equals("quit")) { String[] terms = str.split("\\s+"); for (String word : terms) { Term term = new Term("contents", word); TermDocs tdocs = r.termDocs(term); while (tdocs.next()) { String d_url = r.document(tdocs.doc()).getFieldable("path").stringValue().replace("%%", "/"); System.out.println("[" + tdocs.doc() + "] " + d_url); } } System.out.print("query> "); } }