private static Map<String, List<String>> generate_result(Directory directory) { Map<String, List<String>> result_map = new HashMap<String, List<String>>(); try { IndexReader reader = IndexReader.open(directory); TermEnum termEnum = reader.terms(); while (termEnum.next()) { String termEnumString = termEnum.term().toString(); if (termEnumString.startsWith("content:")) { String term = termEnumString.substring(termEnumString.lastIndexOf(":") + 1); TermDocs termDocs = reader.termDocs(termEnum.term()); while (termDocs.next()) { Document doc = reader.document(termDocs.doc()); String relative_path = doc.get("relative_path"); if (result_map.containsKey(relative_path)) { result_map.get(relative_path).add(term + termDocs.freq()); } else { result_map.put(relative_path, new ArrayList<String>()); } } } } } catch (IOException e) { e.printStackTrace(); } finally { } return result_map; }
/** * Gets the global term frequency of a term, i.e. how may times it occurs in the whole corpus * * @param term whose frequency you want * @return Global term frequency of term, or 1 if unavailable. */ private int getGlobalTermFreq(Term term) { int tf = 0; try { TermDocs tDocs = this.indexReader.termDocs(term); if (tDocs == null) { logger.info("Couldn't get term frequency for term " + term.text()); return 1; } while (tDocs.next()) { tf += tDocs.freq(); } } catch (IOException e) { logger.info("Couldn't get term frequency for term " + term.text()); return 1; } return tf; }
public MyTerm(Term originTrem, TermDocs termDocs, int maxDocNum) throws IOException { super(); this.originTrem = originTrem; this.termDocs = termDocs; this.totalFreq = 0; while (this.termDocs.next()) { int docNum = termDocs.doc(); int freq = termDocs.freq(); this.termMap.put(docNum, freq); this.totalFreq += freq; } this.vector = new int[maxDocNum]; for (int i = 0; i < maxDocNum; i++) { this.vector[i] = 0; } for (int k : this.termMap.keySet()) { this.vector[k] = (int) this.termMap.get(k); } }
@Override public boolean reload(String collectionName, String topRankingField) { if (collectionName == null) { return false; } CrescentCollectionHandler collectionHandler = SpringApplicationContext.getBean( "crescentCollectionHandler", CrescentCollectionHandler.class); CrescentCollection collection = collectionHandler.getCrescentCollections().getCrescentCollection(collectionName); if (collection == null) { logger.debug("doesn't Collection Info => {}", collectionName); init(View.Overview); return false; } if (topRankingField == null) { if (collection.getDefaultSearchFields().get(0) != null) { topRankingField = collection.getDefaultSearchFields().get(0).getName(); } else { logger.debug("doesn't defaultSearchField => {}", collectionName); init(View.Overview); return false; } } List<String> fieldName = new ArrayList<String>(); for (CrescentCollectionField field : collection.getFields()) fieldName.add(field.getName()); TopRankingQueue topRankingQueue = new TopRankingQueue(DEFAULT_TOPRANKING_TERM, new RankingTermComparator()); try { Directory directory = FSDirectory.open(new File(collection.getIndexingDirectory())); IndexReader reader = IndexReader.open(directory); TermEnum terms = reader.terms(); int termFreq = 0; int termCount = 0; Term beforeTerm = null; // init term count fieldTermCount.clear(); for (CrescentCollectionField field : collection.getFields()) fieldTermCount.put(field.getName(), 0); topRankingQueue.clear(); while (terms.next()) { Term currTerm = terms.term(); if (beforeTerm == null) { beforeTerm = currTerm; } if (beforeTerm.field() == currTerm.field()) { termCount++; } else { fieldTermCount.put(beforeTerm.field(), termCount); termCount = 1; beforeTerm = currTerm; } TermDocs termDocs = reader.termDocs(currTerm); while (termDocs.next()) { if (currTerm.field().equals(topRankingField)) { RankingTerm e = new RankingTerm(currTerm.text(), currTerm.field(), termDocs.freq()); topRankingQueue.add(e); } } termFreq++; } if (beforeTerm != null) fieldTermCount.put(beforeTerm.field(), termCount); terms.close(); result.put("numOfTerm", termFreq); result.put("numOfDoc", reader.numDocs()); result.put("hasDel", reader.hasDeletions()); result.put("isOptimize", reader.isOptimized()); result.put("indexVersion", reader.getVersion()); result.put("lastModify", new Date(IndexReader.lastModified(directory))); } catch (IOException e) { e.printStackTrace(); return false; } if (topRankingQueue.size() != 0) { topRankingTerms = topRankingQueue.toArray(); Arrays.sort(topRankingTerms); } result.put("collectionName", collectionName); result.put("indexName", collection.getIndexingDirectory()); result.put("numOfField", collection.getFields().size()); result.put("termCount", fieldTermCount); result.put("topRanking", topRankingTerms); result.put("fieldName", fieldName); return true; }
public static void main(String[] args) throws Exception { // the IndexReader object is the main handle that will give you // all the documents, terms and inverted index IndexReader r = IndexReader.open(FSDirectory.open(new File("index"))); // You can figure out the number of documents using the maxDoc() function System.out.println("The number of documents in this index is: " + r.maxDoc()); int i = 0; // You can find out all the terms that have been indexed using the terms() function TermEnum t = r.terms(); while (t.next()) { // Since there are so many terms, let us try printing only term #100000-#100010 if (i > 100000) System.out.println("[" + i + "] " + t.term().text()); if (++i > 100010) break; } // You can create your own query terms by calling the Term constructor, with the field // 'contents' // In the following example, the query term is 'brute' Term te = new Term("contents", "brute"); // You can also quickly find out the number of documents that have term t System.out.println("Number of documents with the word 'brute' is: " + r.docFreq(te)); // You can use the inverted index to find out all the documents that contain the term 'brute' // by using the termDocs function TermDocs td = r.termDocs(te); while (td.next()) { System.out.println( "Document number [" + td.doc() + "] contains the term 'brute' " + td.freq() + " time(s)."); } // You can find the URL of the a specific document number using the document() function // For example, the URL for document number 14191 is: Document d = r.document(14191); String url = d.getFieldable("path") .stringValue(); // the 'path' field of the Document object holds the URL System.out.println(url.replace("%%", "/")); // -------- Now let us use all of the functions above to make something useful -------- // The following bit of code is a worked out example of how to get a bunch of documents // in response to a query and show them (without ranking them according to TF/IDF) Scanner sc = new Scanner(System.in); String str = ""; System.out.print("query> "); while (!(str = sc.nextLine()).equals("quit")) { String[] terms = str.split("\\s+"); for (String word : terms) { Term term = new Term("contents", word); TermDocs tdocs = r.termDocs(term); while (tdocs.next()) { String d_url = r.document(tdocs.doc()).getFieldable("path").stringValue().replace("%%", "/"); System.out.println("[" + tdocs.doc() + "] " + d_url); } } System.out.print("query> "); } }