/** 删除索引 */ public void delete() { IndexWriter writer = null; try { writer = new IndexWriter( directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35))); // 参数是一个选项,参数可以为一个Query,也可以是一个Term(精确查找的值) // 此删除的文档并不是完全删除,而是存在一个回收站中,可以恢复 // 删除ID为1的文档 writer.deleteDocuments(new Term("id", "1")); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (writer != null) try { writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
/** 手动索引优化 */ public void merge() { IndexWriter writer = null; try { writer = new IndexWriter( directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35))); // 会将索引强制合并为两段,这两段被删除的数据会被清空,此处lunene在3.5之后不建议使用,因为会消耗大量的内存开销,lunene会自动处理 writer.forceMerge(2); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (writer != null) try { writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
public void index(Author author) { Document doc = new Document(); Field authorName = new Field("author.name", author.getName(), Store.YES, Index.ANALYZED); doc.add(authorName); Field bookField = null; for (Book book : author.getBooks()) { bookField = new Field("book.name", book.getTitle(), Store.YES, Index.ANALYZED); doc.add(bookField); } IndexWriter writer = getWriter(); try { writer.addDocument(doc); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { if (writer != null) { writer.close(); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
private static Directory index(Analyzer analyzer, String processingPath) { RAMDirectory directory = null; IndexWriter indexWriter = null; try { directory = new RAMDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35, analyzer); indexWriter = new IndexWriter(directory, iwc); File file = new File(processingPath); index_h("", file, indexWriter); } catch (IOException e) { e.printStackTrace(); } finally { if (indexWriter != null) { try { indexWriter.close(); } catch (CorruptIndexException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } } return directory; }
public String searchIndexDocTextReturn(String searchString, String termString) { System.out.println("Searching for '" + searchString + "'"); // Directory directory = FSDirectory.getDirectory(); IndexReader indexReader; try { indexReader = IndexReader.open(indexDirectory); IndexSearcher indexSearcher = new IndexSearcher(indexReader); int n = w.numDocs(); Term term = new Term(termString, searchString); TermQuery query = new TermQuery(term); TopDocs topDocs = indexSearcher.search(query, 10); if (topDocs.scoreDocs.length > 0) { // while(it.hasNext()){ int docID = topDocs.scoreDocs[0].doc; Document doc = indexSearcher.doc(docID); textOfURL = doc.get("text"); // sourceCodeOfURL = doc.get("html"); this.docId = docID; } } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return textOfURL; }
public void update() { IndexWriter writer = null; try { writer = new IndexWriter( directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35))); /* * Lucene并没有提供更新,这里的更新操作其实是如下两个操作的合集 * 先删除之后再添加 */ Document doc = new Document(); doc.add(new Field("id", "11", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); doc.add(new Field("email", emails[0], Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("content", contents[0], Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("name", names[0], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); writer.updateDocument(new Term("id", "1"), doc); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { if (writer != null) writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
public void search01() { try { IndexReader reader = IndexReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); TermQuery query = new TermQuery(new Term("email", "*****@*****.**")); TopDocs tds = searcher.search(query, 10); for (ScoreDoc sd : tds.scoreDocs) { Document doc = searcher.doc(sd.doc); System.out.println( "(" + sd.doc + "-" + doc.getBoost() + "-" + sd.score + ")" + doc.get("name") + "[" + doc.get("email") + "]-->" + doc.get("id") + "," + doc.get("attach") + "," + doc.get("date") + "," + doc.getValues("email")[1]); } reader.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
public void merge() { IndexWriter writer = null; try { writer = new IndexWriter( directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35))); // 会将索引合并为两段,这两段中被删除的数据会被清空 // 特别注意:此处Lucene在3.5之后不建议使用,因为会消耗大量的开销 // Lucene会根据情况自己处理 writer.forceMerge(2); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { if (writer != null) writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
public void search02() { try { IndexSearcher searcher = getSearcher(); TermQuery query = new TermQuery(new Term("content", "like")); TopDocs tds = searcher.search(query, 10); for (ScoreDoc sd : tds.scoreDocs) { Document doc = searcher.doc(sd.doc); System.out.println( doc.get("id") + "---->" + doc.get("name") + "[" + doc.get("email") + "]-->" + doc.get("id") + "," + doc.get("attach") + "," + doc.get("date") + "," + doc.getValues("email")[1]); } searcher.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
public void forceDelete() { IndexWriter writer = null; try { writer = new IndexWriter( directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35))); writer.forceMergeDeletes(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { if (writer != null) writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
@Override public void index(List<AgeObject> aol) { try { IndexWriter iWriter = new IndexWriter( index, analyzer, objectList == null, IndexWriter.MaxFieldLength.UNLIMITED); if (objectList == null) objectList = aol; else objectList.addAll(aol); for (AgeObject ao : aol) { Document doc = new Document(); for (TextFieldExtractor tfe : extractors) doc.add( new Field( tfe.getName(), tfe.getExtractor().getValue(ao), Field.Store.NO, Field.Index.ANALYZED)); iWriter.addDocument(doc); } iWriter.close(); defaultFieldName = extractors.iterator().next().getName(); } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
public TermFreqVector searchIndexReturnFreqTerms(String searchString, String termString) { System.out.println("Searching for '" + searchString + "'"); // Directory directory = FSDirectory.getDirectory(); IndexReader indexReader; TermFreqVector termFreqDoc = null; try { indexReader = IndexReader.open(indexDirectory); IndexSearcher indexSearcher = new IndexSearcher(indexReader); Term term = new Term(termString, searchString); TermQuery query = new TermQuery(term); TopDocs topDocs = indexSearcher.search(query, 10); if (topDocs.scoreDocs.length > 0) { // while(it.hasNext()){ int docId = topDocs.scoreDocs[0].doc; Document doc = indexSearcher.doc(docId); // textOfURL = doc.get("text"); // sourceCodeOfURL = doc.get("html"); // this.docId = docID; termFreqDoc = indexReader.getTermFreqVector(docId, "text"); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return termFreqDoc; }
public void queryIndex() { Query q; try { q = new MultiFieldQueryParser(new String[] {"title", "name"}, analyzer).parse("s*"); // searching ... int hitsPerPage = 10; IndexSearcher searcher = new IndexSearcher(indexDirectory); TopDocCollector collector = new TopDocCollector(hitsPerPage); searcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // output results System.out.println("Found " + hits.length + " hits."); for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); System.out.println((i + 1) + ". " + d.get("name") + ": " + d.get("title")); } } catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
protected void indexList(List<AgeObject> aol, boolean append) { try { if (searcher != null) { searcher.getIndexReader().close(); searcher.close(); } IndexWriterConfig idxCfg = new IndexWriterConfig(Version.LUCENE_36, analyzer); idxCfg.setRAMBufferSizeMB(50); idxCfg.setOpenMode(append ? OpenMode.APPEND : OpenMode.CREATE); IndexWriter iWriter = new IndexWriter(index, idxCfg); for (Document d : new DocCollection(aol, extractors)) iWriter.addDocument(d); iWriter.close(); searcher = new IndexSearcher(IndexReader.open(index)); } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
public String getSpecificFreqTermInIndex( int KIntopK, ArrayList<String> sentQueries, int specificFrec, boolean allranges, boolean versionOld) { IndexReader indexReader = null; try { indexReader = IndexReader.open(indexDirectory); } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } String mostFrerqTerm = ""; try { mostFrerqTerm = freqTermsFinderInIndex.SpecificFreqTerms( indexDirectory, analyzer, indexReader, KIntopK, sentQueries, specificFrec, allranges, versionOld); indexReader.close(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return mostFrerqTerm; }
public void delete() { IndexWriter writer = null; try { writer = new IndexWriter( directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35))); // 参数是一个选项,可以是一个Query,也可以是一个term,term是一个精确查找的值 // 此时删除的文档并不会被完全删除,而是存储在一个回收站中的,可以恢复 writer.deleteAll(); // 删除所有 // writer.deleteDocuments(new Term("id","1")); writer.commit(); // 执行删除 } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { if (writer != null) writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
public void buildIndex(JSONObject indexData) { try { Directory dir = FSDirectory.open(new File(indexDir)); IKAnalyzer analyzer = new IKAnalyzer(); analyzer.setUseSmart(true); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35, analyzer); indexWriter = new IndexWriter(dir, iwc); indexWriter.deleteAll(); JSONArray statusData = indexData.getJSONArray("statusData"); for (int i = 0; i < statusData.length(); i++) { String text = statusData.getString(i); Document doc = new Document(); doc.add( new Field( "text", text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); indexWriter.addDocument(doc); } JSONArray userData = indexData.getJSONArray("userData"); for (int i = 0; i < userData.length(); i++) { String text = userData.getString(i); Document doc = new Document(); doc.add( new Field( "text", text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); indexWriter.addDocument(doc); } // indexWriter.commit(); System.out.println("Index is done"); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (JSONException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { try { indexWriter.close(); } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
/** * 文本向量化 * * @param analyzer - 选择的分词器对象 * @param field - lucene域名 * @param content - 文本内容 */ public Map<Long, Integer> vectorize(Analyzer analyzer, String field, String content) { Map<Long, Integer> map = new TreeMap<Long, Integer>(); DocWordHashMap wordHash = DocWordHashMap.getInstance(); TokenStream ts = null; try { ts = analyzer.tokenStream(field, content); // 迭代获取分词结果 // 重置TokenStream ts.reset(); while (ts.incrementToken()) { String word = ts.addAttribute(CharTermAttribute.class).toString(); // 逐个MurmurHash词元 long hash = MurmurHash.hash64(word); if (!wordHash.isContainKey(hash)) { wordHash.setWordStringHash(hash, word); } if (!map.containsKey(hash)) { map.put(hash, 1); } else { map.put(hash, map.get(hash) + 1); } } // 关闭TokenStream ts.end(); // Perform end-of-stream operations, e.g. set the final // offset. } catch (CorruptIndexException e) { e.printStackTrace(); map.clear(); } catch (LockObtainFailedException e) { e.printStackTrace(); map.clear(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); map.clear(); } finally { // 释放TokenStream的所有资源 if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); map.clear(); } } } return map; }
public void delete02() { try { reader.deleteDocuments(new Term("id", "1")); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
private IndexSearcher getSearcher() { IndexSearcher searcher = null; try { searcher = new IndexSearcher(ramDir, true); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return searcher; }
protected void add(List<Document> docs) { try { this.writer.addDocuments(docs); this.writer.commit(); jlog.info("" + docs.size() + " docs added"); } catch (CorruptIndexException cie) { jlog.severe("Corrupt index!"); cie.printStackTrace(); } catch (IOException ioe) { jlog.severe("Couldn't write docs:\n\t" + ioe); ioe.printStackTrace(); } }
public void query() { try { IndexReader reader = IndexReader.open(directory); // 通过reader可以有效的获取到文档的数量 System.out.println("numDocs:" + reader.numDocs()); System.out.println("maxDocs:" + reader.maxDoc()); System.out.println("deleteDocs:" + reader.numDeletedDocs()); reader.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
public void printAnalyzerWords(Analyzer analyzer, String field) { // 获取Lucene的TokenStream对象 TokenStream ts = null; try { ts = analyzer.tokenStream(field, this.content); // 获取词元位置属性 OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); // 获取词元文本属性 CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); // 获取词元文本属性 TypeAttribute type = ts.addAttribute(TypeAttribute.class); // 重置TokenStream(重置StringReader) ts.reset(); // 迭代获取分词结果 while (ts.incrementToken()) { System.out.println("documents[" + this.id + "]"); System.out.println( offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } // 关闭TokenStream(关闭StringReader) ts.end(); // Perform end-of-stream operations, e.g. set the final // offset. } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { // 释放TokenStream的所有资源 if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
private IndexWriter getWriter() { log.debug("Creating writer."); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_29); IndexWriter writer = null; try { writer = new IndexWriter(ramDir, analyzer, IndexWriter.MaxFieldLength.UNLIMITED); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return writer; }
public void index() { // 创建索引 IndexWriter writer = null; try { writer = new IndexWriter( directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35))); writer.deleteAll(); Document doc = null; for (int i = 0; i < ids.length; i++) { doc = new Document(); doc.add(new Field("id", ids[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); doc.add(new Field("email", emails[i], Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add( new Field( "email", "test" + i + "@test.com", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("content", contents[i], Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("name", names[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); // 存储数字 doc.add(new NumericField("attach", Field.Store.YES, true).setIntValue(attachs[i])); // 存储日期 doc.add(new NumericField("date", Field.Store.YES, true).setLongValue(dates[i].getTime())); String et = emails[i].substring(emails[i].lastIndexOf("@") + 1); System.out.println(et); if (scores.containsKey(et)) { doc.setBoost(scores.get(et)); // 默认为1 } else { doc.setBoost(0.5f); } writer.addDocument(doc); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { if (writer != null) writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
public boolean IsInSearchIndex(String searchString, String termString) { System.out.println("Searching for '" + searchString + "'"); // Directory directory = FSDirectory.getDirectory(); IndexReader indexReader; try { indexReader = IndexReader.open(indexDirectory); IndexSearcher indexSearcher = new IndexSearcher(indexReader); int n = w.numDocs(); Term term = new Term(termString, searchString); TermQuery query = new TermQuery(term); TopDocs topDocs = indexSearcher.search(query, 10); if (topDocs.scoreDocs.length > 0) { // while(it.hasNext()){ int docID = topDocs.scoreDocs[0].doc; Document doc = indexSearcher.doc(docID); textOfURL = doc.get("text"); sourceCodeOfURL = doc.get("html"); this.docId = docID; return true; } else return false; } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return false; /* * BooleanQuery bquery = new BooleanQuery(); bquery.add(query, * BooleanClause.Occur.SHOULD); // query.add(new Term("url", * searchString)); Hits hits = indexSearcher.search(bquery); PhraseQuery * phrassQ = new PhraseQuery(); phrassQ.add(term); topDocs = * indexSearcher.search(phrassQ, 10); * * Analyzer analyzer = new KeywordAnalyzer(); QueryParser queryParser = * new QueryParser("url", analyzer); Query query2 = * queryParser.parse("\""+searchString+"\""); topDocs = * indexSearcher.search(query2, 10); */ // System.out.println("Number of hits: " + topDocs.totalHits); // Iterator<Hit> it = hits.iterator(); }
public void undelete() { // 使用IndexReader进行恢复 try { IndexReader reader = IndexReader.open(directory, false); // 恢复时,必须把IndexReader的只读(readOnly)设置为false reader.undeleteAll(); reader.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (StaleReaderException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
private void del(String queryString) { jlog.info("Deleting " + queryString); try { Query q = this.queryParser().parse(queryString); this.writer.deleteDocuments(q); this.writer.commit(); jlog.info("Several docs deleted"); } catch (CorruptIndexException cie) { jlog.severe("Corrupt index!"); cie.printStackTrace(); } catch (IOException ioe) { jlog.severe("Couldn't del values docs:\n\t" + ioe); ioe.printStackTrace(); } catch (ParseException pe) { jlog.severe("Couldn't parse del query:\n\t" + pe); pe.printStackTrace(); } }
@Override public void reset() { try { IndexWriter iWriter = new IndexWriter(index, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); iWriter.close(); objectList = null; } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (LockObtainFailedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
public IndexSearcher getSearcher() { try { if (reader == null) { reader = IndexReader.open(directory, false); } else { IndexReader tr = IndexReader.openIfChanged(reader); if (tr != null) { reader.close(); reader = tr; } } return new IndexSearcher(reader); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; }