@Test public void indexTest() throws CorruptIndexException, LockObtainFailedException, IOException, ParseException { HashSet<String> hs = new HashSet<String>(); BufferedReader reader2 = IOUtil.getReader(ResourceBundle.getBundle("library").getString("stopLibrary"), "UTF-8"); String word = null; while ((word = reader2.readLine()) != null) { hs.add(word); } Analyzer analyzer = new AnsjAnalysis(hs, false); Directory directory = null; IndexWriter iwriter = null; BufferedReader reader = IOUtil.getReader("/Users/ansj/Desktop/未命名文件夹/indextest.txt", "UTF-8"); String temp = null; StringBuilder sb = new StringBuilder(); while ((temp = reader.readLine()) != null) { sb.append(temp); sb.append("\n"); } reader.close(); String text = sb.toString(); text = "lucene"; IndexWriterConfig ic = new IndexWriterConfig(Version.LUCENE_32, analyzer); // 建立内存索引对象 directory = new RAMDirectory(); iwriter = new IndexWriter(directory, ic); // BufferedReader reader = // IOUtil.getReader("/Users/ansj/Documents/快盘/分词/语料/1998年人民日报分词语料_未区分.txt", // "GBK"); // String temp = null; // while ((temp = reader.readLine()) != null) { // addContent(iwriter, temp); // } addContent(iwriter, text); addContent(iwriter, text); addContent(iwriter, text); addContent(iwriter, text); iwriter.commit(); iwriter.close(); System.out.println("索引建立完毕"); search(analyzer, directory, "lucene"); }
static { try { long start = System.currentTimeMillis(); FOREST = new Forest(); // 先加载系统内置补充词典 BufferedReader br = MyStaticValue.getUserDefineReader(); String temp = null; while ((temp = br.readLine()) != null) { if (StringUtil.isBlank(temp) || InitDictionary.isInSystemDic(temp.split("\t")[0])) { continue; } else { Library.insertWord(FOREST, temp); } } // 如果系统设置了用户词典.那么..呵呵 temp = MyStaticValue.userDefinePath; // 加载用户自定义词典 Value value = null; String[] strs = null; if ((temp != null || (temp = MyStaticValue.rb.getString("userLibrary")) != null) && new File(temp).isFile()) { br = IOUtil.getReader(temp, "UTF-8"); while ((temp = br.readLine()) != null) { if (StringUtil.isBlank(temp)) { continue; } else { strs = temp.split("\t"); if (strs.length != 3) { value = new Value(strs[0], PARAMER); } else { value = new Value(strs[0], strs[1], strs[2]); } if (!InitDictionary.isInSystemDic(value.getKeyword())) { Library.insertWord(FOREST, value); } } } } else { System.err.println("用户自定义词典:" + temp + ", 没有这个文件!"); } System.out.println("加载用户自定义词典完成用时:" + (System.currentTimeMillis() - start)); } catch (Exception e) { // TODO Auto-generated catch block System.out.println("加载用户自定义词典加载失败:"); } }
public static void main(String[] args) throws IOException { // 学习机器是有状态的 long start = System.currentTimeMillis(); LearnTool learn = new LearnTool(); BufferedReader materialsReader = IOUtil.getReader("/Users/ansj/Downloads/红楼梦.txt", "GBK"); String temp = null; while ((temp = materialsReader.readLine()) != null) { List<Term> paser = NlpAnalysis.paser(temp, learn); // System.out.println(paser); } System.out.println("这次训练已经学到了: " + learn.count + " 个词!"); System.out.println(System.currentTimeMillis() - start); System.out.println(learn.getTopTree(100)); }