@Test
  public void indexTest()
      throws CorruptIndexException, LockObtainFailedException, IOException, ParseException {
    HashSet<String> hs = new HashSet<String>();
    BufferedReader reader2 =
        IOUtil.getReader(ResourceBundle.getBundle("library").getString("stopLibrary"), "UTF-8");
    String word = null;
    while ((word = reader2.readLine()) != null) {
      hs.add(word);
    }
    Analyzer analyzer = new AnsjAnalysis(hs, false);
    Directory directory = null;
    IndexWriter iwriter = null;

    BufferedReader reader = IOUtil.getReader("/Users/ansj/Desktop/未命名文件夹/indextest.txt", "UTF-8");
    String temp = null;
    StringBuilder sb = new StringBuilder();
    while ((temp = reader.readLine()) != null) {
      sb.append(temp);
      sb.append("\n");
    }
    reader.close();
    String text = sb.toString();

    text = "lucene";

    IndexWriterConfig ic = new IndexWriterConfig(Version.LUCENE_32, analyzer);
    // 建立内存索引对象
    directory = new RAMDirectory();
    iwriter = new IndexWriter(directory, ic);
    // BufferedReader reader =
    // IOUtil.getReader("/Users/ansj/Documents/快盘/分词/语料/1998年人民日报分词语料_未区分.txt",
    // "GBK");
    // String temp = null;
    // while ((temp = reader.readLine()) != null) {
    // addContent(iwriter, temp);
    // }
    addContent(iwriter, text);
    addContent(iwriter, text);
    addContent(iwriter, text);
    addContent(iwriter, text);
    iwriter.commit();
    iwriter.close();

    System.out.println("索引建立完毕");

    search(analyzer, directory, "lucene");
  }
  static {
    try {

      long start = System.currentTimeMillis();
      FOREST = new Forest();

      // 先加载系统内置补充词典
      BufferedReader br = MyStaticValue.getUserDefineReader();
      String temp = null;
      while ((temp = br.readLine()) != null) {
        if (StringUtil.isBlank(temp) || InitDictionary.isInSystemDic(temp.split("\t")[0])) {
          continue;
        } else {
          Library.insertWord(FOREST, temp);
        }
      }
      // 如果系统设置了用户词典.那么..呵呵
      temp = MyStaticValue.userDefinePath;
      // 加载用户自定义词典
      Value value = null;
      String[] strs = null;
      if ((temp != null || (temp = MyStaticValue.rb.getString("userLibrary")) != null)
          && new File(temp).isFile()) {
        br = IOUtil.getReader(temp, "UTF-8");
        while ((temp = br.readLine()) != null) {
          if (StringUtil.isBlank(temp)) {
            continue;
          } else {
            strs = temp.split("\t");
            if (strs.length != 3) {
              value = new Value(strs[0], PARAMER);
            } else {
              value = new Value(strs[0], strs[1], strs[2]);
            }

            if (!InitDictionary.isInSystemDic(value.getKeyword())) {
              Library.insertWord(FOREST, value);
            }
          }
        }
      } else {
        System.err.println("用户自定义词典:" + temp + ", 没有这个文件!");
      }
      System.out.println("加载用户自定义词典完成用时:" + (System.currentTimeMillis() - start));
    } catch (Exception e) {
      // TODO Auto-generated catch block
      System.out.println("加载用户自定义词典加载失败:");
    }
  }
Beispiel #3
0
  public static void main(String[] args) throws IOException {
    // 学习机器是有状态的
    long start = System.currentTimeMillis();
    LearnTool learn = new LearnTool();
    BufferedReader materialsReader = IOUtil.getReader("/Users/ansj/Downloads/红楼梦.txt", "GBK");
    String temp = null;
    while ((temp = materialsReader.readLine()) != null) {
      List<Term> paser = NlpAnalysis.paser(temp, learn);
      //			System.out.println(paser);
    }

    System.out.println("这次训练已经学到了: " + learn.count + " 个词!");
    System.out.println(System.currentTimeMillis() - start);
    System.out.println(learn.getTopTree(100));
  }