示例#1
0
 public static void main(String[] args) {
   //        makeCoreDictionary("D:\\JavaProjects\\CorpusToolBox\\data\\2014",
   // "data/dictionary/CoreNatureDictionary.txt");
   //        EasyDictionary dictionary =
   // EasyDictionary.create("data/dictionary/CoreNatureDictionary.txt");
   final NatureDictionaryMaker dictionaryMaker = new NatureDictionaryMaker();
   CorpusLoader.walk(
       "D:\\JavaProjects\\CorpusToolBox\\data\\2014",
       new CorpusLoader.Handler() {
         @Override
         public void handle(Document document) {
           dictionaryMaker.compute(
               CorpusUtil.convert2CompatibleList(
                   document.getSimpleSentenceList(false))); // 再打一遍不拆分的
           dictionaryMaker.compute(
               CorpusUtil.convert2CompatibleList(document.getSimpleSentenceList(true))); // 先打一遍拆分的
         }
       });
   dictionaryMaker.saveTxtTo("data/test/CoreNatureDictionary");
 }
示例#2
0
  static boolean makeCoreDictionary(String inPath, String outPath) {
    final DictionaryMaker dictionaryMaker = new DictionaryMaker();
    final TreeSet<String> labelSet = new TreeSet<String>();

    CorpusLoader.walk(
        inPath,
        new CorpusLoader.Handler() {
          @Override
          public void handle(Document document) {
            for (List<Word> sentence : document.getSimpleSentenceList(true)) {
              for (Word word : sentence) {
                if (shouldInclude(word)) dictionaryMaker.add(word);
              }
            }
            //                for (List<Word> sentence : document.getSimpleSentenceList(false))
            //                {
            //                    for (Word word : sentence)
            //                    {
            //                        if (shouldInclude(word))
            //                            dictionaryMaker.add(word);
            //                    }
            //                }
          }

          boolean shouldInclude(Word word) {
            if ("m".equals(word.label)
                || "mq".equals(word.label)
                || "w".equals(word.label)
                || "t".equals(word.label)) {
              if (!TextUtility.isAllChinese(word.value)) return false;
            } else if ("nr".equals(word.label)) {
              return false;
            }

            return true;
          }
        });
    if (outPath != null) return dictionaryMaker.saveTxtTo(outPath);
    return false;
  }