public static void main(String[] args) { // makeCoreDictionary("D:\\JavaProjects\\CorpusToolBox\\data\\2014", // "data/dictionary/CoreNatureDictionary.txt"); // EasyDictionary dictionary = // EasyDictionary.create("data/dictionary/CoreNatureDictionary.txt"); final NatureDictionaryMaker dictionaryMaker = new NatureDictionaryMaker(); CorpusLoader.walk( "D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() { @Override public void handle(Document document) { dictionaryMaker.compute( CorpusUtil.convert2CompatibleList( document.getSimpleSentenceList(false))); // 再打一遍不拆分的 dictionaryMaker.compute( CorpusUtil.convert2CompatibleList(document.getSimpleSentenceList(true))); // 先打一遍拆分的 } }); dictionaryMaker.saveTxtTo("data/test/CoreNatureDictionary"); }
static boolean makeCoreDictionary(String inPath, String outPath) { final DictionaryMaker dictionaryMaker = new DictionaryMaker(); final TreeSet<String> labelSet = new TreeSet<String>(); CorpusLoader.walk( inPath, new CorpusLoader.Handler() { @Override public void handle(Document document) { for (List<Word> sentence : document.getSimpleSentenceList(true)) { for (Word word : sentence) { if (shouldInclude(word)) dictionaryMaker.add(word); } } // for (List<Word> sentence : document.getSimpleSentenceList(false)) // { // for (Word word : sentence) // { // if (shouldInclude(word)) // dictionaryMaker.add(word); // } // } } boolean shouldInclude(Word word) { if ("m".equals(word.label) || "mq".equals(word.label) || "w".equals(word.label) || "t".equals(word.label)) { if (!TextUtility.isAllChinese(word.value)) return false; } else if ("nr".equals(word.label)) { return false; } return true; } }); if (outPath != null) return dictionaryMaker.saveTxtTo(outPath); return false; }