static { try { long start = System.currentTimeMillis(); FOREST = new Forest(); // 先加载系统内置补充词典 BufferedReader br = MyStaticValue.getUserDefineReader(); String temp = null; while ((temp = br.readLine()) != null) { if (StringUtil.isBlank(temp) || InitDictionary.isInSystemDic(temp.split("\t")[0])) { continue; } else { Library.insertWord(FOREST, temp); } } // 如果系统设置了用户词典.那么..呵呵 temp = MyStaticValue.userDefinePath; // 加载用户自定义词典 Value value = null; String[] strs = null; if ((temp != null || (temp = MyStaticValue.rb.getString("userLibrary")) != null) && new File(temp).isFile()) { br = IOUtil.getReader(temp, "UTF-8"); while ((temp = br.readLine()) != null) { if (StringUtil.isBlank(temp)) { continue; } else { strs = temp.split("\t"); if (strs.length != 3) { value = new Value(strs[0], PARAMER); } else { value = new Value(strs[0], strs[1], strs[2]); } if (!InitDictionary.isInSystemDic(value.getKeyword())) { Library.insertWord(FOREST, value); } } } } else { System.err.println("用户自定义词典:" + temp + ", 没有这个文件!"); } System.out.println("加载用户自定义词典完成用时:" + (System.currentTimeMillis() - start)); } catch (Exception e) { // TODO Auto-generated catch block System.out.println("加载用户自定义词典加载失败:"); } }
/** * 一个词在词典中的id * * @param str * @return */ public static int getWordId(String str) { if (StringUtil.isBlank(str)) { return 0; } int baseValue = str.charAt(0); int checkValue = 0; for (int i = 1; i < str.length(); i++) { checkValue = baseValue; baseValue = base[baseValue] + str.charAt(i); if (baseValue > check.length - 1) return 0; if (check[baseValue] != -1 && check[baseValue] != checkValue) { return 0; } } return baseValue; }
/** * 判断一个词是否在词典中存在 * * @param str * @return */ public static boolean isInSystemDic(String str) { if (StringUtil.isBlank(str)) { return true; } int baseValue = str.charAt(0); int checkValue = 0; for (int i = 1; i < str.length(); i++) { checkValue = baseValue; baseValue = base[baseValue] + str.charAt(i); if (baseValue > check.length - 1) return false; if (check[baseValue] != -1 && check[baseValue] != checkValue) { return false; } } return status[baseValue] > 1; }
public static void initArrays(BufferedReader reader) throws Exception { /** 人名识别必备的 */ HashMap<String, PersonNatureAttr> personMap = new PersonAttrLibrary().getPersonMap(); PersonNatureAttr personAttr = null; /** 机构名识别必备的 */ HashMap<String, CompanyNatureAttr> companyMap = new CompanyAttrLibrary().getCompanyMap(); CompanyNatureAttr companyAttr = null; HashMap<String, NewWordNatureAttr> newWordMap = new NewWordAttrLibrary().getNewWordMap(); NewWordNatureAttr newWordAttr = null; /** 下面开始加载词典 */ String temp = null; String[] strs = null; int num = 0; while ((temp = reader.readLine()) != null) { strs = temp.split(" "); num = Integer.parseInt(strs[0]); base[num] = Integer.parseInt(strs[2]); check[num] = Integer.parseInt(strs[3]); status[num] = Byte.parseByte(strs[4]); if (!"null".equals(strs[5])) { words[num] = strs[1]; if (status[num] < 4) { for (int i = 0; i < strs[1].length(); i++) { IN_SYSTEM[strs[1].charAt(i)] = strs[1].charAt(i); } } // 加载词性 TermNatures tn = new TermNatures(TermNature.setNatureStrToArray(strs[5]), num); // 判断是否是人名属性 if ((personAttr = personMap.get(strs[1])) != null) { tn.setPersonNatureAttr(personAttr); } // 判断是否是地名属性 if ((companyAttr = companyMap.get(strs[1])) != null) { tn.setCompanyAttr(companyAttr); } // 判断是否是新词属性 if ((newWordAttr = newWordMap.get(strs[1])) != null) { // 更新成词的概率 newWordAttr.updateAll(tn.allFreq); tn.setNewWordAttr(newWordAttr); } termNatures[num] = tn; } } // 人名词性补录 Set<Entry<String, PersonNatureAttr>> entrySet = personMap.entrySet(); char c = 0; TermNatures tn = null; for (Entry<String, PersonNatureAttr> entry : entrySet) { if (entry.getKey().length() == 1) { c = entry.getKey().charAt(0); if (status[c] > 1) { continue; } if (status[c] == 0) { base[c] = c; check[c] = -1; status[c] = 3; words[c] = entry.getKey(); } if ((tn = termNatures[c]) == null) { tn = new TermNatures(TermNature.NR); } tn.setPersonNatureAttr(entry.getValue()); termNatures[c] = tn; } } // 机构词性补录 Set<Entry<String, CompanyNatureAttr>> cnSet = companyMap.entrySet(); for (Entry<String, CompanyNatureAttr> entry : cnSet) { if (entry.getKey().length() == 1) { c = entry.getKey().charAt(0); if (status[c] > 1) { continue; } if (status[c] == 0) { base[c] = c; check[c] = -1; status[c] = 3; words[c] = entry.getKey(); } if ((tn = termNatures[c]) == null) { tn = new TermNatures(TermNature.NULL); } tn.setCompanyAttr(entry.getValue()); termNatures[c] = tn; } } // 简繁体字体转换 BufferedReader reader2 = DicReader.getReader("jianFan.dic"); while ((temp = reader2.readLine()) != null) { temp = temp.trim(); if (StringUtil.isBlank(temp)) { continue; } if (IN_SYSTEM[temp.charAt(0)] == 0) { IN_SYSTEM[temp.charAt(0)] = temp.charAt(2); } } reader.close(); reader2.close(); }