/** * 词与词之间的关联表数据 * * @return */ public static void initBigramTables() { BufferedReader reader = null; try { reader = IOUtil.getReader(DicReader.getInputStream("bigramdict.dic"), "UTF-8"); String temp = null; String[] strs = null; int freq = 0; while ((temp = reader.readLine()) != null) { if (StringUtil.isBlank(temp)) { continue; } strs = temp.split("\t"); freq = Integer.parseInt(strs[1]); strs = strs[0].split("@"); AnsjItem fromItem = DATDictionary.getItem(strs[0]); AnsjItem toItem = DATDictionary.getItem(strs[1]); if (fromItem == AnsjItem.NULL && strs[0].contains("#")) { fromItem = AnsjItem.BEGIN; } if (toItem == AnsjItem.NULL && strs[1].contains("#")) { toItem = AnsjItem.END; } if (fromItem == AnsjItem.NULL || toItem == AnsjItem.NULL) { continue; } if (fromItem.bigramEntryMap == null) { fromItem.bigramEntryMap = new HashMap<Integer, Integer>(); } fromItem.bigramEntryMap.put(toItem.getIndex(), freq); } } catch (NumberFormatException e) { e.printStackTrace(); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { IOUtil.close(reader); } }
/** * 名字词性对象反序列化 * * @return */ @SuppressWarnings("unchecked") public static Map<String, int[][]> getPersonFreqMap() { InputStream inputStream = null; ObjectInputStream objectInputStream = null; Map<String, int[][]> map = new HashMap<String, int[][]>(0); try { inputStream = DicReader.getInputStream("person/asian_name_freq.data"); objectInputStream = new ObjectInputStream(inputStream); map = (Map<String, int[][]>) objectInputStream.readObject(); } catch (IOException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } finally { try { if (objectInputStream != null) objectInputStream.close(); if (inputStream != null) inputStream.close(); } catch (IOException e) { e.printStackTrace(); } } return map; }
public static void initArrays(BufferedReader reader) throws Exception { /** 人名识别必备的 */ HashMap<String, PersonNatureAttr> personMap = new PersonAttrLibrary().getPersonMap(); PersonNatureAttr personAttr = null; /** 机构名识别必备的 */ HashMap<String, CompanyNatureAttr> companyMap = new CompanyAttrLibrary().getCompanyMap(); CompanyNatureAttr companyAttr = null; HashMap<String, NewWordNatureAttr> newWordMap = new NewWordAttrLibrary().getNewWordMap(); NewWordNatureAttr newWordAttr = null; /** 下面开始加载词典 */ String temp = null; String[] strs = null; int num = 0; while ((temp = reader.readLine()) != null) { strs = temp.split(" "); num = Integer.parseInt(strs[0]); base[num] = Integer.parseInt(strs[2]); check[num] = Integer.parseInt(strs[3]); status[num] = Byte.parseByte(strs[4]); if (!"null".equals(strs[5])) { words[num] = strs[1]; if (status[num] < 4) { for (int i = 0; i < strs[1].length(); i++) { IN_SYSTEM[strs[1].charAt(i)] = strs[1].charAt(i); } } // 加载词性 TermNatures tn = new TermNatures(TermNature.setNatureStrToArray(strs[5]), num); // 判断是否是人名属性 if ((personAttr = personMap.get(strs[1])) != null) { tn.setPersonNatureAttr(personAttr); } // 判断是否是地名属性 if ((companyAttr = companyMap.get(strs[1])) != null) { tn.setCompanyAttr(companyAttr); } // 判断是否是新词属性 if ((newWordAttr = newWordMap.get(strs[1])) != null) { // 更新成词的概率 newWordAttr.updateAll(tn.allFreq); tn.setNewWordAttr(newWordAttr); } termNatures[num] = tn; } } // 人名词性补录 Set<Entry<String, PersonNatureAttr>> entrySet = personMap.entrySet(); char c = 0; TermNatures tn = null; for (Entry<String, PersonNatureAttr> entry : entrySet) { if (entry.getKey().length() == 1) { c = entry.getKey().charAt(0); if (status[c] > 1) { continue; } if (status[c] == 0) { base[c] = c; check[c] = -1; status[c] = 3; words[c] = entry.getKey(); } if ((tn = termNatures[c]) == null) { tn = new TermNatures(TermNature.NR); } tn.setPersonNatureAttr(entry.getValue()); termNatures[c] = tn; } } // 机构词性补录 Set<Entry<String, CompanyNatureAttr>> cnSet = companyMap.entrySet(); for (Entry<String, CompanyNatureAttr> entry : cnSet) { if (entry.getKey().length() == 1) { c = entry.getKey().charAt(0); if (status[c] > 1) { continue; } if (status[c] == 0) { base[c] = c; check[c] = -1; status[c] = 3; words[c] = entry.getKey(); } if ((tn = termNatures[c]) == null) { tn = new TermNatures(TermNature.NULL); } tn.setCompanyAttr(entry.getValue()); termNatures[c] = tn; } } // 简繁体字体转换 BufferedReader reader2 = DicReader.getReader("jianFan.dic"); while ((temp = reader2.readLine()) != null) { temp = temp.trim(); if (StringUtil.isBlank(temp)) { continue; } if (IN_SYSTEM[temp.charAt(0)] == 0) { IN_SYSTEM[temp.charAt(0)] = temp.charAt(2); } } reader.close(); reader2.close(); }
/** * 词性关联表 * * @return */ public static BufferedReader getNatureTableReader() { return DicReader.getReader("nature/nature.table"); }
/** * 得道姓名单字的词频词典 * * @return */ public static BufferedReader getPersonFreqReader() { return DicReader.getReader("person/name_freq.dic"); }
/** * 英文词典 * * @return */ public static BufferedReader getEnglishReader() { return DicReader.getReader("englishLibrary.dic"); }
/** * 词性表 * * @return */ public static BufferedReader getNatureMapReader() { return DicReader.getReader("nature/nature.map"); }
/** * 数字词典 * * @return */ public static BufferedReader getNumberReader() { return DicReader.getReader("numberLibrary.dic"); }
/** * 核心词典 * * @return */ public static BufferedReader getArraysReader() { return DicReader.getReader("arrays.dic"); }
/** * 机构名词典 * * @return */ public static BufferedReader getNewWordReader() { return DicReader.getReader("newWord/new_word_freq.dic"); }
/** * 机构名词典 * * @return */ public static BufferedReader getCompanReader() { return DicReader.getReader("company/company.data"); }
/** * 人名词典 * * @return */ public static BufferedReader getPersonReader() { return DicReader.getReader("person/person.dic"); }