/** * 从磁盘加载双数组 * * @param path * @return */ static boolean loadDat(String path) { try { ByteArray byteArray = ByteArray.createByteArray(path + Predefine.BIN_EXT); if (byteArray == null) return false; int size = byteArray.nextInt(); CoreDictionary.Attribute[] attributes = new CoreDictionary.Attribute[size]; final Nature[] natureIndexArray = Nature.values(); for (int i = 0; i < size; ++i) { // 第一个是全部频次,第二个是词性个数 int currentTotalFrequency = byteArray.nextInt(); int length = byteArray.nextInt(); attributes[i] = new CoreDictionary.Attribute(length); attributes[i].totalFrequency = currentTotalFrequency; for (int j = 0; j < length; ++j) { attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()]; attributes[i].frequency[j] = byteArray.nextInt(); } } if (!trie.load(byteArray, attributes) || byteArray.hasMore()) return false; } catch (Exception e) { logger.warning("读取失败,问题发生在" + e); return false; } return true; }
/** * 预测分布 * * @param context 环境 * @param outsums 先验分布 * @return 概率数组 */ public final double[] eval(String[] context, double[] outsums) { int[] scontexts = new int[context.length]; for (int i = 0; i < context.length; i++) { Integer ci = pmap.get(context[i]); scontexts[i] = ci == null ? -1 : ci; } prior.logPrior(outsums); return eval(scontexts, outsums, evalParams); }
// 自动加载词典 static { long start = System.currentTimeMillis(); if (!load(path)) { System.err.printf("核心词典%s加载失败\n", path); System.exit(-1); } else { logger.info( path + "加载成功," + trie.size() + "个词条,耗时" + (System.currentTimeMillis() - start) + "ms"); } }
/** * 使用用户词典合并粗分结果 * * @param vertexList 粗分结果 * @return 合并后的结果 */ protected static List<Vertex> combineByCustomDictionary(List<Vertex> vertexList) { Vertex[] wordNet = new Vertex[vertexList.size()]; vertexList.toArray(wordNet); DoubleArrayTrie<CoreDictionary.Attribute> dat = CustomDictionary.dat; for (int i = 0; i < wordNet.length; ++i) { int state = 1; state = dat.transition(wordNet[i].realWord, state); if (state > 0) { int start = i; int to = i + 1; int end = -1; CoreDictionary.Attribute value = null; for (; to < wordNet.length; ++to) { state = dat.transition(wordNet[to].realWord, state); if (state < 0) break; CoreDictionary.Attribute output = dat.output(state); if (output != null) { value = output; end = to + 1; } } if (value != null) { StringBuilder sbTerm = new StringBuilder(); for (int j = start; j < end; ++j) { sbTerm.append(wordNet[j]); wordNet[j] = null; } wordNet[i] = new Vertex(sbTerm.toString(), value); i = end - 1; } } } vertexList.clear(); for (Vertex vertex : wordNet) { if (vertex != null) vertexList.add(vertex); } return vertexList; }
private static boolean load(String path) { logger.info("核心词典开始加载:" + path); if (loadDat(path)) return true; TreeMap<String, CoreDictionary.Attribute> map = new TreeMap<String, Attribute>(); BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8")); String line; int MAX_FREQUENCY = 0; long start = System.currentTimeMillis(); while ((line = br.readLine()) != null) { String param[] = line.split("\\s"); int natureCount = (param.length - 1) / 2; CoreDictionary.Attribute attribute = new CoreDictionary.Attribute(natureCount); for (int i = 0; i < natureCount; ++i) { attribute.nature[i] = Enum.valueOf(Nature.class, param[1 + 2 * i]); attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]); attribute.totalFrequency += attribute.frequency[i]; } map.put(param[0], attribute); MAX_FREQUENCY += attribute.totalFrequency; } logger.info( "核心词典读入词条" + map.size() + " 全部频次" + MAX_FREQUENCY + ",耗时" + (System.currentTimeMillis() - start) + "ms"); br.close(); trie.build(map); logger.info("核心词典加载成功:" + trie.size() + "个词条,下面将写入缓存……"); try { DataOutputStream out = new DataOutputStream(new FileOutputStream(path + Predefine.BIN_EXT)); Collection<CoreDictionary.Attribute> attributeList = map.values(); out.writeInt(attributeList.size()); for (CoreDictionary.Attribute attribute : attributeList) { out.writeInt(attribute.totalFrequency); out.writeInt(attribute.nature.length); for (int i = 0; i < attribute.nature.length; ++i) { out.writeInt(attribute.nature[i].ordinal()); out.writeInt(attribute.frequency[i]); } } trie.save(out); out.close(); } catch (Exception e) { logger.warning("保存失败" + e); return false; } } catch (FileNotFoundException e) { logger.warning("核心词典" + path + "不存在!" + e); return false; } catch (IOException e) { logger.warning("核心词典" + path + "读取错误!" + e); return false; } return true; }
/** * 是否包含词语 * * @param key * @return */ public static boolean contains(String key) { return trie.get(key) != null; }
/** * 获取条目 * * @param wordID * @return */ public static Attribute get(int wordID) { return trie.get(wordID); }
/** * 获取条目 * * @param key * @return */ public static Attribute get(String key) { return trie.get(key); }