// @Bean public boolean doconfig() { WordConfTools.set("dic.path", "classpath:dic.txt,classpath:config/mydic.txt"); DictionaryFactory.reload(); // 更改词典路径之后,重新加载词典 // String path = WordConfTools.get("dic.path"); // log.info("dic.path {}" ,path); return true; }
public static void reload() { AutoDetector.loadAndWatch( new ResourceLoader() { @Override public void clear() { quantifiers.clear(); } @Override public void load(List<String> lines) { LOGGER.info("初始化数量词"); for (String line : lines) { if (line.length() == 1) { char _char = line.charAt(0); if (quantifiers.contains(_char)) { LOGGER.info("配置文件有重复项:" + line); } else { quantifiers.add(_char); } } else { LOGGER.info("忽略不合法数量词:" + line); } } LOGGER.info("数量词初始化完毕,数量词个数:" + quantifiers.size()); } @Override public void add(String line) { if (line.length() == 1) { char _char = line.charAt(0); quantifiers.add(_char); } else { LOGGER.info("忽略不合法数量词:" + line); } } @Override public void remove(String line) { if (line.length() == 1) { char _char = line.charAt(0); quantifiers.remove(_char); } else { LOGGER.info("忽略不合法数量词:" + line); } } }, WordConfTools.get("quantifier.path", "classpath:quantifier.txt")); }
public static void reload() { AutoDetector.loadAndWatch( new ResourceLoader() { @Override public void clear() { DOUBLE_ARRAY_GENERIC_TRIE.clear(); } @Override public void load(List<String> lines) { LOGGER.info("初始化trigram"); Map<String, Integer> map = new HashMap<>(); for (String line : lines) { try { addLine(line, map); } catch (Exception e) { LOGGER.error("错误的trigram数据:" + line); } } int size = map.size(); DOUBLE_ARRAY_GENERIC_TRIE.putAll(map); LOGGER.info("trigram初始化完毕,trigram数据条数:" + size); } @Override public void add(String line) { throw new RuntimeException("not yet support menthod!"); } private void addLine(String line, Map<String, Integer> map) { String[] attr = line.split("\\s+"); int frequency = Integer.parseInt(attr[1]); if (frequency > maxFrequency) { maxFrequency = frequency; } map.put(attr[0], frequency); } @Override public void remove(String line) { throw new RuntimeException("not yet support menthod!"); } }, WordConfTools.get("trigram.path", "classpath:trigram.txt")); }
/** * 三元语法模型 * * @author 杨尚川 */ public class Trigram { private static final Logger LOGGER = LoggerFactory.getLogger(Trigram.class); private static final DoubleArrayGenericTrie DOUBLE_ARRAY_GENERIC_TRIE = new DoubleArrayGenericTrie(WordConfTools.getInt("trigram.double.array.trie.size", 9800000)); private static int maxFrequency = 0; static { reload(); } public static void reload() { AutoDetector.loadAndWatch( new ResourceLoader() { @Override public void clear() { DOUBLE_ARRAY_GENERIC_TRIE.clear(); } @Override public void load(List<String> lines) { LOGGER.info("初始化trigram"); Map<String, Integer> map = new HashMap<>(); for (String line : lines) { try { addLine(line, map); } catch (Exception e) { LOGGER.error("错误的trigram数据:" + line); } } int size = map.size(); DOUBLE_ARRAY_GENERIC_TRIE.putAll(map); LOGGER.info("trigram初始化完毕,trigram数据条数:" + size); } @Override public void add(String line) { throw new RuntimeException("not yet support menthod!"); } private void addLine(String line, Map<String, Integer> map) { String[] attr = line.split("\\s+"); int frequency = Integer.parseInt(attr[1]); if (frequency > maxFrequency) { maxFrequency = frequency; } map.put(attr[0], frequency); } @Override public void remove(String line) { throw new RuntimeException("not yet support menthod!"); } }, WordConfTools.get("trigram.path", "classpath:trigram.txt")); } public static int getMaxFrequency() { return maxFrequency; } /** * 一次性计算多种分词结果的三元模型分值 * * @param sentences 多种分词结果 * @return 分词结果及其对应的分值 */ public static Map<List<Word>, Float> trigram(List<Word>... sentences) { Map<List<Word>, Float> map = new HashMap<>(); // 计算多种分词结果的分值 for (List<Word> sentence : sentences) { if (map.get(sentence) != null) { // 相同的分词结果只计算一次分值 continue; } float score = 0; // 计算其中一种分词结果的分值 if (sentence.size() > 2) { for (int i = 0; i < sentence.size() - 2; i++) { String first = sentence.get(i).getText(); String second = sentence.get(i + 1).getText(); String third = sentence.get(i + 2).getText(); float trigramScore = getScore(first, second, third); if (trigramScore > 0) { score += trigramScore; } } } map.put(sentence, score); } return map; } /** * 计算分词结果的三元模型分值 * * @param words 分词结果 * @return 三元模型分值 */ public static float trigram(List<Word> words) { if (words.size() > 2) { float score = 0; for (int i = 0; i < words.size() - 2; i++) { score += Trigram.getScore( words.get(i).getText(), words.get(i + 1).getText(), words.get(i + 2).getText()); } return score; } return 0; } /** * 获取三个词前后紧挨着同时出现在语料库中的分值 分值被归一化了: 完全没有出现分值为0 出现频率最高的分值为1 * * @param first 第一个词 * @param second 第二个词 * @param third 第三个词 * @return 同时出现的分值 */ public static float getScore(String first, String second, String third) { int frequency = getFrequency(first, second, third); float score = frequency / (float) maxFrequency; if (LOGGER.isDebugEnabled()) { if (score > 0) { LOGGER.debug("三元模型 " + first + ":" + second + ":" + third + " 获得分值:" + score); } } return score; } public static int getFrequency(String first, String second, String third) { Integer value = DOUBLE_ARRAY_GENERIC_TRIE.get(first + ":" + second + ":" + third); if (value == null) { return 0; } return value; } }