//    @Bean
 public boolean doconfig() {
   WordConfTools.set("dic.path", "classpath:dic.txt,classpath:config/mydic.txt");
   DictionaryFactory.reload(); // 更改词典路径之后,重新加载词典
   //        String path = WordConfTools.get("dic.path");
   //        log.info("dic.path {}" ,path);
   return true;
 }
Beispiel #2
0
  public static void reload() {
    AutoDetector.loadAndWatch(
        new ResourceLoader() {

          @Override
          public void clear() {
            quantifiers.clear();
          }

          @Override
          public void load(List<String> lines) {
            LOGGER.info("初始化数量词");
            for (String line : lines) {
              if (line.length() == 1) {
                char _char = line.charAt(0);
                if (quantifiers.contains(_char)) {
                  LOGGER.info("配置文件有重复项:" + line);
                } else {
                  quantifiers.add(_char);
                }
              } else {
                LOGGER.info("忽略不合法数量词:" + line);
              }
            }
            LOGGER.info("数量词初始化完毕,数量词个数:" + quantifiers.size());
          }

          @Override
          public void add(String line) {
            if (line.length() == 1) {
              char _char = line.charAt(0);
              quantifiers.add(_char);
            } else {
              LOGGER.info("忽略不合法数量词:" + line);
            }
          }

          @Override
          public void remove(String line) {
            if (line.length() == 1) {
              char _char = line.charAt(0);
              quantifiers.remove(_char);
            } else {
              LOGGER.info("忽略不合法数量词:" + line);
            }
          }
        },
        WordConfTools.get("quantifier.path", "classpath:quantifier.txt"));
  }
Beispiel #3
0
  public static void reload() {
    AutoDetector.loadAndWatch(
        new ResourceLoader() {

          @Override
          public void clear() {
            DOUBLE_ARRAY_GENERIC_TRIE.clear();
          }

          @Override
          public void load(List<String> lines) {
            LOGGER.info("初始化trigram");
            Map<String, Integer> map = new HashMap<>();
            for (String line : lines) {
              try {
                addLine(line, map);
              } catch (Exception e) {
                LOGGER.error("错误的trigram数据:" + line);
              }
            }
            int size = map.size();
            DOUBLE_ARRAY_GENERIC_TRIE.putAll(map);
            LOGGER.info("trigram初始化完毕,trigram数据条数:" + size);
          }

          @Override
          public void add(String line) {
            throw new RuntimeException("not yet support menthod!");
          }

          private void addLine(String line, Map<String, Integer> map) {
            String[] attr = line.split("\\s+");
            int frequency = Integer.parseInt(attr[1]);
            if (frequency > maxFrequency) {
              maxFrequency = frequency;
            }
            map.put(attr[0], frequency);
          }

          @Override
          public void remove(String line) {
            throw new RuntimeException("not yet support menthod!");
          }
        },
        WordConfTools.get("trigram.path", "classpath:trigram.txt"));
  }
Beispiel #4
0
/**
 * 三元语法模型
 *
 * @author 杨尚川
 */
public class Trigram {
  private static final Logger LOGGER = LoggerFactory.getLogger(Trigram.class);
  private static final DoubleArrayGenericTrie DOUBLE_ARRAY_GENERIC_TRIE =
      new DoubleArrayGenericTrie(WordConfTools.getInt("trigram.double.array.trie.size", 9800000));
  private static int maxFrequency = 0;

  static {
    reload();
  }

  public static void reload() {
    AutoDetector.loadAndWatch(
        new ResourceLoader() {

          @Override
          public void clear() {
            DOUBLE_ARRAY_GENERIC_TRIE.clear();
          }

          @Override
          public void load(List<String> lines) {
            LOGGER.info("初始化trigram");
            Map<String, Integer> map = new HashMap<>();
            for (String line : lines) {
              try {
                addLine(line, map);
              } catch (Exception e) {
                LOGGER.error("错误的trigram数据:" + line);
              }
            }
            int size = map.size();
            DOUBLE_ARRAY_GENERIC_TRIE.putAll(map);
            LOGGER.info("trigram初始化完毕,trigram数据条数:" + size);
          }

          @Override
          public void add(String line) {
            throw new RuntimeException("not yet support menthod!");
          }

          private void addLine(String line, Map<String, Integer> map) {
            String[] attr = line.split("\\s+");
            int frequency = Integer.parseInt(attr[1]);
            if (frequency > maxFrequency) {
              maxFrequency = frequency;
            }
            map.put(attr[0], frequency);
          }

          @Override
          public void remove(String line) {
            throw new RuntimeException("not yet support menthod!");
          }
        },
        WordConfTools.get("trigram.path", "classpath:trigram.txt"));
  }

  public static int getMaxFrequency() {
    return maxFrequency;
  }

  /**
   * 一次性计算多种分词结果的三元模型分值
   *
   * @param sentences 多种分词结果
   * @return 分词结果及其对应的分值
   */
  public static Map<List<Word>, Float> trigram(List<Word>... sentences) {
    Map<List<Word>, Float> map = new HashMap<>();
    // 计算多种分词结果的分值
    for (List<Word> sentence : sentences) {
      if (map.get(sentence) != null) {
        // 相同的分词结果只计算一次分值
        continue;
      }
      float score = 0;
      // 计算其中一种分词结果的分值
      if (sentence.size() > 2) {
        for (int i = 0; i < sentence.size() - 2; i++) {
          String first = sentence.get(i).getText();
          String second = sentence.get(i + 1).getText();
          String third = sentence.get(i + 2).getText();
          float trigramScore = getScore(first, second, third);
          if (trigramScore > 0) {
            score += trigramScore;
          }
        }
      }
      map.put(sentence, score);
    }

    return map;
  }
  /**
   * 计算分词结果的三元模型分值
   *
   * @param words 分词结果
   * @return 三元模型分值
   */
  public static float trigram(List<Word> words) {
    if (words.size() > 2) {
      float score = 0;
      for (int i = 0; i < words.size() - 2; i++) {
        score +=
            Trigram.getScore(
                words.get(i).getText(), words.get(i + 1).getText(), words.get(i + 2).getText());
      }
      return score;
    }
    return 0;
  }
  /**
   * 获取三个词前后紧挨着同时出现在语料库中的分值 分值被归一化了: 完全没有出现分值为0 出现频率最高的分值为1
   *
   * @param first 第一个词
   * @param second 第二个词
   * @param third 第三个词
   * @return 同时出现的分值
   */
  public static float getScore(String first, String second, String third) {
    int frequency = getFrequency(first, second, third);
    float score = frequency / (float) maxFrequency;
    if (LOGGER.isDebugEnabled()) {
      if (score > 0) {
        LOGGER.debug("三元模型 " + first + ":" + second + ":" + third + " 获得分值:" + score);
      }
    }
    return score;
  }

  public static int getFrequency(String first, String second, String third) {
    Integer value = DOUBLE_ARRAY_GENERIC_TRIE.get(first + ":" + second + ":" + third);
    if (value == null) {
      return 0;
    }
    return value;
  }
}