예제 #1
0
  static {
    try {

      long start = System.currentTimeMillis();
      FOREST = new Forest();

      // 先加载系统内置补充词典
      BufferedReader br = MyStaticValue.getUserDefineReader();
      String temp = null;
      while ((temp = br.readLine()) != null) {
        if (StringUtil.isBlank(temp) || InitDictionary.isInSystemDic(temp.split("\t")[0])) {
          continue;
        } else {
          Library.insertWord(FOREST, temp);
        }
      }
      // 如果系统设置了用户词典.那么..呵呵
      temp = MyStaticValue.userDefinePath;
      // 加载用户自定义词典
      Value value = null;
      String[] strs = null;
      if ((temp != null || (temp = MyStaticValue.rb.getString("userLibrary")) != null)
          && new File(temp).isFile()) {
        br = IOUtil.getReader(temp, "UTF-8");
        while ((temp = br.readLine()) != null) {
          if (StringUtil.isBlank(temp)) {
            continue;
          } else {
            strs = temp.split("\t");
            if (strs.length != 3) {
              value = new Value(strs[0], PARAMER);
            } else {
              value = new Value(strs[0], strs[1], strs[2]);
            }

            if (!InitDictionary.isInSystemDic(value.getKeyword())) {
              Library.insertWord(FOREST, value);
            }
          }
        }
      } else {
        System.err.println("用户自定义词典:" + temp + ", 没有这个文件!");
      }
      System.out.println("加载用户自定义词典完成用时:" + (System.currentTimeMillis() - start));
    } catch (Exception e) {
      // TODO Auto-generated catch block
      System.out.println("加载用户自定义词典加载失败:");
    }
  }
예제 #2
0
 /**
  * 一个词在词典中的id
  *
  * @param str
  * @return
  */
 public static int getWordId(String str) {
   if (StringUtil.isBlank(str)) {
     return 0;
   }
   int baseValue = str.charAt(0);
   int checkValue = 0;
   for (int i = 1; i < str.length(); i++) {
     checkValue = baseValue;
     baseValue = base[baseValue] + str.charAt(i);
     if (baseValue > check.length - 1) return 0;
     if (check[baseValue] != -1 && check[baseValue] != checkValue) {
       return 0;
     }
   }
   return baseValue;
 }
예제 #3
0
 /**
  * 判断一个词是否在词典中存在
  *
  * @param str
  * @return
  */
 public static boolean isInSystemDic(String str) {
   if (StringUtil.isBlank(str)) {
     return true;
   }
   int baseValue = str.charAt(0);
   int checkValue = 0;
   for (int i = 1; i < str.length(); i++) {
     checkValue = baseValue;
     baseValue = base[baseValue] + str.charAt(i);
     if (baseValue > check.length - 1) return false;
     if (check[baseValue] != -1 && check[baseValue] != checkValue) {
       return false;
     }
   }
   return status[baseValue] > 1;
 }
예제 #4
0
  public static void initArrays(BufferedReader reader) throws Exception {
    /** 人名识别必备的 */
    HashMap<String, PersonNatureAttr> personMap = new PersonAttrLibrary().getPersonMap();
    PersonNatureAttr personAttr = null;

    /** 机构名识别必备的 */
    HashMap<String, CompanyNatureAttr> companyMap = new CompanyAttrLibrary().getCompanyMap();
    CompanyNatureAttr companyAttr = null;

    HashMap<String, NewWordNatureAttr> newWordMap = new NewWordAttrLibrary().getNewWordMap();
    NewWordNatureAttr newWordAttr = null;

    /** 下面开始加载词典 */
    String temp = null;
    String[] strs = null;
    int num = 0;
    while ((temp = reader.readLine()) != null) {
      strs = temp.split("	");
      num = Integer.parseInt(strs[0]);
      base[num] = Integer.parseInt(strs[2]);
      check[num] = Integer.parseInt(strs[3]);
      status[num] = Byte.parseByte(strs[4]);
      if (!"null".equals(strs[5])) {
        words[num] = strs[1];
        if (status[num] < 4) {
          for (int i = 0; i < strs[1].length(); i++) {
            IN_SYSTEM[strs[1].charAt(i)] = strs[1].charAt(i);
          }
        }
        // 加载词性
        TermNatures tn = new TermNatures(TermNature.setNatureStrToArray(strs[5]), num);
        // 判断是否是人名属性
        if ((personAttr = personMap.get(strs[1])) != null) {
          tn.setPersonNatureAttr(personAttr);
        }
        // 判断是否是地名属性
        if ((companyAttr = companyMap.get(strs[1])) != null) {
          tn.setCompanyAttr(companyAttr);
        }

        // 判断是否是新词属性
        if ((newWordAttr = newWordMap.get(strs[1])) != null) {
          // 更新成词的概率
          newWordAttr.updateAll(tn.allFreq);
          tn.setNewWordAttr(newWordAttr);
        }

        termNatures[num] = tn;
      }
    }
    // 人名词性补录
    Set<Entry<String, PersonNatureAttr>> entrySet = personMap.entrySet();
    char c = 0;
    TermNatures tn = null;
    for (Entry<String, PersonNatureAttr> entry : entrySet) {
      if (entry.getKey().length() == 1) {
        c = entry.getKey().charAt(0);
        if (status[c] > 1) {
          continue;
        }
        if (status[c] == 0) {
          base[c] = c;
          check[c] = -1;
          status[c] = 3;
          words[c] = entry.getKey();
        }

        if ((tn = termNatures[c]) == null) {
          tn = new TermNatures(TermNature.NR);
        }
        tn.setPersonNatureAttr(entry.getValue());
        termNatures[c] = tn;
      }
    }

    // 机构词性补录
    Set<Entry<String, CompanyNatureAttr>> cnSet = companyMap.entrySet();
    for (Entry<String, CompanyNatureAttr> entry : cnSet) {
      if (entry.getKey().length() == 1) {
        c = entry.getKey().charAt(0);
        if (status[c] > 1) {
          continue;
        }
        if (status[c] == 0) {
          base[c] = c;
          check[c] = -1;
          status[c] = 3;
          words[c] = entry.getKey();
        }

        if ((tn = termNatures[c]) == null) {
          tn = new TermNatures(TermNature.NULL);
        }
        tn.setCompanyAttr(entry.getValue());
        termNatures[c] = tn;
      }
    }
    // 简繁体字体转换
    BufferedReader reader2 = DicReader.getReader("jianFan.dic");
    while ((temp = reader2.readLine()) != null) {
      temp = temp.trim();
      if (StringUtil.isBlank(temp)) {
        continue;
      }
      if (IN_SYSTEM[temp.charAt(0)] == 0) {
        IN_SYSTEM[temp.charAt(0)] = temp.charAt(2);
      }
    }
    reader.close();
    reader2.close();
  }