Beispiel #1
0
  /**
   * 词与词之间的关联表数据
   *
   * @return
   */
  public static void initBigramTables() {
    BufferedReader reader = null;
    try {
      reader = IOUtil.getReader(DicReader.getInputStream("bigramdict.dic"), "UTF-8");
      String temp = null;
      String[] strs = null;
      int freq = 0;
      while ((temp = reader.readLine()) != null) {
        if (StringUtil.isBlank(temp)) {
          continue;
        }
        strs = temp.split("\t");
        freq = Integer.parseInt(strs[1]);
        strs = strs[0].split("@");
        AnsjItem fromItem = DATDictionary.getItem(strs[0]);

        AnsjItem toItem = DATDictionary.getItem(strs[1]);

        if (fromItem == AnsjItem.NULL && strs[0].contains("#")) {
          fromItem = AnsjItem.BEGIN;
        }

        if (toItem == AnsjItem.NULL && strs[1].contains("#")) {
          toItem = AnsjItem.END;
        }

        if (fromItem == AnsjItem.NULL || toItem == AnsjItem.NULL) {
          continue;
        }

        if (fromItem.bigramEntryMap == null) {
          fromItem.bigramEntryMap = new HashMap<Integer, Integer>();
        }

        fromItem.bigramEntryMap.put(toItem.getIndex(), freq);
      }
    } catch (NumberFormatException e) {
      e.printStackTrace();
    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
      IOUtil.close(reader);
    }
  }
Beispiel #2
0
  /**
   * 名字词性对象反序列化
   *
   * @return
   */
  @SuppressWarnings("unchecked")
  public static Map<String, int[][]> getPersonFreqMap() {
    InputStream inputStream = null;
    ObjectInputStream objectInputStream = null;
    Map<String, int[][]> map = new HashMap<String, int[][]>(0);
    try {
      inputStream = DicReader.getInputStream("person/asian_name_freq.data");
      objectInputStream = new ObjectInputStream(inputStream);
      map = (Map<String, int[][]>) objectInputStream.readObject();

    } catch (IOException e) {
      e.printStackTrace();
    } catch (ClassNotFoundException e) {
      e.printStackTrace();
    } finally {
      try {
        if (objectInputStream != null) objectInputStream.close();
        if (inputStream != null) inputStream.close();
      } catch (IOException e) {
        e.printStackTrace();
      }
    }
    return map;
  }
Beispiel #3
0
  public static void initArrays(BufferedReader reader) throws Exception {
    /** 人名识别必备的 */
    HashMap<String, PersonNatureAttr> personMap = new PersonAttrLibrary().getPersonMap();
    PersonNatureAttr personAttr = null;

    /** 机构名识别必备的 */
    HashMap<String, CompanyNatureAttr> companyMap = new CompanyAttrLibrary().getCompanyMap();
    CompanyNatureAttr companyAttr = null;

    HashMap<String, NewWordNatureAttr> newWordMap = new NewWordAttrLibrary().getNewWordMap();
    NewWordNatureAttr newWordAttr = null;

    /** 下面开始加载词典 */
    String temp = null;
    String[] strs = null;
    int num = 0;
    while ((temp = reader.readLine()) != null) {
      strs = temp.split("	");
      num = Integer.parseInt(strs[0]);
      base[num] = Integer.parseInt(strs[2]);
      check[num] = Integer.parseInt(strs[3]);
      status[num] = Byte.parseByte(strs[4]);
      if (!"null".equals(strs[5])) {
        words[num] = strs[1];
        if (status[num] < 4) {
          for (int i = 0; i < strs[1].length(); i++) {
            IN_SYSTEM[strs[1].charAt(i)] = strs[1].charAt(i);
          }
        }
        // 加载词性
        TermNatures tn = new TermNatures(TermNature.setNatureStrToArray(strs[5]), num);
        // 判断是否是人名属性
        if ((personAttr = personMap.get(strs[1])) != null) {
          tn.setPersonNatureAttr(personAttr);
        }
        // 判断是否是地名属性
        if ((companyAttr = companyMap.get(strs[1])) != null) {
          tn.setCompanyAttr(companyAttr);
        }

        // 判断是否是新词属性
        if ((newWordAttr = newWordMap.get(strs[1])) != null) {
          // 更新成词的概率
          newWordAttr.updateAll(tn.allFreq);
          tn.setNewWordAttr(newWordAttr);
        }

        termNatures[num] = tn;
      }
    }
    // 人名词性补录
    Set<Entry<String, PersonNatureAttr>> entrySet = personMap.entrySet();
    char c = 0;
    TermNatures tn = null;
    for (Entry<String, PersonNatureAttr> entry : entrySet) {
      if (entry.getKey().length() == 1) {
        c = entry.getKey().charAt(0);
        if (status[c] > 1) {
          continue;
        }
        if (status[c] == 0) {
          base[c] = c;
          check[c] = -1;
          status[c] = 3;
          words[c] = entry.getKey();
        }

        if ((tn = termNatures[c]) == null) {
          tn = new TermNatures(TermNature.NR);
        }
        tn.setPersonNatureAttr(entry.getValue());
        termNatures[c] = tn;
      }
    }

    // 机构词性补录
    Set<Entry<String, CompanyNatureAttr>> cnSet = companyMap.entrySet();
    for (Entry<String, CompanyNatureAttr> entry : cnSet) {
      if (entry.getKey().length() == 1) {
        c = entry.getKey().charAt(0);
        if (status[c] > 1) {
          continue;
        }
        if (status[c] == 0) {
          base[c] = c;
          check[c] = -1;
          status[c] = 3;
          words[c] = entry.getKey();
        }

        if ((tn = termNatures[c]) == null) {
          tn = new TermNatures(TermNature.NULL);
        }
        tn.setCompanyAttr(entry.getValue());
        termNatures[c] = tn;
      }
    }
    // 简繁体字体转换
    BufferedReader reader2 = DicReader.getReader("jianFan.dic");
    while ((temp = reader2.readLine()) != null) {
      temp = temp.trim();
      if (StringUtil.isBlank(temp)) {
        continue;
      }
      if (IN_SYSTEM[temp.charAt(0)] == 0) {
        IN_SYSTEM[temp.charAt(0)] = temp.charAt(2);
      }
    }
    reader.close();
    reader2.close();
  }
Beispiel #4
0
 /**
  * 词性关联表
  *
  * @return
  */
 public static BufferedReader getNatureTableReader() {
   return DicReader.getReader("nature/nature.table");
 }
Beispiel #5
0
 /**
  * 得道姓名单字的词频词典
  *
  * @return
  */
 public static BufferedReader getPersonFreqReader() {
   return DicReader.getReader("person/name_freq.dic");
 }
Beispiel #6
0
 /**
  * 英文词典
  *
  * @return
  */
 public static BufferedReader getEnglishReader() {
   return DicReader.getReader("englishLibrary.dic");
 }
Beispiel #7
0
 /**
  * 词性表
  *
  * @return
  */
 public static BufferedReader getNatureMapReader() {
   return DicReader.getReader("nature/nature.map");
 }
Beispiel #8
0
 /**
  * 数字词典
  *
  * @return
  */
 public static BufferedReader getNumberReader() {
   return DicReader.getReader("numberLibrary.dic");
 }
Beispiel #9
0
 /**
  * 核心词典
  *
  * @return
  */
 public static BufferedReader getArraysReader() {
   return DicReader.getReader("arrays.dic");
 }
Beispiel #10
0
 /**
  * 机构名词典
  *
  * @return
  */
 public static BufferedReader getNewWordReader() {
   return DicReader.getReader("newWord/new_word_freq.dic");
 }
Beispiel #11
0
 /**
  * 机构名词典
  *
  * @return
  */
 public static BufferedReader getCompanReader() {
   return DicReader.getReader("company/company.data");
 }
Beispiel #12
0
 /**
  * 人名词典
  *
  * @return
  */
 public static BufferedReader getPersonReader() {
   return DicReader.getReader("person/person.dic");
 }