/** * 匹配词段 * * @param charArray * @param begin * @param length * @param searchHit * @return Hit */ public Hit match(char[] charArray, int begin, int length, Hit searchHit) { if (searchHit == null) { // 如果hit为空,新建 searchHit = new Hit(); // 设置hit的其实文本位置 searchHit.setBegin(begin); } else { // 否则要将HIT状态重置 searchHit.setUnmatch(); } // 设置hit的当前处理位置 searchHit.setEnd(begin); Character keyChar = new Character(charArray[begin]); DictSegment ds = null; // 引用实例变量为本地变量,避免查询时遇到更新的同步问题 DictSegment[] segmentArray = this.childrenArray; Map<Character, DictSegment> segmentMap = this.childrenMap; // STEP1 在节点中查找keyChar对应的DictSegment if (segmentArray != null) { // 在数组中查找 for (DictSegment seg : segmentArray) { if (seg != null && seg.nodeChar.equals(keyChar)) { // 找到匹配的段 ds = seg; } } } else if (segmentMap != null) { // 在map中查找 ds = (DictSegment) segmentMap.get(keyChar); } // STEP2 找到DictSegment,判断词的匹配状态,是否继续递归,还是返回结果 if (ds != null) { if (length > 1) { // 词未匹配完,继续往下搜索 return ds.match(charArray, begin + 1, length - 1, searchHit); } else if (length == 1) { // 搜索最后一个char if (ds.nodeState == 1) { // 添加HIT状态为完全匹配 searchHit.setMatch(); } if (ds.hasNextNode()) { // 添加HIT状态为前缀匹配 searchHit.setPrefix(); // 记录当前位置的DictSegment searchHit.setMatchedDictSegment(ds); } return searchHit; } } // STEP3 没有找到DictSegment, 将HIT设置为不匹配 return searchHit; }
private void loadPrepDict() { _PrepDict = new DictSegment((char) 0); File file = new File(configuration.getDictRoot(), Dictionary.PATH_DIC_PREP); InputStream is = null; try { is = new FileInputStream(file); } catch (FileNotFoundException e) { logger.error("ik-analyzer", e); } if (is == null) { throw new RuntimeException("Preposition Dictionary not found!!!"); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); String theWord; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _PrepDict.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException e) { logger.error("ik-analyzer", e); } finally { try { is.close(); is = null; } catch (IOException e) { logger.error("ik-analyzer", e); } } }
/** 加载量词词典 */ private void loadQuantifierDict() { // 建立一个量词典实例 _QuantifierDict = new DictSegment((char) 0); // 读取量词词典文件 File file = new File(configuration.getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER); InputStream is = null; try { is = new FileInputStream(file); } catch (FileNotFoundException e) { logger.error("ik-analyzer", e); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _QuantifierDict.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException ioe) { logger.error("Quantifier Dictionary loading exception."); } finally { try { if (is != null) { is.close(); is = null; } } catch (IOException e) { logger.error("ik-analyzer", e); } } }
/** * 加载填充词典片段 * * @param charArray * @param begin * @param length */ public synchronized void fillSegment(char[] charArray, int begin, int length) { // 获取字典表中的汉字对象 Character beginChar = new Character(charArray[begin]); Character keyChar = charMap.get(beginChar); // 字典中没有该字,则将其添加入字典 if (keyChar == null) { charMap.put(beginChar, beginChar); keyChar = beginChar; } // 搜索当前节点的存储,查询对应keyChar的keyChar,如果没有则创建 DictSegment ds = lookforSegment(keyChar); // 处理keyChar对应的segment if (length > 1) { // 词元还没有完全加入词典树 ds.fillSegment(charArray, begin + 1, length - 1); } else if (length == 1) { // 已经是词元的最后一个char,设置当前节点状态为1,表明一个完整的词 ds.nodeState = 1; } }
/** 加载用户配置的扩展词典到主词库表 */ private void loadExtDict() { // 加载扩展词典配置 List<String> extDictFiles = configuration.getExtDictionarys(); if (extDictFiles != null) { InputStream is = null; for (String extDictName : extDictFiles) { // 读取扩展词典文件 logger.info("[Dict Loading]" + extDictName); File file = new File(configuration.getDictRoot(), extDictName); try { is = new FileInputStream(file); } catch (FileNotFoundException e) { logger.error("ik-analyzer", e); } // 如果找不到扩展的字典,则忽略 if (is == null) { continue; } try { BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { // 加载扩展词典数据到主内存词典中 _MainDict.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException e) { logger.error("ik-analyzer", e); } finally { try { if (is != null) { is.close(); is = null; } } catch (IOException e) { logger.error("ik-analyzer", e); } } } } }
/** 加载主词典及扩展词典 */ private void loadMainDict() { // 建立一个主词典实例 _MainDict = new DictSegment((char) 0); // 读取主词典文件 File file = new File(configuration.getDictRoot(), Dictionary.PATH_DIC_MAIN); InputStream is = null; try { is = new FileInputStream(file); } catch (FileNotFoundException e) { e.printStackTrace(); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _MainDict.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException e) { logger.error("ik-analyzer", e); } finally { try { if (is != null) { is.close(); is = null; } } catch (IOException e) { logger.error("ik-analyzer", e); } } // 加载扩展词典 this.loadExtDict(); }
/** 加载用户扩展的停止词词典 */ private void loadStopWordDict() { // 建立主词典实例 _StopWords = new DictSegment((char) 0); // 读取主词典文件 File file = new File(configuration.getDictRoot(), Dictionary.PATH_DIC_STOP); InputStream is = null; try { is = new FileInputStream(file); } catch (FileNotFoundException e) { e.printStackTrace(); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _StopWords.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException e) { logger.error("ik-analyzer", e); } finally { try { if (is != null) { is.close(); is = null; } } catch (IOException e) { logger.error("ik-analyzer", e); } } // 加载扩展停止词典 List<String> extStopWordDictFiles = configuration.getExtStopWordDictionarys(); if (extStopWordDictFiles != null) { is = null; for (String extStopWordDictName : extStopWordDictFiles) { logger.info("[Dict Loading]" + extStopWordDictName); // 读取扩展词典文件 file = new File(configuration.getDictRoot(), extStopWordDictName); try { is = new FileInputStream(file); } catch (FileNotFoundException e) { logger.error("ik-analyzer", e); } // 如果找不到扩展的字典,则忽略 if (is == null) { continue; } try { BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { // 加载扩展停止词典数据到内存中 _StopWords.fillSegment(theWord.trim().toCharArray()); } } while (theWord != null); } catch (IOException e) { logger.error("ik-analyzer", e); } finally { try { if (is != null) { is.close(); is = null; } } catch (IOException e) { logger.error("ik-analyzer", e); } } } } }
/** * 从已匹配的Hit中直接取出DictSegment,继续向下匹配 * * @return Hit */ public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) { DictSegment ds = matchedHit.getMatchedDictSegment(); return ds.match(charArray, currentIndex, 1, matchedHit); }