예제 #1
0
  /**
   * 匹配词段
   *
   * @param charArray
   * @param begin
   * @param length
   * @param searchHit
   * @return Hit
   */
  public Hit match(char[] charArray, int begin, int length, Hit searchHit) {
    if (searchHit == null) {
      // 如果hit为空,新建
      searchHit = new Hit();
      // 设置hit的其实文本位置
      searchHit.setBegin(begin);
    } else {
      // 否则要将HIT状态重置
      searchHit.setUnmatch();
    }
    // 设置hit的当前处理位置
    searchHit.setEnd(begin);

    Character keyChar = new Character(charArray[begin]);
    DictSegment ds = null;

    // 引用实例变量为本地变量,避免查询时遇到更新的同步问题
    DictSegment[] segmentArray = this.childrenArray;
    Map<Character, DictSegment> segmentMap = this.childrenMap;

    // STEP1 在节点中查找keyChar对应的DictSegment
    if (segmentArray != null) {
      // 在数组中查找
      for (DictSegment seg : segmentArray) {
        if (seg != null && seg.nodeChar.equals(keyChar)) {
          // 找到匹配的段
          ds = seg;
        }
      }
    } else if (segmentMap != null) {
      // 在map中查找
      ds = (DictSegment) segmentMap.get(keyChar);
    }

    // STEP2 找到DictSegment,判断词的匹配状态,是否继续递归,还是返回结果
    if (ds != null) {
      if (length > 1) {
        // 词未匹配完,继续往下搜索
        return ds.match(charArray, begin + 1, length - 1, searchHit);
      } else if (length == 1) {
        // 搜索最后一个char
        if (ds.nodeState == 1) {
          // 添加HIT状态为完全匹配
          searchHit.setMatch();
        }
        if (ds.hasNextNode()) {
          // 添加HIT状态为前缀匹配
          searchHit.setPrefix();
          // 记录当前位置的DictSegment
          searchHit.setMatchedDictSegment(ds);
        }
        return searchHit;
      }
    }
    // STEP3 没有找到DictSegment, 将HIT设置为不匹配
    return searchHit;
  }
  private void loadPrepDict() {

    _PrepDict = new DictSegment((char) 0);
    File file = new File(configuration.getDictRoot(), Dictionary.PATH_DIC_PREP);
    InputStream is = null;
    try {
      is = new FileInputStream(file);
    } catch (FileNotFoundException e) {
      logger.error("ik-analyzer", e);
    }
    if (is == null) {
      throw new RuntimeException("Preposition Dictionary not found!!!");
    }
    try {

      BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
      String theWord;
      do {
        theWord = br.readLine();
        if (theWord != null && !"".equals(theWord.trim())) {

          _PrepDict.fillSegment(theWord.trim().toCharArray());
        }
      } while (theWord != null);
    } catch (IOException e) {
      logger.error("ik-analyzer", e);
    } finally {
      try {
        is.close();
        is = null;
      } catch (IOException e) {
        logger.error("ik-analyzer", e);
      }
    }
  }
  /** 加载量词词典 */
  private void loadQuantifierDict() {
    // 建立一个量词典实例
    _QuantifierDict = new DictSegment((char) 0);
    // 读取量词词典文件
    File file = new File(configuration.getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER);
    InputStream is = null;
    try {
      is = new FileInputStream(file);
    } catch (FileNotFoundException e) {
      logger.error("ik-analyzer", e);
    }
    try {
      BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
      String theWord = null;
      do {
        theWord = br.readLine();
        if (theWord != null && !"".equals(theWord.trim())) {
          _QuantifierDict.fillSegment(theWord.trim().toCharArray());
        }
      } while (theWord != null);

    } catch (IOException ioe) {
      logger.error("Quantifier Dictionary loading exception.");

    } finally {
      try {
        if (is != null) {
          is.close();
          is = null;
        }
      } catch (IOException e) {
        logger.error("ik-analyzer", e);
      }
    }
  }
예제 #4
0
  /**
   * 加载填充词典片段
   *
   * @param charArray
   * @param begin
   * @param length
   */
  public synchronized void fillSegment(char[] charArray, int begin, int length) {
    // 获取字典表中的汉字对象
    Character beginChar = new Character(charArray[begin]);
    Character keyChar = charMap.get(beginChar);
    // 字典中没有该字,则将其添加入字典
    if (keyChar == null) {
      charMap.put(beginChar, beginChar);
      keyChar = beginChar;
    }

    // 搜索当前节点的存储,查询对应keyChar的keyChar,如果没有则创建
    DictSegment ds = lookforSegment(keyChar);
    // 处理keyChar对应的segment
    if (length > 1) {
      // 词元还没有完全加入词典树
      ds.fillSegment(charArray, begin + 1, length - 1);
    } else if (length == 1) {
      // 已经是词元的最后一个char,设置当前节点状态为1,表明一个完整的词
      ds.nodeState = 1;
    }
  }
  /** 加载用户配置的扩展词典到主词库表 */
  private void loadExtDict() {
    // 加载扩展词典配置
    List<String> extDictFiles = configuration.getExtDictionarys();
    if (extDictFiles != null) {
      InputStream is = null;
      for (String extDictName : extDictFiles) {
        // 读取扩展词典文件
        logger.info("[Dict Loading]" + extDictName);
        File file = new File(configuration.getDictRoot(), extDictName);
        try {
          is = new FileInputStream(file);
        } catch (FileNotFoundException e) {
          logger.error("ik-analyzer", e);
        }

        // 如果找不到扩展的字典,则忽略
        if (is == null) {
          continue;
        }
        try {
          BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
          String theWord = null;
          do {
            theWord = br.readLine();
            if (theWord != null && !"".equals(theWord.trim())) {
              // 加载扩展词典数据到主内存词典中
              _MainDict.fillSegment(theWord.trim().toCharArray());
            }
          } while (theWord != null);

        } catch (IOException e) {
          logger.error("ik-analyzer", e);
        } finally {
          try {
            if (is != null) {
              is.close();
              is = null;
            }
          } catch (IOException e) {
            logger.error("ik-analyzer", e);
          }
        }
      }
    }
  }
  /** 加载主词典及扩展词典 */
  private void loadMainDict() {
    // 建立一个主词典实例
    _MainDict = new DictSegment((char) 0);

    // 读取主词典文件
    File file = new File(configuration.getDictRoot(), Dictionary.PATH_DIC_MAIN);

    InputStream is = null;
    try {
      is = new FileInputStream(file);
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    }

    try {
      BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
      String theWord = null;
      do {
        theWord = br.readLine();
        if (theWord != null && !"".equals(theWord.trim())) {
          _MainDict.fillSegment(theWord.trim().toCharArray());
        }
      } while (theWord != null);

    } catch (IOException e) {
      logger.error("ik-analyzer", e);

    } finally {
      try {
        if (is != null) {
          is.close();
          is = null;
        }
      } catch (IOException e) {
        logger.error("ik-analyzer", e);
      }
    }
    // 加载扩展词典
    this.loadExtDict();
  }
  /** 加载用户扩展的停止词词典 */
  private void loadStopWordDict() {
    // 建立主词典实例
    _StopWords = new DictSegment((char) 0);

    // 读取主词典文件
    File file = new File(configuration.getDictRoot(), Dictionary.PATH_DIC_STOP);

    InputStream is = null;
    try {
      is = new FileInputStream(file);
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    }

    try {
      BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
      String theWord = null;
      do {
        theWord = br.readLine();
        if (theWord != null && !"".equals(theWord.trim())) {
          _StopWords.fillSegment(theWord.trim().toCharArray());
        }
      } while (theWord != null);

    } catch (IOException e) {
      logger.error("ik-analyzer", e);

    } finally {
      try {
        if (is != null) {
          is.close();
          is = null;
        }
      } catch (IOException e) {
        logger.error("ik-analyzer", e);
      }
    }

    // 加载扩展停止词典
    List<String> extStopWordDictFiles = configuration.getExtStopWordDictionarys();
    if (extStopWordDictFiles != null) {
      is = null;
      for (String extStopWordDictName : extStopWordDictFiles) {
        logger.info("[Dict Loading]" + extStopWordDictName);

        // 读取扩展词典文件
        file = new File(configuration.getDictRoot(), extStopWordDictName);
        try {
          is = new FileInputStream(file);
        } catch (FileNotFoundException e) {
          logger.error("ik-analyzer", e);
        }
        // 如果找不到扩展的字典,则忽略
        if (is == null) {
          continue;
        }
        try {
          BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
          String theWord = null;
          do {
            theWord = br.readLine();
            if (theWord != null && !"".equals(theWord.trim())) {
              // 加载扩展停止词典数据到内存中
              _StopWords.fillSegment(theWord.trim().toCharArray());
            }
          } while (theWord != null);

        } catch (IOException e) {
          logger.error("ik-analyzer", e);

        } finally {
          try {
            if (is != null) {
              is.close();
              is = null;
            }
          } catch (IOException e) {
            logger.error("ik-analyzer", e);
          }
        }
      }
    }
  }
 /**
  * 从已匹配的Hit中直接取出DictSegment,继续向下匹配
  *
  * @return Hit
  */
 public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
   DictSegment ds = matchedHit.getMatchedDictSegment();
   return ds.match(charArray, currentIndex, 1, matchedHit);
 }