예제 #1
0
파일: Segment.java 프로젝트: zhoufeng/HanLP
  /**
   * 快速原子分词,希望用这个方法替换掉原来缓慢的方法
   *
   * @param charArray
   * @param start
   * @param end
   * @return
   */
  protected static List<AtomNode> quickAtomSegment(char[] charArray, int start, int end) {
    List<AtomNode> atomNodeList = new LinkedList<AtomNode>();
    int offsetAtom = start;
    int preType = CharType.get(charArray[offsetAtom]);
    int curType;
    while (++offsetAtom < end) {
      curType = CharType.get(charArray[offsetAtom]);
      if (curType != preType) {
        // 浮点数识别
        if (charArray[offsetAtom] == '.' && preType == CharType.CT_NUM) {
          while (++offsetAtom < end) {
            curType = CharType.get(charArray[offsetAtom]);
            if (curType != CharType.CT_NUM) break;
          }
        }
        atomNodeList.add(new AtomNode(new String(charArray, start, offsetAtom - start), preType));
        start = offsetAtom;
      }
      preType = curType;
    }
    if (offsetAtom == end)
      atomNodeList.add(new AtomNode(new String(charArray, start, offsetAtom - start), preType));

    return atomNodeList;
  }
예제 #2
0
파일: Segment.java 프로젝트: zhoufeng/HanLP
  /**
   * 原子分词
   *
   * @param charArray
   * @param start 从start开始(包含)
   * @param end 到end结束(不包含end)
   * @return 一个列表,代表从start到from的所有字构成的原子节点
   */
  protected static List<AtomNode> atomSegment(char[] charArray, int start, int end) {
    List<AtomNode> atomSegment = new ArrayList<AtomNode>();
    int pCur = start, nCurType, nNextType;
    StringBuilder sb = new StringBuilder();
    char c;

    int[] charTypeArray = new int[end - start];

    // 生成对应单个汉字的字符类型数组
    for (int i = 0; i < charTypeArray.length; ++i) {
      c = charArray[i + start];
      charTypeArray[i] = CharType.get(c);

      if (c == '.'
          && i + start < (charArray.length - 1)
          && CharType.get(charArray[i + start + 1]) == CharType.CT_NUM)
        charTypeArray[i] = CharType.CT_NUM;
      else if (c == '.'
          && i + start < (charArray.length - 1)
          && charArray[i + start + 1] >= '0'
          && charArray[i + start + 1] <= '9') charTypeArray[i] = CharType.CT_SINGLE;
      else if (charTypeArray[i] == CharType.CT_LETTER) charTypeArray[i] = CharType.CT_SINGLE;
    }

    // 根据字符类型数组中的内容完成原子切割
    while (pCur < end) {
      nCurType = charTypeArray[pCur - start];

      if (nCurType == CharType.CT_CHINESE
          || nCurType == CharType.CT_INDEX
          || nCurType == CharType.CT_DELIMITER
          || nCurType == CharType.CT_OTHER) {
        String single = String.valueOf(charArray[pCur]);
        if (single.length() != 0) atomSegment.add(new AtomNode(single, nCurType));
        pCur++;
      }
      // 如果是字符、数字或者后面跟随了数字的小数点“.”则一直取下去。
      else if (pCur < end - 1
          && ((nCurType == CharType.CT_SINGLE) || nCurType == CharType.CT_NUM)) {
        sb.delete(0, sb.length());
        sb.append(charArray[pCur]);

        boolean reachEnd = true;
        while (pCur < end - 1) {
          nNextType = charTypeArray[++pCur - start];

          if (nNextType == nCurType) sb.append(charArray[pCur]);
          else {
            reachEnd = false;
            break;
          }
        }
        atomSegment.add(new AtomNode(sb.toString(), nCurType));
        if (reachEnd) pCur++;
      }
      // 对于所有其它情况
      else {
        atomSegment.add(new AtomNode(charArray[pCur], nCurType));
        pCur++;
      }
    }

    return atomSegment;
  }