Java CharType Examples

Programming Language: Java

Namespace/Package Name: com.hankcs.hanlp.dictionary.other

Class/Type: CharType

Examples at hotexamples.com: 2

Java CharType - 2 examples found. These are the top rated real world Java examples of com.hankcs.hanlp.dictionary.other.CharType extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

get(2)

Example #1

Show file

File: Segment.java Project: zhoufeng/HanLP

  /**
   * 快速原子分词，希望用这个方法替换掉原来缓慢的方法
   *
   * @param charArray
   * @param start
   * @param end
   * @return
   */
  protected static List<AtomNode> quickAtomSegment(char[] charArray, int start, int end) {
    List<AtomNode> atomNodeList = new LinkedList<AtomNode>();
    int offsetAtom = start;
    int preType = CharType.get(charArray[offsetAtom]);
    int curType;
    while (++offsetAtom < end) {
      curType = CharType.get(charArray[offsetAtom]);
      if (curType != preType) {
        // 浮点数识别
        if (charArray[offsetAtom] == '.' && preType == CharType.CT_NUM) {
          while (++offsetAtom < end) {
            curType = CharType.get(charArray[offsetAtom]);
            if (curType != CharType.CT_NUM) break;
          }
        }
        atomNodeList.add(new AtomNode(new String(charArray, start, offsetAtom - start), preType));
        start = offsetAtom;
      }
      preType = curType;
    }
    if (offsetAtom == end)
      atomNodeList.add(new AtomNode(new String(charArray, start, offsetAtom - start), preType));

    return atomNodeList;
  }

Example #2

Show file

File: Segment.java Project: zhoufeng/HanLP

  /**
   * 原子分词
   *
   * @param charArray
   * @param start 从start开始（包含）
   * @param end 到end结束（不包含end）
   * @return 一个列表，代表从start到from的所有字构成的原子节点
   */
  protected static List<AtomNode> atomSegment(char[] charArray, int start, int end) {
    List<AtomNode> atomSegment = new ArrayList<AtomNode>();
    int pCur = start, nCurType, nNextType;
    StringBuilder sb = new StringBuilder();
    char c;

    int[] charTypeArray = new int[end - start];

    // 生成对应单个汉字的字符类型数组
    for (int i = 0; i < charTypeArray.length; ++i) {
      c = charArray[i + start];
      charTypeArray[i] = CharType.get(c);

      if (c == '.'
          && i + start < (charArray.length - 1)
          && CharType.get(charArray[i + start + 1]) == CharType.CT_NUM)
        charTypeArray[i] = CharType.CT_NUM;
      else if (c == '.'
          && i + start < (charArray.length - 1)
          && charArray[i + start + 1] >= '0'
          && charArray[i + start + 1] <= '9') charTypeArray[i] = CharType.CT_SINGLE;
      else if (charTypeArray[i] == CharType.CT_LETTER) charTypeArray[i] = CharType.CT_SINGLE;
    }

    // 根据字符类型数组中的内容完成原子切割
    while (pCur < end) {
      nCurType = charTypeArray[pCur - start];

      if (nCurType == CharType.CT_CHINESE
          || nCurType == CharType.CT_INDEX
          || nCurType == CharType.CT_DELIMITER
          || nCurType == CharType.CT_OTHER) {
        String single = String.valueOf(charArray[pCur]);
        if (single.length() != 0) atomSegment.add(new AtomNode(single, nCurType));
        pCur++;
      }
      // 如果是字符、数字或者后面跟随了数字的小数点“.”则一直取下去。
      else if (pCur < end - 1
          && ((nCurType == CharType.CT_SINGLE) || nCurType == CharType.CT_NUM)) {
        sb.delete(0, sb.length());
        sb.append(charArray[pCur]);

        boolean reachEnd = true;
        while (pCur < end - 1) {
          nNextType = charTypeArray[++pCur - start];

          if (nNextType == nCurType) sb.append(charArray[pCur]);
          else {
            reachEnd = false;
            break;
          }
        }
        atomSegment.add(new AtomNode(sb.toString(), nCurType));
        if (reachEnd) pCur++;
      }
      // 对于所有其它情况
      else {
        atomSegment.add(new AtomNode(charArray[pCur], nCurType));
        pCur++;
      }
    }

    return atomSegment;
  }