/** * 快速原子分词,希望用这个方法替换掉原来缓慢的方法 * * @param charArray * @param start * @param end * @return */ protected static List<AtomNode> quickAtomSegment(char[] charArray, int start, int end) { List<AtomNode> atomNodeList = new LinkedList<AtomNode>(); int offsetAtom = start; int preType = CharType.get(charArray[offsetAtom]); int curType; while (++offsetAtom < end) { curType = CharType.get(charArray[offsetAtom]); if (curType != preType) { // 浮点数识别 if (charArray[offsetAtom] == '.' && preType == CharType.CT_NUM) { while (++offsetAtom < end) { curType = CharType.get(charArray[offsetAtom]); if (curType != CharType.CT_NUM) break; } } atomNodeList.add(new AtomNode(new String(charArray, start, offsetAtom - start), preType)); start = offsetAtom; } preType = curType; } if (offsetAtom == end) atomNodeList.add(new AtomNode(new String(charArray, start, offsetAtom - start), preType)); return atomNodeList; }
/** * 原子分词 * * @param charArray * @param start 从start开始(包含) * @param end 到end结束(不包含end) * @return 一个列表,代表从start到from的所有字构成的原子节点 */ protected static List<AtomNode> atomSegment(char[] charArray, int start, int end) { List<AtomNode> atomSegment = new ArrayList<AtomNode>(); int pCur = start, nCurType, nNextType; StringBuilder sb = new StringBuilder(); char c; int[] charTypeArray = new int[end - start]; // 生成对应单个汉字的字符类型数组 for (int i = 0; i < charTypeArray.length; ++i) { c = charArray[i + start]; charTypeArray[i] = CharType.get(c); if (c == '.' && i + start < (charArray.length - 1) && CharType.get(charArray[i + start + 1]) == CharType.CT_NUM) charTypeArray[i] = CharType.CT_NUM; else if (c == '.' && i + start < (charArray.length - 1) && charArray[i + start + 1] >= '0' && charArray[i + start + 1] <= '9') charTypeArray[i] = CharType.CT_SINGLE; else if (charTypeArray[i] == CharType.CT_LETTER) charTypeArray[i] = CharType.CT_SINGLE; } // 根据字符类型数组中的内容完成原子切割 while (pCur < end) { nCurType = charTypeArray[pCur - start]; if (nCurType == CharType.CT_CHINESE || nCurType == CharType.CT_INDEX || nCurType == CharType.CT_DELIMITER || nCurType == CharType.CT_OTHER) { String single = String.valueOf(charArray[pCur]); if (single.length() != 0) atomSegment.add(new AtomNode(single, nCurType)); pCur++; } // 如果是字符、数字或者后面跟随了数字的小数点“.”则一直取下去。 else if (pCur < end - 1 && ((nCurType == CharType.CT_SINGLE) || nCurType == CharType.CT_NUM)) { sb.delete(0, sb.length()); sb.append(charArray[pCur]); boolean reachEnd = true; while (pCur < end - 1) { nNextType = charTypeArray[++pCur - start]; if (nNextType == nCurType) sb.append(charArray[pCur]); else { reachEnd = false; break; } } atomSegment.add(new AtomNode(sb.toString(), nCurType)); if (reachEnd) pCur++; } // 对于所有其它情况 else { atomSegment.add(new AtomNode(charArray[pCur], nCurType)); pCur++; } } return atomSegment; }