private void markFiltered( StringBuilder text, int start, int end, Token filtered, Map<Integer, Token> filteredTokenMap) { for (int i = start; i < end; i++) { filteredTokenMap.put(i, filtered); text.setCharAt(i, PREDEFINED_TOKEN_REPLACEMENT); } }
@Override public List<Token> tokenize(String text) { if (text == null || text.trim().length() == 0) return null; List<Token> tokens = new ArrayList<Token>(); Map<Integer, Token> filteredTokenMap = new HashMap<Integer, Token>(); // 전처리 과정에서 PREDEFINED_TOKEN_PATTERN 에 의해 걸러진 토큰들 StringBuilder buf = new StringBuilder(text); List<Token> filteredTokens = filterPredefinedPatterns(buf, filteredTokenMap); tokens.addAll(filteredTokens); char ch; String temp = ""; CharType currCharType = CharType.ETC; CharType prevCharType; int tokenIndex = 0; for (int i = 0, len = text.length(); i < len; i++) { ch = buf.charAt(i); prevCharType = currCharType; if (filteredTokenMap.containsKey(i)) { currCharType = CharType.FILTERED; } else { currCharType = determineCharType(ch); } if (i != 0) { if (prevCharType != currCharType) { // System.out.println("["+i+"]prevCharType != currCharType =>"+ temp + // "," + ch +"," + prevCharType + "," + currCharType); if (prevCharType != CharType.FILTERED) { // System.out.println(" created token:"+ temp + "," + // prevCharType); tokens.add(new Token(temp, prevCharType, tokenIndex)); } tokenIndex = i; temp = ""; } } temp = (new StringBuilder(String.valueOf(temp))).append(ch).toString(); } if (temp.trim().length() > 0) { Token t = new Token(temp, currCharType, tokenIndex); tokens.add(t); } Collections.sort(tokens); return tokens; }