@Override public List<Token> tokenize(String text) { if (text == null || text.trim().length() == 0) return null; List<Token> tokens = new ArrayList<Token>(); Map<Integer, Token> filteredTokenMap = new HashMap<Integer, Token>(); // 전처리 과정에서 PREDEFINED_TOKEN_PATTERN 에 의해 걸러진 토큰들 StringBuilder buf = new StringBuilder(text); List<Token> filteredTokens = filterPredefinedPatterns(buf, filteredTokenMap); tokens.addAll(filteredTokens); char ch; String temp = ""; CharType currCharType = CharType.ETC; CharType prevCharType; int tokenIndex = 0; for (int i = 0, len = text.length(); i < len; i++) { ch = buf.charAt(i); prevCharType = currCharType; if (filteredTokenMap.containsKey(i)) { currCharType = CharType.FILTERED; } else { currCharType = determineCharType(ch); } if (i != 0) { if (prevCharType != currCharType) { // System.out.println("["+i+"]prevCharType != currCharType =>"+ temp + // "," + ch +"," + prevCharType + "," + currCharType); if (prevCharType != CharType.FILTERED) { // System.out.println(" created token:"+ temp + "," + // prevCharType); tokens.add(new Token(temp, prevCharType, tokenIndex)); } tokenIndex = i; temp = ""; } } temp = (new StringBuilder(String.valueOf(temp))).append(ch).toString(); } if (temp.trim().length() > 0) { Token t = new Token(temp, currCharType, tokenIndex); tokens.add(t); } Collections.sort(tokens); return tokens; }
// 미리 정의된 패턴과 일치하는 부분을 걸러낸다 (ㅜㅜ, 숫자 등) private List<Token> filterPredefinedPatterns( StringBuilder buf, Map<Integer, Token> filteredTokenMap) { List<Token> result = new ArrayList<Token>(); FilterTokenPattern[] predefinedPatterns = FilterTokenPattern.getPredefinedPatterns(); List<Token> filteredTokens; for (FilterTokenPattern each : predefinedPatterns) { filteredTokens = match(buf, each, filteredTokenMap); if (filteredTokens.size() > 0) { result.addAll(filteredTokens); } } return result; }
// 패턴에 매칭되는 토큰 리스트를 만든다. private List<Token> match( StringBuilder text, FilterTokenPattern tokenPattern, Map<Integer, Token> filteredTokenMap) { List<Token> tokenList = new ArrayList<Token>(); for (Matcher matcher = tokenPattern.getPattern().matcher(text); matcher.find(); ) { Token token = new Token( text.substring(matcher.start(), matcher.end()), tokenPattern.getCharType(), matcher.start()); tokenList.add(token); markFiltered(text, matcher.start(), matcher.end(), token, filteredTokenMap); } return tokenList; }