Beispiel #1
0
  @Override
  public List<Token> tokenize(String text) {
    if (text == null || text.trim().length() == 0) return null;

    List<Token> tokens = new ArrayList<Token>();
    Map<Integer, Token> filteredTokenMap =
        new HashMap<Integer, Token>(); // 전처리 과정에서 PREDEFINED_TOKEN_PATTERN 에 의해 걸러진 토큰들
    StringBuilder buf = new StringBuilder(text);

    List<Token> filteredTokens = filterPredefinedPatterns(buf, filteredTokenMap);
    tokens.addAll(filteredTokens);

    char ch;
    String temp = "";
    CharType currCharType = CharType.ETC;
    CharType prevCharType;
    int tokenIndex = 0;

    for (int i = 0, len = text.length(); i < len; i++) {
      ch = buf.charAt(i);

      prevCharType = currCharType;
      if (filteredTokenMap.containsKey(i)) {
        currCharType = CharType.FILTERED;
      } else {
        currCharType = determineCharType(ch);
      }

      if (i != 0) {
        if (prevCharType != currCharType) {
          //                    System.out.println("["+i+"]prevCharType != currCharType =>"+ temp +
          // "," + ch +"," + prevCharType + "," + currCharType);
          if (prevCharType != CharType.FILTERED) {
            //                        System.out.println("  created token:"+ temp + "," +
            // prevCharType);
            tokens.add(new Token(temp, prevCharType, tokenIndex));
          }
          tokenIndex = i;
          temp = "";
        }
      }

      temp = (new StringBuilder(String.valueOf(temp))).append(ch).toString();
    }

    if (temp.trim().length() > 0) {
      Token t = new Token(temp, currCharType, tokenIndex);
      tokens.add(t);
    }

    Collections.sort(tokens);
    return tokens;
  }
Beispiel #2
0
  // 미리 정의된 패턴과 일치하는 부분을 걸러낸다 (ㅜㅜ, 숫자 등)
  private List<Token> filterPredefinedPatterns(
      StringBuilder buf, Map<Integer, Token> filteredTokenMap) {
    List<Token> result = new ArrayList<Token>();

    FilterTokenPattern[] predefinedPatterns = FilterTokenPattern.getPredefinedPatterns();

    List<Token> filteredTokens;
    for (FilterTokenPattern each : predefinedPatterns) {
      filteredTokens = match(buf, each, filteredTokenMap);
      if (filteredTokens.size() > 0) {
        result.addAll(filteredTokens);
      }
    }
    return result;
  }
Beispiel #3
0
  // 패턴에 매칭되는 토큰 리스트를 만든다.
  private List<Token> match(
      StringBuilder text, FilterTokenPattern tokenPattern, Map<Integer, Token> filteredTokenMap) {
    List<Token> tokenList = new ArrayList<Token>();

    for (Matcher matcher = tokenPattern.getPattern().matcher(text); matcher.find(); ) {
      Token token =
          new Token(
              text.substring(matcher.start(), matcher.end()),
              tokenPattern.getCharType(),
              matcher.start());
      tokenList.add(token);
      markFiltered(text, matcher.start(), matcher.end(), token, filteredTokenMap);
    }

    return tokenList;
  }