예제 #1
1
 private void markFiltered(
     StringBuilder text,
     int start,
     int end,
     Token filtered,
     Map<Integer, Token> filteredTokenMap) {
   for (int i = start; i < end; i++) {
     filteredTokenMap.put(i, filtered);
     text.setCharAt(i, PREDEFINED_TOKEN_REPLACEMENT);
   }
 }
예제 #2
0
  @Override
  public List<Token> tokenize(String text) {
    if (text == null || text.trim().length() == 0) return null;

    List<Token> tokens = new ArrayList<Token>();
    Map<Integer, Token> filteredTokenMap =
        new HashMap<Integer, Token>(); // 전처리 과정에서 PREDEFINED_TOKEN_PATTERN 에 의해 걸러진 토큰들
    StringBuilder buf = new StringBuilder(text);

    List<Token> filteredTokens = filterPredefinedPatterns(buf, filteredTokenMap);
    tokens.addAll(filteredTokens);

    char ch;
    String temp = "";
    CharType currCharType = CharType.ETC;
    CharType prevCharType;
    int tokenIndex = 0;

    for (int i = 0, len = text.length(); i < len; i++) {
      ch = buf.charAt(i);

      prevCharType = currCharType;
      if (filteredTokenMap.containsKey(i)) {
        currCharType = CharType.FILTERED;
      } else {
        currCharType = determineCharType(ch);
      }

      if (i != 0) {
        if (prevCharType != currCharType) {
          //                    System.out.println("["+i+"]prevCharType != currCharType =>"+ temp +
          // "," + ch +"," + prevCharType + "," + currCharType);
          if (prevCharType != CharType.FILTERED) {
            //                        System.out.println("  created token:"+ temp + "," +
            // prevCharType);
            tokens.add(new Token(temp, prevCharType, tokenIndex));
          }
          tokenIndex = i;
          temp = "";
        }
      }

      temp = (new StringBuilder(String.valueOf(temp))).append(ch).toString();
    }

    if (temp.trim().length() > 0) {
      Token t = new Token(temp, currCharType, tokenIndex);
      tokens.add(t);
    }

    Collections.sort(tokens);
    return tokens;
  }