コード例 #1
0
ファイル: TokenScanner.java プロジェクト: asgeirf/okapi
  public Lexems process(String text, LocaleId language, Tokens tokens) {

    Lexems lexems = new Lexems();

    for (LexerRule item : rules) {

      RegexRule rule = (RegexRule) item;

      if (!checkRule(rule, language)) continue;

      List<Integer> inTokenIDs = rule.getInTokenIDs();

      Pattern pattern = patterns.get(rule);
      if (pattern == null) continue;

      for (Token token : tokens) {

        // if (token.isDeleted()) continue;

        if (inTokenIDs.contains(token.getTokenId())) {

          Range r = token.getRange();
          Matcher matcher = pattern.matcher(token.getValue());
          int groupIndex = rule.getRegexGroup();

          while (matcher.find()) {

            int start = matcher.start(groupIndex);
            int end = matcher.end(groupIndex);

            if (start > -1 && end > -1) {

              Lexem lexem =
                  new Lexem(
                      rule.getLexemId(), matcher.group(groupIndex), r.start + start, r.start + end);
              lexem.setAnnotation(new InputTokenAnnotation(token));
              lexem.setImmutable(true);
              lexems.add(lexem);

              if (!rule.getKeepInput())
                token
                    .delete(); // Delete the original token, other rules are still able to extract
                               // parts of it
            }
          }
        }
      }
    }

    return lexems;
  }
コード例 #2
0
ファイル: TokenScanner.java プロジェクト: asgeirf/okapi
  @Override
  protected void lexer_init() {

    patterns = new LinkedHashMap<LexerRule, Pattern>();
    rules = getRules();

    for (LexerRule item : rules) {

      RegexRule rule = (RegexRule) item;

      Pattern pattern = null;
      if (rule.getPattern() != null)
        pattern = Pattern.compile(rule.getPattern(), rule.getRegexOptions());

      patterns.put(rule, pattern);
    }
  }