public Lexems process(String text, LocaleId language, Tokens tokens) { Lexems lexems = new Lexems(); for (LexerRule item : rules) { RegexRule rule = (RegexRule) item; if (!checkRule(rule, language)) continue; List<Integer> inTokenIDs = rule.getInTokenIDs(); Pattern pattern = patterns.get(rule); if (pattern == null) continue; for (Token token : tokens) { // if (token.isDeleted()) continue; if (inTokenIDs.contains(token.getTokenId())) { Range r = token.getRange(); Matcher matcher = pattern.matcher(token.getValue()); int groupIndex = rule.getRegexGroup(); while (matcher.find()) { int start = matcher.start(groupIndex); int end = matcher.end(groupIndex); if (start > -1 && end > -1) { Lexem lexem = new Lexem( rule.getLexemId(), matcher.group(groupIndex), r.start + start, r.start + end); lexem.setAnnotation(new InputTokenAnnotation(token)); lexem.setImmutable(true); lexems.add(lexem); if (!rule.getKeepInput()) token .delete(); // Delete the original token, other rules are still able to extract // parts of it } } } } } return lexems; }
@Override protected void lexer_init() { patterns = new LinkedHashMap<LexerRule, Pattern>(); rules = getRules(); for (LexerRule item : rules) { RegexRule rule = (RegexRule) item; Pattern pattern = null; if (rule.getPattern() != null) pattern = Pattern.compile(rule.getPattern(), rule.getRegexOptions()); patterns.put(rule, pattern); } }