SyntaxNode _newTokenNode(char token, int position) { if (!__inCharacterClass && !__caseSensitive && (_isUpperCase(token) || _isLowerCase(token))) { CharacterClassNode node = new CharacterClassNode(position); node._addToken(token); node._addToken(_toggleCase(token)); return node; } return new TokenNode(token, position); }
private SyntaxNode __atom() throws MalformedPatternException { SyntaxNode current; if (__lookahead == '(') { __match('('); ++__openParen; current = __regex(); __match(')'); ++__closeParen; } else if (__lookahead == '[') current = __characterClass(); else if (__lookahead == '.') { CharacterClassNode characterSet; __match('.'); characterSet = new NegativeCharacterClassNode(__position++); if (__multiline) characterSet._addToken('\n'); current = characterSet; } else if (__lookahead == '\\') { current = __backslashToken(); } /*else if(__lookahead == '^') { current = new TokenNode((char)LeafNode._BEGIN_LINE_MARKER_TOKEN, __position++); __match('^'); } else if(__lookahead == '$') { current = new TokenNode((char)LeafNode._END_LINE_MARKER_TOKEN, __position++); __match('$'); } */ else if (!__isMetachar(__lookahead)) { current = _newTokenNode(__lookahead, __position++); __match(__lookahead); } else throw new MalformedPatternException( "Parse error: unexpected character " + __lookahead + " at position " + __bytesRead); return current; }
private SyntaxNode __characterClass() throws MalformedPatternException { char lastToken, token; SyntaxNode node; CharacterClassNode current; __match('['); __inCharacterClass = true; if (__lookahead == '^') { __match('^'); current = new NegativeCharacterClassNode(__position++); } else current = new CharacterClassNode(__position++); while (__lookahead != ']' && __lookahead != _END_OF_INPUT) { if (__lookahead == '\\') { node = __backslashToken(); --__position; // __backslashToken() (actually newTokenNode()) does not take care of // case insensitivity when __inCharacterClass is true. if (node instanceof TokenNode) { lastToken = ((TokenNode) node)._token; current._addToken(lastToken); if (!__caseSensitive) current._addToken(_toggleCase(lastToken)); } else { CharacterClassNode slash; slash = (CharacterClassNode) node; // This could be made more efficient by manipulating the // characterSet elements of the CharacterClassNodes but // for the moment, this is more clear. for (token = 0; token < LeafNode._NUM_TOKENS; token++) { if (slash._matches(token)) current._addToken(token); } // A byproduct of this act is that when a '-' occurs after // a \d, \w, etc. it is not interpreted as a range and no // parse exception is thrown. // This is considered a feature and not a bug for now. continue; } } else { lastToken = __lookahead; current._addToken(__lookahead); if (!__caseSensitive) current._addToken(_toggleCase(__lookahead)); __match(__lookahead); } // In Perl, a - is a token if it occurs at the beginning // or end of the character class. Anywhere else, it indicates // a range. // A byproduct of this implementation is that if a '-' occurs // after the end of a range, it is interpreted as a '-' and no // exception is thrown. e.g., the second dash in [a-z-x] // This is considered a feature and not a bug for now. if (__lookahead == '-') { __match('-'); if (__lookahead == ']') { current._addToken('-'); break; } else if (__lookahead == '\\') { node = __backslashToken(); --__position; if (node instanceof TokenNode) token = ((TokenNode) node)._token; else throw new MalformedPatternException( "Parse error: invalid range specified at position " + __bytesRead); } else { token = __lookahead; __match(__lookahead); } if (token < lastToken) throw new MalformedPatternException( "Parse error: invalid range specified at position " + __bytesRead); current._addTokenRange(lastToken + 1, token); if (!__caseSensitive) current._addTokenRange(_toggleCase((char) (lastToken + 1)), _toggleCase(token)); } } __match(']'); __inCharacterClass = false; return current; }
private SyntaxNode __backslashToken() throws MalformedPatternException { SyntaxNode current; char token; int number; __match('\\'); if (__lookahead == 'x') { __match('x'); // Parse a hexadecimal number current = _newTokenNode((char) __parseUnsignedInteger(16, 2, 2), __position++); } else if (__lookahead == 'c') { __match('c'); // Create a control character token = Character.toUpperCase(__lookahead); token = (char) (token > 63 ? token - 64 : token + 64); current = new TokenNode(token, __position++); __match(__lookahead); } else if (__lookahead >= '0' && __lookahead <= '9') { __match(__lookahead); if (__lookahead >= '0' && __lookahead <= '9') { // We have an octal character or a multi-digit backreference. // Assume octal character for now. __putback(); number = __parseUnsignedInteger(10, 2, 3); number = Integer.parseInt(Integer.toString(number), 8); current = _newTokenNode((char) number, __position++); } else { // We have either \0, an escaped digit, or a backreference. __putback(); if (__lookahead == '0') { // \0 matches the null character __match('0'); current = new TokenNode('\0', __position++); } else { // Either an escaped digit or backreference. number = Character.digit(__lookahead, 10); current = _newTokenNode(__lookahead, __position++); __match(__lookahead); } } } else if (__lookahead == 'b') { // Inside of a character class the \b means backspace, otherwise // it means a word boundary // if(__inCharacterClass) // \b always means backspace current = new TokenNode('\b', __position++); /* else current = new TokenNode((char)LeafNode._WORD_BOUNDARY_MARKER_TOKEN, position++); */ __match('b'); } /*else if(__lookahead == 'B' && !__inCharacterClass){ current = new TokenNode((char)LeafNode._NONWORD_BOUNDARY_MARKER_TOKEN, position++); __match('B'); } */ else { CharacterClassNode characterSet; token = __lookahead; switch (__lookahead) { case 'n': token = '\n'; break; case 'r': token = '\r'; break; case 't': token = '\t'; break; case 'f': token = '\f'; break; } switch (token) { case 'd': characterSet = new CharacterClassNode(__position++); characterSet._addTokenRange('0', '9'); current = characterSet; break; case 'D': characterSet = new NegativeCharacterClassNode(__position++); characterSet._addTokenRange('0', '9'); current = characterSet; break; case 'w': characterSet = new CharacterClassNode(__position++); characterSet._addTokenRange('0', '9'); characterSet._addTokenRange('a', 'z'); characterSet._addTokenRange('A', 'Z'); characterSet._addToken('_'); current = characterSet; break; case 'W': characterSet = new NegativeCharacterClassNode(__position++); characterSet._addTokenRange('0', '9'); characterSet._addTokenRange('a', 'z'); characterSet._addTokenRange('A', 'Z'); characterSet._addToken('_'); current = characterSet; break; case 's': characterSet = new CharacterClassNode(__position++); characterSet._addToken(' '); characterSet._addToken('\f'); characterSet._addToken('\n'); characterSet._addToken('\r'); characterSet._addToken('\t'); current = characterSet; break; case 'S': characterSet = new NegativeCharacterClassNode(__position++); characterSet._addToken(' '); characterSet._addToken('\f'); characterSet._addToken('\n'); characterSet._addToken('\r'); characterSet._addToken('\t'); current = characterSet; break; default: current = _newTokenNode(token, __position++); break; } __match(__lookahead); } return current; }