Ejemplo n.º 1
0
  private SyntaxNode __characterClass() throws MalformedPatternException {
    char lastToken, token;
    SyntaxNode node;
    CharacterClassNode current;

    __match('[');
    __inCharacterClass = true;

    if (__lookahead == '^') {
      __match('^');
      current = new NegativeCharacterClassNode(__position++);
    } else current = new CharacterClassNode(__position++);

    while (__lookahead != ']' && __lookahead != _END_OF_INPUT) {

      if (__lookahead == '\\') {
        node = __backslashToken();
        --__position;

        // __backslashToken() (actually newTokenNode()) does not take care of
        // case insensitivity when __inCharacterClass is true.
        if (node instanceof TokenNode) {
          lastToken = ((TokenNode) node)._token;
          current._addToken(lastToken);
          if (!__caseSensitive) current._addToken(_toggleCase(lastToken));
        } else {
          CharacterClassNode slash;
          slash = (CharacterClassNode) node;
          // This could be made more efficient by manipulating the
          // characterSet elements of the CharacterClassNodes but
          // for the moment, this is more clear.
          for (token = 0; token < LeafNode._NUM_TOKENS; token++) {
            if (slash._matches(token)) current._addToken(token);
          }

          // A byproduct of this act is that when a '-' occurs after
          // a \d, \w, etc. it is not interpreted as a range and no
          // parse exception is thrown.
          // This is considered a feature and not a bug for now.
          continue;
        }
      } else {
        lastToken = __lookahead;
        current._addToken(__lookahead);
        if (!__caseSensitive) current._addToken(_toggleCase(__lookahead));
        __match(__lookahead);
      }

      // In Perl, a - is a token if it occurs at the beginning
      // or end of the character class.  Anywhere else, it indicates
      // a range.
      // A byproduct of this implementation is that if a '-' occurs
      // after the end of a range, it is interpreted as a '-' and no
      // exception is thrown. e.g., the second dash in [a-z-x]
      // This is considered a feature and not a bug for now.
      if (__lookahead == '-') {
        __match('-');
        if (__lookahead == ']') {
          current._addToken('-');
          break;
        } else if (__lookahead == '\\') {
          node = __backslashToken();
          --__position;
          if (node instanceof TokenNode) token = ((TokenNode) node)._token;
          else
            throw new MalformedPatternException(
                "Parse error: invalid range specified at position " + __bytesRead);
        } else {
          token = __lookahead;
          __match(__lookahead);
        }

        if (token < lastToken)
          throw new MalformedPatternException(
              "Parse error: invalid range specified at position " + __bytesRead);
        current._addTokenRange(lastToken + 1, token);
        if (!__caseSensitive)
          current._addTokenRange(_toggleCase((char) (lastToken + 1)), _toggleCase(token));
      }
    }

    __match(']');
    __inCharacterClass = false;
    return current;
  }
Ejemplo n.º 2
0
  private SyntaxNode __backslashToken() throws MalformedPatternException {
    SyntaxNode current;
    char token;
    int number;

    __match('\\');

    if (__lookahead == 'x') {
      __match('x');
      // Parse a hexadecimal number
      current = _newTokenNode((char) __parseUnsignedInteger(16, 2, 2), __position++);
    } else if (__lookahead == 'c') {
      __match('c');
      // Create a control character
      token = Character.toUpperCase(__lookahead);
      token = (char) (token > 63 ? token - 64 : token + 64);
      current = new TokenNode(token, __position++);
      __match(__lookahead);
    } else if (__lookahead >= '0' && __lookahead <= '9') {
      __match(__lookahead);

      if (__lookahead >= '0' && __lookahead <= '9') {
        // We have an octal character or a multi-digit backreference.
        // Assume octal character for now.
        __putback();
        number = __parseUnsignedInteger(10, 2, 3);
        number = Integer.parseInt(Integer.toString(number), 8);
        current = _newTokenNode((char) number, __position++);
      } else {
        // We have either \0, an escaped digit, or a backreference.
        __putback();
        if (__lookahead == '0') {
          // \0 matches the null character
          __match('0');
          current = new TokenNode('\0', __position++);
        } else {
          // Either an escaped digit or backreference.
          number = Character.digit(__lookahead, 10);
          current = _newTokenNode(__lookahead, __position++);
          __match(__lookahead);
        }
      }
    } else if (__lookahead == 'b') {
      // Inside of a character class the \b means backspace, otherwise
      // it means a word boundary
      // if(__inCharacterClass)
      // \b always means backspace
      current = new TokenNode('\b', __position++);
      /*
           else
      current = new TokenNode((char)LeafNode._WORD_BOUNDARY_MARKER_TOKEN,
      			position++);
      			*/
      __match('b');
    } /*else if(__lookahead == 'B' && !__inCharacterClass){
        current = new TokenNode((char)LeafNode._NONWORD_BOUNDARY_MARKER_TOKEN,
           position++);
        __match('B');
      } */ else {
      CharacterClassNode characterSet;
      token = __lookahead;

      switch (__lookahead) {
        case 'n':
          token = '\n';
          break;
        case 'r':
          token = '\r';
          break;
        case 't':
          token = '\t';
          break;
        case 'f':
          token = '\f';
          break;
      }

      switch (token) {
        case 'd':
          characterSet = new CharacterClassNode(__position++);
          characterSet._addTokenRange('0', '9');
          current = characterSet;
          break;
        case 'D':
          characterSet = new NegativeCharacterClassNode(__position++);
          characterSet._addTokenRange('0', '9');
          current = characterSet;
          break;
        case 'w':
          characterSet = new CharacterClassNode(__position++);
          characterSet._addTokenRange('0', '9');
          characterSet._addTokenRange('a', 'z');
          characterSet._addTokenRange('A', 'Z');
          characterSet._addToken('_');
          current = characterSet;
          break;
        case 'W':
          characterSet = new NegativeCharacterClassNode(__position++);
          characterSet._addTokenRange('0', '9');
          characterSet._addTokenRange('a', 'z');
          characterSet._addTokenRange('A', 'Z');
          characterSet._addToken('_');
          current = characterSet;
          break;
        case 's':
          characterSet = new CharacterClassNode(__position++);
          characterSet._addToken(' ');
          characterSet._addToken('\f');
          characterSet._addToken('\n');
          characterSet._addToken('\r');
          characterSet._addToken('\t');
          current = characterSet;
          break;
        case 'S':
          characterSet = new NegativeCharacterClassNode(__position++);
          characterSet._addToken(' ');
          characterSet._addToken('\f');
          characterSet._addToken('\n');
          characterSet._addToken('\r');
          characterSet._addToken('\t');
          current = characterSet;
          break;
        default:
          current = _newTokenNode(token, __position++);
          break;
      }

      __match(__lookahead);
    }

    return current;
  }