Beispiel #1
0
 private void fetchTokenInCCFor_digit() {
   if (syntax.opEscOctal3()) {
     unfetch();
     final int last = p;
     int num = scanUnsignedOctalNumber(3);
     if (num < 0) {
       throw new ValueException(ERR_TOO_BIG_NUMBER);
     }
     if (p == last) {
         /* can't read nothing. */
       num = 0; /* but, it's not error */
     }
     token.type = TokenType.RAW_BYTE;
     token.setC(num);
   }
 }
Beispiel #2
0
 private void fetchTokenFor_zero() {
   if (syntax.opEscOctal3()) {
     final int last = p;
     int num = scanUnsignedOctalNumber(c == '0' ? 2 : 3);
     if (num < 0) {
       throw new ValueException(ERR_TOO_BIG_NUMBER);
     }
     if (p == last) {
         /* can't read nothing. */
       num = 0; /* but, it's not error */
     }
     token.type = TokenType.RAW_BYTE;
     token.setC(num);
   } else if (c != '0') {
     inc();
   }
 }
Beispiel #3
0
  private void fetchTokenInCCFor_x() {
    if (!left()) {
      return;
    }
    final int last = p;

    if (peekIs('{') && syntax.opEscXBraceHex8()) {
      inc();
      final int num = scanUnsignedHexadecimalNumber(8);
      if (num < 0) {
        throw new ValueException(ERR_TOO_BIG_WIDE_CHAR_VALUE);
      }
      if (left()) {
        final int c2 = peek();
        if (EncodingHelper.isXDigit(c2)) {
          throw new ValueException(ERR_TOO_LONG_WIDE_CHAR_VALUE);
        }
      }

      if (p > last + 1 && left() && peekIs('}')) {
        inc();
        token.type = TokenType.CODE_POINT;
        token.setCode(num);
      } else {
        /* can't read nothing or invalid format */
        p = last;
      }
    } else if (syntax.opEscXHex2()) {
      int num = scanUnsignedHexadecimalNumber(2);
      if (num < 0) {
        throw new ValueException(ERR_TOO_BIG_NUMBER);
      }
      if (p == last) {
          /* can't read nothing. */
        num = 0; /* but, it's not error */
      }
      token.type = TokenType.RAW_BYTE;
      token.setC(num);
    }
  }
Beispiel #4
0
  protected final TokenType fetchToken() {
    // mark(); // out
    start:
    while (true) {
      if (!left()) {
        token.type = TokenType.EOT;
        return token.type;
      }

      token.type = TokenType.STRING;
      token.backP = p;

      fetch();

      if (c == syntax.metaCharTable.esc
          && !syntax.op2IneffectiveEscape()) { // IS_MC_ESC_CODE(code, syn)
        if (!left()) {
          throw new SyntaxException(ERR_END_PATTERN_AT_ESCAPE);
        }

        token.backP = p;
        fetch();

        token.setC(c);
        token.escaped = true;
        switch (c) {
          case '*':
            if (syntax.opEscAsteriskZeroInf()) {
              fetchTokenFor_repeat(0, QuantifierNode.REPEAT_INFINITE);
            }
            break;
          case '+':
            if (syntax.opEscPlusOneInf()) {
              fetchTokenFor_repeat(1, QuantifierNode.REPEAT_INFINITE);
            }
            break;
          case '?':
            if (syntax.opEscQMarkZeroOne()) {
              fetchTokenFor_repeat(0, 1);
            }
            break;
          case '{':
            if (syntax.opEscBraceInterval()) {
              fetchTokenFor_openBrace();
            }
            break;
          case '|':
            if (syntax.opEscVBarAlt()) {
              token.type = TokenType.ALT;
            }
            break;
          case '(':
            if (syntax.opEscLParenSubexp()) {
              token.type = TokenType.SUBEXP_OPEN;
            }
            break;
          case ')':
            if (syntax.opEscLParenSubexp()) {
              token.type = TokenType.SUBEXP_CLOSE;
            }
            break;
          case 'w':
            if (syntax.opEscWWord()) {
              fetchTokenInCCFor_charType(
                  false, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
            }
            break;
          case 'W':
            if (syntax.opEscWWord()) {
              fetchTokenInCCFor_charType(
                  true, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
            }
            break;
          case 'b':
            if (syntax.opEscBWordBound()) {
              fetchTokenFor_anchor(AnchorType.WORD_BOUND);
            }
            break;
          case 'B':
            if (syntax.opEscBWordBound()) {
              fetchTokenFor_anchor(AnchorType.NOT_WORD_BOUND);
            }
            break;
          case '<':
            if (Config.USE_WORD_BEGIN_END && syntax.opEscLtGtWordBeginEnd()) {
              fetchTokenFor_anchor(AnchorType.WORD_BEGIN);
            }
            break;
          case '>':
            if (Config.USE_WORD_BEGIN_END && syntax.opEscLtGtWordBeginEnd()) {
              fetchTokenFor_anchor(AnchorType.WORD_END);
            }
            break;
          case 's':
            if (syntax.opEscSWhiteSpace()) {
              fetchTokenInCCFor_charType(
                  false, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
            }
            break;
          case 'S':
            if (syntax.opEscSWhiteSpace()) {
              fetchTokenInCCFor_charType(
                  true, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
            }
            break;
          case 'd':
            if (syntax.opEscDDigit()) {
              fetchTokenInCCFor_charType(
                  false, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
            }
            break;
          case 'D':
            if (syntax.opEscDDigit()) {
              fetchTokenInCCFor_charType(
                  true, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
            }
            break;
          case 'h':
            if (syntax.op2EscHXDigit()) {
              fetchTokenInCCFor_charType(false, CharacterType.XDIGIT);
            }
            break;
          case 'H':
            if (syntax.op2EscHXDigit()) {
              fetchTokenInCCFor_charType(true, CharacterType.XDIGIT);
            }
            break;
          case 'A':
            if (syntax.opEscAZBufAnchor()) {
              fetchTokenFor_anchor(AnchorType.BEGIN_BUF);
            }
            break;
          case 'Z':
            if (syntax.opEscAZBufAnchor()) {
              fetchTokenFor_anchor(AnchorType.SEMI_END_BUF);
            }
            break;
          case 'z':
            if (syntax.opEscAZBufAnchor()) {
              fetchTokenFor_anchor(AnchorType.END_BUF);
            }
            break;
          case 'G':
            if (syntax.opEscCapitalGBeginAnchor()) {
              fetchTokenFor_anchor(AnchorType.BEGIN_POSITION);
            }
            break;
          case '`':
            if (syntax.op2EscGnuBufAnchor()) {
              fetchTokenFor_anchor(AnchorType.BEGIN_BUF);
            }
            break;
          case '\'':
            if (syntax.op2EscGnuBufAnchor()) {
              fetchTokenFor_anchor(AnchorType.END_BUF);
            }
            break;
          case 'x':
            fetchTokenFor_xBrace();
            break;
          case 'u':
            fetchTokenFor_uHex();
            break;
          case '1':
          case '2':
          case '3':
          case '4':
          case '5':
          case '6':
          case '7':
          case '8':
          case '9':
            fetchTokenFor_digit();
            break;
          case '0':
            fetchTokenFor_zero();
            break;

          default:
            unfetch();
            final int num = fetchEscapedValue();

            /* set_raw: */
            if (token.getC() != num) {
              token.type = TokenType.CODE_POINT;
              token.setCode(num);
            } else {
                /* string */
              p = token.backP + 1;
            }
            break;
        } // switch (c)

      } else {
        token.setC(c);
        token.escaped = false;

        if (Config.USE_VARIABLE_META_CHARS
            && (c != MetaChar.INEFFECTIVE_META_CHAR && syntax.opVariableMetaCharacters())) {
          fetchTokenFor_metaChars();
          break;
        }

        {
          switch (c) {
            case '.':
              if (syntax.opDotAnyChar()) {
                token.type = TokenType.ANYCHAR;
              }
              break;
            case '*':
              if (syntax.opAsteriskZeroInf()) {
                fetchTokenFor_repeat(0, QuantifierNode.REPEAT_INFINITE);
              }
              break;
            case '+':
              if (syntax.opPlusOneInf()) {
                fetchTokenFor_repeat(1, QuantifierNode.REPEAT_INFINITE);
              }
              break;
            case '?':
              if (syntax.opQMarkZeroOne()) {
                fetchTokenFor_repeat(0, 1);
              }
              break;
            case '{':
              if (syntax.opBraceInterval()) {
                fetchTokenFor_openBrace();
              }
              break;
            case '|':
              if (syntax.opVBarAlt()) {
                token.type = TokenType.ALT;
              }
              break;

            case '(':
              if (peekIs('?') && syntax.op2QMarkGroupEffect()) {
                inc();
                if (peekIs('#')) {
                  fetch();
                  while (true) {
                    if (!left()) {
                      throw new SyntaxException(ERR_END_PATTERN_IN_GROUP);
                    }
                    fetch();
                    if (c == syntax.metaCharTable.esc) {
                      if (left()) {
                        fetch();
                      }
                    } else {
                      if (c == ')') {
                        break;
                      }
                    }
                  }
                  continue start; // goto start
                }
                unfetch();
              }

              if (syntax.opLParenSubexp()) {
                token.type = TokenType.SUBEXP_OPEN;
              }
              break;
            case ')':
              if (syntax.opLParenSubexp()) {
                token.type = TokenType.SUBEXP_CLOSE;
              }
              break;
            case '^':
              if (syntax.opLineAnchor()) {
                fetchTokenFor_anchor(
                    isSingleline(env.option) ? AnchorType.BEGIN_BUF : AnchorType.BEGIN_LINE);
              }
              break;
            case '$':
              if (syntax.opLineAnchor()) {
                fetchTokenFor_anchor(
                    isSingleline(env.option) ? AnchorType.END_BUF : AnchorType.END_LINE);
              }
              break;
            case '[':
              if (syntax.opBracketCC()) {
                token.type = TokenType.CC_CC_OPEN;
              }
              break;
            case ']':
              // if (*src > env->pattern)   /* /].../ is allowed. */
              // CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
              break;
            case '#':
              if (Option.isExtend(env.option)) {
                while (left()) {
                  fetch();
                  if (EncodingHelper.isNewLine(c)) {
                    break;
                  }
                }
                continue start; // goto start
              }
              break;

            case ' ':
            case '\t':
            case '\n':
            case '\r':
            case '\f':
              if (Option.isExtend(env.option)) {
                continue start; // goto start
              }
              break;

            default: // string
              break;
          } // switch
        }
      }

      break;
    } // while
    return token.type;
  }
Beispiel #5
0
  protected final TokenType fetchTokenInCC() {
    if (!left()) {
      token.type = TokenType.EOT;
      return token.type;
    }

    fetch();
    token.type = TokenType.CHAR;
    token.setC(c);
    token.escaped = false;

    if (c == ']') {
      token.type = TokenType.CC_CLOSE;
    } else if (c == '-') {
      token.type = TokenType.CC_RANGE;
    } else if (c == syntax.metaCharTable.esc) {
      if (!syntax.backSlashEscapeInCC()) {
        return token.type;
      }
      if (!left()) {
        throw new SyntaxException(ERR_END_PATTERN_AT_ESCAPE);
      }
      fetch();
      token.escaped = true;
      token.setC(c);

      switch (c) {
        case 'w':
          fetchTokenInCCFor_charType(
              false, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
          break;
        case 'W':
          fetchTokenInCCFor_charType(
              true, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
          break;
        case 'd':
          fetchTokenInCCFor_charType(
              false, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
          break;
        case 'D':
          fetchTokenInCCFor_charType(
              true, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
          break;
        case 's':
          fetchTokenInCCFor_charType(
              false, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
          break;
        case 'S':
          fetchTokenInCCFor_charType(
              true, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
          break;
        case 'h':
          if (syntax.op2EscHXDigit()) {
            fetchTokenInCCFor_charType(false, CharacterType.XDIGIT);
          }
          break;
        case 'H':
          if (syntax.op2EscHXDigit()) {
            fetchTokenInCCFor_charType(true, CharacterType.XDIGIT);
          }
          break;
        case 'x':
          fetchTokenInCCFor_x();
          break;
        case 'u':
          fetchTokenInCCFor_u();
          break;
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
          fetchTokenInCCFor_digit();
          break;

        default:
          unfetch();
          final int num = fetchEscapedValue();
          if (token.getC() != num) {
            token.setCode(num);
            token.type = TokenType.CODE_POINT;
          }
          break;
      } // switch

    } else if (c == '&') {
      fetchTokenInCCFor_and();
    }
    return token.type;
  }