public Token(int tokenClass, String lexeme) { // InputBuffer's escapeString handles all escapes in a string, including u's // RegExp literals should not escape the u's either if (tokenClass != STRINGLITERAL_TOKEN && lexeme.indexOf("\\u") != -1) // contains a unicode escape char? { StringBuffer buffer = new StringBuffer(); int len = lexeme.length(); for (int x = 0; x < len; x++) { if (x + 2 < len && lexeme.charAt(x) == '\\' && lexeme.charAt(x + 1) == 'u') { if (x != 0 && lexeme.charAt(x - 1) == '\\') // watch out for '\\u' { buffer.append(lexeme.charAt(x)); continue; } int thisChar = 0; int y, digit; // calculate numeric value, bail if invalid for (y = x + 2; y < x + 6 && y < len; y++) { digit = Character.digit(lexeme.charAt(y), 16); if (digit == -1) break; thisChar = (thisChar << 4) + digit; } if (y != x + 6 || Character.isDefined((char) thisChar) == false) // if there was a problem or the char is invalid just escape the '\''u' // with 'u' { buffer.append(lexeme.charAt(++x)); } else // use Character class to convert unicode codePoint into a char ( note, this will // handle a wider set of unicode codepoints than the c++ impl does). { // jdk 1.5.2 only, but handles extended chars: char[] ca = Character.toChars(thisChar); char c = (char) thisChar; buffer.append(c); x += 5; } } else { buffer.append(lexeme.charAt(x)); } } this.tokenClass = tokenClass; this.lexeme = buffer.toString(); } else { this.tokenClass = tokenClass; this.lexeme = lexeme; } }
public int nexttoken(boolean resetState) { String doctagname = "description"; StringBuilder doctextbuf = null; int startofxml = pos(); StringBuilder blockcommentbuf = null; char regexp_flags = 0; // used to track option flags encountered in a regexp expression. Initialized in // regexp_state boolean maybe_reserved = false; char c = 0; if (resetState) { isFirstTokenOnLine = false; } while (true) { if (debug) { System.out.println("state = " + state + ", next = " + pos()); } switch (state) { case start_state: { c = nextchar(); mark(); switch (c) { case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': maybe_reserved = true; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '_': case '$': state = A_state; continue; case 0xffef: // could not have worked...case 0xffffffef: // ??? not in Character type // range ??? if (nextchar() == 0xffffffbb && nextchar() == 0xffffffbf) { // ISSUE: set encoding scheme to utf-8, and implement support for utf8 state = start_state; } else { state = error_state; } continue; case '@': return makeToken(ATSIGN_TOKEN); case '\'': case '\"': { char startquote = (char) c; boolean needs_escape = false; while ((c = nextchar()) != startquote) { if (c == '\\') { needs_escape = true; c = nextchar(); // special case: escaped eol strips crlf or lf if (c == '\r') c = nextchar(); if (c == '\n') continue; } else if (c == '\r' || c == '\n') { if (startquote == '\'') error(kError_Lexical_LineTerminatorInSingleQuotedStringLiteral); else error(kError_Lexical_LineTerminatorInDoubleQuotedStringLiteral); break; } else if (c == 0) { error(kError_Lexical_EndOfStreamInStringLiteral); return makeToken(EOS_TOKEN); } } return makeToken( STRINGLITERAL_TOKEN, input.copyReplaceStringEscapes(needs_escape)); } case '-': // tokens: -- -= - switch (nextchar()) { case '-': return makeToken(MINUSMINUS_TOKEN); case '=': return makeToken(MINUSASSIGN_TOKEN); default: retract(); return makeToken(MINUS_TOKEN); } case '!': // tokens: ! != !=== if (nextchar() == '=') { if (nextchar() == '=') return makeToken(STRICTNOTEQUALS_TOKEN); retract(); return makeToken(NOTEQUALS_TOKEN); } retract(); return makeToken(NOT_TOKEN); case '%': // tokens: % %= switch (nextchar()) { case '=': return makeToken(MODULUSASSIGN_TOKEN); default: retract(); return makeToken(MODULUS_TOKEN); } case '&': // tokens: & &= && &&= c = nextchar(); if (c == '=') return makeToken(BITWISEANDASSIGN_TOKEN); if (c == '&') { if (nextchar() == '=') return makeToken(LOGICALANDASSIGN_TOKEN); retract(); return makeToken(LOGICALAND_TOKEN); } retract(); return makeToken(BITWISEAND_TOKEN); case '#': // # is short for use if (HAS_HASHPRAGMAS) { return makeToken(USE_TOKEN); } state = error_state; continue; case '(': return makeToken(LEFTPAREN_TOKEN); case ')': return makeToken(RIGHTPAREN_TOKEN); case '*': // tokens: *= * if (nextchar() == '=') return makeToken(MULTASSIGN_TOKEN); retract(); return makeToken(MULT_TOKEN); case ',': return makeToken(COMMA_TOKEN); case '.': state = dot_state; continue; case '/': state = slash_state; continue; case ':': // tokens: : :: if (nextchar() == ':') { return makeToken(DOUBLECOLON_TOKEN); } retract(); return makeToken(COLON_TOKEN); case ';': return makeToken(SEMICOLON_TOKEN); case '?': return makeToken(QUESTIONMARK_TOKEN); case '[': return makeToken(LEFTBRACKET_TOKEN); case ']': return makeToken(RIGHTBRACKET_TOKEN); case '^': // tokens: ^= ^ if (nextchar() == '=') return makeToken(BITWISEXORASSIGN_TOKEN); retract(); return makeToken(BITWISEXOR_TOKEN); case '{': return makeToken(LEFTBRACE_TOKEN); case '|': // tokens: | |= || ||= c = nextchar(); if (c == '=') return makeToken(BITWISEORASSIGN_TOKEN); if (c == '|') { if (nextchar() == '=') return makeToken(LOGICALORASSIGN_TOKEN); retract(); return makeToken(LOGICALOR_TOKEN); } retract(); return makeToken(BITWISEOR_TOKEN); case '}': return makeToken(RIGHTBRACE_TOKEN); case '~': return makeToken(BITWISENOT_TOKEN); case '+': // tokens: ++ += + c = nextchar(); if (c == '+') return makeToken(PLUSPLUS_TOKEN); if (c == '=') return makeToken(PLUSASSIGN_TOKEN); retract(); return makeToken(PLUS_TOKEN); case '<': switch (nextchar()) { case '<': // tokens: << <<= if (nextchar() == '=') return makeToken(LEFTSHIFTASSIGN_TOKEN); retract(); return makeToken(LEFTSHIFT_TOKEN); case '=': return makeToken(LESSTHANOREQUALS_TOKEN); case '/': return makeToken(XMLTAGSTARTEND_TOKEN); case '!': state = xmlcommentorcdatastart_state; continue; case '?': state = xmlpi_state; continue; } retract(); return makeToken(LESSTHAN_TOKEN); case '=': // tokens: === == = if (nextchar() == '=') { if (nextchar() == '=') return makeToken(STRICTEQUALS_TOKEN); retract(); return makeToken(EQUALS_TOKEN); } retract(); return makeToken(ASSIGN_TOKEN); case '>': // tokens: > >= >> >>= >>> >>>= state = start_state; switch (nextchar()) { case '>': switch (nextchar()) { case '>': if (nextchar() == '=') return makeToken(UNSIGNEDRIGHTSHIFTASSIGN_TOKEN); retract(); return makeToken(UNSIGNEDRIGHTSHIFT_TOKEN); case '=': return makeToken(RIGHTSHIFTASSIGN_TOKEN); default: retract(); return makeToken(RIGHTSHIFT_TOKEN); } case '=': return makeToken(GREATERTHANOREQUALS_TOKEN); } retract(); return makeToken(GREATERTHAN_TOKEN); case '0': state = zero_state; continue; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': state = decimalinteger_state; continue; case ' ': // ascii range white space case '\t': case 0x000b: case 0x000c: case 0x0085: case 0x00a0: continue; case '\n': // ascii line terminators. case '\r': isFirstTokenOnLine = true; continue; case 0: return makeToken(EOS_TOKEN); default: switch (input.nextcharClass((char) c, true)) { case Lu: case Ll: case Lt: case Lm: case Lo: case Nl: maybe_reserved = false; state = A_state; continue; case Zs: // unicode whitespace and control-characters case Cc: case Cf: continue; case Zl: // unicode line terminators case Zp: isFirstTokenOnLine = true; continue; default: state = error_state; continue; } } } /* * prefix: <letter> */ case A_state: { boolean needs_escape = c == '\\'; // ??? really should only be true if the word started with a backslash while (true) { c = nextchar(); if (c >= 'a' && c <= 'z') { continue; } if ((c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '$' || c == '_') { maybe_reserved = false; continue; } if (c <= 0x7f) // in ascii range & mostly not a valid char { if (c == '\\') { needs_escape = true; // close enough, we just want to minimize rescans for unicode escapes } else { retract(); break; } } // else outside ascii range (or an escape sequence ) switch (input.nextcharClass(c, false)) { case Lu: case Ll: case Lt: case Lm: case Lo: case Nl: case Mn: case Mc: case Nd: case Pc: maybe_reserved = false; input.nextcharClass(c, true); // advance input cursor continue; } retract(); break; } state = start_state; String s = input.copyReplaceUnicodeEscapes(needs_escape); if (maybe_reserved) { Integer i = reservedWord.get(s); if (i != null) return makeToken((int) i); } return makeToken(IDENTIFIER_TOKEN, s); } /* * prefix: 0 * accepts: 0x... | 0X... | 01... | 0... | 0 */ case zero_state: switch (nextchar()) { case 'x': case 'X': switch (nextchar()) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': state = hexinteger_state; break; default: state = start_state; error(kError_Lexical_General); } continue; case '.': state = decimal_state; continue; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': state = decimalinteger_state; continue; case 'E': case 'e': state = exponentstart_state; continue; case 'd': case 'm': case 'i': case 'u': if (!ctx.statics.es4_numerics) retract(); state = start_state; return makeToken(NUMBERLITERAL_TOKEN, input.copy()); default: retract(); state = start_state; return makeToken(NUMBERLITERAL_TOKEN, input.copy()); } /* * prefix: 0x<hex digits> * accepts: 0x123f */ case hexinteger_state: switch (nextchar()) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': state = hexinteger_state; continue; case 'u': case 'i': if (!ctx.statics.es4_numerics) retract(); state = start_state; return makeToken(NUMBERLITERAL_TOKEN, input.copy()); default: retract(); state = start_state; return makeToken(NUMBERLITERAL_TOKEN, input.copy()); } /* * prefix: . * accepts: .123 | . */ case dot_state: switch (nextchar()) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': state = decimal_state; continue; case '.': state = start_state; if (nextchar() == '.') return makeToken(TRIPLEDOT_TOKEN); retract(); return makeToken(DOUBLEDOT_TOKEN); case '<': state = start_state; return makeToken(DOTLESSTHAN_TOKEN); default: retract(); state = start_state; return makeToken(DOT_TOKEN); } /* * prefix: N * accepts: 0.123 | 1.23 | 123 | 1e23 | 1e-23 */ case decimalinteger_state: switch (nextchar()) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': state = decimalinteger_state; continue; case '.': state = decimal_state; continue; case 'd': case 'm': case 'u': case 'i': if (!ctx.statics.es4_numerics) retract(); state = start_state; return makeToken(NUMBERLITERAL_TOKEN, input.copy()); case 'E': case 'e': state = exponentstart_state; continue; default: retract(); state = start_state; return makeToken(NUMBERLITERAL_TOKEN, input.copy()); } /* * prefix: N. * accepts: 0.1 | 1e23 | 1e-23 */ case decimal_state: switch (nextchar()) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': state = decimal_state; continue; case 'd': case 'm': if (!ctx.statics.es4_numerics) retract(); state = start_state; return makeToken(NUMBERLITERAL_TOKEN, input.copy()); case 'E': case 'e': state = exponentstart_state; continue; default: retract(); state = start_state; return makeToken(NUMBERLITERAL_TOKEN, input.copy()); } /* * prefix: ..e * accepts: ..eN | ..e+N | ..e-N */ case exponentstart_state: switch (nextchar()) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '+': case '-': state = exponent_state; continue; default: error(kError_Lexical_General); state = start_state; continue; // Issue: needs specific error here. } /* * prefix: ..e * accepts: ..eN | ..e+N | ..e-N */ case exponent_state: switch (nextchar()) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': state = exponent_state; continue; case 'd': case 'm': if (!ctx.statics.es4_numerics) retract(); state = start_state; return makeToken(NUMBERLITERAL_TOKEN, input.copy()); default: retract(); state = start_state; return makeToken(NUMBERLITERAL_TOKEN, input.copy()); } /* * prefix: / */ case slash_state: { c = nextchar(); switch (c) { case '/': // line comment state = start_state; line_comment: while ((c = nextchar()) != 0) { if (c == '\r' || c == '\n') { isFirstTokenOnLine = true; if (save_comments == false) { break line_comment; } retract(); // don't include newline in line comment. (Sec 7.3) return makeCommentToken( SLASHSLASHCOMMENT_TOKEN, input.copyReplaceUnicodeEscapes()); } } continue; case '*': if (save_comments == false) { block_comment: while ((c = nextchar()) != 0) { if (c == '\r' || c == '\n') isFirstTokenOnLine = true; if (c == '*') { c = nextchar(); if (c == '/') { break block_comment; } retract(); } } state = start_state; } else { if (blockcommentbuf == null) blockcommentbuf = new StringBuilder(); blockcommentbuf.append("/*"); state = blockcommentstart_state; } continue; case '>': if (inXML > 0) // ignore this if outside an XML context { state = start_state; return makeToken(XMLTAGENDEND_TOKEN); } // FALL THROUGH default: // If the last token read is any of these, then the '/' must start a div or // div_assign... int lb = currentToken.lookback; if (lb == IDENTIFIER_TOKEN || lb == NUMBERLITERAL_TOKEN || lb == RIGHTPAREN_TOKEN || lb == RIGHTBRACE_TOKEN || lb == RIGHTBRACKET_TOKEN) { /* * tokens: /= / */ state = start_state; if (c == '=') return makeToken(DIVASSIGN_TOKEN); retract(); return makeToken(DIV_TOKEN); } state = slashregexp_state; retract(); continue; } } /* * tokens: /<regexpbody>/<regexpflags> */ case slashregexp_state: switch (nextchar()) { case '\\': nextchar(); continue; case '/': regexp_flags = 0; state = regexp_state; continue; case 0: case '\n': case '\r': error(kError_Lexical_General); state = start_state; continue; default: state = slashregexp_state; continue; } /* * tokens: g | i | m | s | x . Note that s and x are custom extentions to match perl's functionality * Also note we handle this via an array of boolean flags intead of state change logic. * (5,1) + (5,2) + (5,3) + (5,4) + (5,5) is just too many states to handle this via state logic */ case regexp_state: c = nextchar(); switch (c) { case 'g': if ((regexp_flags & 0x01) == 0) { regexp_flags |= 0x01; continue; } error(kError_Lexical_General); state = start_state; continue; case 'i': if ((regexp_flags & 0x02) == 0) { regexp_flags |= 0x02; continue; } error(kError_Lexical_General); state = start_state; continue; case 'm': if ((regexp_flags & 0x04) == 0) { regexp_flags |= 0x04; continue; } error(kError_Lexical_General); state = start_state; continue; case 's': if ((regexp_flags & 0x08) == 0) { regexp_flags |= 0x08; continue; } error(kError_Lexical_General); state = start_state; continue; case 'x': if ((regexp_flags & 0x10) == 0) { regexp_flags |= 0x10; continue; } error(kError_Lexical_General); state = start_state; continue; default: if (Character.isJavaIdentifierPart(c)) { error(kError_Lexical_General); state = start_state; continue; } retract(); state = start_state; return makeToken(REGEXPLITERAL_TOKEN, input.copyReplaceUnicodeEscapes()); } /* * prefix: <! */ case xmlcommentorcdatastart_state: switch (nextchar()) { case '[': if (nextchar() == 'C' && nextchar() == 'D' && nextchar() == 'A' && nextchar() == 'T' && nextchar() == 'A' && nextchar() == '[') { state = xmlcdata_state; continue; } break; // error case '-': if (nextchar() == '-') { state = xmlcomment_state; continue; } } error(kError_Lexical_General); state = start_state; continue; case xmlcdata_state: switch (nextchar()) { case ']': if (nextchar() == ']' && nextchar() == '>') { state = start_state; return makeToken( XMLMARKUP_TOKEN, input.substringReplaceUnicodeEscapes(startofxml, pos())); } continue; case 0: error(kError_Lexical_General); state = start_state; } continue; case xmlcomment_state: while ((c = nextchar()) != '-' && c != 0) ; if (c == '-' && nextchar() != '-') { continue; } // got -- if next is > ok else error if (nextchar() == '>') { state = start_state; return makeToken( XMLMARKUP_TOKEN, input.substringReplaceUnicodeEscapes(startofxml, pos())); } error(kError_Lexical_General); state = start_state; continue; case xmlpi_state: while ((c = nextchar()) != '?' && c != 0) ; if (c == '?' && nextchar() == '>') { state = start_state; return makeToken( XMLMARKUP_TOKEN, input.substringReplaceUnicodeEscapes(startofxml, pos())); } if (c == 0) { error(kError_Lexical_General); state = start_state; } continue; case xmltext_state: { switch (nextchar()) { case '<': case '{': { retract(); String xmltext = input.substringReplaceUnicodeEscapes(startofxml, pos()); if (xmltext != null) { state = start_state; return makeToken(XMLTEXT_TOKEN, xmltext); } else // if there is no leading text, then just return punctuation token to avoid // empty text tokens { switch (nextchar()) { case '<': switch (nextchar()) { case '/': state = start_state; return makeToken(XMLTAGSTARTEND_TOKEN); case '!': state = xmlcommentorcdatastart_state; continue; case '?': state = xmlpi_state; continue; default: retract(); state = start_state; return makeToken(LESSTHAN_TOKEN); } case '{': state = start_state; return makeToken(LEFTBRACE_TOKEN); } } } case 0: state = start_state; return makeToken(EOS_TOKEN); } continue; } case xmlliteral_state: switch (nextchar()) { case '{': // return makeToken( XMLPART_TOKEN return makeToken( XMLPART_TOKEN, input.substringReplaceUnicodeEscapes(startofxml, pos() - 1)); case '<': if (nextchar() == '/') { --level; nextchar(); mark(); retract(); state = endxmlname_state; } else { ++level; state = xmlliteral_state; } continue; case '/': if (nextchar() == '>') { --level; if (level == 0) { state = start_state; return makeToken( XMLLITERAL_TOKEN, input.substringReplaceUnicodeEscapes(startofxml, pos() + 1)); } } continue; case 0: retract(); error(kError_Lexical_NoMatchingTag); state = start_state; continue; default: continue; } case endxmlname_state: c = nextchar(); if (Character.isJavaIdentifierPart(c) || c == ':') { continue; } switch (c) { case '{': // return makeToken( XMLPART_TOKEN { String xmltext = input.substringReplaceUnicodeEscapes(startofxml, pos() - 1); return makeToken(XMLPART_TOKEN, xmltext); } case '>': retract(); nextchar(); if (level == 0) { String xmltext = input.substringReplaceUnicodeEscapes(startofxml, pos() + 1); state = start_state; return makeToken(XMLLITERAL_TOKEN, xmltext); } state = xmlliteral_state; continue; default: state = xmlliteral_state; continue; } /* * prefix: /* */ case blockcommentstart_state: { c = nextchar(); blockcommentbuf.append(c); switch (c) { case '*': if (nextchar() == '/') { state = start_state; return makeCommentToken(BLOCKCOMMENT_TOKEN, new String()); } retract(); state = doccomment_state; continue; case 0: error(kError_BlockCommentNotTerminated); state = start_state; continue; case '\n': case '\r': isFirstTokenOnLine = true; default: state = blockcomment_state; continue; } } /* * prefix: /** */ case doccomment_state: { c = nextchar(); blockcommentbuf.append(c); switch (c) { case '*': state = doccommentstar_state; continue; case '@': if (doctextbuf == null) doctextbuf = getDocTextBuffer(doctagname); if (doctagname.length() > 0) { doctextbuf.append("]]></").append(doctagname).append(">"); } doctagname = ""; state = doccommenttag_state; continue; case '\r': case '\n': isFirstTokenOnLine = true; if (doctextbuf == null) doctextbuf = getDocTextBuffer(doctagname); doctextbuf.append('\n'); continue; case 0: error(kError_BlockCommentNotTerminated); state = start_state; continue; default: if (doctextbuf == null) doctextbuf = getDocTextBuffer(doctagname); doctextbuf.append((char) (c)); continue; } } case doccommentstar_state: { c = nextchar(); blockcommentbuf.append(c); switch (c) { case '/': { if (doctextbuf == null) doctextbuf = getDocTextBuffer(doctagname); if (doctagname.length() > 0) { doctextbuf.append("]]></").append(doctagname).append(">"); } String doctext = doctextbuf.toString(); // ??? does this needs escape conversion ??? state = start_state; return makeCommentToken(DOCCOMMENT_TOKEN, doctext); } case '*': continue; case 0: error(kError_BlockCommentNotTerminated); state = start_state; continue; default: state = doccomment_state; continue; // if not a slash, then keep looking for an end comment. } } /* * prefix: @ */ case doccommenttag_state: { c = nextchar(); switch (c) { case '*': state = doccommentstar_state; continue; case ' ': case '\t': case '\r': case '\n': { if (doctextbuf == null) doctextbuf = getDocTextBuffer(doctagname); // skip extra whitespace --fixes bug on tag text parsing // --but really, the problem is not here, it's in whatever reads asdoc output.. // --So if that gets fixed, feel free to delete the following. while ((c = nextchar()) == ' ' || c == '\t') ; retract(); if (doctagname.length() > 0) { doctextbuf.append("\n<").append(doctagname).append("><![CDATA["); } state = doccomment_state; continue; } case 0: error(kError_BlockCommentNotTerminated); state = start_state; continue; default: doctagname += (char) (c); continue; } } /* * prefix: /** */ case doccommentvalue_state: switch (nextchar()) { case '*': state = doccommentstar_state; continue; case '@': state = doccommenttag_state; continue; case 0: error(kError_BlockCommentNotTerminated); state = start_state; continue; default: state = doccomment_state; continue; } /* * prefix: /* */ case blockcomment_state: { c = nextchar(); blockcommentbuf.append(c); switch (c) { case '*': c = nextchar(); if (c == '/') { state = start_state; blockcommentbuf.append(c); String blocktext = blockcommentbuf.toString(); // ??? needs escape conversion return makeCommentToken(BLOCKCOMMENT_TOKEN, blocktext); } retract(); break; case '\r': case '\n': isFirstTokenOnLine = true; break; case 0: error(kError_BlockCommentNotTerminated); state = start_state; break; } continue; } /* * skip error */ case error_state: error(kError_Lexical_General); skiperror(); state = start_state; continue; default: error("invalid scanner state"); state = start_state; return makeToken(EOS_TOKEN); } } }