/** * Parse a comment. Parse a remark markup. * * <p>From the <a href="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.4">HTML 4.01 * Specification, W3C Recommendation 24 December 1999</a> * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.4 * * <p><cite> 3.2.4 Comments * * <p>HTML comments have the following syntax: * * <p><code> * <!-- this is a comment --><p> * <!-- and so is this one,<p> * which occupies more than one line --><p> * </code> White space is not permitted between the markup declaration open delimiter("<!") and * the comment open delimiter ("--"), but is permitted between the comment close delimiter ("--") * and the markup declaration close delimiter (">"). A common error is to include a string of * hyphens ("---") within a comment. Authors should avoid putting two or more adjacent hyphens * inside comments. Information that appears between comments has no special meaning (e.g., * character references are not interpreted as such). Note that comments are markup. * * <p></cite> * * <p>This method uses a state machine with the following states: * * <ol> * <li>state 0 - prior to the first open delimiter (first dash) * <li>state 1 - prior to the second open delimiter (second dash) * <li>state 2 - prior to the first closing delimiter (first dash) * <li>state 3 - prior to the second closing delimiter (second dash) * <li>state 4 - prior to the terminating > * </ol> * * <p>All comment text (everything excluding the < and >), is included in the remark text. * We allow terminators like --!> even though this isn't part of the spec. * * @param start The position at which to start scanning. * @param quotesmart If <code>true</code>, strings ignore quoted contents. * @return The parsed node. * @exception ParserException If a problem occurs reading from the source. */ protected Node parseRemark(int start, boolean quotesmart) throws ParserException { boolean done; char ch; int state; done = false; state = 0; while (!done) { ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; else switch (state) { case 0: // prior to the first open delimiter if ('>' == ch) done = true; if ('-' == ch) state = 1; else return (parseString(start, quotesmart)); break; case 1: // prior to the second open delimiter if ('-' == ch) { // handle <!--> because netscape does ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; else if ('>' == ch) done = true; else { mPage.ungetCharacter(mCursor); state = 2; } } else return (parseString(start, quotesmart)); break; case 2: // prior to the first closing delimiter if ('-' == ch) state = 3; else if (Page.EOF == ch) return (parseString(start, quotesmart)); // no terminator break; case 3: // prior to the second closing delimiter if ('-' == ch) state = 4; else state = 2; break; case 4: // prior to the terminating > if ('>' == ch) done = true; else if (Character.isWhitespace(ch)) { // stay in state 4 } else if (!STRICT_REMARKS && (('-' == ch) || ('!' == ch))) { // stay in state 4 } else // bug #1345049 HTMLParser should not terminate a comment with ---> // should maybe issue a warning mentioning STRICT_REMARKS state = 2; break; default: throw new IllegalStateException("how the f**k did we get in state " + state); } } return (makeRemark(start, mCursor.getPosition())); }
/** * Get the next node from the source. * * @param quotesmart If <code>true</code>, strings ignore quoted contents. * @return A Remark, Text or Tag, or <code>null</code> if no more lexemes are present. * @exception ParserException If there is a problem with the underlying page. */ public Node nextNode(boolean quotesmart) throws ParserException { int start; char ch; Node ret; // debugging suppport if (-1 != mDebugLineTrigger) { Page page = getPage(); int lineno = page.row(mCursor); if (mDebugLineTrigger < lineno) mDebugLineTrigger = lineno + 1; // trigger on next line too } start = mCursor.getPosition(); ch = mPage.getCharacter(mCursor); switch (ch) { case Page.EOF: ret = null; break; case '<': ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) ret = makeString(start, mCursor.getPosition()); else if ('%' == ch) { mPage.ungetCharacter(mCursor); ret = parseJsp(start); } else if ('?' == ch) { mPage.ungetCharacter(mCursor); ret = parsePI(start); } else if ('/' == ch || '%' == ch || Character.isLetter(ch)) { mPage.ungetCharacter(mCursor); ret = parseTag(start); } else if ('!' == ch) { ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) ret = makeString(start, mCursor.getPosition()); else { if ('>' == ch) // handle <!> ret = makeRemark(start, mCursor.getPosition()); else { mPage.ungetCharacter(mCursor); // remark/tag need this char if ('-' == ch) ret = parseRemark(start, quotesmart); else { mPage.ungetCharacter(mCursor); // tag needs prior one too ret = parseTag(start); } } } } else ret = parseString(start, quotesmart); break; default: mPage.ungetCharacter(mCursor); // string needs to see leading foreslash ret = parseString(start, quotesmart); break; } return (ret); }
/** * Advance the cursor through a JIS escape sequence. * * @param cursor A cursor positioned within the escape sequence. * @exception ParserException If a problem occurs reading from the source. */ protected void scanJIS(Cursor cursor) throws ParserException { boolean done; char ch; int state; done = false; state = 0; while (!done) { ch = mPage.getCharacter(cursor); if (Page.EOF == ch) done = true; else switch (state) { case 0: if (0x1b == ch) // escape state = 1; break; case 1: if ('(' == ch) state = 2; else state = 0; break; case 2: if ('J' == ch) done = true; else state = 0; break; default: throw new IllegalStateException("state " + state); } } }
/** * Parse a tag. Parse the name and attributes from a start tag. * * <p>From the <a href="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2">HTML 4.01 * Specification, W3C Recommendation 24 December 1999</a> * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2 * * <p><cite> 3.2.2 Attributes * * <p>Elements may have associated properties, called attributes, which may have values (by * default, or set by authors or scripts). Attribute/value pairs appear before the final ">" of an * element's start tag. Any number of (legal) attribute value pairs, separated by spaces, may * appear in an element's start tag. They may appear in any order. * * <p>In this example, the id attribute is set for an H1 element: <code> * <H1 id="section1"> * </code> This is an identified heading thanks to the id attribute <code> * </H1> * </code> By default, SGML requires that all attribute values be delimited using either double * quotation marks (ASCII decimal 34) or single quotation marks (ASCII decimal 39). Single quote * marks can be included within the attribute value when the value is delimited by double quote * marks, and vice versa. Authors may also use numeric character references to represent double * quotes (&#34;) and single quotes (&#39;). For doublequotes authors can also use the * character entity reference &quot;. * * <p>In certain cases, authors may specify the value of an attribute without any quotation marks. * The attribute value may only contain letters (a-z and A-Z), digits (0-9), hyphens (ASCII * decimal 45), periods (ASCII decimal 46), underscores (ASCII decimal 95), and colons (ASCII * decimal 58). We recommend using quotation marks even when it is possible to eliminate them. * * <p>Attribute names are always case-insensitive. * * <p>Attribute values are generally case-insensitive. The definition of each attribute in the * reference manual indicates whether its value is case-insensitive. * * <p>All the attributes defined by this specification are listed in the attribute index. * * <p></cite> * * <p>This method uses a state machine with the following states: * * <ol> * <li>state 0 - outside of any attribute * <li>state 1 - within attributre name * <li>state 2 - equals hit * <li>state 3 - within naked attribute value. * <li>state 4 - within single quoted attribute value * <li>state 5 - within double quoted attribute value * <li>state 6 - whitespaces after attribute name could lead to state 2 (=)or state 0 * </ol> * * <p>The starting point for the various components is stored in an array of integers that match * the initiation point for the states one-for-one, i.e. bookmarks[0] is where state 0 began, * bookmarks[1] is where state 1 began, etc. Attributes are stored in a <code>Vector</code> having * one slot for each whitespace or attribute/value pair. The first slot is for attribute name * (kind of like a standalone attribute). * * @param start The position at which to start scanning. * @return The parsed tag. * @exception ParserException If a problem occurs reading from the source. */ protected Node parseTag(int start) throws ParserException { boolean done; char ch; int state; int[] bookmarks; Vector attributes; done = false; attributes = new Vector(); state = 0; bookmarks = new int[8]; bookmarks[0] = mCursor.getPosition(); while (!done) { bookmarks[state + 1] = mCursor.getPosition(); ch = mPage.getCharacter(mCursor); switch (state) { case 0: // outside of any attribute if ((Page.EOF == ch) || ('>' == ch) || ('<' == ch)) { if ('<' == ch) { // don't consume the opening angle mPage.ungetCharacter(mCursor); bookmarks[state + 1] = mCursor.getPosition(); } whitespace(attributes, bookmarks); done = true; } else if (!Character.isWhitespace(ch)) { whitespace(attributes, bookmarks); state = 1; } break; case 1: // within attribute name if ((Page.EOF == ch) || ('>' == ch) || ('<' == ch)) { if ('<' == ch) { // don't consume the opening angle mPage.ungetCharacter(mCursor); bookmarks[state + 1] = mCursor.getPosition(); } standalone(attributes, bookmarks); done = true; } else if (Character.isWhitespace(ch)) { // whitespaces might be followed by next attribute or an equal sign // see Bug #891058 Bug in lexer. bookmarks[6] = bookmarks[2]; // setting the bookmark[0] is done in state 6 if applicable state = 6; } else if ('=' == ch) state = 2; break; case 2: // equals hit if ((Page.EOF == ch) || ('>' == ch)) { empty(attributes, bookmarks); done = true; } else if ('\'' == ch) { state = 4; bookmarks[4] = bookmarks[3]; } else if ('"' == ch) { state = 5; bookmarks[5] = bookmarks[3]; } else if (Character.isWhitespace(ch)) { // collect white spaces after "=" into the assignment string; // do nothing // see Bug #891058 Bug in lexer. } else state = 3; break; case 3: // within naked attribute value if ((Page.EOF == ch) || ('>' == ch)) { naked(attributes, bookmarks); done = true; } else if (Character.isWhitespace(ch)) { naked(attributes, bookmarks); bookmarks[0] = bookmarks[4]; state = 0; } break; case 4: // within single quoted attribute value if (Page.EOF == ch) { single_quote(attributes, bookmarks); done = true; // complain? } else if ('\'' == ch) { single_quote(attributes, bookmarks); bookmarks[0] = bookmarks[5] + 1; state = 0; } break; case 5: // within double quoted attribute value if (Page.EOF == ch) { double_quote(attributes, bookmarks); done = true; // complain? } else if ('"' == ch) { double_quote(attributes, bookmarks); bookmarks[0] = bookmarks[6] + 1; state = 0; } break; // patch for lexer state correction by // Gernot Fricke // See Bug # 891058 Bug in lexer. case 6: // undecided for state 0 or 2 // we have read white spaces after an attributte name if (Page.EOF == ch) { // same as last else clause standalone(attributes, bookmarks); bookmarks[0] = bookmarks[6]; mPage.ungetCharacter(mCursor); state = 0; } else if (Character.isWhitespace(ch)) { // proceed } else if ('=' == ch) // yepp. the white spaces belonged to the equal. { bookmarks[2] = bookmarks[6]; bookmarks[3] = bookmarks[7]; state = 2; } else { // white spaces were not ended by equal // meaning the attribute was a stand alone attribute // now: create the stand alone attribute and rewind // the cursor to the end of the white spaces // and restart scanning as whitespace attribute. standalone(attributes, bookmarks); bookmarks[0] = bookmarks[6]; mPage.ungetCharacter(mCursor); state = 0; } break; default: throw new IllegalStateException("how the f**k did we get in state " + state); } } return (makeTag(start, mCursor.getPosition(), attributes)); }
/** * Parse a string node. Scan characters until "</", "<%", "<!" or < followed by a * letter is encountered, or the input stream is exhausted, in which case <code>null</code> is * returned. * * @param start The position at which to start scanning. * @param quotesmart If <code>true</code>, strings ignore quoted contents. * @return The parsed node. * @exception ParserException If a problem occurs reading from the source. */ protected Node parseString(int start, boolean quotesmart) throws ParserException { boolean done; char ch; char quote; done = false; quote = 0; while (!done) { ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; else if (0x1b == ch) // escape { ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; else if ('$' == ch) { ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; else if ('B' == ch) scanJIS(mCursor); else { mPage.ungetCharacter(mCursor); mPage.ungetCharacter(mCursor); } } else mPage.ungetCharacter(mCursor); } else if (quotesmart && (0 == quote) && (('\'' == ch) || ('"' == ch))) quote = ch; // enter quoted state // patch from Gernot Fricke to handle escaped closing quote else if (quotesmart && (0 != quote) && ('\\' == ch)) { ch = mPage.getCharacter(mCursor); // try to consume escape if ((Page.EOF != ch) && ('\\' != ch) // escaped backslash && (ch != quote)) // escaped quote character // ( reflects ["] or ['] whichever opened the quotation) mPage.ungetCharacter(mCursor); // unconsume char if char not an escape } else if (quotesmart && (ch == quote)) quote = 0; // exit quoted state else if (quotesmart && (0 == quote) && (ch == '/')) { // handle multiline and double slash comments (with a quote) // in script like: // I can't handle single quotations. ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; else if ('/' == ch) { do ch = mPage.getCharacter(mCursor); while ((Page.EOF != ch) && ('\n' != ch)); } else if ('*' == ch) { do { do ch = mPage.getCharacter(mCursor); while ((Page.EOF != ch) && ('*' != ch)); ch = mPage.getCharacter(mCursor); if (ch == '*') mPage.ungetCharacter(mCursor); } while ((Page.EOF != ch) && ('/' != ch)); } else mPage.ungetCharacter(mCursor); } else if ((0 == quote) && ('<' == ch)) { ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; // the order of these tests might be optimized for speed: else if ('/' == ch || Character.isLetter(ch) || '!' == ch || '%' == ch || '?' == ch) { done = true; mPage.ungetCharacter(mCursor); mPage.ungetCharacter(mCursor); } else { // it's not a tag, so keep going, but check for quotes mPage.ungetCharacter(mCursor); } } } return (makeString(start, mCursor.getPosition())); }
/** * Return CDATA as a text node. Slightly less rigid than {@link #parseCDATA()} this method * provides for parsing CDATA that may contain quoted strings that have embedded ETAGO ("</") * delimiters and skips single and multiline comments. * * @param quotesmart If <code>true</code> the strict definition of CDATA is extended to allow for * single or double quoted ETAGO ("</") sequences. * @return The <code>TextNode</code> of the CDATA or <code>null</code> if none. * @see #parseCDATA() * @exception ParserException If a problem occurs reading from the source. */ public Node parseCDATA(boolean quotesmart) throws ParserException { int start; int state; boolean done; char quote; char ch; int end; boolean comment; start = mCursor.getPosition(); state = 0; done = false; quote = 0; comment = false; while (!done) { ch = mPage.getCharacter(mCursor); switch (state) { case 0: // prior to ETAGO switch (ch) { case Page.EOF: done = true; break; case '\'': if (quotesmart && !comment) if (0 == quote) quote = '\''; // enter quoted state else if ('\'' == quote) quote = 0; // exit quoted state break; case '"': if (quotesmart && !comment) if (0 == quote) quote = '"'; // enter quoted state else if ('"' == quote) quote = 0; // exit quoted state break; case '\\': if (quotesmart) if (0 != quote) { ch = mPage.getCharacter(mCursor); // try to consume escaped character if (Page.EOF == ch) done = true; else if ((ch != '\\') && (ch != quote)) // unconsume char if character was not an escapable char. mPage.ungetCharacter(mCursor); } break; case '/': if (quotesmart) if (0 == quote) { // handle multiline and double slash comments (with a quote) ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; else if ('/' == ch) comment = true; else if ('*' == ch) { do { do ch = mPage.getCharacter(mCursor); while ((Page.EOF != ch) && ('*' != ch)); ch = mPage.getCharacter(mCursor); if (ch == '*') mPage.ungetCharacter(mCursor); } while ((Page.EOF != ch) && ('/' != ch)); } else mPage.ungetCharacter(mCursor); } break; case '\n': comment = false; break; case '<': if (quotesmart) { if (0 == quote) state = 1; } else state = 1; break; default: break; } break; case 1: // < switch (ch) { case Page.EOF: done = true; break; case '/': state = 2; break; case '!': ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; else if ('-' == ch) { ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; else if ('-' == ch) state = 3; else state = 0; } else state = 0; break; default: state = 0; break; } break; case 2: // </ comment = false; if (Page.EOF == ch) done = true; else if (Character.isLetter(ch)) { done = true; // back up to the start of ETAGO mPage.ungetCharacter(mCursor); mPage.ungetCharacter(mCursor); mPage.ungetCharacter(mCursor); } else state = 0; break; case 3: // <! comment = false; if (Page.EOF == ch) done = true; else if ('-' == ch) { ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; else if ('-' == ch) { ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; else if ('>' == ch) state = 0; else { mPage.ungetCharacter(mCursor); mPage.ungetCharacter(mCursor); } } else mPage.ungetCharacter(mCursor); } break; default: throw new IllegalStateException("how the f**k did we get in state " + state); } } end = mCursor.getPosition(); return (makeString(start, end)); }
/** * Parse an XML processing instruction. Scan characters until "?>" is encountered, or the input * stream is exhausted, in which case <code>null</code> is returned. * * @param start The position at which to start scanning. * @return The parsed node. * @exception ParserException If a problem occurs reading from the source. */ protected Node parsePI(int start) throws ParserException { boolean done; char ch; int state; Vector attributes; int code; done = false; state = 0; code = 0; attributes = new Vector(); // <?xyz?> // 011112d while (!done) { ch = mPage.getCharacter(mCursor); switch (state) { case 0: // prior to the question mark switch (ch) { case '?': // <? code = mCursor.getPosition(); attributes.addElement(new PageAttribute(mPage, start + 1, code, -1, -1, (char) 0)); state = 1; break; // case Page.EOF: // <\0 // case '>': // <> default: done = true; break; } break; case 1: // prior to the closing question mark switch (ch) { case Page.EOF: // <?x\0 case '>': // <?x> done = true; break; case '\'': case '"': // <?..." state = ch; break; case '?': // <?...? state = 2; break; default: // <?...x break; } break; case 2: switch (ch) { case Page.EOF: // <?x..?\0 done = true; break; case '>': state = 3; done = true; break; default: // <?...?x state = 1; break; } break; case '"': switch (ch) { case Page.EOF: // <?x.."\0 done = true; break; case '"': state = 1; break; default: // <?...'.x break; } break; case '\'': switch (ch) { case Page.EOF: // <?x..'\0 done = true; break; case '\'': state = 1; break; default: // <?..."..x break; } break; default: throw new IllegalStateException("how the f**k did we get in state " + state); } } if (3 == state) // normal exit { if (0 != code) { state = mCursor.getPosition() - 2; // reuse state attributes.addElement(new PageAttribute(mPage, code, state, -1, -1, (char) 0)); attributes.addElement(new PageAttribute(mPage, state, state + 1, -1, -1, (char) 0)); } else throw new IllegalStateException("processing instruction with no content"); } else return (parseString(start, true)); // hmmm, true? return (makeTag(start, mCursor.getPosition(), attributes)); }
/** * Parse a java server page node. Scan characters until "%>" is encountered, or the input * stream is exhausted, in which case <code>null</code> is returned. * * @param start The position at which to start scanning. * @return The parsed node. * @exception ParserException If a problem occurs reading from the source. */ protected Node parseJsp(int start) throws ParserException { boolean done; char ch; int state; Vector attributes; int code; done = false; state = 0; code = 0; attributes = new Vector(); // <%xyz%> // 012223d // <%=xyz%> // 0122223d // <%@xyz%d // 0122223d while (!done) { ch = mPage.getCharacter(mCursor); switch (state) { case 0: // prior to the percent switch (ch) { case '%': // <% state = 1; break; // case Page.EOF: // <\0 // case '>': // <> default: done = true; break; } break; case 1: // prior to the optional qualifier switch (ch) { case Page.EOF: // <%\0 case '>': // <%> done = true; break; case '=': // <%= case '@': // <%@ code = mCursor.getPosition(); attributes.addElement(new PageAttribute(mPage, start + 1, code, -1, -1, (char) 0)); state = 2; break; default: // <%x code = mCursor.getPosition() - 1; attributes.addElement(new PageAttribute(mPage, start + 1, code, -1, -1, (char) 0)); state = 2; break; } break; case 2: // prior to the closing percent switch (ch) { case Page.EOF: // <%x\0 case '>': // <%x> done = true; break; case '\'': case '"': // <%???" state = ch; break; case '%': // <%???% state = 3; break; case '/': // // or /* ch = mPage.getCharacter(mCursor); if (ch == '/') { // find the \n or \r while (true) { ch = mPage.getCharacter(mCursor); if (ch == Page.EOF) { done = true; break; } else if (ch == '\n' || ch == '\r') { break; } } } else if (ch == '*') { do { do ch = mPage.getCharacter(mCursor); while ((Page.EOF != ch) && ('*' != ch)); ch = mPage.getCharacter(mCursor); if (ch == '*') mPage.ungetCharacter(mCursor); } while ((Page.EOF != ch) && ('/' != ch)); } else mPage.ungetCharacter(mCursor); break; default: // <%???x break; } break; case 3: switch (ch) { case Page.EOF: // <%x??%\0 done = true; break; case '>': state = 4; done = true; break; default: // <%???%x state = 2; break; } break; case '"': switch (ch) { case Page.EOF: // <%x??"\0 done = true; break; case '"': state = 2; break; default: // <%???'??x break; } break; case '\'': switch (ch) { case Page.EOF: // <%x??'\0 done = true; break; case '\'': state = 2; break; default: // <%???"??x break; } break; default: throw new IllegalStateException("how the f**k did we get in state " + state); } } if (4 == state) // normal exit { if (0 != code) { state = mCursor.getPosition() - 2; // reuse state attributes.addElement(new PageAttribute(mPage, code, state, -1, -1, (char) 0)); attributes.addElement(new PageAttribute(mPage, state, state + 1, -1, -1, (char) 0)); } else throw new IllegalStateException("jsp with no code!"); } else return (parseString(start, true)); // hmmm, true? return (makeTag(start, mCursor.getPosition(), attributes)); }