/** * @param ihtmlRoots roots of trees to process and the baseURI used to resolve URIs in those * nodes. * @param validatedStylesheets CSS style-sheets that have had unsafe constructs removed and had * rules rewritten. * @param placeholderScripts placeholder IDs per unsanitized JS programs. We extract scripts early * on and turn them into separate jobs, so that we can use cached results for scripts even * when the non-script details of the containing HTML page changes. * @param meta specifies how URLs and other attributes are rewritten. * @param cssSchema specifies how STYLE attributes are rewritten. * @param htmlSchema specifies how elements and attributes are handled. * @param mq receives messages about invalid attribute values. */ public TemplateCompiler( List<? extends IhtmlRoot> ihtmlRoots, List<? extends ValidatedStylesheet> validatedStylesheets, List<? extends ScriptPlaceholder> placeholderScripts, CssSchema cssSchema, HtmlSchema htmlSchema, PluginMeta meta, MessageContext mc, MessageQueue mq) { this.ihtmlRoots = Lists.newArrayList(ihtmlRoots); this.validatedStylesheets = Lists.newArrayList(validatedStylesheets); for (ScriptPlaceholder ph : placeholderScripts) { scriptsPerPlaceholder.put(ph.source.placeholderId, ph); } this.htmlSchema = htmlSchema; this.meta = meta; this.mc = mc; this.mq = mq; this.aRewriter = new HtmlAttributeRewriter(meta, cssSchema, htmlSchema, embeddedContent, mq); }
/** * An abstract renderer for JavaScript tokens that ensures that implementations don't fall afoul of * JavaScript's syntactic quirks. * * @author [email protected] */ abstract class BufferingRenderer implements TokenConsumer { private final List<Object> pending = Lists.newArrayList(); private final Concatenator out; /** @param out receives the rendered text. */ BufferingRenderer(Concatenator out) { this.out = out; } /** @throws NullPointerException if out raises an IOException and ioExceptionHandler is null. */ public final void noMoreTokens() { JsTokenAdjacencyChecker adjChecker = new JsTokenAdjacencyChecker(); String lastToken = null; boolean noOutputWritten = true; List<String> outputTokens = splitTokens(pending); pending.clear(); String pendingSpace = null; for (int i = 0, nTokens = outputTokens.size(); i < nTokens; ++i) { String token = outputTokens.get(i); if (token.charAt(0) == '\n' || " ".equals(token)) { pendingSpace = token; continue; } if (TokenClassification.isComment(token)) { // Make sure we don't get into a situation where we have to output // a newline to end a line comment, but can't output a newline because // it would break a restricted production. // When we see a line comment, scan forward until the next non-comment // token. If the canBreakBetween check fails, then remove any // line-breaks by rewriting the comment. // We have to rewrite multi-line block comments, since ES3 and ES5 say // that a multi-line comment is replaced with a newline for the // purposes of semicolon insertion. // // This is inconsistently implemented, but the rewriting works // regardless of whether an implementation actually treats the // comment as a newline for semicolon insertion. String nextToken = null; for (int j = i + 1; j < nTokens; ++j) { switch (TokenClassification.classify(outputTokens.get(j))) { case SPACE: case LINEBREAK: case COMMENT: continue; default: break; } nextToken = outputTokens.get(j); break; } if (!JsRenderUtil.canBreakBetween(lastToken, nextToken)) { token = removeLinebreaksFromComment(token); if (pendingSpace != null) { pendingSpace = " "; } } } boolean needSpaceBefore = adjChecker.needSpaceBefore(token); if (pendingSpace == null && needSpaceBefore) { pendingSpace = " "; } if (pendingSpace != null) { if (pendingSpace.charAt(0) == '\n') { if (!JsRenderUtil.canBreakBetween(lastToken, token)) { pendingSpace = " "; } else if (noOutputWritten) { pendingSpace = pendingSpace.substring(1); } } out.append(pendingSpace); pendingSpace = null; } out.append(token); noOutputWritten = false; if (!TokenClassification.isComment(token)) { lastToken = token; } } out.noMoreTokens(); } /** * May receive line-break or comment tokens. Implementations may ignore comment tokens, but the * client is responsible for making sure that comments are well-formed, do not contain code (e.g. * conditional compilation code), and do not violate any containment requirements, such as not * containing the string {@code </script>}. */ public final void consume(String text) { if ("".equals(text)) { return; } pending.add(text); } public final void mark(@Nullable FilePosition mark) { if (mark != null && !InputSource.UNKNOWN.equals(mark.source())) { pending.add(mark); } } private static String removeLinebreaksFromComment(String token) { if (TokenClassification.isLineComment(token)) { token = "/*" + token.substring(2) + "*/"; } StringBuilder sb = new StringBuilder(token); // Section 5.1.2 hinges on whether a MultiLineComment contains a // line-terminator char, so make sure it does not. for (int i = sb.length(); --i >= 0; ) { if (JsLexer.isJsLineSeparator(sb.charAt(i))) { sb.setCharAt(i, ' '); } } // Make sure that turning a line comment into a MultiLineComment didn't // cause a */ in the line comment to become lexically significant. for (int e = sb.length() - 3, i; (i = sb.lastIndexOf("*/", e)) >= 0; ) { sb.setCharAt(i + 1, ' '); } return sb.toString(); } /** * Generates a list of output tokens consisting of non-whitespace tokens, space tokens ({@code " * "}) and newline tokens ({@code '\n'} followed by any number of spaces). * * @param tokens a heterogeneous array containing {@code String} tokens and {@code FilePosition} * marks. * @return the strings in tokens in order with newline and space tokens inserted as appropriate. */ abstract List<String> splitTokens(List<Object> tokens); }
/** * A lexer that recognizes the <a href="http://www.w3.org/TR/CSS21/grammar.html#scanner">CSS 2.1 * Grammar</a> plus line comments as interpreted by most browsers. * * <p>TODO(mikesamuel): CSS2.1 has changed lexical conventions to effectively decode escapes at lex * time in most contexts. E.g., the rule <code>"@import" IMPORT_SYM</code> now reads * <code>@{I}{M}{P}{O}{R}{T} {return IMPORT_SYM;}</code> and <code>{num}ms TIME * </code> now reads <code>{num}{M}{S} {return TIME;}</code>. * * @author [email protected] */ public final class CssLexer implements TokenStream<CssTokenType> { private final CssSplitter splitter; private final LinkedList<Token<CssTokenType>> pending = Lists.newLinkedList(); // TODO(mikesamuel): all clients should pass in a proper queue public CssLexer(CharProducer cp) { this(cp, DevNullMessageQueue.singleton(), false); } /** * @param allowSubstitutions true iff ${...} style substitutions should be allowed as described at * {@link CssTokenType#SUBSTITUTION} */ public CssLexer(CharProducer cp, MessageQueue mq, boolean allowSubstitutions) { assert null != cp; this.splitter = new CssSplitter(cp, mq, allowSubstitutions); } public boolean hasNext() throws ParseException { return !pending.isEmpty() || splitter.hasNext(); } public Token<CssTokenType> next() throws ParseException { produce(); if (null == pending) { throw new NoSuchElementException(); } return pending.removeFirst(); } /** * True iff ${...} style substitutions should be allowed as described at {@link * CssTokenType#SUBSTITUTION} * * @see #allowSubstitutions(boolean) */ public boolean areSubstitutionsAllowed() { return splitter.areSubstitutionsAllowed(); } /** * Changes the substitution policy for this lexer. * * @see #areSubstitutionsAllowed() */ public void allowSubstitutions(boolean allow) { splitter.allowSubstitutions(allow); } /** Decodes escapes in an identifier */ public static String decodeCssIdentifier(CharSequence ident) { StringBuilder sb = null; int pos = 0; for (int i = 0, n = ident.length(); i < n; ) { if (ident.charAt(i) == '\\') { if (sb == null) { sb = new StringBuilder(); } sb.append(ident, pos, i); int codepoint = 0; while (++i < n && isHexChar(ident.charAt(i))) { char ch = ident.charAt(i); codepoint <<= 4; if (ch >= '0' && ch <= '9') { codepoint |= ch - '0'; } else if (ch >= 'a' && ch <= 'f') { codepoint |= ch + 10 - 'a'; } else { codepoint |= ch + 10 - 'A'; } } sb.appendCodePoint(codepoint < Character.MAX_CODE_POINT ? codepoint : 0xfffd); if (i < n && isSpaceChar(ident.charAt(i))) { ++i; } pos = i; } else { ++i; } } if (sb == null) { return ident.toString(); } return sb.append(ident, pos, ident.length()).toString(); } /** * * * <pre> * nmstart [_a-z]|{nonascii}|{escape} * nonascii [\200-\377] * </pre> * * @return true iff ch is a nmstart and is not an escape. Call {@link #decodeCssIdentifier} before * this method to figure out whether an escape sequence is a nmstart */ public static boolean isNmStart(char ch) { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch >= 0200 && ch <= 0377) || ch == '_'; } /** * If the character producer has not been exhausted, ensures that there is a token on pending on * pending. */ private void produce() throws ParseException { if (!pending.isEmpty()) { return; } if (!splitter.hasNext()) { return; } Token<CssTokenType> t = splitter.next(); pending.add(t); if (t.type == CssTokenType.PUNCTUATION && splitter.hasNext()) { if ("!".equals(t.text)) { // Join !important // IMPORTANT_SYM "!"({w}|{comment})*{I}{M}{P}{O}{R}{T}{A}{N}{T} Token<CssTokenType> t2 = splitter.next(); while (t2 != null && (t2.type == CssTokenType.SPACE || t2.type == CssTokenType.COMMENT)) { pending.add(t2); t2 = splitter.hasNext() ? splitter.next() : null; } // The !important is significant regardless of case and whether or not a // letter is hex escaped. if (null != t2) { pending.add(t2); if (t2.type == CssTokenType.IDENT && Strings.eqIgnoreCase("important", decodeCssIdentifier(t2.text))) { reduce(CssTokenType.DIRECTIVE); } } } else if ("-".equals(t.text)) { // Join '-'{nmstart}{nmchar}* Token<CssTokenType> t2 = splitter.next(); if (null != t2) { pending.add(t2); if (t2.type == CssTokenType.IDENT) { reduce(CssTokenType.IDENT); } } } } } /** * Reduces the pending tokens to a single token with the given type. For example, if the pending * list contains an identifier followed by an open parenthesis, then it can be reduced to a single * function token. This is necessitated by CSS2's odd lexical convention which classifies as * single tokens things that most other languages treat as sequences of primitive tokens. * * <p>Modifies the pending list in place. */ private void reduce(CssTokenType type) { StringBuilder sb = new StringBuilder(); for (Token<CssTokenType> t : pending) { sb.append(t.text); } FilePosition fp = FilePosition.span(pending.getFirst().pos, pending.getLast().pos); pending.clear(); pending.add(Token.instance(sb.toString(), type, fp)); } /** Is the given character a whitespace character according to the CSS 2 spec. */ public static boolean isSpaceChar(char ch) { // s [ \t\r\n\f]+ // w {s}? switch (ch) { case ' ': case '\t': case '\r': case '\n': case '\f': return true; default: return false; } } /** Is the given character a hex digit? */ public static boolean isHexChar(char ch) { // h [0-9a-f] return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F'); } }