/** * A mapping of identifiers which ensures that no use of an identifier in a scope masks a use of an * identifier in an outer scope. * * <p>Javascript is not a block scoped language, but IHTML constructs are block scoped, so we * alpha-rename all variables to prevent collisions. * * @param <NAME> a type that can work as a hashtable key * @param <BINDING> type of data associated with a particular name. This can be any object related * to the result of resolving the name. * @author [email protected] */ public final class NameContext<NAME, BINDING> { private final NameContext<NAME, BINDING> parent; private final Iterator<String> nameGenerator; /** maps names in original source to details about the renamed instance. */ private final Map<NAME, VarInfo<NAME, BINDING>> vars = Maps.newLinkedHashMap(); public static final class VarInfo<NAME, BINDING> { public final NAME origName; public final String newName; public final FilePosition declaredAt; private BINDING binding; private VarInfo(NAME origName, String newName, FilePosition declaredAt) { assert origName != null; this.origName = origName; this.newName = newName; this.declaredAt = declaredAt; } public BINDING getBinding() { return binding; } public void bind(BINDING binding) { this.binding = binding; } @Override public String toString() { return "(" + getClass().getSimpleName() + " " + origName + ")"; } } /** * Creates a context with no parent context. * * @param nameGenerator an infinite iterator that returns safe identifiers and that never returns * the same String twice. Typically, a {@link com.google.caja.util.SafeIdentifierMaker}. */ public NameContext(Iterator<String> nameGenerator) { this(null, nameGenerator); } private NameContext(NameContext<NAME, BINDING> parent, Iterator<String> nameGenerator) { this.parent = parent; this.nameGenerator = nameGenerator; } /** * Produces a context that has the same name generator and which has this context as its parent. */ public NameContext<NAME, BINDING> makeChildContext() { return new NameContext<NAME, BINDING>(this, this.nameGenerator); } /** * The context that is used to resolve original names that have not been declared in this context, * or null if no such context. */ public NameContext<NAME, BINDING> getParentContext() { return parent; } /** * Introduce a new declaration which will mask any declaration with the same name in the {@link * #getParentContext} context. */ public VarInfo<NAME, BINDING> declare(NAME origName, FilePosition declSite) throws RedeclarationException { VarInfo<NAME, BINDING> d = vars.get(origName); if (d == null) { String newName = nameGenerator.next(); VarInfo<NAME, BINDING> vi = new VarInfo<NAME, BINDING>(origName, newName, declSite); vars.put(origName, vi); return vi; } else { FilePosition dPos = d.declaredAt; throw new RedeclarationException( new Message( RewriterMessageType.CANNOT_REDECLARE_VAR, declSite, MessagePart.Factory.valueOf(origName.toString()), dPos)); } } /** * Find a declaration with the given original name, looking in ancestor contexts if {@code * declare(originalName, ...)} was never called on this context. */ public VarInfo<NAME, BINDING> lookup(NAME originalName) { for (NameContext<NAME, BINDING> c = this; c != null; c = c.parent) { VarInfo<NAME, BINDING> vi = c.vars.get(originalName); if (vi != null) { return vi; } } return null; } /** The set of vars declared in this context, not including any in ancestor contexts. */ public Iterable<VarInfo<NAME, BINDING>> vars() { return Collections.unmodifiableMap(vars).values(); } /** The name generator used to generate names for new declarations. */ public Iterator<String> getNameGenerator() { return nameGenerator; } public static class RedeclarationException extends CajaException { public RedeclarationException(Message m, Throwable th) { super(m, th); } public RedeclarationException(Message m) { this(m, null); } } }
/** * Compiles an HTML document to a chunk of safe static HTML, and a bit of javascript which attaches * event handlers and other dynamic attributes, and executes inline scripts. * * <p>Requires that CSS be rewritten, that inline scripts have been replaced with {@link Placeholder * placeholders}, and that the output JS be run through the CajitaRewriter. * * @author [email protected] */ public class TemplateCompiler { private final List<IhtmlRoot> ihtmlRoots; private final List<ValidatedStylesheet> validatedStylesheets; private final HtmlSchema htmlSchema; private final PluginMeta meta; private final MessageContext mc; private final MessageQueue mq; private final HtmlAttributeRewriter aRewriter; /** * Maps {@link Node}s to JS parse trees. * * <ul> * <li>If the value is {@code null}, then the literal value in the original parse tree may be * used. * <li>If the node is an attribute, then the value is an expression that returns a (key, value) * pair. * <li>If the node is a text node inside a script block, then the value is an {@link * UncajoledModule}. * <li>Otherwise, the value is a JavaScript expression which evaluates to the dynamic text * value. * </ul> */ private final Map<Node, ParseTreeNode> scriptsPerNode = Maps.newIdentityHashMap(); /** * Maps placeholder IDs to JS programs. * * <p>We extract scripts early on and turn them into separate jobs, so that we can use cached * results for scripts even when the non-script details of the containing HTML page changes. */ private final Map<String, ScriptPlaceholder> scriptsPerPlaceholder = Maps.newHashMap(); private final Map<Attr, EmbeddedContent> embeddedContent = Maps.newIdentityHashMap(); /** * @param ihtmlRoots roots of trees to process and the baseURI used to resolve URIs in those * nodes. * @param validatedStylesheets CSS style-sheets that have had unsafe constructs removed and had * rules rewritten. * @param placeholderScripts placeholder IDs per unsanitized JS programs. We extract scripts early * on and turn them into separate jobs, so that we can use cached results for scripts even * when the non-script details of the containing HTML page changes. * @param meta specifies how URLs and other attributes are rewritten. * @param cssSchema specifies how STYLE attributes are rewritten. * @param htmlSchema specifies how elements and attributes are handled. * @param mq receives messages about invalid attribute values. */ public TemplateCompiler( List<? extends IhtmlRoot> ihtmlRoots, List<? extends ValidatedStylesheet> validatedStylesheets, List<? extends ScriptPlaceholder> placeholderScripts, CssSchema cssSchema, HtmlSchema htmlSchema, PluginMeta meta, MessageContext mc, MessageQueue mq) { this.ihtmlRoots = Lists.newArrayList(ihtmlRoots); this.validatedStylesheets = Lists.newArrayList(validatedStylesheets); for (ScriptPlaceholder ph : placeholderScripts) { scriptsPerPlaceholder.put(ph.source.placeholderId, ph); } this.htmlSchema = htmlSchema; this.meta = meta; this.mc = mc; this.mq = mq; this.aRewriter = new HtmlAttributeRewriter(meta, cssSchema, htmlSchema, embeddedContent, mq); } /** * Examines the HTML document and writes messages about problematic portions to the message queue * passed to the constructor. */ private void inspect() { if (!mq.hasMessageAtLevel(MessageLevel.FATAL_ERROR)) { for (IhtmlRoot ihtmlRoot : ihtmlRoots) { HtmlEmbeddedContentFinder finder = new HtmlEmbeddedContentFinder(htmlSchema, ihtmlRoot.baseUri, mq, mc); for (EmbeddedContent c : finder.findEmbeddedContent(ihtmlRoot.root)) { Node src = c.getSource(); if (src instanceof Attr) { embeddedContent.put((Attr) src, c); } } inspect(ihtmlRoot.source, ihtmlRoot.root, ElKey.forHtmlElement("div")); } } } private void inspect(JobEnvelope source, Node n, ElKey containingHtmlElement) { switch (n.getNodeType()) { case Node.ELEMENT_NODE: inspectElement(source, (Element) n, containingHtmlElement); break; case Node.TEXT_NODE: case Node.CDATA_SECTION_NODE: inspectText((Text) n, containingHtmlElement); break; case Node.DOCUMENT_FRAGMENT_NODE: inspectFragment(source, (DocumentFragment) n, containingHtmlElement); break; default: // Since they don't show in the scriptsPerNode map, they won't appear in // any output trees. break; } } /** * @param containingHtmlElement the name of the HTML element containing el. If the HTML element is * contained inside a template construct then this name may differ from el's immediate parent. */ private void inspectElement(JobEnvelope source, Element el, ElKey containingHtmlElement) { ElKey elKey = ElKey.forElement(el); // Recurse early so that ihtml:dynamic elements have been parsed before we // process the attributes element list. for (Node child : Nodes.childrenOf(el)) { inspect(source, child, elKey); } // For each attribute allowed on this element type, ensure that // (1) If it is not specified, and its default value is not allowed, then // it is added with a known safe value. // (2) Its value is rewritten as appropriate. // We don't have to worry about disallowed attributes since those will // not be present in scriptsPerNode. The TemplateSanitizer should have // stripped those out. The TemplateSanitizer should also have stripped out // disallowed elements. if (!htmlSchema.isElementAllowed(elKey)) { return; } HTML.Element elInfo = htmlSchema.lookupElement(elKey); List<HTML.Attribute> attrs = elInfo.getAttributes(); if (attrs != null) { for (HTML.Attribute a : attrs) { AttribKey attrKey = a.getKey(); if (!htmlSchema.isAttributeAllowed(attrKey)) { continue; } Attr attr = null; String aUri = attrKey.ns.uri; String aName = attrKey.localName; Attr unsafe = el.getAttributeNodeNS(aUri, aName); if (unsafe != null && a.getValueCriterion().accept(unsafe.getValue())) { attr = unsafe; } else if ((a.getDefaultValue() != null && !a.getValueCriterion().accept(a.getDefaultValue())) || !a.isOptional()) { attr = el.getOwnerDocument().createAttributeNS(aUri, aName); String safeValue; if (a.getType() == HTML.Attribute.Type.URI) { safeValue = "" + Nodes.getFilePositionFor(el).source().getUri(); } else { safeValue = a.getSafeValue(); } if (safeValue == null) { mq.addMessage( IhtmlMessageType.MISSING_ATTRIB, Nodes.getFilePositionFor(el), elKey, attrKey); continue; } attr.setNodeValue(safeValue); el.setAttributeNodeNS(attr); } if (attr != null) { inspectHtmlAttribute(source, attr, a); } } } scriptsPerNode.put(el, null); } private void inspectText(Text t, ElKey containingHtmlElement) { if (!htmlSchema.isElementAllowed(containingHtmlElement)) { return; } scriptsPerNode.put(t, null); } private void inspectFragment( JobEnvelope source, DocumentFragment f, ElKey containingHtmlElement) { scriptsPerNode.put(f, null); for (Node child : Nodes.childrenOf(f)) { // We know that top level text nodes in a document fragment // are not significant if they are just newlines and indentation. // This decreases output size significantly. if (isWhitespaceOnlyTextNode(child)) { continue; } inspect(source, child, containingHtmlElement); } } private static boolean isWhitespaceOnlyTextNode(Node child) { // This leaves whitespace without a leading EOL character intact. // TODO(ihab.awad): Investigate why this is the right criterion to use. return child.getNodeType() == Node.TEXT_NODE // excludes CDATA sections && "".equals(child.getNodeValue().replaceAll("[\r\n]+[ \t]*", "")); } /** * For an HTML attribute, decides whether the value is valid according to the schema and if it is * valid, sets a value into {@link #scriptsPerNode}. The expression is null if the current value * is fine, or a StringLiteral if it can be statically rewritten. */ private void inspectHtmlAttribute(JobEnvelope source, Attr attr, HTML.Attribute info) { if (Placeholder.ID_ATTR.is(attr) && scriptsPerPlaceholder.containsKey(attr.getValue())) { scriptsPerNode.put(attr, null); } else { HtmlAttributeRewriter.SanitizedAttr r = aRewriter.sanitizeStringValue(HtmlAttributeRewriter.fromAttr(attr, info, source)); if (r.isSafe) { scriptsPerNode.put(attr, r.result); } // Otherwise the SanitizeHtmlStage should have emitted a warning. } } /** * Builds a tree of only the safe HTML parts ignoring IHTML elements. If there are embedded script * elements, then these will be removed, and nodes may have synthetic IDs added so that the * generated code can split them into the elements present when each script is executed. * * <p>On introspection, the code will find that the output DOM is missing the SCRIPT elements * originally on the page. We consider this a known observable fact of our transformation. If we * wish to hid that as well, we could change {@link SafeHtmlMaker} to include empty SCRIPT nodes. * However, that would make the output larger -- and, anyway, the text content of these nodes * would *still* not be identical to the original. * * @param doc a DOM {@link Document} object to be used as a factory for DOM nodes; it is not * processed or transformed in any way. */ public Pair<List<SafeHtmlChunk>, List<SafeJsChunk>> getSafeHtml(Document doc) { // Inspect the document. inspect(); // Compile CSS to HTML when appropriate or to JS where not. // It always ends up at the top either way. List<SafeStylesheet> css = new SafeCssMaker(validatedStylesheets, doc).make(); // Emit safe HTML with JS which attaches dynamic attributes. SafeHtmlMaker htmlMaker = new SafeHtmlMaker( meta, mc, doc, scriptsPerNode, scriptsPerPlaceholder, ihtmlRoots, aRewriter.getHandlers()); return htmlMaker.make(css); } }