Example #1
0
 /**
  * @param ihtmlRoots roots of trees to process and the baseURI used to resolve URIs in those
  *     nodes.
  * @param validatedStylesheets CSS style-sheets that have had unsafe constructs removed and had
  *     rules rewritten.
  * @param placeholderScripts placeholder IDs per unsanitized JS programs. We extract scripts early
  *     on and turn them into separate jobs, so that we can use cached results for scripts even
  *     when the non-script details of the containing HTML page changes.
  * @param meta specifies how URLs and other attributes are rewritten.
  * @param cssSchema specifies how STYLE attributes are rewritten.
  * @param htmlSchema specifies how elements and attributes are handled.
  * @param mq receives messages about invalid attribute values.
  */
 public TemplateCompiler(
     List<? extends IhtmlRoot> ihtmlRoots,
     List<? extends ValidatedStylesheet> validatedStylesheets,
     List<? extends ScriptPlaceholder> placeholderScripts,
     CssSchema cssSchema,
     HtmlSchema htmlSchema,
     PluginMeta meta,
     MessageContext mc,
     MessageQueue mq) {
   this.ihtmlRoots = Lists.newArrayList(ihtmlRoots);
   this.validatedStylesheets = Lists.newArrayList(validatedStylesheets);
   for (ScriptPlaceholder ph : placeholderScripts) {
     scriptsPerPlaceholder.put(ph.source.placeholderId, ph);
   }
   this.htmlSchema = htmlSchema;
   this.meta = meta;
   this.mc = mc;
   this.mq = mq;
   this.aRewriter = new HtmlAttributeRewriter(meta, cssSchema, htmlSchema, embeddedContent, mq);
 }
Example #2
0
/**
 * An abstract renderer for JavaScript tokens that ensures that implementations don't fall afoul of
 * JavaScript's syntactic quirks.
 *
 * @author [email protected]
 */
abstract class BufferingRenderer implements TokenConsumer {
  private final List<Object> pending = Lists.newArrayList();
  private final Concatenator out;

  /** @param out receives the rendered text. */
  BufferingRenderer(Concatenator out) {
    this.out = out;
  }

  /** @throws NullPointerException if out raises an IOException and ioExceptionHandler is null. */
  public final void noMoreTokens() {
    JsTokenAdjacencyChecker adjChecker = new JsTokenAdjacencyChecker();

    String lastToken = null;
    boolean noOutputWritten = true;
    List<String> outputTokens = splitTokens(pending);
    pending.clear();
    String pendingSpace = null;
    for (int i = 0, nTokens = outputTokens.size(); i < nTokens; ++i) {
      String token = outputTokens.get(i);
      if (token.charAt(0) == '\n' || " ".equals(token)) {
        pendingSpace = token;
        continue;
      }
      if (TokenClassification.isComment(token)) {
        // Make sure we don't get into a situation where we have to output
        // a newline to end a line comment, but can't output a newline because
        // it would break a restricted production.
        // When we see a line comment, scan forward until the next non-comment
        // token.  If the canBreakBetween check fails, then remove any
        // line-breaks by rewriting the comment.
        // We have to rewrite multi-line block comments, since ES3 and ES5 say
        // that a multi-line comment is replaced with a newline for the
        // purposes of semicolon insertion.
        //
        // This is inconsistently implemented, but the rewriting works
        // regardless of whether an implementation actually treats the
        // comment as a newline for semicolon insertion.
        String nextToken = null;
        for (int j = i + 1; j < nTokens; ++j) {
          switch (TokenClassification.classify(outputTokens.get(j))) {
            case SPACE:
            case LINEBREAK:
            case COMMENT:
              continue;
            default:
              break;
          }
          nextToken = outputTokens.get(j);
          break;
        }
        if (!JsRenderUtil.canBreakBetween(lastToken, nextToken)) {
          token = removeLinebreaksFromComment(token);
          if (pendingSpace != null) {
            pendingSpace = " ";
          }
        }
      }
      boolean needSpaceBefore = adjChecker.needSpaceBefore(token);
      if (pendingSpace == null && needSpaceBefore) {
        pendingSpace = " ";
      }
      if (pendingSpace != null) {
        if (pendingSpace.charAt(0) == '\n') {
          if (!JsRenderUtil.canBreakBetween(lastToken, token)) {
            pendingSpace = " ";
          } else if (noOutputWritten) {
            pendingSpace = pendingSpace.substring(1);
          }
        }
        out.append(pendingSpace);
        pendingSpace = null;
      }
      out.append(token);
      noOutputWritten = false;
      if (!TokenClassification.isComment(token)) {
        lastToken = token;
      }
    }
    out.noMoreTokens();
  }

  /**
   * May receive line-break or comment tokens. Implementations may ignore comment tokens, but the
   * client is responsible for making sure that comments are well-formed, do not contain code (e.g.
   * conditional compilation code), and do not violate any containment requirements, such as not
   * containing the string {@code </script>}.
   */
  public final void consume(String text) {
    if ("".equals(text)) {
      return;
    }
    pending.add(text);
  }

  public final void mark(@Nullable FilePosition mark) {
    if (mark != null && !InputSource.UNKNOWN.equals(mark.source())) {
      pending.add(mark);
    }
  }

  private static String removeLinebreaksFromComment(String token) {
    if (TokenClassification.isLineComment(token)) {
      token = "/*" + token.substring(2) + "*/";
    }
    StringBuilder sb = new StringBuilder(token);
    // Section 5.1.2 hinges on whether a MultiLineComment contains a
    // line-terminator char, so make sure it does not.
    for (int i = sb.length(); --i >= 0; ) {
      if (JsLexer.isJsLineSeparator(sb.charAt(i))) {
        sb.setCharAt(i, ' ');
      }
    }
    // Make sure that turning a line comment into a MultiLineComment didn't
    // cause a */ in the line comment to become lexically significant.
    for (int e = sb.length() - 3, i; (i = sb.lastIndexOf("*/", e)) >= 0; ) {
      sb.setCharAt(i + 1, ' ');
    }
    return sb.toString();
  }

  /**
   * Generates a list of output tokens consisting of non-whitespace tokens, space tokens ({@code "
   * "}) and newline tokens ({@code '\n'} followed by any number of spaces).
   *
   * @param tokens a heterogeneous array containing {@code String} tokens and {@code FilePosition}
   *     marks.
   * @return the strings in tokens in order with newline and space tokens inserted as appropriate.
   */
  abstract List<String> splitTokens(List<Object> tokens);
}
Example #3
0
/**
 * A lexer that recognizes the <a href="http://www.w3.org/TR/CSS21/grammar.html#scanner">CSS 2.1
 * Grammar</a> plus line comments as interpreted by most browsers.
 *
 * <p>TODO(mikesamuel): CSS2.1 has changed lexical conventions to effectively decode escapes at lex
 * time in most contexts. E.g., the rule <code>"@import"              IMPORT_SYM</code> now reads
 * <code>@{I}{M}{P}{O}{R}{T}    {return IMPORT_SYM;}</code> and <code>{num}ms                TIME
 * </code> now reads <code>{num}{M}{S}            {return TIME;}</code>.
 *
 * @author [email protected]
 */
public final class CssLexer implements TokenStream<CssTokenType> {
  private final CssSplitter splitter;
  private final LinkedList<Token<CssTokenType>> pending = Lists.newLinkedList();

  // TODO(mikesamuel): all clients should pass in a proper queue
  public CssLexer(CharProducer cp) {
    this(cp, DevNullMessageQueue.singleton(), false);
  }

  /**
   * @param allowSubstitutions true iff ${...} style substitutions should be allowed as described at
   *     {@link CssTokenType#SUBSTITUTION}
   */
  public CssLexer(CharProducer cp, MessageQueue mq, boolean allowSubstitutions) {
    assert null != cp;
    this.splitter = new CssSplitter(cp, mq, allowSubstitutions);
  }

  public boolean hasNext() throws ParseException {
    return !pending.isEmpty() || splitter.hasNext();
  }

  public Token<CssTokenType> next() throws ParseException {
    produce();
    if (null == pending) {
      throw new NoSuchElementException();
    }
    return pending.removeFirst();
  }

  /**
   * True iff ${...} style substitutions should be allowed as described at {@link
   * CssTokenType#SUBSTITUTION}
   *
   * @see #allowSubstitutions(boolean)
   */
  public boolean areSubstitutionsAllowed() {
    return splitter.areSubstitutionsAllowed();
  }

  /**
   * Changes the substitution policy for this lexer.
   *
   * @see #areSubstitutionsAllowed()
   */
  public void allowSubstitutions(boolean allow) {
    splitter.allowSubstitutions(allow);
  }

  /** Decodes escapes in an identifier */
  public static String decodeCssIdentifier(CharSequence ident) {
    StringBuilder sb = null;
    int pos = 0;
    for (int i = 0, n = ident.length(); i < n; ) {
      if (ident.charAt(i) == '\\') {
        if (sb == null) {
          sb = new StringBuilder();
        }
        sb.append(ident, pos, i);
        int codepoint = 0;
        while (++i < n && isHexChar(ident.charAt(i))) {
          char ch = ident.charAt(i);
          codepoint <<= 4;
          if (ch >= '0' && ch <= '9') {
            codepoint |= ch - '0';
          } else if (ch >= 'a' && ch <= 'f') {
            codepoint |= ch + 10 - 'a';
          } else {
            codepoint |= ch + 10 - 'A';
          }
        }
        sb.appendCodePoint(codepoint < Character.MAX_CODE_POINT ? codepoint : 0xfffd);
        if (i < n && isSpaceChar(ident.charAt(i))) {
          ++i;
        }
        pos = i;
      } else {
        ++i;
      }
    }
    if (sb == null) {
      return ident.toString();
    }
    return sb.append(ident, pos, ident.length()).toString();
  }

  /**
   *
   *
   * <pre>
   * nmstart    [_a-z]|{nonascii}|{escape}
   * nonascii   [\200-\377]
   * </pre>
   *
   * @return true iff ch is a nmstart and is not an escape. Call {@link #decodeCssIdentifier} before
   *     this method to figure out whether an escape sequence is a nmstart
   */
  public static boolean isNmStart(char ch) {
    return (ch >= 'a' && ch <= 'z')
        || (ch >= 'A' && ch <= 'Z')
        || (ch >= 0200 && ch <= 0377)
        || ch == '_';
  }

  /**
   * If the character producer has not been exhausted, ensures that there is a token on pending on
   * pending.
   */
  private void produce() throws ParseException {
    if (!pending.isEmpty()) {
      return;
    }
    if (!splitter.hasNext()) {
      return;
    }

    Token<CssTokenType> t = splitter.next();
    pending.add(t);
    if (t.type == CssTokenType.PUNCTUATION && splitter.hasNext()) {
      if ("!".equals(t.text)) { // Join !important
        // IMPORTANT_SYM        "!"({w}|{comment})*{I}{M}{P}{O}{R}{T}{A}{N}{T}
        Token<CssTokenType> t2 = splitter.next();
        while (t2 != null && (t2.type == CssTokenType.SPACE || t2.type == CssTokenType.COMMENT)) {
          pending.add(t2);
          t2 = splitter.hasNext() ? splitter.next() : null;
        }
        // The !important is significant regardless of case and whether or not a
        // letter is hex escaped.
        if (null != t2) {
          pending.add(t2);
          if (t2.type == CssTokenType.IDENT
              && Strings.eqIgnoreCase("important", decodeCssIdentifier(t2.text))) {
            reduce(CssTokenType.DIRECTIVE);
          }
        }
      } else if ("-".equals(t.text)) { // Join '-'{nmstart}{nmchar}*
        Token<CssTokenType> t2 = splitter.next();
        if (null != t2) {
          pending.add(t2);
          if (t2.type == CssTokenType.IDENT) {
            reduce(CssTokenType.IDENT);
          }
        }
      }
    }
  }

  /**
   * Reduces the pending tokens to a single token with the given type. For example, if the pending
   * list contains an identifier followed by an open parenthesis, then it can be reduced to a single
   * function token. This is necessitated by CSS2's odd lexical convention which classifies as
   * single tokens things that most other languages treat as sequences of primitive tokens.
   *
   * <p>Modifies the pending list in place.
   */
  private void reduce(CssTokenType type) {
    StringBuilder sb = new StringBuilder();
    for (Token<CssTokenType> t : pending) {
      sb.append(t.text);
    }
    FilePosition fp = FilePosition.span(pending.getFirst().pos, pending.getLast().pos);
    pending.clear();
    pending.add(Token.instance(sb.toString(), type, fp));
  }

  /** Is the given character a whitespace character according to the CSS 2 spec. */
  public static boolean isSpaceChar(char ch) {
    // s      [ \t\r\n\f]+
    // w      {s}?
    switch (ch) {
      case ' ':
      case '\t':
      case '\r':
      case '\n':
      case '\f':
        return true;
      default:
        return false;
    }
  }

  /** Is the given character a hex digit? */
  public static boolean isHexChar(char ch) {
    // h     [0-9a-f]
    return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
  }
}