@Override
 public HTMLCleanerConfiguration getDefaultConfiguration() {
   HTMLCleanerConfiguration configuration = new DefaultHTMLCleanerConfiguration();
   configuration.setFilters(
       Arrays.asList(
           this.bodyFilter,
           this.listItemFilter,
           this.listFilter,
           this.fontFilter,
           this.attributeFilter));
   return configuration;
 }
 /** Test {@link UniqueIdFilter}. */
 @Test
 public void duplicateIds() throws Exception {
   String actual = "<p id=\"x\">1</p><p id=\"xy\">2</p><p id=\"x\">3</p>";
   String expected = "<p id=\"x\">1</p><p id=\"xy\">2</p><p id=\"x0\">3</p>";
   HTMLCleanerConfiguration config = this.mocker.getComponentUnderTest().getDefaultConfiguration();
   List<HTMLFilter> filters = new ArrayList<HTMLFilter>(config.getFilters());
   filters.add(this.mocker.<HTMLFilter>getInstance(HTMLFilter.class, "uniqueId"));
   config.setFilters(filters);
   Assert.assertEquals(
       HEADER_FULL + expected + FOOTER,
       HTMLUtils.toString(
           this.mocker.getComponentUnderTest().clean(new StringReader(actual), config)));
 }
 /** Verify that we can control what filters are used for cleaning. */
 @Test
 public void explicitFilterList() throws ComponentLookupException {
   HTMLCleanerConfiguration configuration =
       this.mocker.getComponentUnderTest().getDefaultConfiguration();
   configuration.setFilters(Collections.<HTMLFilter>emptyList());
   String result =
       HTMLUtils.toString(
           this.mocker
               .getComponentUnderTest()
               .clean(new StringReader("something"), configuration));
   // Note that if the default Body filter had been executed the result would have been:
   // <p>something</p>.
   Assert.assertEquals(HEADER_FULL + "something" + FOOTER, result);
 }
  /** Verify that the restricted parameter works. */
  @Test
  public void restrictedHtml() throws ComponentLookupException {
    HTMLCleanerConfiguration configuration =
        this.mocker.getComponentUnderTest().getDefaultConfiguration();
    Map<String, String> parameters = new HashMap<String, String>();
    parameters.putAll(configuration.getParameters());
    parameters.put("restricted", "true");
    configuration.setParameters(parameters);

    String result =
        HTMLUtils.toString(
            this.mocker
                .getComponentUnderTest()
                .clean(new StringReader("<script>alert(\"foo\")</script>"), configuration));
    Assert.assertEquals(HEADER_FULL + "<pre>alert(\"foo\")</pre>" + FOOTER, result);

    result =
        HTMLUtils.toString(
            this.mocker
                .getComponentUnderTest()
                .clean(new StringReader("<style>p {color:white;}</style>"), configuration));
    Assert.assertEquals(HEADER_FULL + "<pre>p {color:white;}</pre>" + FOOTER, result);
  }
  /**
   * @param configuration the configuration to use for the cleaning
   * @return the default {@link CleanerProperties} to be used for cleaning.
   */
  private CleanerProperties getDefaultCleanerProperties(HTMLCleanerConfiguration configuration) {
    CleanerProperties defaultProperties = new CleanerProperties();
    defaultProperties.setOmitUnknownTags(true);

    // HTML Cleaner uses the compact notation by default but we don't want that since:
    // - it's more work and not required since not compact notation is valid XHTML
    // - expanded elements can also be rendered fine in browsers that only support HTML.
    defaultProperties.setUseEmptyElementTags(false);

    // Wrap script and style content in CDATA blocks
    defaultProperties.setUseCdataForScriptAndStyle(true);

    // If the caller has defined NAMESPACE_AWARE configuration property then use it, otherwise use
    // our default.
    String param = configuration.getParameters().get(HTMLCleanerConfiguration.NAMESPACES_AWARE);
    boolean namespacesAware = (param != null) ? Boolean.parseBoolean(param) : true;
    defaultProperties.setNamespacesAware(namespacesAware);

    return defaultProperties;
  }
  /**
   * @param configuration The cleaner configuration.
   * @return the default cleaning transformations to perform on tags, in addition to the base
   *     transformations done by HTML Cleaner
   */
  private CleanerTransformations getDefaultCleanerTransformations(
      HTMLCleanerConfiguration configuration) {
    CleanerTransformations defaultTransformations = new CleanerTransformations();

    TagTransformation tt =
        new TagTransformation(HTMLConstants.TAG_B, HTMLConstants.TAG_STRONG, false);
    defaultTransformations.addTransformation(tt);

    tt = new TagTransformation(HTMLConstants.TAG_I, HTMLConstants.TAG_EM, false);
    defaultTransformations.addTransformation(tt);

    tt = new TagTransformation(HTMLConstants.TAG_U, HTMLConstants.TAG_INS, false);
    defaultTransformations.addTransformation(tt);

    tt = new TagTransformation(HTMLConstants.TAG_S, HTMLConstants.TAG_DEL, false);
    defaultTransformations.addTransformation(tt);

    tt = new TagTransformation(HTMLConstants.TAG_STRIKE, HTMLConstants.TAG_DEL, false);
    defaultTransformations.addTransformation(tt);

    tt = new TagTransformation(HTMLConstants.TAG_CENTER, HTMLConstants.TAG_P, false);
    tt.addAttributeTransformation(HTMLConstants.ATTRIBUTE_STYLE, "text-align:center");
    defaultTransformations.addTransformation(tt);

    String restricted = configuration.getParameters().get(HTMLCleanerConfiguration.RESTRICTED);
    if ("true".equalsIgnoreCase(restricted)) {

      tt = new TagTransformation(HTMLConstants.TAG_SCRIPT, HTMLConstants.TAG_PRE, false);
      defaultTransformations.addTransformation(tt);

      tt = new TagTransformation(HTMLConstants.TAG_STYLE, HTMLConstants.TAG_PRE, false);
      defaultTransformations.addTransformation(tt);
    }

    return defaultTransformations;
  }
  @Override
  public Document clean(Reader originalHtmlContent, HTMLCleanerConfiguration configuration) {
    Document result;

    // Note: Instantiation of an HtmlCleaner object is cheap so there's no need to cache an instance
    // of it,
    // especially since this makes it extra safe with regards to multithreading (even though HTML
    // Cleaner is
    // already supposed to be thread safe).
    CleanerProperties cleanerProperties = getDefaultCleanerProperties(configuration);
    HtmlCleaner cleaner = new HtmlCleaner(cleanerProperties);

    cleaner.setTransformations(getDefaultCleanerTransformations(configuration));
    TagNode cleanedNode;
    try {
      cleanedNode = cleaner.clean(originalHtmlContent);
    } catch (Exception e) {
      // This shouldn't happen since we're not doing any IO... I consider this a flaw in the design
      // of HTML
      // Cleaner.
      throw new RuntimeException("Unhandled error when cleaning HTML", e);
    }

    // Serialize the cleanedNode TagNode into a w3c dom. Ideally following code should be enough.
    // But SF's HTML Cleaner seems to omit the DocType declaration while serializing.
    // See
    // https://sourceforge.net/tracker/index.php?func=detail&aid=2062318&group_id=183053&atid=903696
    //      cleanedNode.setDocType(new DoctypeToken("html", "PUBLIC", "-//W3C//DTD XHTML 1.0
    // Strict//EN",
    //          "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"));
    //      try {
    //          result = new DomSerializer(cleanerProperties, false).createDOM(cleanedNode);
    //      } catch(ParserConfigurationException ex) { }
    // As a workaround, we must serialize the cleanedNode into a temporary w3c document, create a
    // new w3c document
    // with proper DocType declaration and move the root node from the temporary document to the new
    // one.
    try {
      // Since there's a bug in SF's HTML Cleaner in that it doesn't recognize CDATA blocks we need
      // to turn off
      // character escaping (hence the false value passed) and do the escaping in
      // XMLUtils.toString(). Note that
      // this can cause problem for code not serializing the W3C DOM to a String since it won't have
      // the
      // characters escaped.
      // See
      // https://sourceforge.net/tracker/index.php?func=detail&aid=2691888&group_id=183053&atid=903696
      Document tempDoc = new XWikiDOMSerializer(cleanerProperties, false).createDOM(cleanedNode);
      DOMImplementation domImpl =
          DocumentBuilderFactory.newInstance().newDocumentBuilder().getDOMImplementation();
      DocumentType docType =
          domImpl.createDocumentType(
              QUALIFIED_NAME_HTML,
              "-//W3C//DTD XHTML 1.0 Strict//EN",
              "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
      result = domImpl.createDocument(null, QUALIFIED_NAME_HTML, docType);
      result.replaceChild(
          result.adoptNode(tempDoc.getDocumentElement()), result.getDocumentElement());
    } catch (ParserConfigurationException ex) {
      throw new RuntimeException("Error while serializing TagNode into w3c dom.", ex);
    }

    // Finally apply filters.
    for (HTMLFilter filter : configuration.getFilters()) {
      filter.filter(result, configuration.getParameters());
    }

    return result;
  }