@Override public HTMLCleanerConfiguration getDefaultConfiguration() { HTMLCleanerConfiguration configuration = new DefaultHTMLCleanerConfiguration(); configuration.setFilters( Arrays.asList( this.bodyFilter, this.listItemFilter, this.listFilter, this.fontFilter, this.attributeFilter)); return configuration; }
/** Test {@link UniqueIdFilter}. */ @Test public void duplicateIds() throws Exception { String actual = "<p id=\"x\">1</p><p id=\"xy\">2</p><p id=\"x\">3</p>"; String expected = "<p id=\"x\">1</p><p id=\"xy\">2</p><p id=\"x0\">3</p>"; HTMLCleanerConfiguration config = this.mocker.getComponentUnderTest().getDefaultConfiguration(); List<HTMLFilter> filters = new ArrayList<HTMLFilter>(config.getFilters()); filters.add(this.mocker.<HTMLFilter>getInstance(HTMLFilter.class, "uniqueId")); config.setFilters(filters); Assert.assertEquals( HEADER_FULL + expected + FOOTER, HTMLUtils.toString( this.mocker.getComponentUnderTest().clean(new StringReader(actual), config))); }
/** Verify that we can control what filters are used for cleaning. */ @Test public void explicitFilterList() throws ComponentLookupException { HTMLCleanerConfiguration configuration = this.mocker.getComponentUnderTest().getDefaultConfiguration(); configuration.setFilters(Collections.<HTMLFilter>emptyList()); String result = HTMLUtils.toString( this.mocker .getComponentUnderTest() .clean(new StringReader("something"), configuration)); // Note that if the default Body filter had been executed the result would have been: // <p>something</p>. Assert.assertEquals(HEADER_FULL + "something" + FOOTER, result); }
/** Verify that the restricted parameter works. */ @Test public void restrictedHtml() throws ComponentLookupException { HTMLCleanerConfiguration configuration = this.mocker.getComponentUnderTest().getDefaultConfiguration(); Map<String, String> parameters = new HashMap<String, String>(); parameters.putAll(configuration.getParameters()); parameters.put("restricted", "true"); configuration.setParameters(parameters); String result = HTMLUtils.toString( this.mocker .getComponentUnderTest() .clean(new StringReader("<script>alert(\"foo\")</script>"), configuration)); Assert.assertEquals(HEADER_FULL + "<pre>alert(\"foo\")</pre>" + FOOTER, result); result = HTMLUtils.toString( this.mocker .getComponentUnderTest() .clean(new StringReader("<style>p {color:white;}</style>"), configuration)); Assert.assertEquals(HEADER_FULL + "<pre>p {color:white;}</pre>" + FOOTER, result); }
/** * @param configuration the configuration to use for the cleaning * @return the default {@link CleanerProperties} to be used for cleaning. */ private CleanerProperties getDefaultCleanerProperties(HTMLCleanerConfiguration configuration) { CleanerProperties defaultProperties = new CleanerProperties(); defaultProperties.setOmitUnknownTags(true); // HTML Cleaner uses the compact notation by default but we don't want that since: // - it's more work and not required since not compact notation is valid XHTML // - expanded elements can also be rendered fine in browsers that only support HTML. defaultProperties.setUseEmptyElementTags(false); // Wrap script and style content in CDATA blocks defaultProperties.setUseCdataForScriptAndStyle(true); // If the caller has defined NAMESPACE_AWARE configuration property then use it, otherwise use // our default. String param = configuration.getParameters().get(HTMLCleanerConfiguration.NAMESPACES_AWARE); boolean namespacesAware = (param != null) ? Boolean.parseBoolean(param) : true; defaultProperties.setNamespacesAware(namespacesAware); return defaultProperties; }
/** * @param configuration The cleaner configuration. * @return the default cleaning transformations to perform on tags, in addition to the base * transformations done by HTML Cleaner */ private CleanerTransformations getDefaultCleanerTransformations( HTMLCleanerConfiguration configuration) { CleanerTransformations defaultTransformations = new CleanerTransformations(); TagTransformation tt = new TagTransformation(HTMLConstants.TAG_B, HTMLConstants.TAG_STRONG, false); defaultTransformations.addTransformation(tt); tt = new TagTransformation(HTMLConstants.TAG_I, HTMLConstants.TAG_EM, false); defaultTransformations.addTransformation(tt); tt = new TagTransformation(HTMLConstants.TAG_U, HTMLConstants.TAG_INS, false); defaultTransformations.addTransformation(tt); tt = new TagTransformation(HTMLConstants.TAG_S, HTMLConstants.TAG_DEL, false); defaultTransformations.addTransformation(tt); tt = new TagTransformation(HTMLConstants.TAG_STRIKE, HTMLConstants.TAG_DEL, false); defaultTransformations.addTransformation(tt); tt = new TagTransformation(HTMLConstants.TAG_CENTER, HTMLConstants.TAG_P, false); tt.addAttributeTransformation(HTMLConstants.ATTRIBUTE_STYLE, "text-align:center"); defaultTransformations.addTransformation(tt); String restricted = configuration.getParameters().get(HTMLCleanerConfiguration.RESTRICTED); if ("true".equalsIgnoreCase(restricted)) { tt = new TagTransformation(HTMLConstants.TAG_SCRIPT, HTMLConstants.TAG_PRE, false); defaultTransformations.addTransformation(tt); tt = new TagTransformation(HTMLConstants.TAG_STYLE, HTMLConstants.TAG_PRE, false); defaultTransformations.addTransformation(tt); } return defaultTransformations; }
@Override public Document clean(Reader originalHtmlContent, HTMLCleanerConfiguration configuration) { Document result; // Note: Instantiation of an HtmlCleaner object is cheap so there's no need to cache an instance // of it, // especially since this makes it extra safe with regards to multithreading (even though HTML // Cleaner is // already supposed to be thread safe). CleanerProperties cleanerProperties = getDefaultCleanerProperties(configuration); HtmlCleaner cleaner = new HtmlCleaner(cleanerProperties); cleaner.setTransformations(getDefaultCleanerTransformations(configuration)); TagNode cleanedNode; try { cleanedNode = cleaner.clean(originalHtmlContent); } catch (Exception e) { // This shouldn't happen since we're not doing any IO... I consider this a flaw in the design // of HTML // Cleaner. throw new RuntimeException("Unhandled error when cleaning HTML", e); } // Serialize the cleanedNode TagNode into a w3c dom. Ideally following code should be enough. // But SF's HTML Cleaner seems to omit the DocType declaration while serializing. // See // https://sourceforge.net/tracker/index.php?func=detail&aid=2062318&group_id=183053&atid=903696 // cleanedNode.setDocType(new DoctypeToken("html", "PUBLIC", "-//W3C//DTD XHTML 1.0 // Strict//EN", // "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd")); // try { // result = new DomSerializer(cleanerProperties, false).createDOM(cleanedNode); // } catch(ParserConfigurationException ex) { } // As a workaround, we must serialize the cleanedNode into a temporary w3c document, create a // new w3c document // with proper DocType declaration and move the root node from the temporary document to the new // one. try { // Since there's a bug in SF's HTML Cleaner in that it doesn't recognize CDATA blocks we need // to turn off // character escaping (hence the false value passed) and do the escaping in // XMLUtils.toString(). Note that // this can cause problem for code not serializing the W3C DOM to a String since it won't have // the // characters escaped. // See // https://sourceforge.net/tracker/index.php?func=detail&aid=2691888&group_id=183053&atid=903696 Document tempDoc = new XWikiDOMSerializer(cleanerProperties, false).createDOM(cleanedNode); DOMImplementation domImpl = DocumentBuilderFactory.newInstance().newDocumentBuilder().getDOMImplementation(); DocumentType docType = domImpl.createDocumentType( QUALIFIED_NAME_HTML, "-//W3C//DTD XHTML 1.0 Strict//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"); result = domImpl.createDocument(null, QUALIFIED_NAME_HTML, docType); result.replaceChild( result.adoptNode(tempDoc.getDocumentElement()), result.getDocumentElement()); } catch (ParserConfigurationException ex) { throw new RuntimeException("Error while serializing TagNode into w3c dom.", ex); } // Finally apply filters. for (HTMLFilter filter : configuration.getFilters()) { filter.filter(result, configuration.getParameters()); } return result; }