/** Test {@link UniqueIdFilter}. */ @Test public void duplicateIds() throws Exception { String actual = "<p id=\"x\">1</p><p id=\"xy\">2</p><p id=\"x\">3</p>"; String expected = "<p id=\"x\">1</p><p id=\"xy\">2</p><p id=\"x0\">3</p>"; HTMLCleanerConfiguration config = this.mocker.getComponentUnderTest().getDefaultConfiguration(); List<HTMLFilter> filters = new ArrayList<HTMLFilter>(config.getFilters()); filters.add(this.mocker.<HTMLFilter>getInstance(HTMLFilter.class, "uniqueId")); config.setFilters(filters); Assert.assertEquals( HEADER_FULL + expected + FOOTER, HTMLUtils.toString( this.mocker.getComponentUnderTest().clean(new StringReader(actual), config))); }
@Override public Document clean(Reader originalHtmlContent, HTMLCleanerConfiguration configuration) { Document result; // Note: Instantiation of an HtmlCleaner object is cheap so there's no need to cache an instance // of it, // especially since this makes it extra safe with regards to multithreading (even though HTML // Cleaner is // already supposed to be thread safe). CleanerProperties cleanerProperties = getDefaultCleanerProperties(configuration); HtmlCleaner cleaner = new HtmlCleaner(cleanerProperties); cleaner.setTransformations(getDefaultCleanerTransformations(configuration)); TagNode cleanedNode; try { cleanedNode = cleaner.clean(originalHtmlContent); } catch (Exception e) { // This shouldn't happen since we're not doing any IO... I consider this a flaw in the design // of HTML // Cleaner. throw new RuntimeException("Unhandled error when cleaning HTML", e); } // Serialize the cleanedNode TagNode into a w3c dom. Ideally following code should be enough. // But SF's HTML Cleaner seems to omit the DocType declaration while serializing. // See // https://sourceforge.net/tracker/index.php?func=detail&aid=2062318&group_id=183053&atid=903696 // cleanedNode.setDocType(new DoctypeToken("html", "PUBLIC", "-//W3C//DTD XHTML 1.0 // Strict//EN", // "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd")); // try { // result = new DomSerializer(cleanerProperties, false).createDOM(cleanedNode); // } catch(ParserConfigurationException ex) { } // As a workaround, we must serialize the cleanedNode into a temporary w3c document, create a // new w3c document // with proper DocType declaration and move the root node from the temporary document to the new // one. try { // Since there's a bug in SF's HTML Cleaner in that it doesn't recognize CDATA blocks we need // to turn off // character escaping (hence the false value passed) and do the escaping in // XMLUtils.toString(). Note that // this can cause problem for code not serializing the W3C DOM to a String since it won't have // the // characters escaped. // See // https://sourceforge.net/tracker/index.php?func=detail&aid=2691888&group_id=183053&atid=903696 Document tempDoc = new XWikiDOMSerializer(cleanerProperties, false).createDOM(cleanedNode); DOMImplementation domImpl = DocumentBuilderFactory.newInstance().newDocumentBuilder().getDOMImplementation(); DocumentType docType = domImpl.createDocumentType( QUALIFIED_NAME_HTML, "-//W3C//DTD XHTML 1.0 Strict//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"); result = domImpl.createDocument(null, QUALIFIED_NAME_HTML, docType); result.replaceChild( result.adoptNode(tempDoc.getDocumentElement()), result.getDocumentElement()); } catch (ParserConfigurationException ex) { throw new RuntimeException("Error while serializing TagNode into w3c dom.", ex); } // Finally apply filters. for (HTMLFilter filter : configuration.getFilters()) { filter.filter(result, configuration.getParameters()); } return result; }