protected String cleaner(String rs) { Whitelist wlist = new Whitelist(); wlist.addTags("txt", "p"); wlist.addTags("table", "tbody", "tr", "td"); wlist.addTags("img").addAttributes("img", "src"); return Jsoup.clean(rs, wlist); }
static { // 增加可信标签到白名单 content_filter.addTags("embed", "object", "param", "div", "font", "del"); // 增加可信属性 content_filter.addAttributes(":all", "style", "class", "id", "name", "on"); content_filter.addAttributes("object", "width", "height", "classid", "codebase"); content_filter.addAttributes("param", "name", "value"); content_filter.addAttributes( "embed", "src", "quality", "width", "height", "allowFullScreen", "allowScriptAccess", "flashvars", "name", "type", "pluginspage"); }
@Test public void shouldCleanHTMLContentReadability() throws IOException { // http://trib.al/KsvH2JE // http://tcrn.ch/1NxNAZJ String urll = "http://tcrn.ch/1NxNAZJ"; URL url = new URL(urll); final Readability extractor = new Readability(url, 10000); extractor.init(); String content = extractor.html(); Whitelist wl = Whitelist.relaxed(); // add additional tags here as necessary wl.addTags("figure"); String clean = Jsoup.clean(content, wl); Document mDocument = new Document(""); Element html = mDocument.createElement("html"); Element head = mDocument.createElement("head"); Element body = mDocument.createElement("body"); html.appendChild(head); body = body.append(clean).select("div").first(); html.appendChild(body); mDocument.appendChild(html); // Document document = Jsoup.parse(content); // Element head = document.head(); // String style = "<STYLE type=\"text/css\">"+ // "blockquote{"+ // " display:block;"+ // " background: #fff;"+ // " padding: 15px 20px 15px 45px;"+ // " margin: 0 0 20px;"+ // " position: relative;"+ // " "+ // " /*Font*/"+ // " font-family: Georgia, serif;"+ // " font-size: 16px;"+ // " line-height: 1.2;"+ // " color: #666;"+ // " text-align: justify;"+ // " "+ // " /*Borders - (Optional)*/"+ // " border-left: 15px solid #76AABA;"+ // " border-right: 2px solid #76AABA;"+ // " "+ // " /*Box Shadow - (Optional)*/"+ // " -moz-box-shadow: 2px 2px 15px #ccc;"+ // " -webkit-box-shadow: 2px 2px 15px #ccc;"+ // " box-shadow: 2px 2px 15px #ccc;"+ // "}"+ // ""+ // "blockquote::before{"+ // " content: \"\\201C\"; /*Unicode for Left Double Quote*/"+ // " "+ // " /*Font*/"+ // " font-family: Georgia, serif;"+ // " font-size: 60px;"+ // " font-weight: bold;"+ // " color: #999;"+ // " "+ // " /*Positioning*/"+ // " position: absolute;"+ // " left: 10px;"+ // " top:5px;"+ // "}"+ // ""+ // "blockquote::after{"+ // " /*Reset to make sure"+ // " content: \"\";*/"+ // " "+ // " content: \"\\201D\"; /*Unicode for Left Double Quote*/"+ // " "+ // " /*Font*/"+ // " font-family: Georgia, serif;"+ // " font-size: 60px;"+ // " font-weight: bold;"+ // " color: #999;"+ // " "+ // " /*Positioning*/"+ // " position: absolute;"+ // " right: 10px;"+ // " bottom:5px;"+ // "}"+ // ""+ // "blockquote a{"+ // " text-decoration: none;"+ // " background: #eee;"+ // " cursor: pointer;"+ // " padding: 0 3px;"+ // " color: #76AABA;"+ // "}"+ // ""+ // "blockquote a:hover{"+ // " color: #666;"+ // "}"+ // ""+ // "blockquote em{"+ // " font-style: italic;"+ // "}"+ // "</STYLE>"; // head.append(style); // // content = document.html(); LOG.debug("Article content : {}", mDocument.html()); // LOG.debug("Article content outer: {}", contentOuter); }