Example #1
0
  protected String cleaner(String rs) {

    Whitelist wlist = new Whitelist();

    wlist.addTags("txt", "p");
    wlist.addTags("table", "tbody", "tr", "td");
    wlist.addTags("img").addAttributes("img", "src");

    return Jsoup.clean(rs, wlist);
  }
Example #2
0
 static {
   // 增加可信标签到白名单
   content_filter.addTags("embed", "object", "param", "div", "font", "del");
   // 增加可信属性
   content_filter.addAttributes(":all", "style", "class", "id", "name", "on");
   content_filter.addAttributes("object", "width", "height", "classid", "codebase");
   content_filter.addAttributes("param", "name", "value");
   content_filter.addAttributes(
       "embed",
       "src",
       "quality",
       "width",
       "height",
       "allowFullScreen",
       "allowScriptAccess",
       "flashvars",
       "name",
       "type",
       "pluginspage");
 }
Example #3
0
  @Test
  public void shouldCleanHTMLContentReadability() throws IOException {
    // http://trib.al/KsvH2JE
    // http://tcrn.ch/1NxNAZJ
    String urll = "http://tcrn.ch/1NxNAZJ";
    URL url = new URL(urll);

    final Readability extractor = new Readability(url, 10000);
    extractor.init();
    String content = extractor.html();

    Whitelist wl = Whitelist.relaxed();
    // add additional tags here as necessary
    wl.addTags("figure");
    String clean = Jsoup.clean(content, wl);

    Document mDocument = new Document("");
    Element html = mDocument.createElement("html");
    Element head = mDocument.createElement("head");
    Element body = mDocument.createElement("body");

    html.appendChild(head);
    body = body.append(clean).select("div").first();
    html.appendChild(body);
    mDocument.appendChild(html);

    //	Document document = Jsoup.parse(content);
    //	Element head = document.head();
    //	String style = "<STYLE type=\"text/css\">"+
    //		"blockquote{"+
    //		"  display:block;"+
    //		"  background: #fff;"+
    //		"  padding: 15px 20px 15px 45px;"+
    //		"  margin: 0 0 20px;"+
    //		"  position: relative;"+
    //		"  "+
    //		"  /*Font*/"+
    //		"  font-family: Georgia, serif;"+
    //		"  font-size: 16px;"+
    //		"  line-height: 1.2;"+
    //		"  color: #666;"+
    //		"  text-align: justify;"+
    //		"  "+
    //		"  /*Borders - (Optional)*/"+
    //		"  border-left: 15px solid #76AABA;"+
    //		"  border-right: 2px solid #76AABA;"+
    //		"  "+
    //		"  /*Box Shadow - (Optional)*/"+
    //		"  -moz-box-shadow: 2px 2px 15px #ccc;"+
    //		"  -webkit-box-shadow: 2px 2px 15px #ccc;"+
    //		"  box-shadow: 2px 2px 15px #ccc;"+
    //		"}"+
    //		""+
    //		"blockquote::before{"+
    //		"  content: \"\\201C\"; /*Unicode for Left Double Quote*/"+
    //		"  "+
    //		"  /*Font*/"+
    //		"  font-family: Georgia, serif;"+
    //		"  font-size: 60px;"+
    //		"  font-weight: bold;"+
    //		"  color: #999;"+
    //		"  "+
    //		"  /*Positioning*/"+
    //		"  position: absolute;"+
    //		"  left: 10px;"+
    //		"  top:5px;"+
    //		"}"+
    //		""+
    //		"blockquote::after{"+
    //		"  /*Reset to make sure"+
    //		"  content: \"\";*/"+
    //		"  "+
    //		"   content: \"\\201D\"; /*Unicode for Left Double Quote*/"+
    //		"  "+
    //		"  /*Font*/"+
    //		"  font-family: Georgia, serif;"+
    //		"  font-size: 60px;"+
    //		"  font-weight: bold;"+
    //		"  color: #999;"+
    //		"  "+
    //		"  /*Positioning*/"+
    //		"  position: absolute;"+
    //		"  right: 10px;"+
    //		"  bottom:5px;"+
    //		"}"+
    //		""+
    //		"blockquote a{"+
    //		"  text-decoration: none;"+
    //		"  background: #eee;"+
    //		"  cursor: pointer;"+
    //		"  padding: 0 3px;"+
    //		"  color: #76AABA;"+
    //		"}"+
    //		""+
    //		"blockquote a:hover{"+
    //		" color: #666;"+
    //		"}"+
    //		""+
    //		"blockquote em{"+
    //		"  font-style: italic;"+
    //		"}"+
    //		"</STYLE>";
    //	head.append(style);
    //
    //	content = document.html();
    LOG.debug("Article content : {}", mDocument.html());

    // LOG.debug("Article content outer: {}", contentOuter);

  }