public ContentModel view(String url) { ContentModel model = new ContentModel(); try { NodeFilter filter = new TagNameFilter("html"); Parser parser = new Parser(); parser.setURL(SearchHelper.decrypt(url)); parser.setEncoding(parser.getEncoding()); // parser.setEncoding("gb2312"); NodeList list = parser.extractAllNodesThatMatch(filter); for (int i = 0; i < list.size(); i++) { String s = list.elementAt(i).toHtml(); model.setContent(s); } } catch (Exception e) { e.printStackTrace(); } return model; }
public ContentModel listHtml(String param, String type) { ContentModel model = new ContentModel(); StringBuffer html = new StringBuffer(); try { NodeFilter filter = new TagNameFilter("body"); Parser parser = new Parser(); parser.setURL(SearchHelper.SEARCH_URL_BAIDU + param); parser.setEncoding(parser.getEncoding()); NodeList list = parser.extractAllNodesThatMatch(filter); String body = list.toHtml(); Parser content = new Parser(); content.setInputHTML(body); content.setEncoding(parser.getEncoding()); NodeFilter content_filter = new TagNameFilter("table"); NodeList content_list = content.extractAllNodesThatMatch(content_filter); for (int i = 0; i < content_list.size(); i++) { String s = content_list.elementAt(i).toHtml(); if (s.indexOf("div") != -1) { continue; } if (s.indexOf("相关搜索") != -1) { html.append("<div id=\"rs\">" + s + "</div>"); continue; } html.append("<div class=\"content\">"); for (Node n : extractHtml(content_list.elementAt(i), type)) { if (n instanceof LinkTag) { if (n.toPlainTextString().equals("百度快照")) { continue; } html.append("<h3 class=\"t\">" + n.toHtml() + "</h3>"); } else { html.append(n.toHtml()); } } html.append("<br/></div><br>"); } /** 获取分页数据 */ Parser page = new Parser(); page.setInputHTML(body); page.setEncoding(parser.getEncoding()); NodeFilter page_filter = new TagNameFilter("p"); NodeList page_list = page.extractAllNodesThatMatch(page_filter); for (int i = 0; i < page_list.size(); i++) { String s = page_list.elementAt(i).toHtml(); if (s.indexOf("page") == -1) { continue; } html.append("<p id=\"page\">" + page_list.elementAt(i).toHtml() + "</div>"); } } catch (Exception e) { e.printStackTrace(); } model.setContent(html.toString()); return model; }