/** * 递归钻取正文信息 * * @param nodeP * @return */ @SuppressWarnings("unchecked") protected List<Node> extractHtml(Node nodeP, String type) throws Exception { NodeList nodeList = nodeP.getChildren(); if ((nodeList == null) || (nodeList.size() == 0)) { return null; } ArrayList tableList = new ArrayList(); try { for (NodeIterator e = nodeList.elements(); e.hasMoreNodes(); ) { Node node = (Node) e.nextNode(); if (node instanceof LinkTag) { tableList.add(node); } else if (node instanceof ScriptTag || node instanceof StyleTag || node instanceof SelectTag) { } else if (node instanceof TextNode) { if (node.getText().length() > 0) { tableList.add(node); } } else { List tempList = extractHtml(node, type); if ((tempList != null) && (tempList.size() > 0)) { Iterator ti = tempList.iterator(); while (ti.hasNext()) { tableList.add(ti.next()); } } } } } catch (Exception e) { return null; } if ((tableList != null) && (tableList.size() > 0)) { TableContext tc = new TableContext(); tc.setLinkList(new ArrayList()); tc.setTextBuffer(new StringBuffer()); tableNumber++; tc.setTableRow(tableNumber); Iterator ti = tableList.iterator(); // 得到设置的搜索URL String baseUrl = Config.getSingleConfig(ConfigItem.SEARCH_BASE_URL); while (ti.hasNext()) { Node node = (Node) ti.next(); if (node instanceof LinkTag) { LinkTag linkTag = (LinkTag) node; if (!"1".equalsIgnoreCase(type)) { linkTag.setAttribute( "href", baseUrl + SearchHelper.encrypt(linkTag.getAttribute("href"))); } tc.getLinkList().add(linkTag); } else { tc.getTextBuffer().append(node.getText()); } } return tableList; } return null; }
public ContentModel view(String url) { ContentModel model = new ContentModel(); try { NodeFilter filter = new TagNameFilter("html"); Parser parser = new Parser(); parser.setURL(SearchHelper.decrypt(url)); parser.setEncoding(parser.getEncoding()); // parser.setEncoding("gb2312"); NodeList list = parser.extractAllNodesThatMatch(filter); for (int i = 0; i < list.size(); i++) { String s = list.elementAt(i).toHtml(); model.setContent(s); } } catch (Exception e) { e.printStackTrace(); } return model; }