/** * 方法:获取对应的页面内容 * * @param htmlPageContent * @param preUrl * @throws ParserException * <p>Add By Ethan Lam At 2011-11-23 */ public void fetchHtmlContent(String htmlPageContent, String preUrl) throws ParserException { Parser parser = new Parser(); parser.setInputHTML(htmlPageContent); NodeFilter filter = new AndFilter( new TagNameFilter("div"), new HasAttributeFilter("class", "blkContainerSblkCon")); NodeList nodeList = parser.parse(filter); NodeIterator it = nodeList.elements(); Div div = null; StringBuffer htmlContent = new StringBuffer(); while (it.hasMoreNodes()) { div = (Div) it.nextNode(); NodeList nl = div.getChildren(); if (nl == null) continue; NodeIterator sub = nl.elements(); while (sub.hasMoreNodes()) { Node t = sub.nextNode(); if (t instanceof ParagraphTag) { // LoggerUtil.info("fetchHtmlContent:",((ParagraphTag) t).getStringText()); htmlContent.append(((ParagraphTag) t).getStringText()); } } } if ("".equals(htmlContent.toString().trim())) return; Page page = new Page(); page.setUrl(preUrl); page.setSegment(htmlContent.toString()); LoggerUtil.info(preUrl + "获取到的页面内容:", htmlContent.toString()); pageSer.save(page); }
/** * 处理目标 超链接节点 * * @param htmlPageContent * @param preUrl * @throws Exception */ public void dealLinkNodes(String htmlPageContent, String preUrl) { try { Parser parser = new Parser(); parser.setInputHTML(htmlPageContent); NodeFilter filter = new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank")); NodeList nodeList = parser.parse(filter); LoggerUtil.info("ParserHandler", "爬虫得到新的节点个数:" + (nodeList != null ? nodeList.size() : 0)); NodeIterator it = nodeList.elements(); while (it.hasMoreNodes()) { Node node = it.nextNode(); if (node instanceof LinkTag) { if (!filterHandler.isLinkTagFilter(((LinkTag) node))) { LoggerUtil.debug( "ParserHandler ", ((LinkTag) node).getLink(), ((LinkTag) node).getLinkText()); CrawlQueue.getQueueManager() .newNode(((LinkTag) node).getLinkText(), ((LinkTag) node).getLink(), preUrl); } } } } catch (Exception e) { } }