/** * 递归钻取正文信息 * * @param nodeP * @return */ @SuppressWarnings("unchecked") protected List<Node> extractHtml(Node nodeP, String type) throws Exception { NodeList nodeList = nodeP.getChildren(); if ((nodeList == null) || (nodeList.size() == 0)) { return null; } ArrayList tableList = new ArrayList(); try { for (NodeIterator e = nodeList.elements(); e.hasMoreNodes(); ) { Node node = (Node) e.nextNode(); if (node instanceof LinkTag) { tableList.add(node); } else if (node instanceof ScriptTag || node instanceof StyleTag || node instanceof SelectTag) { } else if (node instanceof TextNode) { if (node.getText().length() > 0) { tableList.add(node); } } else { List tempList = extractHtml(node, type); if ((tempList != null) && (tempList.size() > 0)) { Iterator ti = tempList.iterator(); while (ti.hasNext()) { tableList.add(ti.next()); } } } } } catch (Exception e) { return null; } if ((tableList != null) && (tableList.size() > 0)) { TableContext tc = new TableContext(); tc.setLinkList(new ArrayList()); tc.setTextBuffer(new StringBuffer()); tableNumber++; tc.setTableRow(tableNumber); Iterator ti = tableList.iterator(); // 得到设置的搜索URL String baseUrl = Config.getSingleConfig(ConfigItem.SEARCH_BASE_URL); while (ti.hasNext()) { Node node = (Node) ti.next(); if (node instanceof LinkTag) { LinkTag linkTag = (LinkTag) node; if (!"1".equalsIgnoreCase(type)) { linkTag.setAttribute( "href", baseUrl + SearchHelper.encrypt(linkTag.getAttribute("href"))); } tc.getLinkList().add(linkTag); } else { tc.getTextBuffer().append(node.getText()); } } return tableList; } return null; }
/** @throws ParserException */ private void parseHtml() throws ParserException { htmlTags = new ArrayList(); Parser parser = new Parser(); parser.setInputHTML(fDocument.get()); for (NodeIterator e = parser.elements(); e.hasMoreNodes(); ) { Node node = e.nextNode(); VHtmlNodeVisitor htmlNodeVisitor = new VHtmlNodeVisitor(); node.accept(htmlNodeVisitor); } }
/** Test regular expression matching: */ public void testRegularExpression() throws Exception { String target = "\n" + "\n" + "Most recently, in the Western Conference final, the Flames knocked off \n" + "the San Jose Sharks, the Pacific Division champions, to become the first \n" + "Canadian team to reach the Stanley Cup Championship series since 1994."; String html = "<html><head><title>CBC Sports Online: NHL Playoffs</title></head>" + "<body><h1>CBC SPORTS ONLINE</h1>\n" + "The Calgary Flames have already defeated three NHL division winners \n" + "during their improbable playoff run. If they are to hoist the Stanley \n" + "Cup they'll have to go through one more. <p><table ALIGN=\"Right\" width=196 CELLPADDING=0 cellspacing=0 hspace=4> <tr><td><img src=\"/gfx/topstory/sports/iginla_j0524.jpg\" width=194 height=194 hspace=3 border=1><br>\n" + "\n" + "<font SIZE=\"1\" FACE=\"verdana,arial\">\n" + "Jarome Iginla skates during the Flames' practice on Monday. Calgary takes on the Tampa Bay Lightning in the Stanley Cup finals beginning Tuesday night in Tampa\n" + "</font></td></tr></table>\n" + "\n" + "\n" + "In the post-season's first round, the Flames defeated the Vancouver \n" + "Canucks, the Northwest Division winners, in seven tough games. <p>\n" + "\n" + "In Round 2 it was the Detroit Red Wings, who not only won the Central \n" + "Division, but also boasted the NHL's best overall record during the \n" + "regular season, who fell to the Flames. <p>" + target + "<p>\n" + "\n" + "Up next for the Flames is the Tampa Bay Lighting -- the runaway winners \n" + "of the NHL's Southeast Division and the Eastern Conference's best team \n" + "during the regular season. <p>\n" + "\n" + "The Lighting advanced by beating the Philadelphia Flyers in the Eastern \n" + "Conference final. <p>\n" + "</body></html>\n"; Lexer lexer; Parser parser; RegexFilter filter; NodeIterator iterator; int count; lexer = new Lexer(html); parser = new Parser(lexer); filter = new RegexFilter( "(19|20)\\d\\d([- \\\\/.](0[1-9]|1[012])[- \\\\/.](0[1-9]|[12][0-9]|3[01]))?"); count = 0; for (iterator = parser.extractAllNodesThatMatch(filter).elements(); iterator.hasMoreNodes(); ) { assertEquals("text wrong", target, iterator.nextNode().toHtml()); count++; } assertEquals("wrong count", 1, count); }
/** * Assign the underlying node filter for this wrapper. * * @param filter The filter to wrap. * @param context The parser to use for conditioning this filter. Some filters need contextual * information to provide to the user, i.e. for tag names or attribute names or values, so the * Parser context is provided. */ public void setNodeFilter(NodeFilter filter, Parser context) { Set set; mFilter = (TagNameFilter) filter; set = new HashSet(); context.reset(); try { for (NodeIterator iterator = context.elements(); iterator.hasMoreNodes(); ) addName(set, iterator.nextNode()); } catch (ParserException pe) { // oh well, we tried } for (Iterator iterator = set.iterator(); iterator.hasNext(); ) mName.addItem(iterator.next()); mName.setSelectedItem(mFilter.getName()); }
public void testSelectors() throws Exception { String html = "<html><head><title>sample title</title></head><body inserterr=\"true\" yomama=\"false\"><h3 id=\"heading\">big </invalid>heading</h3><ul id=\"things\"><li><br word=\"broken\"/>>moocow<li><applet/>doohickey<li class=\"last\"><b class=\"item\">final<br>item</b></ul></body></html>"; Lexer l; Parser p; CssSelectorNodeFilter it; NodeIterator i; int count; l = new Lexer(html); p = new Parser(l); it = new CssSelectorNodeFilter("li + li"); count = 0; for (i = p.extractAllNodesThatMatch(it).elements(); i.hasMoreNodes(); ) { assertEquals("tag name wrong", "LI", ((Tag) i.nextNode()).getTagName()); count++; } assertEquals("wrong count", 2, count); }
@Override public void crawl(Parser parser) throws ParserException { List<LCOdds> data = new ArrayList<LCOdds>(); NodeList nl = parser.parse(new CssSelectorNodeFilter(ROOT)); for (NodeIterator it = nl.elements(); it.hasMoreNodes(); ) { NodeList cells = it.nextNode().getChildren(); cells.keepAllNodesThatMatch(tdFilter); LCOdds lc = parseRow(cells); if (null != lc) { data.add(lc); } } // persist if (data.size() < 1) { log.warn(" -- [ 06_LC_2 ] data is empty !"); } storeData("lc_odds", data); }
/** * 方法:获取对应的页面内容 * * @param htmlPageContent * @param preUrl * @throws ParserException * <p>Add By Ethan Lam At 2011-11-23 */ public void fetchHtmlContent(String htmlPageContent, String preUrl) throws ParserException { Parser parser = new Parser(); parser.setInputHTML(htmlPageContent); NodeFilter filter = new AndFilter( new TagNameFilter("div"), new HasAttributeFilter("class", "blkContainerSblkCon")); NodeList nodeList = parser.parse(filter); NodeIterator it = nodeList.elements(); Div div = null; StringBuffer htmlContent = new StringBuffer(); while (it.hasMoreNodes()) { div = (Div) it.nextNode(); NodeList nl = div.getChildren(); if (nl == null) continue; NodeIterator sub = nl.elements(); while (sub.hasMoreNodes()) { Node t = sub.nextNode(); if (t instanceof ParagraphTag) { // LoggerUtil.info("fetchHtmlContent:",((ParagraphTag) t).getStringText()); htmlContent.append(((ParagraphTag) t).getStringText()); } } } if ("".equals(htmlContent.toString().trim())) return; Page page = new Page(); page.setUrl(preUrl); page.setSegment(htmlContent.toString()); LoggerUtil.info(preUrl + "获取到的页面内容:", htmlContent.toString()); pageSer.save(page); }
/** * 处理目标 超链接节点 * * @param htmlPageContent * @param preUrl * @throws Exception */ public void dealLinkNodes(String htmlPageContent, String preUrl) { try { Parser parser = new Parser(); parser.setInputHTML(htmlPageContent); NodeFilter filter = new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank")); NodeList nodeList = parser.parse(filter); LoggerUtil.info("ParserHandler", "爬虫得到新的节点个数:" + (nodeList != null ? nodeList.size() : 0)); NodeIterator it = nodeList.elements(); while (it.hasMoreNodes()) { Node node = it.nextNode(); if (node instanceof LinkTag) { if (!filterHandler.isLinkTagFilter(((LinkTag) node))) { LoggerUtil.debug( "ParserHandler ", ((LinkTag) node).getLink(), ((LinkTag) node).getLinkText()); CrawlQueue.getQueueManager() .newNode(((LinkTag) node).getLinkText(), ((LinkTag) node).getLink(), preUrl); } } } } catch (Exception e) { } }
public ConversationID findCorrespondingHTMLFormConversation(ConversationID samlId) { ConversationModel conversationModel = this.model.getConversationModel(); HttpUrl samlHttpUrl = conversationModel.getRequestUrl(samlId); int samlConversationIndex = conversationModel.getIndexOfConversation(samlId); for (int conversationIndex = samlConversationIndex - 1; conversationIndex >= 0; conversationIndex--) { ConversationID id = conversationModel.getConversationAt(conversationIndex); Response response = conversationModel.getResponse(id); HttpUrl httpUrl = conversationModel.getRequestUrl(id); Object parsedContent = Parser.parse(httpUrl, response); if (null == parsedContent) { continue; } if (false == parsedContent instanceof org.htmlparser.util.NodeList) { continue; } org.htmlparser.util.NodeList htmlNodeList = (org.htmlparser.util.NodeList) parsedContent; org.htmlparser.util.NodeList forms = htmlNodeList.searchFor(FormTag.class); try { for (NodeIterator ni = forms.elements(); ni.hasMoreNodes(); ) { FormTag form = (FormTag) ni.nextNode(); String formAction = form.getAttribute("action"); HttpUrl formActionHttpUrl = new HttpUrl(formAction); if (samlHttpUrl.equals(formActionHttpUrl)) { return id; } } } catch (ParserException ex) { this._logger.log(Level.WARNING, "Looking for forms, got ''{0}''", ex); } catch (MalformedURLException ex) { this._logger.log(Level.WARNING, "Malformed action url: {0}", ex.getMessage()); } } return null; }