@Test @Ignore public void test01() throws Exception { System.out.println("-------------------------------"); WebClient webClient = new WebClient(BrowserVersion.CHROME); try { WebClientOptions options = webClient.getOptions(); options.setThrowExceptionOnFailingStatusCode(false); options.setThrowExceptionOnScriptError(false); options.setCssEnabled(false); options.setJavaScriptEnabled(true); options.setTimeout(50000); // webClient.setAjaxController(new NicelyResynchronizingAjaxController()); HtmlPage pageOrgin = webClient.getPage("http://lvyou.baidu.com/jinhua/jingdian"); Thread.sleep(5000); DomNodeList<DomNode> pageNodes = pageOrgin.querySelectorAll(".pagination"); HtmlDivision pageDiv = (HtmlDivision) pageNodes.get(0); DomNodeList<HtmlElement> liElements = pageDiv.getElementsByTagName("li"); Integer pageSize = liElements.size() - 1; for (int pageNow = 0; pageNow < pageSize; pageNow++) { DomNodeList<HtmlElement> pageAnchors = liElements.get(pageNow).getElementsByTagName("a"); HtmlAnchor pageAnchor = (HtmlAnchor) pageAnchors.get(0); HtmlPage page = pageAnchor.click(); Thread.sleep(10000); DomElement jViewDom = page.getElementById("J-view-list-container"); DomNodeList<HtmlElement> lis = jViewDom.getElementsByTagName("li"); for (HtmlElement li : lis) { DomNodeList<DomNode> titleNodes = li.querySelectorAll(".title"); HtmlAnchor titleAnchor = (HtmlAnchor) titleNodes.get(0); System.out.println("---------------标题----------------"); log.debug("{}", titleAnchor.asText()); DomNodeList<DomNode> picNodes = li.querySelectorAll(".pic"); HtmlAnchor picAnchor = (HtmlAnchor) picNodes.get(0); System.out.println("---------------详情URL----------------"); String detailUrl = "http://lvyou.baidu.com" + picAnchor.getAttribute("href"); log.debug("{}", detailUrl); DomNodeList<HtmlElement> imgEelements = picAnchor.getElementsByTagName("img"); for (HtmlElement imgEelement : imgEelements) { System.out.println("---------------图片----------------"); log.debug("{}", imgEelement.getAttribute("src")); } DomNodeList<DomNode> sumNodes = li.querySelectorAll(".view-userSays"); HtmlDivision sumDiv = (HtmlDivision) sumNodes.get(0); DomNodeList<HtmlElement> sumElements = sumDiv.getElementsByTagName("p"); HtmlParagraph sumPara = (HtmlParagraph) sumElements.get(0); System.out.println("---------------摘要----------------"); log.debug("{}", sumPara.asText()); } } } finally { webClient.close(); } System.out.println("-------------------------------"); }
@Test @Ignore public void test() throws Exception { System.out.println("-------------------------------"); WebClient webClient = new WebClient(BrowserVersion.CHROME); webClient.getOptions().setCssEnabled(false); webClient.getOptions().setJavaScriptEnabled(false); HtmlPage page = webClient.getPage("http://news.163.com/domestic/"); // DomNodeList<HtmlElement> elements = page.getElementBy System.out.println("---------------标题----------------"); DomNodeList<DomNode> domNodes = page.querySelectorAll(".item-top"); // log.debug("{}", domNodes); for (DomNode domNode : domNodes) { HtmlDivision htmlDivision = (HtmlDivision) domNode; DomNodeList<HtmlElement> aElements = htmlDivision.getElementsByTagName("a"); HtmlAnchor htmlAnchor = (HtmlAnchor) aElements.get(0); // HTMLHeadingElement htmlHeading2 = (HTMLHeadingElement) // htmlDivision.getElementsByTagName("h2"); // HtmlAnchor htmlAnchor = (HtmlAnchor) htmlDivision.getElementsByTagName("a"); log.debug("{}", htmlAnchor.asText()); log.debug("{}", htmlAnchor.getAttribute("href")); DomNodeList<HtmlElement> pElements = htmlDivision.getElementsByTagName("p"); HtmlParagraph htmlParagraph = (HtmlParagraph) pElements.get(0); log.debug("{}", htmlParagraph.asText()); DomNodeList<HtmlElement> iEelements = htmlDivision.getElementsByTagName("img"); for (HtmlElement iEelement : iEelements) { log.debug("{}", iEelement.getAttribute("src")); } String detailUrl = htmlAnchor.getAttribute("href"); if (detailUrl.equals("http://news.163.com/15/1215/17/BAT2L8RB00014JB6.html#f=dlist")) { HtmlPage detailPage = webClient.getPage(detailUrl); System.out.println("---------------正文----------------"); DomElement endTextElement = detailPage.getElementById("endText"); log.debug("{}", endTextElement.asText()); System.out.println("---------------图片----------------"); DomNodeList<DomNode> imgNodes = endTextElement.querySelectorAll(".f_center"); for (DomNode imgNode : imgNodes) { HtmlParagraph imgpara = (HtmlParagraph) imgNode; DomNodeList<HtmlElement> endImgs = imgpara.getElementsByTagName("img"); for (HtmlElement endImg : endImgs) { log.debug("{}", endImg.getAttribute("src")); } } } } webClient.close(); System.out.println("-------------------------------"); }