public static void main(String[] args) { Searcher searcher = new JSoupBaiduSearcher(); SearchResult searchResult = searcher.search("杨尚川", 1); List<Webpage> webpages = searchResult.getWebpages(); if (webpages != null) { int i = 1; LOG.info( "搜索结果 当前第 " + searchResult.getPage() + " 页,页面大小为:" + searchResult.getPageSize() + " 共有结果数:" + searchResult.getTotal()); for (Webpage webpage : webpages) { LOG.info("搜索结果 " + (i++) + " :"); LOG.info("标题:" + webpage.getTitle()); LOG.info("URL:" + webpage.getUrl()); LOG.info("摘要:" + webpage.getSummary()); LOG.info("正文:" + webpage.getContent()); LOG.info(""); } } else { LOG.error("没有搜索到结果"); } }
@Override public SearchResult search(String keyword, int page) { int pageSize = 10; // 百度搜索结果每页大小为10,pn参数代表的不是页数,而是返回结果的开始数 // 如获取第一页则pn=0,第二页则pn=10,第三页则pn=20,以此类推,抽象出模式:(page-1)*pageSize String url = "http://www.baidu.com/s?pn=" + (page - 1) * pageSize + "&wd=" + keyword; SearchResult searchResult = new SearchResult(); searchResult.setPage(page); List<Webpage> webpages = new ArrayList<>(); try { Document document = Jsoup.connect(url).get(); // 获取搜索结果数目 int total = getBaiduSearchResultCount(document); searchResult.setTotal(total); int len = 10; if (total < 1) { return null; } // 如果搜索到的结果不足一页 if (total < 10) { len = total; } for (int i = 0; i < len; i++) { String titleCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page - 1) * pageSize) + ".result.c-container h3.t a"; String summaryCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page - 1) * pageSize) + ".result.c-container div.c-abstract"; LOG.debug("titleCssQuery:" + titleCssQuery); LOG.debug("summaryCssQuery:" + summaryCssQuery); Element titleElement = document.select(titleCssQuery).first(); String href = ""; String titleText = ""; if (titleElement != null) { titleText = titleElement.text(); href = titleElement.attr("href"); } else { // 处理百度百科 titleCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op h3.t a"; summaryCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op div p"; LOG.debug("处理百度百科 titleCssQuery:" + titleCssQuery); LOG.debug("处理百度百科 summaryCssQuery:" + summaryCssQuery); titleElement = document.select(titleCssQuery).first(); if (titleElement != null) { titleText = titleElement.text(); href = titleElement.attr("href"); } } LOG.debug(titleText); Element summaryElement = document.select(summaryCssQuery).first(); // 处理百度知道 if (summaryElement == null) { summaryCssQuery = summaryCssQuery.replace("div.c-abstract", "font"); LOG.debug("处理百度知道 summaryCssQuery:" + summaryCssQuery); summaryElement = document.select(summaryCssQuery).first(); } String summaryText = ""; if (summaryElement != null) { summaryText = summaryElement.text(); } LOG.debug(summaryText); if (titleText != null && !"".equals(titleText.trim()) && summaryText != null && !"".equals(summaryText.trim())) { Webpage webpage = new Webpage(); webpage.setTitle(titleText); webpage.setUrl(href); webpage.setSummary(summaryText); if (href != null) { String content = Tools.getHTMLContent(href); webpage.setContent(content); } else { LOG.info("页面正确提取失败"); } webpages.add(webpage); } else { LOG.error("获取搜索结果列表项出错:" + titleText + " - " + summaryText); } } } catch (IOException ex) { LOG.error("搜索出错", ex); } searchResult.setWebpages(webpages); ; return searchResult; }