@Override public Site getSite() { return Site.me() .setDomain("www.diandian.com") .addStartUrl("http://17dujingdian.com/") .setUserAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); }
/** * @author [email protected] <br> * @since 0.5.1 */ public class ZhihuPageProcessor implements PageProcessor { private Site site = Site.me() .setCycleRetryTimes(5) .setRetryTimes(5) .setSleepTime(500) .setTimeOut(3 * 60 * 1000) .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0") .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") .addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3") .setCharset("UTF-8"); private static final int voteNum = 1000; @Override public void process(Page page) { List<String> relativeUrl = page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all(); page.addTargetRequests(relativeUrl); relativeUrl = page.getHtml() .xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href") .all(); page.addTargetRequests(relativeUrl); List<String> answers = page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all(); boolean exist = false; for (String answer : answers) { String vote = new Html(answer) .xpath("//div[@class='zm-votebar']//span[@class='count']/text()") .toString(); if (Integer.valueOf(vote) >= voteNum) { page.putField("vote", vote); page.putField("content", new Html(answer).xpath("//div[@class='zm-editable-content']")); page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href")); exist = true; } } if (!exist) { page.setSkip(true); } } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new ZhihuPageProcessor()) .addUrl("http://www.zhihu.com/search?type=question&q=java") .addPipeline(new FilePipeline("D:\\webmagic\\")) .thread(5) .run(); } }
@Override public Page download(Request request, Task task) { checkInit(); WebDriver webDriver; try { webDriver = webDriverPool.get(); } catch (InterruptedException e) { logger.warn("interrupted", e); return null; } logger.info("downloading page " + request.getUrl()); webDriver.get(request.getUrl()); try { Thread.sleep(sleepTime); } catch (InterruptedException e) { e.printStackTrace(); } WebDriver.Options manage = webDriver.manage(); Site site = task.getSite(); if (site.getCookies() != null) { for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) { Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue()); manage.addCookie(cookie); } } /* * TODO You can add mouse event or other processes * * @author: [email protected] */ WebElement webElement = webDriver.findElement(By.xpath("/html")); String content = webElement.getAttribute("outerHTML"); Page page = new Page(); page.setRawText(content); page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); webDriverPool.returnToPool(webDriver); return page; }
/** @author chenruoyu */ public class BlogPageProcessor implements PageProcessor { // 1926267847 public static final String URL_LIST = "http://blog\\.sina\\.com\\.cn/s/articlelist_1197161814_0_\\d+\\.html"; public static final String URL_POST = "http://blog\\.sina\\.com\\.cn/s/blog_\\w+\\.html"; private Site site = Site.me().setDomain("blog.sina.com.cn").setSleepTime(3000); @Override public void process(Page page) { // 列表页 if (page.getUrl().regex(URL_LIST).match()) { page.addTargetRequests( page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex(URL_POST).all()); page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all()); // 文章页 } else { page.putField( "title", page.getHtml().xpath("//div[@class='articalTitle']/h2/tidyText()").toString().trim()); page.putField( "date", page.getHtml() .xpath("//div[@id='articlebody']//span[@class='time SG_txtc']") .regex("\\((.*)\\)")); // 博文正文全文 String passage = page.getHtml().xpath("//div[@id='sina_keyword_ad_area2']/allText()").toString(); page.putField("passage", passage); // 博文标签 //*[@id='sina_keyword_ad_area']/table/tbody/tr/td[1]/h3/a String tag = page.getHtml() .xpath("//div[@id='sina_keyword_ad_area']/table/tbody/tr/td[1]/h3/a/text()") .toString() .trim(); page.putField("tag", tag); System.out.println("\n"); } } @Override public Site getSite() { return site; } // 1926267847 public static void main(String[] args) { Spider.create(new BlogPageProcessor()) .addUrl("http://blog.sina.com.cn/s/articlelist_1197161814_0_1.html") .run(); } }
private CloseableHttpClient getHttpClient(Site site) { if (site == null) { return httpClientGenerator.getClient(null); } String domain = site.getDomain(); CloseableHttpClient httpClient = httpClients.get(domain); if (httpClient == null) { synchronized (this) { httpClient = httpClients.get(domain); if (httpClient == null) { httpClient = httpClientGenerator.getClient(site); httpClients.put(domain, httpClient); } } } return httpClient; }
/** * @author [email protected] <br> * @since 0.4.0 */ public class BaiduBaikePageProcessor implements IPageProcessor { private Site site = Site.create() // .setHttpProxy(new HttpHost("127.0.0.1",8888)) .setRetryTimes(3) .setSleepTime(1000) .setUseGzip(true); @Override public void process(Page page) { page.putField("name", page.getHtml().css("h1.title div.lemmaTitleH1", "text").toString()); page.putField( "description", page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()")); } @Override public Site getSite() { return site; } public static void main(String[] args) { // single download Spider spider = Spider.create(new BaiduBaikePageProcessor()).thread(2); String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"; ResultItems resultItems = spider.<ResultItems>get(String.format(urlTemplate, "水力发电")); System.out.println(resultItems); // multidownload List<String> list = new ArrayList<String>(); list.add(String.format(urlTemplate, "风力发电")); list.add(String.format(urlTemplate, "太阳能")); list.add(String.format(urlTemplate, "地热发电")); list.add(String.format(urlTemplate, "地热发电")); List<ResultItems> resultItemses = spider.<ResultItems>getAll(list); for (ResultItems resultItemse : resultItemses) { System.out.println(resultItemse.getAll()); } spider.close(); } }
/** * 华军软件园[中国] app搜索抓取 * url:http://search.newhua.com/search_list.php?searchname=MT&searchsid=6&app=search&controller=index&action=search&type=news * ID:23 需要两次请求 * * @version 1.0.0 */ public class PageProOnlineDown implements PageProcessor { // 日志管理对象 private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(PageProOnlineDown.class); // 定义网站编码,以及间隔时间 Site site = Site.me() .setCharset("utf-8") .setRetryTimes(PropertiesUtil.getRetryTimes()) .setSleepTime(PropertiesUtil.getInterval()); /** 返回结果结果集 */ private Set<Apk> resSet = Sets.newHashSet(); /** * process the page, extract urls to fetch, extract the data and store * * @param page */ @Override public Apk process(Page page) { LOGGER.debug("crawler url: {}", page.getUrl()); // 获取搜索页面 if (page.getUrl().regex("http://www\\.onlinedown\\.net/.*").match()) { LOGGER.debug("match success, url:{}", page.getUrl()); List<String> urlList = page.getHtml().links().regex("http://www\\.onlinedown\\.net/.*").all(); Set<String> sets = Sets.newHashSet(urlList); for (String url : sets) { if (PageProUrlFilter.isUrlReasonable(url)) { if (url.contains("http://www.onlinedown.net/android/soft")) { // url="http://www.onlinedown.net/"+url; url = url.replaceAll("//android", ""); System.out.println("aaa"); } page.addTargetRequest(url); } } // 打印搜索结果url LOGGER.debug("app info results urls: {}", page.getTargetRequests()); } // 获取信息 if (page.getUrl().regex("http://www\\.onlinedown\\.net/soft/.*").match()) { PageProOnlineDown_Detail.getApkDetail(page, resSet); } // 更新下载地址 if (page.getUrl().regex("http://www\\.onlinedown\\.net/softdown/.*").match()) { Html html = page.getHtml(); Apk apk = PageProOnlineDown_Detail.getApkDetail(page, resSet); page.putField("apk", apk); if (page.getResultItems().get("apk") == null) { page.setSkip(true); } } else { page.setSkip(true); } return null; } /** * get the site settings * * @return site * @see Site */ @Override public Site getSite() { return site; } @Override public List<Apk> processMulti(Page page) { // TODO Auto-generated method stub return null; } }
/* * 原则: * 当前page下要求解析出来的字段都要保证在当前page生命周期内解析完成。 * 哪些规则是当前page下的,由page.getRequest().getFiledRuleId()决定 */ @Override public void process(Page page) throws PageProcessException { Request originalReq = page.getRequest(); // 源rquest,非null Integer fieldRuleId = originalReq.getFieldRuleId(); if (fieldRuleId != null) { if (fieldRuleId == 20) System.out.println("test"); } Request nextRequest = originalReq.getNextRequest(); // 抽出下一步request,允许null Request templast = nextRequest; // 创建请求链时需要的临时节点 // 找到当前page下要求解析的字段 final List<SpiderFieldRule> dependenceFieldRules = new ArrayList<SpiderFieldRule>(); for (SpiderFieldRule fieldRule : fieldRules) { if (fieldRule.getParentId() == (fieldRuleId == null ? 0 : fieldRuleId)) { dependenceFieldRules.add(fieldRule); } } // 开始解析当前page下要求解析的字段 for (SpiderFieldRule fieldRule : dependenceFieldRules) { List<String> results; StringBuilder sb; switch (fieldRule.getType()) { case 0: results = page.getHtml().regex(fieldRule.getRule()).all(); if (results.size() == 0) { if (fieldRule.getAllowEmpty() == 1) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#regex@#}", fieldRule.getFieldName(), fieldRule.getRule(), fieldRule.getParentId())); } } // 判断是否会产生新的下载请求,如果产生新的下载请求,则当前规则只解析顶级层,如果有子规则,要=到新的下载完成之后才能解析 if (fieldRule.getAdditionDownload() == 1) { // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析 for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(fieldRule.getId()); if (templast == null) { nextRequest = additionReq; templast = nextRequest; } else { templast.setNextRequest(additionReq); templast = additionReq; } if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else if (fieldRule.getAdditionRequest() == 1) { for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(fieldRule.getId()); transmitResultItem(page, additionReq); page.addTargetRequest(additionReq); if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else { if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); for (String result : results) { sb.append(result + ","); } page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } ruleComplierLoop(fieldRule.getId(), page, nextRequest, templast); } break; case 1: results = page.getHtml().xpath(fieldRule.getRule()).all(); if (results.size() == 0) { if (fieldRule.getAllowEmpty() == 1) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#xpath@#}", fieldRule.getFieldName(), fieldRule.getRule(), fieldRule.getParentId())); } } // 判断是否会产生新的下载请求 if (fieldRule.getAdditionDownload() == 1) { // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析 for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(fieldRule.getId()); if (templast == null) { nextRequest = additionReq; templast = nextRequest; } else { templast.setNextRequest(additionReq); templast = additionReq; } if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else if (fieldRule.getAdditionRequest() == 1) { for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(fieldRule.getId()); transmitResultItem(page, additionReq); page.addTargetRequest(additionReq); if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else { if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); for (String result : results) { sb.append(result + ","); } page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } ruleComplierLoop(fieldRule.getId(), page, nextRequest, templast); } break; case 2: results = page.getHtml().css(fieldRule.getRule()).all(); if (results.size() == 0) { if (fieldRule.getAllowEmpty() == 1) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#css@#}", fieldRule.getFieldName(), fieldRule.getRule(), fieldRule.getParentId())); } } // 判断是否会产生新的下载请求 if (fieldRule.getAdditionDownload() == 1) { // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析 for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(fieldRule.getId()); if (templast == null) { nextRequest = additionReq; templast = nextRequest; } else { templast.setNextRequest(additionReq); templast = additionReq; } if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else if (fieldRule.getAdditionRequest() == 1) { for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(fieldRule.getId()); transmitResultItem(page, additionReq); page.addTargetRequest(additionReq); if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else { if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); for (String result : results) { sb.append(result + ","); } page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } ruleComplierLoop(fieldRule.getId(), page, nextRequest, templast); } break; default: if (page.getRequest().getExtra(fieldRule.getFieldName()) == null) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,type:@#orig@#}", fieldRule.getFieldName(), fieldRule.getRule())); } page.putField( fieldRule.getFieldName(), page.getRequest().getExtra(fieldRule.getFieldName())); break; } } // 最后判断一下当前page有没有产生新的下载请求或任务请求,如果有,将page.getResultItems()中的解析结果通过request传递到下一个page中去 if (nextRequest != null) { transmitResultItem(page, nextRequest); page.setSkip(true); originalReq.setNextRequest(nextRequest); } }
/** * 当某个解析规则不会产生新的下载请求时(这种情况下当前page生命周期已经结束),当前page必须解析完该规则下的所有字段,存在子规则层层嵌套的情况 * * @param parentRuleId * @param page * @param nextRequest * @param templast * @throws PageProcessException */ private void ruleComplierLoop(int parentRuleId, Page page, Request nextRequest, Request templast) throws PageProcessException { List<SpiderFieldRule> childs = new ArrayList<SpiderFieldRule>(); for (SpiderFieldRule fieldRule : fieldRules) { if (fieldRule.getParentId() == parentRuleId) { childs.add(fieldRule); } } if (childs.size() == 0) { return; } else { for (SpiderFieldRule child : childs) { List<String> results; StringBuilder sb; switch (child.getType()) { case 0: results = page.getHtml().regex(child.getRule()).all(); if (results.size() == 0) { if (child.getAllowEmpty() == 1) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#regex@#}", child.getFieldName(), child.getRule(), child.getParentId())); } } // 判断是否会产生新的下载请求 if (child.getAdditionDownload() == 1) { // page.setIncludeAddition(true); // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析 for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(child.getId()); if (templast == null) { nextRequest = additionReq; templast = nextRequest; } else { templast.setNextRequest(additionReq); templast = additionReq; } if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else if (child.getAdditionRequest() == 1) { for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(child.getId()); page.addTargetRequest(additionReq); if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else { if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); for (String result : results) { sb.append(result + ","); } page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } ruleComplierLoop(child.getId(), page, nextRequest, templast); } break; case 1: results = page.getHtml().xpath(child.getRule()).all(); if (results.size() == 0) { if (child.getAllowEmpty() == 1) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#xpath@#}", child.getFieldName(), child.getRule(), child.getParentId())); } } // 判断是否会产生新的下载请求 if (child.getAdditionDownload() == 1) { // page.setIncludeAddition(true); // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析 for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(child.getId()); if (templast == null) { nextRequest = additionReq; templast = nextRequest; } else { templast.setNextRequest(additionReq); templast = additionReq; } if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else if (child.getAdditionRequest() == 1) { for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(child.getId()); page.addTargetRequest(additionReq); if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else { if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); for (String result : results) { sb.append(result + ","); } page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } ruleComplierLoop(child.getId(), page, nextRequest, templast); } break; case 2: results = page.getHtml().css(child.getRule()).all(); if (results.size() == 0) { if (child.getAllowEmpty() == 1) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#css@#}", child.getFieldName(), child.getRule(), child.getParentId())); } } // 判断是否会产生新的下载请求 if (child.getAdditionDownload() == 1) { // page.setIncludeAddition(true); // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析 for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(child.getId()); if (templast == null) { nextRequest = additionReq; templast = nextRequest; } else { templast.setNextRequest(additionReq); templast = additionReq; } if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else if (child.getAdditionRequest() == 1) { for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(child.getId()); page.addTargetRequest(additionReq); if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else { if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); for (String result : results) { sb.append(result + ","); } page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } ruleComplierLoop(child.getId(), page, nextRequest, templast); } break; default: if (page.getRequest().getExtra(child.getFieldName()) == null) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,type:@#orig@#}", child.getFieldName(), child.getRule())); } page.putField(child.getFieldName(), page.getRequest().getExtra(child.getFieldName())); break; } } } }
/** * 软件盒子 http://www.itopdog.cn/ Itopdog #81 (1)该网站的关键字搜索结果的翻页有错误,通过页面给定的结果无法进入到正确的搜索结果中 * 但是可以通过手动构造翻页url来获取,尝试多个关键字的搜索,搜索结果最多只有两页,因此,手动构造第二页的url (2)此网站的下载次数是及时更新的 (3)该网站有些应用已经无法下载 * * @author DMT */ public class Itopdog implements PageProcessor { Site site = Site.me().setCharset("utf-8").setRetryTimes(0).setSleepTime(3); @Override public Apk process(Page page) { // index page // http://www.itopdog.cn/home.php?type=az&ct=home&ac=search&q=%E6%B5%8F%E8%A7%88%E5%99%A8 if (page.getUrl().regex("http://www\\.itopdog\\.cn/home\\.php\\?*").match()) { // app的具体介绍页面 List<String> url1 = page.getHtml() .links("//div[@class='panel']") .regex("http://www\\.itopdog\\.cn/az.*") .all(); // 添加下一页url(翻页) 第2页 List<String> url2 = page.getHtml() .links("//div[@class='clearfix pagewrap']") .regex("http://www\\.itopdog\\.cn/.*") .all(); if (url2.isEmpty() == false) { String url = page.getUrl() + "&per_page=20"; url1.add(url); } // remove the duplicate urls in list HashSet<String> urlSet = new HashSet<String>(url1); // add the urls to page Iterator<String> it = urlSet.iterator(); while (it.hasNext()) { page.addTargetRequest(it.next()); } } // the app detail page if (page.getUrl().regex("http://www\\.itopdog\\.cn/az.*").match()) { // Apk apk = null; // String appName = null; //app名字 // String appDetailUrl = null; //具体页面url // String appDownloadUrl = null; //app下载地址 // String osPlatform = null ; //运行平台 // String appVersion = null; //app版本 // String appSize = null; //app大小 // String appUpdateDate = null; //更新日期 // String appType = null; //下载的文件类型 apk?zip?rar?ipa? // String appvender = null; //app开发者 APK这个类中尚未添加 // String appDownloadedTime=null; //app的下载次数 // // //有的名字里面包含版本号,有的不包含 // String nameString=page.getHtml().xpath("//font[@class='h2_css']/text()").toString(); // if(nameString != null && nameString.contains("V")) // { // appName=nameString.substring(0,nameString.indexOf("V")-1); // appVersion = nameString.substring(nameString.indexOf("V")+1,nameString.length()); // } // else if(nameString != null && nameString.contains("v")) // { // appName=nameString.substring(0,nameString.indexOf("v")-1); // appVersion = nameString.substring(nameString.indexOf("V")+1,nameString.length()); // } // else if(nameString != null && nameString.contains(".")) // { // appName=nameString.substring(0,nameString.indexOf(".")-1); // appVersion = nameString.substring(nameString.indexOf(".")-1,nameString.length()); // } // else // { // appName = nameString; // appVersion = null; // } // // appDetailUrl = page.getUrl().toString(); // // appDownloadUrl = page.getHtml().xpath("//div[@class='down-btn']/a/@href").toString(); // // osPlatform = page.getHtml().xpath("//dl[@class='clearfix // appinfo']/dd[4]/text()").toString(); // // String sizeString = page.getHtml().xpath("//dl[@class='clearfix // appinfo']/dd[1]/text()").toString(); // appSize = sizeString; // // String updatedateString = page.getHtml().xpath("//div[@class='six // code2d']/strong/text()").toString(); // appUpdateDate = // updatedateString.substring(updatedateString.indexOf(":")+1,updatedateString.length()); // // String typeString = "apk"; // appType =typeString; // // appvender=null; // // //下载次数是动态获取的,使用downloadTimeUrl构造出获取下载次数的链接 // String // id=appDetailUrl.substring(appDetailUrl.indexOf("-")+1,appDetailUrl.lastIndexOf(".")-1); // String downloadTimeUrl="http://www.itopdog.cn/home.php?ct=home&ac=get_updown_api&id="+id; // String line=null; // try { // //打开一个网址,获取源文件,这个网址里面是 //// { //// state: true, //// up: "0", //// down: "0", //// up_per: "0%", //// down_per: "0%", //// down_all: "3" //// } // URL url=new URL(downloadTimeUrl); // BufferedReader reader; // reader = new BufferedReader(new InputStreamReader(url.openStream())); // for(int i=0;i<7;i++) // line=reader.readLine(); // //line=document.write('30168'); // } catch (Exception e) { // } // if(line != null) // appDownloadedTime =line.substring(line.indexOf("\"")+1,line.lastIndexOf("\"")-1); // //// String DownloadedTimeString = // page.getHtml().xpath("//ul[@class='mdccs']/li[9]/text()").toString(); //// appDownloadedTime = // DownloadedTimeString.substring(DownloadedTimeString.indexOf(":")+1,DownloadedTimeString.length()); // // // // System.out.println("appName="+appName); // System.out.println("appDetailUrl="+appDetailUrl); // System.out.println("appDownloadUrl="+appDownloadUrl); // System.out.println("osPlatform="+osPlatform); // System.out.println("appVersion="+appVersion); // System.out.println("appSize="+appSize); // System.out.println("appUpdateDate="+appUpdateDate); // System.out.println("appType="+appType); // System.out.println("appvender="+appvender); // System.out.println("appDownloadedTime="+appDownloadedTime); // // if(appName != null && appDownloadUrl != null){ // apk = new Apk(appName,appDetailUrl,appDownloadUrl,osPlatform // ,appVersion,appSize,appUpdateDate,appType,null); // } // return Itopdog_Detail.getApkDetail(page); } return null; } @Override public Site getSite() { return site; } @Override public List<Apk> processMulti(Page page) { // TODO Auto-generated method stub return null; } }
/** Created by yiang on 2015/4/9. */ @Component public class IndeedCrawler { private static int beginId = 0; private static int endid = 10000; private int threadNum = 10; @Qualifier("IndeedListPipeline") @Autowired private PageModelPipeline indeedListPipeline; @Qualifier("IndeedDetailPipeline") @Autowired private PageModelPipeline indeedDetailPipeline; @Resource private IndeedDao indeedDao; private static Site site = Site.me() .setTimeOut(20000) .setSleepTime(5000) .setUserAgent( "Mozilla/5.0 (compatible; Baiduspider/2.0; " + "+http://www.baidu.com/search/spider.html)"); // 列表页爬取 public void crawlList() { while (true) { // 取标签 String tag = indeedDao.getTags(); if (tag == null || tag.equals("")) { break; } int num = indeedDao.getTagNum(tag); // int endid=(int)num/10; if (num > 1000) { endid = 1000; } else { endid = num; } List<String> urls = new ArrayList<String>(); for (int id = beginId; id <= endid; id += 10) { urls.add("http://www.indeed.com/jobs?q=" + tag + "&sort=date&start=" + id); // urls.add("http://cn.indeed.com/%E5%B7%A5%E4%BD%9C?q=" + tag + "&sort=date&start=" + id); } OOSpider.create(site, indeedListPipeline, IndeedList.class) .startUrls(urls) .setSpawnUrl(false) .thread(threadNum) .run(); // 时间戳标记 indeedDao.setTimestamp(tag); } } // 详情页爬取 public void crawlDeatil() { while (true) { List<String> l = indeedDao.getTargetUrls(); if (l == null) { break; } OOSpider.create(site, indeedDetailPipeline, IndeedDetail.class) .startUrls(l) .thread(threadNum) .setSpawnUrl(false) .run(); } } public static void main(String[] args) { // 加载配置 ApplicationContext applicationContext = new ClassPathXmlApplicationContext("classpath:/spring/applicationContext*.xml"); final IndeedCrawler indeedCrawler = applicationContext.getBean(IndeedCrawler.class); indeedCrawler.crawlList(); // indeedCrawler.crawlDeatil(); } }
/** * 安卓乐园[中国] app搜索抓取 url:http://search.520apk.com/cse/search?q=QQ&s=17910776473296434043&nsid=1 * * @version 1.0.0 */ public class PagePro520apk implements PageProcessor { // 日志管理对象 private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(PagePro520apk.class); // 定义网站编码,以及间隔时间 Site site = Site.me() .setCharset("utf-8") .setRetryTimes(PropertiesUtil.getRetryTimes()) .setSleepTime(PropertiesUtil.getInterval()); /** * process the page, extract urls to fetch, extract the data and store * * @param page */ @Override public Apk process(Page page) { LOGGER.debug("crawler url: {}", page.getUrl()); // 获取搜索页面 if (page.getUrl().regex("http://search\\.520apk\\.com/cse/search\\?q=.*").match()) { LOGGER.debug("match success, url:{}", page.getUrl()); // 获取详细链接,以及分页链接 List<String> urlList = page.getHtml().links("//div[@id='results']/div[@class='result f s0']/h3").all(); urlList.addAll(page.getHtml().links("//div[@class='pager clearfix']").all()); Iterator<String> iter = Sets.newHashSet(urlList).iterator(); while (iter.hasNext()) { page.addTargetRequest(iter.next()); } // 打印搜索结果url LOGGER.debug("app info results urls: {}", page.getTargetRequests()); } // 获取信息 if (page.getUrl().regex("http://www.520apk.com/android/*").match()) { return PagePro520apk_Detail.getApkDetail(page); } return null; } /** * get the site settings * * @return site * @see us.codecraft.webmagic.Site */ @Override public Site getSite() { return site; } @Override public List<Apk> processMulti(Page page) { // TODO Auto-generated method stub return null; } }
/** * 游戏狗[中国] app搜索抓取 url:http://search.gamedog.cn/app/?keyword=QQ&platform=Android id:12 * * @version 1.0.0 */ public class PageProGameDog implements PageProcessor { // 日志管理对象 private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(PageProGameDog.class); // 定义网站编码,以及间隔时间 Site site = Site.me() .setCharset("utf-8") .setRetryTimes(PropertiesUtil.getRetryTimes()) .setSleepTime(PropertiesUtil.getInterval()); /** * process the page, extract urls to fetch, extract the data and store * * @param page */ @Override public Apk process(Page page) { LOGGER.debug("crawler url: {}", page.getUrl()); // 获取搜索页面 if (page.getUrl().regex("http://android\\.gamedog\\.cn/.*").match()) { LOGGER.debug("match success, url:{}", page.getUrl()); // 获取详细链接,以及分页链接 List<String> urlList = page.getHtml().links().regex("http://android\\.gamedog\\.cn/.*").all(); Set<String> cacheSet = Sets.newHashSet(); cacheSet.addAll(urlList); for (String temp : cacheSet) { if (PageProUrlFilter.isUrlReasonable(temp)) page.addTargetRequest(temp); } // 打印搜索结果url LOGGER.debug("app info results urls: {}", page.getTargetRequests()); } // 获取信息 if (page.getUrl().regex("http://android.gamedog.cn/soft|game|online.*").match()) { Html html = page.getHtml(); Apk apk = PageProGameDog_Detail.getApkDetail(page); page.putField("apk", apk); if (page.getResultItems().get("apk") == null) { page.setSkip(true); } } else { page.setSkip(true); } return null; } /** * get the site settings * * @return site * @see Site */ @Override public Site getSite() { return site; } @Override public List<Apk> processMulti(Page page) { // TODO Auto-generated method stub return null; } }
/** * 苹果资讯 app搜索抓取 * url:http://www.baidu.com/s?ie=UTF-8&wd=%E8%B6%85%E8%83%BD%E9%99%86%E6%88%98%20site:shouyou.178.com * * @version 1.0.0 */ public class PageProIfan178 implements PageProcessor { // 日志管理对象 private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(PageProIfan178.class); // 定义网站编码,以及间隔时间 Site site = Site.me().setCharset("utf-8").setRetryTimes(2).setSleepTime(3); /** * process the page, extract urls to fetch, extract the data and store * * @param page */ @Override public Apk process(Page page) { LOGGER.debug("crawler url: {}", page.getUrl()); // 获取搜索页面 if (page.getUrl().regex("http://www\\.baidu\\.com/s\\?ie=UTF-8.*").match()) { LOGGER.debug("match success, url:{}", page.getUrl()); // 获取详细链接,以及分页链接 List<String> urlList = page.getHtml().links("//div[@id='content_left']/div/h3[@class='t']").all(); Iterator<String> iter = Sets.newHashSet(urlList).iterator(); while (iter.hasNext()) { page.addTargetRequest(iter.next()); } // 打印搜索结果url LOGGER.debug("app info results urls: {}", page.getTargetRequests()); } // 获取信息 if (page.getUrl().regex("http://www\\.baidu\\.com/link\\?url=.*").match()) { // 获取dom对象 // Html html = page.getHtml(); // // // 找出对应需要信息 // String appDetailUrl = page.getUrl().toString(); // String appName = // html.xpath("//div[@class='page-page']/div[@class='t1']/h1/text()").toString(); // if (StringUtils.isEmpty(appName)) { // appName = html.xpath("//div[@class='box-dw-l-t']/h1/strong/text()").get(); // } // String appVersion = null; // String appDownloadUrl = html.xpath("//div[@class='clearfix t2']/a/@href").get(); // if (StringUtils.isEmpty(appDownloadUrl)) { // appDownloadUrl = // html.xpath("//div[@class='dw-btn']/a[@class='dw-btn2']/@href").get(); // } // String osPlatform = null; // String appSize = // StringUtils.substringAfterLast(html.xpath("//div[@class='txt']/div[@class='clearfix // inf']/p[1]/text()").get(), ":"); // String appUpdateDate = null; // String downloadNum = null; // String appDesc = html.xpath("//div[@class='app_detail_infor']/p/text()").get(); // if (StringUtils.isEmpty(appDesc)) { // appDesc = // html.xpath("//div[@class='box-dw-l']/div[@class='jianjie']/p/text()").get(); // } // String appType = null; // // LOGGER.debug("name:{}, version: {}, url:{}, size: {}, appType: {}, os: {}, // date:{}, appDesc:{}", appName, appVersion, appDownloadUrl, appSize, appType, osPlatform, // appUpdateDate, appDesc); // // if (null != appName && null != appDownloadUrl) { // Apk apk = new Apk(appName, appDetailUrl, appDownloadUrl, osPlatform, // appVersion, appSize, appUpdateDate, null != appType ? appType : "APK"); // apk.setAppDescription(appDesc); return PageProIfan178_Detail.getApkDetail(page); } return null; } /** * get the site settings * * @return site * @see us.codecraft.webmagic.Site */ @Override public Site getSite() { return site; } @Override public List<Apk> processMulti(Page page) { // TODO Auto-generated method stub return null; } }
@Component("SpiderZhaopinProcessor") public class SpiderZhaopinProcessor implements PageProcessor { public Site site = Site.me().setRetryTimes(3).setSleepTime(3000); private static final Logger logger = LoggerFactory.getLogger(SpiderZhaopinProcessor.class); private String pageUrl; private static HashMap<String, Integer> doneLinks = new HashMap<String, Integer>(); private static Integer doneNum = 0; public SpiderZhaopinProcessor() { site.setDomain("zhaopin.com"); site.setUserAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36"); } @Override public Site getSite() { return site; } @Override public void process(Page page) { String title, companyName, salary; String keyStr, valueStr; // 属性 HashMap<String, String> propsMap = new HashMap<String, String>(); pageUrl = page.getRequest().getUrl(); Html pageHtml = page.getHtml(); Selectable pageRefLinks = page.getHtml().links(); // 1.页面是否已经存在过 synchronized (doneLinks) { if (doneLinks.containsKey(pageUrl)) { page.setSkip(true); return; } } String regEx = "http://jobs.zhaopin.com/(\\d)+.htm\\?([\\w=&]+)"; Pattern p = Pattern.compile(regEx); if (p.matcher(pageUrl).find()) { // 可以处理 System.out.println("找到一个:" + pageUrl); String pageTitle[] = pageHtml.getDocument().title().split("-"); title = pageTitle[0].replaceAll("招聘", ""); companyName = pageTitle[1]; List<String> keyList = pageHtml.xpath("//ul[@class='terminal-ul']/li").all(); String line = "", lines[]; for (int ii = 0; ii < keyList.size(); ii++) { if (keyList.get(ii) == null) { logger.warn("why keylist is null!"); continue; } line = StringUtil.html2text(keyList.get(ii)); lines = line.split(":"); if (lines.length != 2) continue; keyStr = lines[0]; valueStr = lines[1]; logger.debug(keyStr + ":" + valueStr); if (!(valueStr == null || keyStr == null)) { if (keyStr.trim().length() > 0 && valueStr.trim().length() > 0) propsMap.put(keyStr, valueStr); } } salary = propsMap.get("职位月薪"); if (salary == null) salary = ""; List<String> textList = pageHtml.xpath("//div[@class='tab-cont-box']/div[@class='tab-inner-cont']").all(); String jobDesc = textList.get(0); String companyDesc = textList.get(1); // save page.putField("jobTitle", title.trim()); page.putField("company", companyName.trim()); page.putField("salary", salary); page.putField("keyword", ""); page.putField("descr", jobDesc.trim()); page.putField("companyDesc", companyDesc); page.putField("props", propsMap); page.putField("url", pageUrl); page.putField("source", "zhaopin"); } else { page.setSkip(true); } // 分页、列表 page.addTargetRequests( pageRefLinks.regex("http://[\\w,\\/-_]+.zhaopin.com/[\\w,\\/.-?&_]+").all()); // page.addTargetRequests(pageRefLinks.regex(regEx).all()); // page.addTargetRequests(pageRefLinks.regex("http://sou.zhaopin.com/[\\w,\\/.-?&_]+").all()); synchronized (doneLinks) { doneLinks.put(pageUrl, doneNum++); SpiderRecord.addKeyNum("Zhaopin_all", doneNum); } } private String clearHtml(String tag) { return StringUtil.html2text(tag).replaceAll(":", "").trim(); } }
/* * author : 罗一鑫 * date : 2015/10/19 * 根据播放页html,从中抓取qitanid * * */ public class CommentCrawler implements PageProcessor { private static String part1OfUrl = "http://api.t.iqiyi.com/qx_api/comment/get_video_comments?aid="; private static String part2OfUrl = "&categoryid=1&cb=fnsucc&escape=true&need_reply=true&need_subject=true&need_total=1&page="; private static String part3OfUrl = "&page_size=10&page_size_reply=3&qitan_comment_type=1&qitancallback=fnsucc&qitanid="; private static String part4OfUrl = "&sort=hot&t=&tvid="; /* * xpath解析 * */ private static String TOTAL = "//div/div[@class='wrapper']/div[@class='wrapper-left']/" + "div[@id='block-I']/div[@id='qitancommonarea']/@"; private static String QITANID_XPATH = TOTAL + "data-qitancomment-qitanid"; private static String TVID_XPATH = TOTAL + "data-qitancomment-tvid"; private static String TITLE_XPATH = "//head/title"; /* * 站点设置 * */ private Site site = Site.me() .setDomain("http://www.iqiyi.com/") .setSleepTime(3000) .setUserAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) " + "AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); @Override public Site getSite() { // TODO Auto-generated method stub return this.site; } @Override public void process(Page page) { if (!GlobalVar.isFilePlayUrlsReaded) { GlobalVar.isFilePlayUrlsReaded = true; FileInputStream fis = null; InputStreamReader isr = null; BufferedReader br = null; String str = null; try { fis = new FileInputStream("./playUrls.txt"); isr = new InputStreamReader(fis); br = new BufferedReader(isr); while ((str = br.readLine()) != null) { page.addTargetRequest(str); } } catch (Exception e) { e.printStackTrace(); } finally { try { br.close(); isr.close(); fis.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } System.out.println("test"); // TODO Auto-generated method stub page.putField( "qitanid", page.getHtml().xpath(QITANID_XPATH).toString() + "," + page.getHtml().xpath(TVID_XPATH).toString() + "," + page.getHtml().xpath(TITLE_XPATH).toString().split(">")[1].split("-")[0]); } }
public class PagePro360 implements PageProcessor { // Site site = Site.me().setCharset("utf-8").setRetryTimes(2).setSleepTime(0); Site site = Site.me() .setCharset("utf-8") .setRetryTimes(PropertiesUtil.getRetryTimes()) .setSleepTime(PropertiesUtil.getInterval()); private Logger LOGGER = LoggerFactory.getLogger(PagePro360.class); public Apk process(Page page) { List<String> urls = page.getHtml().links().regex("(http://zhushou\\.360\\.cn/.*)").all(); Set<String> cacheSet = Sets.newHashSet(); cacheSet.addAll(urls); // 构造分页 // http://zhushou.360.cn/list/index/cid/1 // if(page.getUrl().regex("(http://zhushou\\.360\\.cn/detail/list/index/.*)").match()){ if (page.getRequest().getUrl().equals("http://zhushou.360.cn/list/index/cid/1") || page.getRequest().getUrl().equals("http://zhushou.360.cn/list/index/cid/2")) { String pageStr = page.getHtml().regex("(pg\\.pageCount\\s=\\s\\w+)").toString(); int pageCount = Integer.parseInt(pageStr.substring(15)); List<String> url1 = new ArrayList<String>(); for (int i = 2; i <= pageCount; i++) { url1.add(page.getRequest().getUrl() + "?page=" + i); } page.addTargetRequests(url1); } // 剔除锚点.*?#.* // #expand,#next,#prev,#comment,#nogo,#guess-like,#btn-install-now-log,#comment-list,#report for (String url : cacheSet) { if (url.toString().endsWith("#expand") || url.toString().endsWith("#next") || url.toString().endsWith("#prev") || url.toString().endsWith("#comment") || url.toString().endsWith("#nogo") || url.toString().endsWith("#guess-like") || url.toString().endsWith("#btn-install-now-log") || url.toString().endsWith("#comment-list") || url.toString().endsWith("#report")) { LOGGER.error("anchor:" + url.toString()); } else { LOGGER.info(url.toString()); page.addTargetRequest(url); } } // 提取页面信息 if (page.getUrl().regex("(http://zhushou\\.360\\.cn/detail/index/soft_id/.*)").match()) { Html html = page.getHtml(); Apk apk = PagePro360_Detail.getApkDetail(page); page.putField("apk", apk); if (page.getResultItems().get("apk") == null) { page.setSkip(true); } } else { page.setSkip(true); } return null; } public static void main(String[] args) { String url = "http://zhushou.360.cn/list/index/cid/1?page=24#expand"; // // if(url.endsWith("#expand||#next||#prev||#comment||#nogo||#guess-like||#btn-install-now-log")){ if (url.endsWith("#expand || #next")) { System.out.println("true"); } } @Override public List<Apk> processMulti(Page page) { // TODO Auto-generated method stub return null; } public Site getSite() { return site; } }
@Override public Page download(Request request, Task task) { Site site = null; if (task != null) { site = task.getSite(); } Set<Integer> acceptStatCode; String charset = null; Map<String, String> headers = null; if (site != null) { acceptStatCode = site.getAcceptStatCode(); charset = site.getCharset(); headers = site.getHeaders(); } else { acceptStatCode = Sets.newHashSet(200); } logger.info("downloading page " + request.getUrl()); RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl()); if (headers != null) { for (Map.Entry<String, String> headerEntry : headers.entrySet()) { requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); } } RequestConfig.Builder requestConfigBuilder = RequestConfig.custom() .setConnectionRequestTimeout(site.getTimeOut()) .setSocketTimeout(site.getTimeOut()) .setConnectTimeout(site.getTimeOut()) .setCookieSpec(CookieSpecs.BEST_MATCH); if (site != null && site.getHttpProxy() != null) { requestConfigBuilder.setProxy(site.getHttpProxy()); } requestBuilder.setConfig(requestConfigBuilder.build()); CloseableHttpResponse httpResponse = null; try { httpResponse = getHttpClient(site).execute(requestBuilder.build()); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (acceptStatCode.contains(statusCode)) { // charset if (charset == null) { String value = httpResponse.getEntity().getContentType().getValue(); charset = UrlUtils.getCharset(value); } return handleResponse(request, charset, httpResponse, task); } else { logger.warn("code error " + statusCode + "\t" + request.getUrl()); return null; } } catch (IOException e) { logger.warn("download page " + request.getUrl() + " error", e); if (site.getCycleRetryTimes() > 0) { return addToCycleRetry(request, site); } return null; } finally { try { if (httpResponse != null) { // ensure the connection is released back to pool EntityUtils.consume(httpResponse.getEntity()); } } catch (IOException e) { logger.warn("close response fail", e); } } }
public SpiderZhaopinProcessor() { site.setDomain("zhaopin.com"); site.setUserAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36"); }
/** * url:http://s.shuiguo.com/qq_1_1.html id:38 * * @version 1.0.0 */ public class PageProShuiGuo implements PageProcessor { // 日志管理对象 private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(PageProShuiGuo.class); // 定义网站编码,以及间隔时间 Site site = Site.me() .setCharset("utf-8") .setRetryTimes(PropertiesUtil.getRetryTimes()) .setSleepTime(PropertiesUtil.getInterval()); /** * process the page, extract urls to fetch, extract the data and store * * @param page */ @Override public Apk process(Page page) { LOGGER.debug("crawler url: {}", page.getUrl()); // 获取搜索页面 if ((page.getUrl().regex("http://www.shuiguo.com/android/.*").match() && page.getUrl().get().contains(".html")) || page.getUrl().regex("http://a\\.shuiguo\\.com/phb/").match()) { LOGGER.debug("match success, url:{}", page.getUrl()); // 获取详细链接,以及分页链接 List<String> urlList = page.getHtml().links().regex("http://www\\.shuiguo\\.com/android/.*").all(); Iterator<String> iter = Sets.newHashSet(urlList).iterator(); while (iter.hasNext()) { page.addTargetRequest(iter.next()); } // 打印搜索结果url LOGGER.debug("app info results urls: {}", page.getTargetRequests()); } // 获取信息 if (page.getUrl().regex("http://www\\.shuiguo\\.com/android/.*").match() && !page.getUrl().get().endsWith(".html")) { Html html = page.getHtml(); Apk apk; try { apk = PageProShuiGuo_Detail.getApkDetail(page); page.putField("apk", apk); if (page.getResultItems().get("apk") == null) { page.setSkip(true); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } else { page.setSkip(true); } return null; } /** * get the site settings * * @return site * @see Site */ @Override public Site getSite() { return site; } @Override public List<Apk> processMulti(Page page) { // TODO Auto-generated method stub return null; } }