// 列表页爬取 public void crawlList() { while (true) { // 取标签 String tag = indeedDao.getTags(); if (tag == null || tag.equals("")) { break; } int num = indeedDao.getTagNum(tag); // int endid=(int)num/10; if (num > 1000) { endid = 1000; } else { endid = num; } List<String> urls = new ArrayList<String>(); for (int id = beginId; id <= endid; id += 10) { urls.add("http://www.indeed.com/jobs?q=" + tag + "&sort=date&start=" + id); // urls.add("http://cn.indeed.com/%E5%B7%A5%E4%BD%9C?q=" + tag + "&sort=date&start=" + id); } OOSpider.create(site, indeedListPipeline, IndeedList.class) .startUrls(urls) .setSpawnUrl(false) .thread(threadNum) .run(); // 时间戳标记 indeedDao.setTimestamp(tag); } }
// 详情页爬取 public void crawlDeatil() { while (true) { List<String> l = indeedDao.getTargetUrls(); if (l == null) { break; } OOSpider.create(site, indeedDetailPipeline, IndeedDetail.class) .startUrls(l) .thread(threadNum) .setSpawnUrl(false) .run(); } }