示例#1
0
  // 列表页爬取
  public void crawlList() {

    while (true) {
      // 取标签
      String tag = indeedDao.getTags();
      if (tag == null || tag.equals("")) {
        break;
      }
      int num = indeedDao.getTagNum(tag);
      // int endid=(int)num/10;
      if (num > 1000) {
        endid = 1000;
      } else {
        endid = num;
      }

      List<String> urls = new ArrayList<String>();
      for (int id = beginId; id <= endid; id += 10) {
        urls.add("http://www.indeed.com/jobs?q=" + tag + "&sort=date&start=" + id);
        // urls.add("http://cn.indeed.com/%E5%B7%A5%E4%BD%9C?q=" + tag + "&sort=date&start=" + id);
      }
      OOSpider.create(site, indeedListPipeline, IndeedList.class)
          .startUrls(urls)
          .setSpawnUrl(false)
          .thread(threadNum)
          .run();

      // 时间戳标记
      indeedDao.setTimestamp(tag);
    }
  }
示例#2
0
  // 详情页爬取
  public void crawlDeatil() {
    while (true) {

      List<String> l = indeedDao.getTargetUrls();
      if (l == null) {
        break;
      }
      OOSpider.create(site, indeedDetailPipeline, IndeedDetail.class)
          .startUrls(l)
          .thread(threadNum)
          .setSpawnUrl(false)
          .run();
    }
  }