Пример #1
0
 @Override
 protected boolean accept(CrawlerTask task) {
   if (task.isFinished() == false && task.getFetchStatus() == 1) return true;
   return false;
 }
Пример #2
0
  /* (non-Javadoc)
   * @see inteldt.aspider.custom.framework.Processor#acceptProcess(inteldt.aspider.custom.framework.CrawlerTask)
   */
  @Override
  protected void acceptProcess(CrawlerTask task) {
    // 抽取用户信息
    if (RegexUtil.isMatched(task.getUrl(), "http://www.zhihu.com/people/\\S+")
        && !RegexUtil.isMatched(task.getUrl(), "http://www.zhihu.com/people/\\S+/followees")) {
      /* 第一部分,将用户信息抽取 */
      ZhihuAccount account = new ZhihuAccount();
      Document doc = Jsoup.parse(task.getHtml());
      // account ID
      account.setAccount(task.getUrl().substring(task.getUrl().lastIndexOf("/") + 1));
      // name
      Element elem = doc.select("span.name").get(1);
      if (elem != null) {
        String name = elem.text();
        account.setName(name);
      }
      // byname
      elem = doc.select("span.bio").first();
      if (elem != null) {
        String byname = elem.text();
        account.setByname(byname);
      }
      // avatar
      elem = doc.select(".avatar").select(".avatar-l").first();
      if (elem != null) {
        String imgurl = elem.attr("src");
        //				String filepath = imgurl.substring(8).replaceAll("\\.","/");
        //				downloadImage(imgurl,"./headImg/" + filepath);
        //				account.setAvatar(System.getProperty("user.dir") + "/headImg/" + filepath);
        account.setAvatar(imgurl);
      }
      // gender
      if (doc.select(".icon").select(".icon-profile-male").isEmpty()) {
        account.setGender("女");
      } else {
        account.setGender("男");
      }
      // location
      elem = doc.select("span.location").select(".item").first();
      if (elem != null) {
        String location = elem.text();
        account.setLocation(location);
      }
      // business
      elem = doc.select("span.business").select(".item").first();
      if (elem != null) {
        String business = elem.text();
        account.setBusiness(business);
      }
      // company
      elem = doc.select("span.employment").select(".item").first();
      if (elem != null) {
        String company = elem.text();
        account.setCompany(company);
      }
      // position
      elem = doc.select("span.position").select(".item").first();
      if (elem != null) {
        String position = elem.text();
        account.setPosition(position);
      }
      // education
      elem = doc.select("span.education").select(".item").first();
      if (elem != null) {
        String education = elem.text();
        account.setEducation(education);
      }
      // master
      elem = doc.select("span.education-extra").select(".item").first();
      if (elem != null) {
        String master = elem.text();
        account.setMaster(master);
      }
      // desciption
      elem = doc.select("span.description").select(".unfold-item").first();
      if (elem != null) {
        String desciption = elem.text();
        account.setDescription(desciption);
      }
      // okayNum
      elem = doc.select("span.zm-profile-header-user-agree > strong").first();
      if (elem != null) {
        int okayNum = Integer.parseInt(elem.text());
        account.setOkayNum(okayNum);
      }
      // thksNum
      elem = doc.select("span.zm-profile-header-user-thanks > strong").first();
      if (elem != null) {
        int thksNum = Integer.parseInt(elem.text());
        account.setThksNum(thksNum);
      }
      // goodTopic
      Elements elems = doc.select("a.zg-gray-darker");
      if (elem != null) {
        List<String> goodTopic = new ArrayList<String>();
        for (Element topic : elems) {
          goodTopic.add(topic.text());
        }
        account.setGoodTopic(goodTopic);
      }
      task.setEntity(account.generateEntity());
      System.out.println(account);

      /* 第二部分,将关注者页面的链接抽取 */
      List<String> links = null;
      if (!RedisFilter.put(task.getUrl(), task.getUrl() + "/followees")) {
        links = new ArrayList<String>();
        links.add(task.getUrl() + "/followees");
        TaskManager.addSecondaryUrl(task.getUrl() + "/followees"); // 添加任务
      }
      task.setLinks(links);
      task.setLinkExtractorFinished(true);
    }

    // 抽取关注者
    if (RegexUtil.isMatched(task.getUrl(), "http://www.zhihu.com/people/\\S+/followees")) {
      Document doc = Jsoup.parse(task.getHtml());
      Elements elems = doc.select("a[href^=http://www.zhihu.com/people/]");
      List<String> links = new ArrayList<String>();
      for (Element elem : elems) {
        String url = elem.absUrl("href");
        if (RegexUtil.isMatched(url, "http://www.zhihu.com/people/\\S+")) {
          if (!RedisFilter.put(task.getUrl(), url)) {
            links.add(url);
            TaskManager.addMainUrl(url); // 添加任务
          }
        }
      }
      task.setLinks(links);
      task.setLinkExtractorFinished(true);
    }
  }
Пример #3
0
 @Override
 protected void rejectProcess(CrawlerTask task) {
   // 设置任务结束
   task.setFinished(true);
 }