@Override
 public Site getSite() {
   return Site.me()
       .setDomain("www.diandian.com")
       .addStartUrl("http://17dujingdian.com/")
       .setUserAgent(
           "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
 }
Example #2
0
/**
 * @author [email protected] <br>
 * @since 0.5.1
 */
public class ZhihuPageProcessor implements PageProcessor {

  private Site site =
      Site.me()
          .setCycleRetryTimes(5)
          .setRetryTimes(5)
          .setSleepTime(500)
          .setTimeOut(3 * 60 * 1000)
          .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0")
          .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
          .addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
          .setCharset("UTF-8");

  private static final int voteNum = 1000;

  @Override
  public void process(Page page) {
    List<String> relativeUrl =
        page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all();
    page.addTargetRequests(relativeUrl);
    relativeUrl =
        page.getHtml()
            .xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href")
            .all();
    page.addTargetRequests(relativeUrl);
    List<String> answers = page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all();
    boolean exist = false;
    for (String answer : answers) {
      String vote =
          new Html(answer)
              .xpath("//div[@class='zm-votebar']//span[@class='count']/text()")
              .toString();
      if (Integer.valueOf(vote) >= voteNum) {
        page.putField("vote", vote);
        page.putField("content", new Html(answer).xpath("//div[@class='zm-editable-content']"));
        page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href"));
        exist = true;
      }
    }
    if (!exist) {
      page.setSkip(true);
    }
  }

  @Override
  public Site getSite() {
    return site;
  }

  public static void main(String[] args) {
    Spider.create(new ZhihuPageProcessor())
        .addUrl("http://www.zhihu.com/search?type=question&q=java")
        .addPipeline(new FilePipeline("D:\\webmagic\\"))
        .thread(5)
        .run();
  }
}
Example #3
0
  @Override
  public Page download(Request request, Task task) {
    checkInit();
    WebDriver webDriver;
    try {
      webDriver = webDriverPool.get();
    } catch (InterruptedException e) {
      logger.warn("interrupted", e);
      return null;
    }
    logger.info("downloading page " + request.getUrl());
    webDriver.get(request.getUrl());
    try {
      Thread.sleep(sleepTime);
    } catch (InterruptedException e) {
      e.printStackTrace();
    }
    WebDriver.Options manage = webDriver.manage();
    Site site = task.getSite();
    if (site.getCookies() != null) {
      for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) {
        Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue());
        manage.addCookie(cookie);
      }
    }

    /*
     * TODO You can add mouse event or other processes
     *
     * @author: [email protected]
     */

    WebElement webElement = webDriver.findElement(By.xpath("/html"));
    String content = webElement.getAttribute("outerHTML");
    Page page = new Page();
    page.setRawText(content);
    page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    webDriverPool.returnToPool(webDriver);
    return page;
  }
/** @author chenruoyu */
public class BlogPageProcessor implements PageProcessor {

  // 1926267847
  public static final String URL_LIST =
      "http://blog\\.sina\\.com\\.cn/s/articlelist_1197161814_0_\\d+\\.html";

  public static final String URL_POST = "http://blog\\.sina\\.com\\.cn/s/blog_\\w+\\.html";

  private Site site = Site.me().setDomain("blog.sina.com.cn").setSleepTime(3000);

  @Override
  public void process(Page page) {
    // 列表页
    if (page.getUrl().regex(URL_LIST).match()) {
      page.addTargetRequests(
          page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex(URL_POST).all());
      page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all());
      // 文章页
    } else {
      page.putField(
          "title",
          page.getHtml().xpath("//div[@class='articalTitle']/h2/tidyText()").toString().trim());
      page.putField(
          "date",
          page.getHtml()
              .xpath("//div[@id='articlebody']//span[@class='time SG_txtc']")
              .regex("\\((.*)\\)"));
      // 博文正文全文
      String passage =
          page.getHtml().xpath("//div[@id='sina_keyword_ad_area2']/allText()").toString();
      page.putField("passage", passage);
      // 博文标签 //*[@id='sina_keyword_ad_area']/table/tbody/tr/td[1]/h3/a
      String tag =
          page.getHtml()
              .xpath("//div[@id='sina_keyword_ad_area']/table/tbody/tr/td[1]/h3/a/text()")
              .toString()
              .trim();
      page.putField("tag", tag);
      System.out.println("\n");
    }
  }

  @Override
  public Site getSite() {
    return site;
  }
  // 1926267847
  public static void main(String[] args) {
    Spider.create(new BlogPageProcessor())
        .addUrl("http://blog.sina.com.cn/s/articlelist_1197161814_0_1.html")
        .run();
  }
}
 private CloseableHttpClient getHttpClient(Site site) {
   if (site == null) {
     return httpClientGenerator.getClient(null);
   }
   String domain = site.getDomain();
   CloseableHttpClient httpClient = httpClients.get(domain);
   if (httpClient == null) {
     synchronized (this) {
       httpClient = httpClients.get(domain);
       if (httpClient == null) {
         httpClient = httpClientGenerator.getClient(site);
         httpClients.put(domain, httpClient);
       }
     }
   }
   return httpClient;
 }
/**
 * @author [email protected] <br>
 * @since 0.4.0
 */
public class BaiduBaikePageProcessor implements IPageProcessor {

  private Site site =
      Site.create() // .setHttpProxy(new HttpHost("127.0.0.1",8888))
          .setRetryTimes(3)
          .setSleepTime(1000)
          .setUseGzip(true);

  @Override
  public void process(Page page) {
    page.putField("name", page.getHtml().css("h1.title div.lemmaTitleH1", "text").toString());
    page.putField(
        "description",
        page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()"));
  }

  @Override
  public Site getSite() {
    return site;
  }

  public static void main(String[] args) {
    // single download
    Spider spider = Spider.create(new BaiduBaikePageProcessor()).thread(2);
    String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
    ResultItems resultItems = spider.<ResultItems>get(String.format(urlTemplate, "水力发电"));
    System.out.println(resultItems);

    // multidownload
    List<String> list = new ArrayList<String>();
    list.add(String.format(urlTemplate, "风力发电"));
    list.add(String.format(urlTemplate, "太阳能"));
    list.add(String.format(urlTemplate, "地热发电"));
    list.add(String.format(urlTemplate, "地热发电"));
    List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
    for (ResultItems resultItemse : resultItemses) {
      System.out.println(resultItemse.getAll());
    }
    spider.close();
  }
}
/**
 * 华军软件园[中国] app搜索抓取
 * url:http://search.newhua.com/search_list.php?searchname=MT&searchsid=6&app=search&controller=index&action=search&type=news
 * ID:23 需要两次请求
 *
 * @version 1.0.0
 */
public class PageProOnlineDown implements PageProcessor {

  // 日志管理对象
  private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(PageProOnlineDown.class);

  // 定义网站编码,以及间隔时间
  Site site =
      Site.me()
          .setCharset("utf-8")
          .setRetryTimes(PropertiesUtil.getRetryTimes())
          .setSleepTime(PropertiesUtil.getInterval());

  /** 返回结果结果集 */
  private Set<Apk> resSet = Sets.newHashSet();

  /**
   * process the page, extract urls to fetch, extract the data and store
   *
   * @param page
   */
  @Override
  public Apk process(Page page) {
    LOGGER.debug("crawler url: {}", page.getUrl());

    // 获取搜索页面
    if (page.getUrl().regex("http://www\\.onlinedown\\.net/.*").match()) {
      LOGGER.debug("match success, url:{}", page.getUrl());

      List<String> urlList = page.getHtml().links().regex("http://www\\.onlinedown\\.net/.*").all();

      Set<String> sets = Sets.newHashSet(urlList);
      for (String url : sets) {
        if (PageProUrlFilter.isUrlReasonable(url)) {
          if (url.contains("http://www.onlinedown.net/android/soft")) {
            // url="http://www.onlinedown.net/"+url;
            url = url.replaceAll("//android", "");
            System.out.println("aaa");
          }

          page.addTargetRequest(url);
        }
      }
      // 打印搜索结果url
      LOGGER.debug("app info results urls: {}", page.getTargetRequests());
    }

    // 获取信息
    if (page.getUrl().regex("http://www\\.onlinedown\\.net/soft/.*").match()) {

      PageProOnlineDown_Detail.getApkDetail(page, resSet);
    }

    // 更新下载地址
    if (page.getUrl().regex("http://www\\.onlinedown\\.net/softdown/.*").match()) {
      Html html = page.getHtml();
      Apk apk = PageProOnlineDown_Detail.getApkDetail(page, resSet);

      page.putField("apk", apk);
      if (page.getResultItems().get("apk") == null) {
        page.setSkip(true);
      }
    } else {
      page.setSkip(true);
    }
    return null;
  }

  /**
   * get the site settings
   *
   * @return site
   * @see Site
   */
  @Override
  public Site getSite() {
    return site;
  }

  @Override
  public List<Apk> processMulti(Page page) {
    // TODO Auto-generated method stub
    return null;
  }
}
  /*
   * 原则:
   * 	当前page下要求解析出来的字段都要保证在当前page生命周期内解析完成。
   * 	哪些规则是当前page下的,由page.getRequest().getFiledRuleId()决定
   */
  @Override
  public void process(Page page) throws PageProcessException {

    Request originalReq = page.getRequest(); // 源rquest,非null
    Integer fieldRuleId = originalReq.getFieldRuleId();
    if (fieldRuleId != null) {
      if (fieldRuleId == 20) System.out.println("test");
    }

    Request nextRequest = originalReq.getNextRequest(); // 抽出下一步request,允许null
    Request templast = nextRequest; // 创建请求链时需要的临时节点

    // 找到当前page下要求解析的字段
    final List<SpiderFieldRule> dependenceFieldRules = new ArrayList<SpiderFieldRule>();

    for (SpiderFieldRule fieldRule : fieldRules) {
      if (fieldRule.getParentId() == (fieldRuleId == null ? 0 : fieldRuleId)) {
        dependenceFieldRules.add(fieldRule);
      }
    }
    // 开始解析当前page下要求解析的字段
    for (SpiderFieldRule fieldRule : dependenceFieldRules) {
      List<String> results;
      StringBuilder sb;
      switch (fieldRule.getType()) {
        case 0:
          results = page.getHtml().regex(fieldRule.getRule()).all();
          if (results.size() == 0) {
            if (fieldRule.getAllowEmpty() == 1) {
              throw new PageProcessException(
                  String.format(
                      "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#regex@#}",
                      fieldRule.getFieldName(), fieldRule.getRule(), fieldRule.getParentId()));
            }
          }
          // 判断是否会产生新的下载请求,如果产生新的下载请求,则当前规则只解析顶级层,如果有子规则,要=到新的下载完成之后才能解析
          if (fieldRule.getAdditionDownload() == 1) {
            // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析
            for (String result : results) {
              if (!result.startsWith("http")) {
                result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
              }
              Request additionReq = new Request(result);
              additionReq.setFieldRuleId(fieldRule.getId());
              if (templast == null) {
                nextRequest = additionReq;
                templast = nextRequest;
              } else {
                templast.setNextRequest(additionReq);
                templast = additionReq;
              }
              if (fieldRule.getNeedPersistence() == 0) {
                sb = new StringBuilder();
                sb.append(result + ",");
                page.putField(
                    fieldRule.getFieldName(),
                    StringUtils.substringBeforeLast(sb.toString().trim(), ","));
              }
            }
          } else if (fieldRule.getAdditionRequest() == 1) {
            for (String result : results) {
              if (!result.startsWith("http")) {
                result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
              }
              Request additionReq = new Request(result);
              additionReq.setFieldRuleId(fieldRule.getId());
              transmitResultItem(page, additionReq);
              page.addTargetRequest(additionReq);
              if (fieldRule.getNeedPersistence() == 0) {
                sb = new StringBuilder();
                sb.append(result + ",");
                page.putField(
                    fieldRule.getFieldName(),
                    StringUtils.substringBeforeLast(sb.toString().trim(), ","));
              }
            }
          } else {
            if (fieldRule.getNeedPersistence() == 0) {
              sb = new StringBuilder();
              for (String result : results) {
                sb.append(result + ",");
              }
              page.putField(
                  fieldRule.getFieldName(),
                  StringUtils.substringBeforeLast(sb.toString().trim(), ","));
            }
            ruleComplierLoop(fieldRule.getId(), page, nextRequest, templast);
          }
          break;
        case 1:
          results = page.getHtml().xpath(fieldRule.getRule()).all();
          if (results.size() == 0) {
            if (fieldRule.getAllowEmpty() == 1) {
              throw new PageProcessException(
                  String.format(
                      "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#xpath@#}",
                      fieldRule.getFieldName(), fieldRule.getRule(), fieldRule.getParentId()));
            }
          }
          // 判断是否会产生新的下载请求
          if (fieldRule.getAdditionDownload() == 1) {
            // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析
            for (String result : results) {
              if (!result.startsWith("http")) {
                result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
              }
              Request additionReq = new Request(result);
              additionReq.setFieldRuleId(fieldRule.getId());
              if (templast == null) {
                nextRequest = additionReq;
                templast = nextRequest;
              } else {
                templast.setNextRequest(additionReq);
                templast = additionReq;
              }
              if (fieldRule.getNeedPersistence() == 0) {
                sb = new StringBuilder();
                sb.append(result + ",");
                page.putField(
                    fieldRule.getFieldName(),
                    StringUtils.substringBeforeLast(sb.toString().trim(), ","));
              }
            }
          } else if (fieldRule.getAdditionRequest() == 1) {
            for (String result : results) {
              if (!result.startsWith("http")) {
                result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
              }
              Request additionReq = new Request(result);
              additionReq.setFieldRuleId(fieldRule.getId());
              transmitResultItem(page, additionReq);
              page.addTargetRequest(additionReq);
              if (fieldRule.getNeedPersistence() == 0) {
                sb = new StringBuilder();
                sb.append(result + ",");
                page.putField(
                    fieldRule.getFieldName(),
                    StringUtils.substringBeforeLast(sb.toString().trim(), ","));
              }
            }
          } else {
            if (fieldRule.getNeedPersistence() == 0) {
              sb = new StringBuilder();
              for (String result : results) {
                sb.append(result + ",");
              }
              page.putField(
                  fieldRule.getFieldName(),
                  StringUtils.substringBeforeLast(sb.toString().trim(), ","));
            }
            ruleComplierLoop(fieldRule.getId(), page, nextRequest, templast);
          }
          break;
        case 2:
          results = page.getHtml().css(fieldRule.getRule()).all();
          if (results.size() == 0) {
            if (fieldRule.getAllowEmpty() == 1) {
              throw new PageProcessException(
                  String.format(
                      "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#css@#}",
                      fieldRule.getFieldName(), fieldRule.getRule(), fieldRule.getParentId()));
            }
          }
          // 判断是否会产生新的下载请求
          if (fieldRule.getAdditionDownload() == 1) {
            // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析
            for (String result : results) {
              if (!result.startsWith("http")) {
                result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
              }
              Request additionReq = new Request(result);
              additionReq.setFieldRuleId(fieldRule.getId());
              if (templast == null) {
                nextRequest = additionReq;
                templast = nextRequest;
              } else {
                templast.setNextRequest(additionReq);
                templast = additionReq;
              }
              if (fieldRule.getNeedPersistence() == 0) {
                sb = new StringBuilder();
                sb.append(result + ",");
                page.putField(
                    fieldRule.getFieldName(),
                    StringUtils.substringBeforeLast(sb.toString().trim(), ","));
              }
            }
          } else if (fieldRule.getAdditionRequest() == 1) {
            for (String result : results) {
              if (!result.startsWith("http")) {
                result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
              }
              Request additionReq = new Request(result);
              additionReq.setFieldRuleId(fieldRule.getId());
              transmitResultItem(page, additionReq);
              page.addTargetRequest(additionReq);
              if (fieldRule.getNeedPersistence() == 0) {
                sb = new StringBuilder();
                sb.append(result + ",");
                page.putField(
                    fieldRule.getFieldName(),
                    StringUtils.substringBeforeLast(sb.toString().trim(), ","));
              }
            }
          } else {
            if (fieldRule.getNeedPersistence() == 0) {
              sb = new StringBuilder();
              for (String result : results) {
                sb.append(result + ",");
              }
              page.putField(
                  fieldRule.getFieldName(),
                  StringUtils.substringBeforeLast(sb.toString().trim(), ","));
            }
            ruleComplierLoop(fieldRule.getId(), page, nextRequest, templast);
          }
          break;
        default:
          if (page.getRequest().getExtra(fieldRule.getFieldName()) == null) {
            throw new PageProcessException(
                String.format(
                    "{fieldname:@#%s@#,fieldrule:@#%s@#,type:@#orig@#}",
                    fieldRule.getFieldName(), fieldRule.getRule()));
          }
          page.putField(
              fieldRule.getFieldName(), page.getRequest().getExtra(fieldRule.getFieldName()));
          break;
      }
    }
    // 最后判断一下当前page有没有产生新的下载请求或任务请求,如果有,将page.getResultItems()中的解析结果通过request传递到下一个page中去
    if (nextRequest != null) {
      transmitResultItem(page, nextRequest);
      page.setSkip(true);
      originalReq.setNextRequest(nextRequest);
    }
  }
  /**
   * 当某个解析规则不会产生新的下载请求时(这种情况下当前page生命周期已经结束),当前page必须解析完该规则下的所有字段,存在子规则层层嵌套的情况
   *
   * @param parentRuleId
   * @param page
   * @param nextRequest
   * @param templast
   * @throws PageProcessException
   */
  private void ruleComplierLoop(int parentRuleId, Page page, Request nextRequest, Request templast)
      throws PageProcessException {

    List<SpiderFieldRule> childs = new ArrayList<SpiderFieldRule>();
    for (SpiderFieldRule fieldRule : fieldRules) {
      if (fieldRule.getParentId() == parentRuleId) {
        childs.add(fieldRule);
      }
    }
    if (childs.size() == 0) {
      return;
    } else {
      for (SpiderFieldRule child : childs) {
        List<String> results;
        StringBuilder sb;
        switch (child.getType()) {
          case 0:
            results = page.getHtml().regex(child.getRule()).all();
            if (results.size() == 0) {
              if (child.getAllowEmpty() == 1) {
                throw new PageProcessException(
                    String.format(
                        "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#regex@#}",
                        child.getFieldName(), child.getRule(), child.getParentId()));
              }
            }
            // 判断是否会产生新的下载请求
            if (child.getAdditionDownload() == 1) {
              // page.setIncludeAddition(true);
              // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析
              for (String result : results) {
                if (!result.startsWith("http")) {
                  result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
                }
                Request additionReq = new Request(result);
                additionReq.setFieldRuleId(child.getId());
                if (templast == null) {
                  nextRequest = additionReq;
                  templast = nextRequest;
                } else {
                  templast.setNextRequest(additionReq);
                  templast = additionReq;
                }
                if (child.getNeedPersistence() == 0) {
                  sb = new StringBuilder();
                  sb.append(result + ",");
                  page.putField(
                      child.getFieldName(),
                      StringUtils.substringBeforeLast(sb.toString().trim(), ","));
                }
              }
            } else if (child.getAdditionRequest() == 1) {
              for (String result : results) {
                if (!result.startsWith("http")) {
                  result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
                }
                Request additionReq = new Request(result);
                additionReq.setFieldRuleId(child.getId());
                page.addTargetRequest(additionReq);
                if (child.getNeedPersistence() == 0) {
                  sb = new StringBuilder();
                  sb.append(result + ",");
                  page.putField(
                      child.getFieldName(),
                      StringUtils.substringBeforeLast(sb.toString().trim(), ","));
                }
              }
            } else {
              if (child.getNeedPersistence() == 0) {
                sb = new StringBuilder();
                for (String result : results) {
                  sb.append(result + ",");
                }
                page.putField(
                    child.getFieldName(),
                    StringUtils.substringBeforeLast(sb.toString().trim(), ","));
              }
              ruleComplierLoop(child.getId(), page, nextRequest, templast);
            }
            break;
          case 1:
            results = page.getHtml().xpath(child.getRule()).all();
            if (results.size() == 0) {
              if (child.getAllowEmpty() == 1) {
                throw new PageProcessException(
                    String.format(
                        "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#xpath@#}",
                        child.getFieldName(), child.getRule(), child.getParentId()));
              }
            }
            // 判断是否会产生新的下载请求
            if (child.getAdditionDownload() == 1) {
              // page.setIncludeAddition(true);
              // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析
              for (String result : results) {
                if (!result.startsWith("http")) {
                  result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
                }
                Request additionReq = new Request(result);
                additionReq.setFieldRuleId(child.getId());
                if (templast == null) {
                  nextRequest = additionReq;
                  templast = nextRequest;
                } else {
                  templast.setNextRequest(additionReq);
                  templast = additionReq;
                }
                if (child.getNeedPersistence() == 0) {
                  sb = new StringBuilder();
                  sb.append(result + ",");
                  page.putField(
                      child.getFieldName(),
                      StringUtils.substringBeforeLast(sb.toString().trim(), ","));
                }
              }
            } else if (child.getAdditionRequest() == 1) {
              for (String result : results) {
                if (!result.startsWith("http")) {
                  result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
                }
                Request additionReq = new Request(result);
                additionReq.setFieldRuleId(child.getId());
                page.addTargetRequest(additionReq);
                if (child.getNeedPersistence() == 0) {
                  sb = new StringBuilder();
                  sb.append(result + ",");
                  page.putField(
                      child.getFieldName(),
                      StringUtils.substringBeforeLast(sb.toString().trim(), ","));
                }
              }
            } else {
              if (child.getNeedPersistence() == 0) {
                sb = new StringBuilder();
                for (String result : results) {
                  sb.append(result + ",");
                }
                page.putField(
                    child.getFieldName(),
                    StringUtils.substringBeforeLast(sb.toString().trim(), ","));
              }
              ruleComplierLoop(child.getId(), page, nextRequest, templast);
            }
            break;
          case 2:
            results = page.getHtml().css(child.getRule()).all();
            if (results.size() == 0) {
              if (child.getAllowEmpty() == 1) {
                throw new PageProcessException(
                    String.format(
                        "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#css@#}",
                        child.getFieldName(), child.getRule(), child.getParentId()));
              }
            }
            // 判断是否会产生新的下载请求
            if (child.getAdditionDownload() == 1) {
              // page.setIncludeAddition(true);
              // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析
              for (String result : results) {
                if (!result.startsWith("http")) {
                  result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
                }
                Request additionReq = new Request(result);
                additionReq.setFieldRuleId(child.getId());
                if (templast == null) {
                  nextRequest = additionReq;
                  templast = nextRequest;
                } else {
                  templast.setNextRequest(additionReq);
                  templast = additionReq;
                }
                if (child.getNeedPersistence() == 0) {
                  sb = new StringBuilder();
                  sb.append(result + ",");
                  page.putField(
                      child.getFieldName(),
                      StringUtils.substringBeforeLast(sb.toString().trim(), ","));
                }
              }
            } else if (child.getAdditionRequest() == 1) {
              for (String result : results) {
                if (!result.startsWith("http")) {
                  result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
                }
                Request additionReq = new Request(result);
                additionReq.setFieldRuleId(child.getId());
                page.addTargetRequest(additionReq);
                if (child.getNeedPersistence() == 0) {
                  sb = new StringBuilder();
                  sb.append(result + ",");
                  page.putField(
                      child.getFieldName(),
                      StringUtils.substringBeforeLast(sb.toString().trim(), ","));
                }
              }
            } else {
              if (child.getNeedPersistence() == 0) {
                sb = new StringBuilder();
                for (String result : results) {
                  sb.append(result + ",");
                }
                page.putField(
                    child.getFieldName(),
                    StringUtils.substringBeforeLast(sb.toString().trim(), ","));
              }
              ruleComplierLoop(child.getId(), page, nextRequest, templast);
            }
            break;
          default:
            if (page.getRequest().getExtra(child.getFieldName()) == null) {
              throw new PageProcessException(
                  String.format(
                      "{fieldname:@#%s@#,fieldrule:@#%s@#,type:@#orig@#}",
                      child.getFieldName(), child.getRule()));
            }
            page.putField(child.getFieldName(), page.getRequest().getExtra(child.getFieldName()));
            break;
        }
      }
    }
  }
Example #10
0
/**
 * 软件盒子 http://www.itopdog.cn/ Itopdog #81 (1)该网站的关键字搜索结果的翻页有错误,通过页面给定的结果无法进入到正确的搜索结果中
 * 但是可以通过手动构造翻页url来获取,尝试多个关键字的搜索,搜索结果最多只有两页,因此,手动构造第二页的url (2)此网站的下载次数是及时更新的 (3)该网站有些应用已经无法下载
 *
 * @author DMT
 */
public class Itopdog implements PageProcessor {
  Site site = Site.me().setCharset("utf-8").setRetryTimes(0).setSleepTime(3);

  @Override
  public Apk process(Page page) {
    // index page
    //	http://www.itopdog.cn/home.php?type=az&ct=home&ac=search&q=%E6%B5%8F%E8%A7%88%E5%99%A8
    if (page.getUrl().regex("http://www\\.itopdog\\.cn/home\\.php\\?*").match()) {
      // app的具体介绍页面
      List<String> url1 =
          page.getHtml()
              .links("//div[@class='panel']")
              .regex("http://www\\.itopdog\\.cn/az.*")
              .all();

      // 添加下一页url(翻页)  第2页
      List<String> url2 =
          page.getHtml()
              .links("//div[@class='clearfix pagewrap']")
              .regex("http://www\\.itopdog\\.cn/.*")
              .all();
      if (url2.isEmpty() == false) {
        String url = page.getUrl() + "&per_page=20";
        url1.add(url);
      }

      // remove the duplicate urls in list
      HashSet<String> urlSet = new HashSet<String>(url1);

      // add the urls to page
      Iterator<String> it = urlSet.iterator();
      while (it.hasNext()) {
        page.addTargetRequest(it.next());
      }
    }

    // the app detail page
    if (page.getUrl().regex("http://www\\.itopdog\\.cn/az.*").match()) {
      //			Apk apk = null;
      //			String appName = null;				//app名字
      //			String appDetailUrl = null;			//具体页面url
      //			String appDownloadUrl = null;		//app下载地址
      //			String osPlatform = null ;			//运行平台
      //			String appVersion = null;			//app版本
      //			String appSize = null;				//app大小
      //			String appUpdateDate = null;		//更新日期
      //			String appType = null;				//下载的文件类型 apk?zip?rar?ipa?
      //			String appvender = null;			//app开发者  APK这个类中尚未添加
      //			String appDownloadedTime=null;		//app的下载次数
      //
      //			//有的名字里面包含版本号,有的不包含
      //			String nameString=page.getHtml().xpath("//font[@class='h2_css']/text()").toString();
      //			if(nameString != null && nameString.contains("V"))
      //			{
      //				appName=nameString.substring(0,nameString.indexOf("V")-1);
      //				appVersion = nameString.substring(nameString.indexOf("V")+1,nameString.length());
      //			}
      //			else if(nameString != null && nameString.contains("v"))
      //			{
      //				appName=nameString.substring(0,nameString.indexOf("v")-1);
      //				appVersion = nameString.substring(nameString.indexOf("V")+1,nameString.length());
      //			}
      //			else if(nameString != null && nameString.contains("."))
      //			{
      //				appName=nameString.substring(0,nameString.indexOf(".")-1);
      //				appVersion = nameString.substring(nameString.indexOf(".")-1,nameString.length());
      //			}
      //			else
      //			{
      //				appName = nameString;
      //				appVersion = null;
      //			}
      //
      //			appDetailUrl = page.getUrl().toString();
      //
      //			appDownloadUrl = page.getHtml().xpath("//div[@class='down-btn']/a/@href").toString();
      //
      //			osPlatform = page.getHtml().xpath("//dl[@class='clearfix
      // appinfo']/dd[4]/text()").toString();
      //
      //			String sizeString = page.getHtml().xpath("//dl[@class='clearfix
      // appinfo']/dd[1]/text()").toString();
      //				appSize = sizeString;
      //
      //			String updatedateString = page.getHtml().xpath("//div[@class='six
      // code2d']/strong/text()").toString();
      //				appUpdateDate =
      // updatedateString.substring(updatedateString.indexOf(":")+1,updatedateString.length());
      //
      //			String typeString = "apk";
      //				appType =typeString;
      //
      //			appvender=null;
      //
      //			//下载次数是动态获取的,使用downloadTimeUrl构造出获取下载次数的链接
      //			String
      // id=appDetailUrl.substring(appDetailUrl.indexOf("-")+1,appDetailUrl.lastIndexOf(".")-1);
      //			String downloadTimeUrl="http://www.itopdog.cn/home.php?ct=home&ac=get_updown_api&id="+id;
      //			String line=null;
      //			try {
      //				//打开一个网址,获取源文件,这个网址里面是
      ////				{
      ////					state: true,
      ////					up: "0",
      ////					down: "0",
      ////					up_per: "0%",
      ////					down_per: "0%",
      ////					down_all: "3"
      ////					}
      //				URL url=new URL(downloadTimeUrl);
      //				BufferedReader reader;
      //				reader = new BufferedReader(new InputStreamReader(url.openStream()));
      //				for(int i=0;i<7;i++)
      //					line=reader.readLine();
      //				//line=document.write('30168');
      //			} catch (Exception e) {
      //			}
      //			if(line != null)
      //				appDownloadedTime =line.substring(line.indexOf("\"")+1,line.lastIndexOf("\"")-1);
      //
      ////			String DownloadedTimeString =
      // page.getHtml().xpath("//ul[@class='mdccs']/li[9]/text()").toString();
      ////				appDownloadedTime =
      // DownloadedTimeString.substring(DownloadedTimeString.indexOf(":")+1,DownloadedTimeString.length());
      //
      //
      //
      //			System.out.println("appName="+appName);
      //			System.out.println("appDetailUrl="+appDetailUrl);
      //			System.out.println("appDownloadUrl="+appDownloadUrl);
      //			System.out.println("osPlatform="+osPlatform);
      //			System.out.println("appVersion="+appVersion);
      //			System.out.println("appSize="+appSize);
      //			System.out.println("appUpdateDate="+appUpdateDate);
      //			System.out.println("appType="+appType);
      //			System.out.println("appvender="+appvender);
      //			System.out.println("appDownloadedTime="+appDownloadedTime);
      //
      //			if(appName != null && appDownloadUrl != null){
      //				apk = new Apk(appName,appDetailUrl,appDownloadUrl,osPlatform
      // ,appVersion,appSize,appUpdateDate,appType,null);
      //			}
      //
      return Itopdog_Detail.getApkDetail(page);
    }

    return null;
  }

  @Override
  public Site getSite() {
    return site;
  }

  @Override
  public List<Apk> processMulti(Page page) {
    // TODO Auto-generated method stub
    return null;
  }
}
Example #11
0
/** Created by yiang on 2015/4/9. */
@Component
public class IndeedCrawler {
  private static int beginId = 0;
  private static int endid = 10000;
  private int threadNum = 10;

  @Qualifier("IndeedListPipeline")
  @Autowired
  private PageModelPipeline indeedListPipeline;

  @Qualifier("IndeedDetailPipeline")
  @Autowired
  private PageModelPipeline indeedDetailPipeline;

  @Resource private IndeedDao indeedDao;

  private static Site site =
      Site.me()
          .setTimeOut(20000)
          .setSleepTime(5000)
          .setUserAgent(
              "Mozilla/5.0 (compatible; Baiduspider/2.0; "
                  + "+http://www.baidu.com/search/spider.html)");

  // 列表页爬取
  public void crawlList() {

    while (true) {
      // 取标签
      String tag = indeedDao.getTags();
      if (tag == null || tag.equals("")) {
        break;
      }
      int num = indeedDao.getTagNum(tag);
      // int endid=(int)num/10;
      if (num > 1000) {
        endid = 1000;
      } else {
        endid = num;
      }

      List<String> urls = new ArrayList<String>();
      for (int id = beginId; id <= endid; id += 10) {
        urls.add("http://www.indeed.com/jobs?q=" + tag + "&sort=date&start=" + id);
        // urls.add("http://cn.indeed.com/%E5%B7%A5%E4%BD%9C?q=" + tag + "&sort=date&start=" + id);
      }
      OOSpider.create(site, indeedListPipeline, IndeedList.class)
          .startUrls(urls)
          .setSpawnUrl(false)
          .thread(threadNum)
          .run();

      // 时间戳标记
      indeedDao.setTimestamp(tag);
    }
  }

  // 详情页爬取
  public void crawlDeatil() {
    while (true) {

      List<String> l = indeedDao.getTargetUrls();
      if (l == null) {
        break;
      }
      OOSpider.create(site, indeedDetailPipeline, IndeedDetail.class)
          .startUrls(l)
          .thread(threadNum)
          .setSpawnUrl(false)
          .run();
    }
  }

  public static void main(String[] args) {
    // 加载配置
    ApplicationContext applicationContext =
        new ClassPathXmlApplicationContext("classpath:/spring/applicationContext*.xml");
    final IndeedCrawler indeedCrawler = applicationContext.getBean(IndeedCrawler.class);

    indeedCrawler.crawlList();
    // indeedCrawler.crawlDeatil();

  }
}
Example #12
0
/**
 * 安卓乐园[中国] app搜索抓取 url:http://search.520apk.com/cse/search?q=QQ&s=17910776473296434043&nsid=1
 *
 * @version 1.0.0
 */
public class PagePro520apk implements PageProcessor {

  // 日志管理对象
  private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(PagePro520apk.class);

  // 定义网站编码,以及间隔时间
  Site site =
      Site.me()
          .setCharset("utf-8")
          .setRetryTimes(PropertiesUtil.getRetryTimes())
          .setSleepTime(PropertiesUtil.getInterval());

  /**
   * process the page, extract urls to fetch, extract the data and store
   *
   * @param page
   */
  @Override
  public Apk process(Page page) {
    LOGGER.debug("crawler url: {}", page.getUrl());

    // 获取搜索页面
    if (page.getUrl().regex("http://search\\.520apk\\.com/cse/search\\?q=.*").match()) {
      LOGGER.debug("match success, url:{}", page.getUrl());

      // 获取详细链接,以及分页链接
      List<String> urlList =
          page.getHtml().links("//div[@id='results']/div[@class='result f s0']/h3").all();
      urlList.addAll(page.getHtml().links("//div[@class='pager clearfix']").all());

      Iterator<String> iter = Sets.newHashSet(urlList).iterator();
      while (iter.hasNext()) {
        page.addTargetRequest(iter.next());
      }

      // 打印搜索结果url
      LOGGER.debug("app info results urls: {}", page.getTargetRequests());
    }

    // 获取信息
    if (page.getUrl().regex("http://www.520apk.com/android/*").match()) {
      return PagePro520apk_Detail.getApkDetail(page);
    }
    return null;
  }

  /**
   * get the site settings
   *
   * @return site
   * @see us.codecraft.webmagic.Site
   */
  @Override
  public Site getSite() {
    return site;
  }

  @Override
  public List<Apk> processMulti(Page page) {
    // TODO Auto-generated method stub
    return null;
  }
}
Example #13
0
/**
 * 游戏狗[中国] app搜索抓取 url:http://search.gamedog.cn/app/?keyword=QQ&platform=Android id:12
 *
 * @version 1.0.0
 */
public class PageProGameDog implements PageProcessor {

  // 日志管理对象
  private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(PageProGameDog.class);

  // 定义网站编码,以及间隔时间
  Site site =
      Site.me()
          .setCharset("utf-8")
          .setRetryTimes(PropertiesUtil.getRetryTimes())
          .setSleepTime(PropertiesUtil.getInterval());

  /**
   * process the page, extract urls to fetch, extract the data and store
   *
   * @param page
   */
  @Override
  public Apk process(Page page) {
    LOGGER.debug("crawler url: {}", page.getUrl());

    // 获取搜索页面
    if (page.getUrl().regex("http://android\\.gamedog\\.cn/.*").match()) {
      LOGGER.debug("match success, url:{}", page.getUrl());

      // 获取详细链接,以及分页链接
      List<String> urlList = page.getHtml().links().regex("http://android\\.gamedog\\.cn/.*").all();
      Set<String> cacheSet = Sets.newHashSet();
      cacheSet.addAll(urlList);

      for (String temp : cacheSet) {
        if (PageProUrlFilter.isUrlReasonable(temp)) page.addTargetRequest(temp);
      }

      // 打印搜索结果url
      LOGGER.debug("app info results urls: {}", page.getTargetRequests());
    }

    // 获取信息
    if (page.getUrl().regex("http://android.gamedog.cn/soft|game|online.*").match()) {
      Html html = page.getHtml();
      Apk apk = PageProGameDog_Detail.getApkDetail(page);

      page.putField("apk", apk);
      if (page.getResultItems().get("apk") == null) {
        page.setSkip(true);
      }
    } else {
      page.setSkip(true);
    }

    return null;
  }

  /**
   * get the site settings
   *
   * @return site
   * @see Site
   */
  @Override
  public Site getSite() {
    return site;
  }

  @Override
  public List<Apk> processMulti(Page page) {
    // TODO Auto-generated method stub
    return null;
  }
}
Example #14
0
/**
 * 苹果资讯 app搜索抓取
 * url:http://www.baidu.com/s?ie=UTF-8&wd=%E8%B6%85%E8%83%BD%E9%99%86%E6%88%98%20site:shouyou.178.com
 *
 * @version 1.0.0
 */
public class PageProIfan178 implements PageProcessor {

  // 日志管理对象
  private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(PageProIfan178.class);

  // 定义网站编码,以及间隔时间
  Site site = Site.me().setCharset("utf-8").setRetryTimes(2).setSleepTime(3);

  /**
   * process the page, extract urls to fetch, extract the data and store
   *
   * @param page
   */
  @Override
  public Apk process(Page page) {
    LOGGER.debug("crawler url: {}", page.getUrl());

    // 获取搜索页面
    if (page.getUrl().regex("http://www\\.baidu\\.com/s\\?ie=UTF-8.*").match()) {
      LOGGER.debug("match success, url:{}", page.getUrl());

      // 获取详细链接,以及分页链接
      List<String> urlList =
          page.getHtml().links("//div[@id='content_left']/div/h3[@class='t']").all();

      Iterator<String> iter = Sets.newHashSet(urlList).iterator();
      while (iter.hasNext()) {
        page.addTargetRequest(iter.next());
      }

      // 打印搜索结果url
      LOGGER.debug("app info results urls: {}", page.getTargetRequests());
    }

    // 获取信息
    if (page.getUrl().regex("http://www\\.baidu\\.com/link\\?url=.*").match()) {
      // 获取dom对象
      //            Html html = page.getHtml();
      //
      //            // 找出对应需要信息
      //            String appDetailUrl = page.getUrl().toString();
      //            String appName =
      // html.xpath("//div[@class='page-page']/div[@class='t1']/h1/text()").toString();
      //            if (StringUtils.isEmpty(appName)) {
      //                appName = html.xpath("//div[@class='box-dw-l-t']/h1/strong/text()").get();
      //            }
      //            String appVersion = null;
      //            String appDownloadUrl = html.xpath("//div[@class='clearfix t2']/a/@href").get();
      //            if (StringUtils.isEmpty(appDownloadUrl)) {
      //                appDownloadUrl =
      // html.xpath("//div[@class='dw-btn']/a[@class='dw-btn2']/@href").get();
      //            }
      //            String osPlatform = null;
      //            String appSize =
      // StringUtils.substringAfterLast(html.xpath("//div[@class='txt']/div[@class='clearfix
      // inf']/p[1]/text()").get(), ":");
      //            String appUpdateDate = null;
      //            String downloadNum = null;
      //            String appDesc = html.xpath("//div[@class='app_detail_infor']/p/text()").get();
      //            if (StringUtils.isEmpty(appDesc)) {
      //                appDesc =
      // html.xpath("//div[@class='box-dw-l']/div[@class='jianjie']/p/text()").get();
      //            }
      //            String appType = null;
      //
      //            LOGGER.debug("name:{}, version: {}, url:{}, size: {}, appType: {}, os: {},
      // date:{}, appDesc:{}", appName, appVersion, appDownloadUrl, appSize, appType, osPlatform,
      // appUpdateDate, appDesc);
      //
      //            if (null != appName && null != appDownloadUrl) {
      //                Apk apk = new Apk(appName, appDetailUrl, appDownloadUrl, osPlatform,
      // appVersion, appSize, appUpdateDate, null != appType ? appType : "APK");
      //                apk.setAppDescription(appDesc);

      return PageProIfan178_Detail.getApkDetail(page);
    }

    return null;
  }

  /**
   * get the site settings
   *
   * @return site
   * @see us.codecraft.webmagic.Site
   */
  @Override
  public Site getSite() {
    return site;
  }

  @Override
  public List<Apk> processMulti(Page page) {
    // TODO Auto-generated method stub
    return null;
  }
}
@Component("SpiderZhaopinProcessor")
public class SpiderZhaopinProcessor implements PageProcessor {

  public Site site = Site.me().setRetryTimes(3).setSleepTime(3000);

  private static final Logger logger = LoggerFactory.getLogger(SpiderZhaopinProcessor.class);

  private String pageUrl;
  private static HashMap<String, Integer> doneLinks = new HashMap<String, Integer>();
  private static Integer doneNum = 0;

  public SpiderZhaopinProcessor() {

    site.setDomain("zhaopin.com");
    site.setUserAgent(
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36");
  }

  @Override
  public Site getSite() {
    return site;
  }

  @Override
  public void process(Page page) {

    String title, companyName, salary;

    String keyStr, valueStr;

    // 属性
    HashMap<String, String> propsMap = new HashMap<String, String>();

    pageUrl = page.getRequest().getUrl();

    Html pageHtml = page.getHtml();
    Selectable pageRefLinks = page.getHtml().links();

    // 1.页面是否已经存在过
    synchronized (doneLinks) {
      if (doneLinks.containsKey(pageUrl)) {

        page.setSkip(true);
        return;
      }
    }

    String regEx = "http://jobs.zhaopin.com/(\\d)+.htm\\?([\\w=&]+)";
    Pattern p = Pattern.compile(regEx);

    if (p.matcher(pageUrl).find()) {

      // 可以处理
      System.out.println("找到一个:" + pageUrl);

      String pageTitle[] = pageHtml.getDocument().title().split("-");

      title = pageTitle[0].replaceAll("招聘", "");
      companyName = pageTitle[1];

      List<String> keyList = pageHtml.xpath("//ul[@class='terminal-ul']/li").all();

      String line = "", lines[];
      for (int ii = 0; ii < keyList.size(); ii++) {

        if (keyList.get(ii) == null) {
          logger.warn("why keylist is null!");
          continue;
        }

        line = StringUtil.html2text(keyList.get(ii));
        lines = line.split(":");

        if (lines.length != 2) continue;

        keyStr = lines[0];
        valueStr = lines[1];

        logger.debug(keyStr + ":" + valueStr);
        if (!(valueStr == null || keyStr == null)) {

          if (keyStr.trim().length() > 0 && valueStr.trim().length() > 0)
            propsMap.put(keyStr, valueStr);
        }
      }

      salary = propsMap.get("职位月薪");
      if (salary == null) salary = "";

      List<String> textList =
          pageHtml.xpath("//div[@class='tab-cont-box']/div[@class='tab-inner-cont']").all();

      String jobDesc = textList.get(0);
      String companyDesc = textList.get(1);

      // save
      page.putField("jobTitle", title.trim());
      page.putField("company", companyName.trim());
      page.putField("salary", salary);

      page.putField("keyword", "");
      page.putField("descr", jobDesc.trim());
      page.putField("companyDesc", companyDesc);

      page.putField("props", propsMap);

      page.putField("url", pageUrl);
      page.putField("source", "zhaopin");

    } else {
      page.setSkip(true);
    }
    // 分页、列表
    page.addTargetRequests(
        pageRefLinks.regex("http://[\\w,\\/-_]+.zhaopin.com/[\\w,\\/.-?&_]+").all());
    // page.addTargetRequests(pageRefLinks.regex(regEx).all());
    // page.addTargetRequests(pageRefLinks.regex("http://sou.zhaopin.com/[\\w,\\/.-?&_]+").all());

    synchronized (doneLinks) {
      doneLinks.put(pageUrl, doneNum++);
      SpiderRecord.addKeyNum("Zhaopin_all", doneNum);
    }
  }

  private String clearHtml(String tag) {

    return StringUtil.html2text(tag).replaceAll(":", "").trim();
  }
}
/*
 * author : 罗一鑫
 * date : 2015/10/19
 * 根据播放页html,从中抓取qitanid
 *
 * */
public class CommentCrawler implements PageProcessor {
  private static String part1OfUrl =
      "http://api.t.iqiyi.com/qx_api/comment/get_video_comments?aid=";
  private static String part2OfUrl =
      "&categoryid=1&cb=fnsucc&escape=true&need_reply=true&need_subject=true&need_total=1&page=";
  private static String part3OfUrl =
      "&page_size=10&page_size_reply=3&qitan_comment_type=1&qitancallback=fnsucc&qitanid=";
  private static String part4OfUrl = "&sort=hot&t=&tvid=";
  /*
   * xpath解析
   * */
  private static String TOTAL =
      "//div/div[@class='wrapper']/div[@class='wrapper-left']/"
          + "div[@id='block-I']/div[@id='qitancommonarea']/@";
  private static String QITANID_XPATH = TOTAL + "data-qitancomment-qitanid";
  private static String TVID_XPATH = TOTAL + "data-qitancomment-tvid";
  private static String TITLE_XPATH = "//head/title";

  /*
   * 站点设置
   * */
  private Site site =
      Site.me()
          .setDomain("http://www.iqiyi.com/")
          .setSleepTime(3000)
          .setUserAgent(
              "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) "
                  + "AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");

  @Override
  public Site getSite() {
    // TODO Auto-generated method stub
    return this.site;
  }

  @Override
  public void process(Page page) {
    if (!GlobalVar.isFilePlayUrlsReaded) {
      GlobalVar.isFilePlayUrlsReaded = true;
      FileInputStream fis = null;
      InputStreamReader isr = null;
      BufferedReader br = null;
      String str = null;
      try {
        fis = new FileInputStream("./playUrls.txt");
        isr = new InputStreamReader(fis);
        br = new BufferedReader(isr);
        while ((str = br.readLine()) != null) {
          page.addTargetRequest(str);
        }
      } catch (Exception e) {
        e.printStackTrace();
      } finally {
        try {
          br.close();
          isr.close();
          fis.close();
        } catch (IOException e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }
      }
    }
    System.out.println("test");
    // TODO Auto-generated method stub
    page.putField(
        "qitanid",
        page.getHtml().xpath(QITANID_XPATH).toString()
            + ","
            + page.getHtml().xpath(TVID_XPATH).toString()
            + ","
            + page.getHtml().xpath(TITLE_XPATH).toString().split(">")[1].split("-")[0]);
  }
}
Example #17
0
public class PagePro360 implements PageProcessor {
  // Site site = Site.me().setCharset("utf-8").setRetryTimes(2).setSleepTime(0);
  Site site =
      Site.me()
          .setCharset("utf-8")
          .setRetryTimes(PropertiesUtil.getRetryTimes())
          .setSleepTime(PropertiesUtil.getInterval());

  private Logger LOGGER = LoggerFactory.getLogger(PagePro360.class);

  public Apk process(Page page) {
    List<String> urls = page.getHtml().links().regex("(http://zhushou\\.360\\.cn/.*)").all();

    Set<String> cacheSet = Sets.newHashSet();
    cacheSet.addAll(urls);

    // 构造分页
    // http://zhushou.360.cn/list/index/cid/1
    // if(page.getUrl().regex("(http://zhushou\\.360\\.cn/detail/list/index/.*)").match()){
    if (page.getRequest().getUrl().equals("http://zhushou.360.cn/list/index/cid/1")
        || page.getRequest().getUrl().equals("http://zhushou.360.cn/list/index/cid/2")) {
      String pageStr = page.getHtml().regex("(pg\\.pageCount\\s=\\s\\w+)").toString();
      int pageCount = Integer.parseInt(pageStr.substring(15));
      List<String> url1 = new ArrayList<String>();
      for (int i = 2; i <= pageCount; i++) {
        url1.add(page.getRequest().getUrl() + "?page=" + i);
      }

      page.addTargetRequests(url1);
    }
    // 剔除锚点.*?#.*
    // #expand,#next,#prev,#comment,#nogo,#guess-like,#btn-install-now-log,#comment-list,#report
    for (String url : cacheSet) {
      if (url.toString().endsWith("#expand")
          || url.toString().endsWith("#next")
          || url.toString().endsWith("#prev")
          || url.toString().endsWith("#comment")
          || url.toString().endsWith("#nogo")
          || url.toString().endsWith("#guess-like")
          || url.toString().endsWith("#btn-install-now-log")
          || url.toString().endsWith("#comment-list")
          || url.toString().endsWith("#report")) {
        LOGGER.error("anchor:" + url.toString());
      } else {
        LOGGER.info(url.toString());
        page.addTargetRequest(url);
      }
    }

    // 提取页面信息
    if (page.getUrl().regex("(http://zhushou\\.360\\.cn/detail/index/soft_id/.*)").match()) {
      Html html = page.getHtml();
      Apk apk = PagePro360_Detail.getApkDetail(page);

      page.putField("apk", apk);
      if (page.getResultItems().get("apk") == null) {
        page.setSkip(true);
      }
    } else {
      page.setSkip(true);
    }
    return null;
  }

  public static void main(String[] args) {
    String url = "http://zhushou.360.cn/list/index/cid/1?page=24#expand";
    //
    //	if(url.endsWith("#expand||#next||#prev||#comment||#nogo||#guess-like||#btn-install-now-log")){
    if (url.endsWith("#expand || #next")) {
      System.out.println("true");
    }
  }

  @Override
  public List<Apk> processMulti(Page page) {
    // TODO Auto-generated method stub
    return null;
  }

  public Site getSite() {
    return site;
  }
}
 @Override
 public Page download(Request request, Task task) {
   Site site = null;
   if (task != null) {
     site = task.getSite();
   }
   Set<Integer> acceptStatCode;
   String charset = null;
   Map<String, String> headers = null;
   if (site != null) {
     acceptStatCode = site.getAcceptStatCode();
     charset = site.getCharset();
     headers = site.getHeaders();
   } else {
     acceptStatCode = Sets.newHashSet(200);
   }
   logger.info("downloading page " + request.getUrl());
   RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl());
   if (headers != null) {
     for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
       requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
     }
   }
   RequestConfig.Builder requestConfigBuilder =
       RequestConfig.custom()
           .setConnectionRequestTimeout(site.getTimeOut())
           .setSocketTimeout(site.getTimeOut())
           .setConnectTimeout(site.getTimeOut())
           .setCookieSpec(CookieSpecs.BEST_MATCH);
   if (site != null && site.getHttpProxy() != null) {
     requestConfigBuilder.setProxy(site.getHttpProxy());
   }
   requestBuilder.setConfig(requestConfigBuilder.build());
   CloseableHttpResponse httpResponse = null;
   try {
     httpResponse = getHttpClient(site).execute(requestBuilder.build());
     int statusCode = httpResponse.getStatusLine().getStatusCode();
     if (acceptStatCode.contains(statusCode)) {
       // charset
       if (charset == null) {
         String value = httpResponse.getEntity().getContentType().getValue();
         charset = UrlUtils.getCharset(value);
       }
       return handleResponse(request, charset, httpResponse, task);
     } else {
       logger.warn("code error " + statusCode + "\t" + request.getUrl());
       return null;
     }
   } catch (IOException e) {
     logger.warn("download page " + request.getUrl() + " error", e);
     if (site.getCycleRetryTimes() > 0) {
       return addToCycleRetry(request, site);
     }
     return null;
   } finally {
     try {
       if (httpResponse != null) {
         // ensure the connection is released back to pool
         EntityUtils.consume(httpResponse.getEntity());
       }
     } catch (IOException e) {
       logger.warn("close response fail", e);
     }
   }
 }
  public SpiderZhaopinProcessor() {

    site.setDomain("zhaopin.com");
    site.setUserAgent(
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36");
  }
Example #20
0
/**
 * url:http://s.shuiguo.com/qq_1_1.html id:38
 *
 * @version 1.0.0
 */
public class PageProShuiGuo implements PageProcessor {

  // 日志管理对象
  private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(PageProShuiGuo.class);

  // 定义网站编码,以及间隔时间
  Site site =
      Site.me()
          .setCharset("utf-8")
          .setRetryTimes(PropertiesUtil.getRetryTimes())
          .setSleepTime(PropertiesUtil.getInterval());

  /**
   * process the page, extract urls to fetch, extract the data and store
   *
   * @param page
   */
  @Override
  public Apk process(Page page) {
    LOGGER.debug("crawler url: {}", page.getUrl());

    // 获取搜索页面
    if ((page.getUrl().regex("http://www.shuiguo.com/android/.*").match()
            && page.getUrl().get().contains(".html"))
        || page.getUrl().regex("http://a\\.shuiguo\\.com/phb/").match()) {
      LOGGER.debug("match success, url:{}", page.getUrl());

      // 获取详细链接,以及分页链接
      List<String> urlList =
          page.getHtml().links().regex("http://www\\.shuiguo\\.com/android/.*").all();

      Iterator<String> iter = Sets.newHashSet(urlList).iterator();
      while (iter.hasNext()) {
        page.addTargetRequest(iter.next());
      }

      // 打印搜索结果url
      LOGGER.debug("app info results urls: {}", page.getTargetRequests());
    }

    // 获取信息
    if (page.getUrl().regex("http://www\\.shuiguo\\.com/android/.*").match()
        && !page.getUrl().get().endsWith(".html")) {
      Html html = page.getHtml();
      Apk apk;
      try {
        apk = PageProShuiGuo_Detail.getApkDetail(page);
        page.putField("apk", apk);
        if (page.getResultItems().get("apk") == null) {
          page.setSkip(true);
        }
      } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }

    } else {
      page.setSkip(true);
    }
    return null;
  }

  /**
   * get the site settings
   *
   * @return site
   * @see Site
   */
  @Override
  public Site getSite() {
    return site;
  }

  @Override
  public List<Apk> processMulti(Page page) {
    // TODO Auto-generated method stub
    return null;
  }
}