Пример #1
0
 @Override
 public synchronized void push(Request request, Task task) {
   Jedis jedis = pool.getResource();
   // 使用SortedSet进行url去重
   if (jedis.zrank(SET_PREFIX + task.getUUID(), request.getUrl()) == null) {
     // 使用List保存队列
     jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
     jedis.zadd(SET_PREFIX + task.getUUID(), System.currentTimeMillis(), request.getUrl());
   }
   pool.returnResource(jedis);
 }
Пример #2
0
 @Override
 public synchronized Request poll(Task task) {
   Jedis jedis = pool.getResource();
   String url = jedis.lpop(QUEUE_PREFIX + task.getUUID());
   pool.returnResource(jedis);
   if (url == null) {
     return null;
   }
   return new Request(url);
 }
 @Override
 public void process(Object o, Task task) {
   String path = this.path + "/" + task.getUUID() + "/";
   try {
     String filename;
     if (o instanceof HasKey) {
       filename = path + ((HasKey) o).key() + ".json";
     } else {
       filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".json";
     }
     PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(filename)));
     printWriter.write(JSON.toJSONString(o));
     printWriter.close();
   } catch (IOException e) {
     logger.warn("write file error", e);
   }
 }
Пример #4
0
  @Override
  public Page download(Request request, Task task) {
    checkInit();
    WebDriver webDriver;
    try {
      webDriver = webDriverPool.get();
    } catch (InterruptedException e) {
      logger.warn("interrupted", e);
      return null;
    }
    logger.info("downloading page " + request.getUrl());
    webDriver.get(request.getUrl());
    try {
      Thread.sleep(sleepTime);
    } catch (InterruptedException e) {
      e.printStackTrace();
    }
    WebDriver.Options manage = webDriver.manage();
    Site site = task.getSite();
    if (site.getCookies() != null) {
      for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) {
        Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue());
        manage.addCookie(cookie);
      }
    }

    /*
     * TODO You can add mouse event or other processes
     *
     * @author: [email protected]
     */

    WebElement webElement = webDriver.findElement(By.xpath("/html"));
    String content = webElement.getAttribute("outerHTML");
    Page page = new Page();
    page.setRawText(content);
    page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    webDriverPool.returnToPool(webDriver);
    return page;
  }
Пример #5
0
 @Override
 public Page download(Request request, Task task) {
   Site site = null;
   if (task != null) {
     site = task.getSite();
   }
   Set<Integer> acceptStatCode;
   String charset = null;
   Map<String, String> headers = null;
   if (site != null) {
     acceptStatCode = site.getAcceptStatCode();
     charset = site.getCharset();
     headers = site.getHeaders();
   } else {
     acceptStatCode = Sets.newHashSet(200);
   }
   logger.info("downloading page " + request.getUrl());
   RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl());
   if (headers != null) {
     for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
       requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
     }
   }
   RequestConfig.Builder requestConfigBuilder =
       RequestConfig.custom()
           .setConnectionRequestTimeout(site.getTimeOut())
           .setSocketTimeout(site.getTimeOut())
           .setConnectTimeout(site.getTimeOut())
           .setCookieSpec(CookieSpecs.BEST_MATCH);
   if (site != null && site.getHttpProxy() != null) {
     requestConfigBuilder.setProxy(site.getHttpProxy());
   }
   requestBuilder.setConfig(requestConfigBuilder.build());
   CloseableHttpResponse httpResponse = null;
   try {
     httpResponse = getHttpClient(site).execute(requestBuilder.build());
     int statusCode = httpResponse.getStatusLine().getStatusCode();
     if (acceptStatCode.contains(statusCode)) {
       // charset
       if (charset == null) {
         String value = httpResponse.getEntity().getContentType().getValue();
         charset = UrlUtils.getCharset(value);
       }
       return handleResponse(request, charset, httpResponse, task);
     } else {
       logger.warn("code error " + statusCode + "\t" + request.getUrl());
       return null;
     }
   } catch (IOException e) {
     logger.warn("download page " + request.getUrl() + " error", e);
     if (site.getCycleRetryTimes() > 0) {
       return addToCycleRetry(request, site);
     }
     return null;
   } finally {
     try {
       if (httpResponse != null) {
         // ensure the connection is released back to pool
         EntityUtils.consume(httpResponse.getEntity());
       }
     } catch (IOException e) {
       logger.warn("close response fail", e);
     }
   }
 }
Пример #6
0
 private String getFileName(String filename) {
   return filePath + task.getUUID() + filename;
 }