@Override public synchronized void push(Request request, Task task) { Jedis jedis = pool.getResource(); // 使用SortedSet进行url去重 if (jedis.zrank(SET_PREFIX + task.getUUID(), request.getUrl()) == null) { // 使用List保存队列 jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl()); jedis.zadd(SET_PREFIX + task.getUUID(), System.currentTimeMillis(), request.getUrl()); } pool.returnResource(jedis); }
@Override public synchronized Request poll(Task task) { Jedis jedis = pool.getResource(); String url = jedis.lpop(QUEUE_PREFIX + task.getUUID()); pool.returnResource(jedis); if (url == null) { return null; } return new Request(url); }
@Override public void process(Object o, Task task) { String path = this.path + "/" + task.getUUID() + "/"; try { String filename; if (o instanceof HasKey) { filename = path + ((HasKey) o).key() + ".json"; } else { filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".json"; } PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(filename))); printWriter.write(JSON.toJSONString(o)); printWriter.close(); } catch (IOException e) { logger.warn("write file error", e); } }
@Override public Page download(Request request, Task task) { checkInit(); WebDriver webDriver; try { webDriver = webDriverPool.get(); } catch (InterruptedException e) { logger.warn("interrupted", e); return null; } logger.info("downloading page " + request.getUrl()); webDriver.get(request.getUrl()); try { Thread.sleep(sleepTime); } catch (InterruptedException e) { e.printStackTrace(); } WebDriver.Options manage = webDriver.manage(); Site site = task.getSite(); if (site.getCookies() != null) { for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) { Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue()); manage.addCookie(cookie); } } /* * TODO You can add mouse event or other processes * * @author: [email protected] */ WebElement webElement = webDriver.findElement(By.xpath("/html")); String content = webElement.getAttribute("outerHTML"); Page page = new Page(); page.setRawText(content); page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); webDriverPool.returnToPool(webDriver); return page; }
@Override public Page download(Request request, Task task) { Site site = null; if (task != null) { site = task.getSite(); } Set<Integer> acceptStatCode; String charset = null; Map<String, String> headers = null; if (site != null) { acceptStatCode = site.getAcceptStatCode(); charset = site.getCharset(); headers = site.getHeaders(); } else { acceptStatCode = Sets.newHashSet(200); } logger.info("downloading page " + request.getUrl()); RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl()); if (headers != null) { for (Map.Entry<String, String> headerEntry : headers.entrySet()) { requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); } } RequestConfig.Builder requestConfigBuilder = RequestConfig.custom() .setConnectionRequestTimeout(site.getTimeOut()) .setSocketTimeout(site.getTimeOut()) .setConnectTimeout(site.getTimeOut()) .setCookieSpec(CookieSpecs.BEST_MATCH); if (site != null && site.getHttpProxy() != null) { requestConfigBuilder.setProxy(site.getHttpProxy()); } requestBuilder.setConfig(requestConfigBuilder.build()); CloseableHttpResponse httpResponse = null; try { httpResponse = getHttpClient(site).execute(requestBuilder.build()); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (acceptStatCode.contains(statusCode)) { // charset if (charset == null) { String value = httpResponse.getEntity().getContentType().getValue(); charset = UrlUtils.getCharset(value); } return handleResponse(request, charset, httpResponse, task); } else { logger.warn("code error " + statusCode + "\t" + request.getUrl()); return null; } } catch (IOException e) { logger.warn("download page " + request.getUrl() + " error", e); if (site.getCycleRetryTimes() > 0) { return addToCycleRetry(request, site); } return null; } finally { try { if (httpResponse != null) { // ensure the connection is released back to pool EntityUtils.consume(httpResponse.getEntity()); } } catch (IOException e) { logger.warn("close response fail", e); } } }
private String getFileName(String filename) { return filePath + task.getUUID() + filename; }