@Override public synchronized void push(Request request, Task task) { if (logger.isDebugEnabled()) { logger.debug("push to queue " + request.getUrl()); } if (urls.add(request.getUrl())) { queue.add(request); } }
@Override public synchronized void push(Request request, Task task) { Jedis jedis = pool.getResource(); // 使用SortedSet进行url去重 if (jedis.zrank(SET_PREFIX + task.getUUID(), request.getUrl()) == null) { // 使用List保存队列 jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl()); jedis.zadd(SET_PREFIX + task.getUUID(), System.currentTimeMillis(), request.getUrl()); } pool.returnResource(jedis); }
/** * 将当前页面解析到的字段传递给新的request * * @param page * @param nextRequest */ private void transmitResultItem(Page page, Request nextRequest) { // 将当前page解析下来的字段转交到nextRequest中 Map<String, Object> fields = page.getResultItems().getAll(); for (Entry<String, Object> entry : fields.entrySet()) { nextRequest.addInheritField(entry.getKey(), entry.getValue()); } }
protected Page handleResponse( Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset); Page page = new Page(); page.setRawText(content); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); return page; }
@Override public Page download(Request request, Task task) { checkInit(); WebDriver webDriver; try { webDriver = webDriverPool.get(); } catch (InterruptedException e) { logger.warn("interrupted", e); return null; } logger.info("downloading page " + request.getUrl()); webDriver.get(request.getUrl()); try { Thread.sleep(sleepTime); } catch (InterruptedException e) { e.printStackTrace(); } WebDriver.Options manage = webDriver.manage(); Site site = task.getSite(); if (site.getCookies() != null) { for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) { Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue()); manage.addCookie(cookie); } } /* * TODO You can add mouse event or other processes * * @author: [email protected] */ WebElement webElement = webDriver.findElement(By.xpath("/html")); String content = webElement.getAttribute("outerHTML"); Page page = new Page(); page.setRawText(content); page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); webDriverPool.returnToPool(webDriver); return page; }
/* * 原则: * 当前page下要求解析出来的字段都要保证在当前page生命周期内解析完成。 * 哪些规则是当前page下的,由page.getRequest().getFiledRuleId()决定 */ @Override public void process(Page page) throws PageProcessException { Request originalReq = page.getRequest(); // 源rquest,非null Integer fieldRuleId = originalReq.getFieldRuleId(); if (fieldRuleId != null) { if (fieldRuleId == 20) System.out.println("test"); } Request nextRequest = originalReq.getNextRequest(); // 抽出下一步request,允许null Request templast = nextRequest; // 创建请求链时需要的临时节点 // 找到当前page下要求解析的字段 final List<SpiderFieldRule> dependenceFieldRules = new ArrayList<SpiderFieldRule>(); for (SpiderFieldRule fieldRule : fieldRules) { if (fieldRule.getParentId() == (fieldRuleId == null ? 0 : fieldRuleId)) { dependenceFieldRules.add(fieldRule); } } // 开始解析当前page下要求解析的字段 for (SpiderFieldRule fieldRule : dependenceFieldRules) { List<String> results; StringBuilder sb; switch (fieldRule.getType()) { case 0: results = page.getHtml().regex(fieldRule.getRule()).all(); if (results.size() == 0) { if (fieldRule.getAllowEmpty() == 1) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#regex@#}", fieldRule.getFieldName(), fieldRule.getRule(), fieldRule.getParentId())); } } // 判断是否会产生新的下载请求,如果产生新的下载请求,则当前规则只解析顶级层,如果有子规则,要=到新的下载完成之后才能解析 if (fieldRule.getAdditionDownload() == 1) { // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析 for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(fieldRule.getId()); if (templast == null) { nextRequest = additionReq; templast = nextRequest; } else { templast.setNextRequest(additionReq); templast = additionReq; } if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else if (fieldRule.getAdditionRequest() == 1) { for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(fieldRule.getId()); transmitResultItem(page, additionReq); page.addTargetRequest(additionReq); if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else { if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); for (String result : results) { sb.append(result + ","); } page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } ruleComplierLoop(fieldRule.getId(), page, nextRequest, templast); } break; case 1: results = page.getHtml().xpath(fieldRule.getRule()).all(); if (results.size() == 0) { if (fieldRule.getAllowEmpty() == 1) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#xpath@#}", fieldRule.getFieldName(), fieldRule.getRule(), fieldRule.getParentId())); } } // 判断是否会产生新的下载请求 if (fieldRule.getAdditionDownload() == 1) { // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析 for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(fieldRule.getId()); if (templast == null) { nextRequest = additionReq; templast = nextRequest; } else { templast.setNextRequest(additionReq); templast = additionReq; } if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else if (fieldRule.getAdditionRequest() == 1) { for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(fieldRule.getId()); transmitResultItem(page, additionReq); page.addTargetRequest(additionReq); if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else { if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); for (String result : results) { sb.append(result + ","); } page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } ruleComplierLoop(fieldRule.getId(), page, nextRequest, templast); } break; case 2: results = page.getHtml().css(fieldRule.getRule()).all(); if (results.size() == 0) { if (fieldRule.getAllowEmpty() == 1) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#css@#}", fieldRule.getFieldName(), fieldRule.getRule(), fieldRule.getParentId())); } } // 判断是否会产生新的下载请求 if (fieldRule.getAdditionDownload() == 1) { // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析 for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(fieldRule.getId()); if (templast == null) { nextRequest = additionReq; templast = nextRequest; } else { templast.setNextRequest(additionReq); templast = additionReq; } if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else if (fieldRule.getAdditionRequest() == 1) { for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(fieldRule.getId()); transmitResultItem(page, additionReq); page.addTargetRequest(additionReq); if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else { if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); for (String result : results) { sb.append(result + ","); } page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } ruleComplierLoop(fieldRule.getId(), page, nextRequest, templast); } break; default: if (page.getRequest().getExtra(fieldRule.getFieldName()) == null) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,type:@#orig@#}", fieldRule.getFieldName(), fieldRule.getRule())); } page.putField( fieldRule.getFieldName(), page.getRequest().getExtra(fieldRule.getFieldName())); break; } } // 最后判断一下当前page有没有产生新的下载请求或任务请求,如果有,将page.getResultItems()中的解析结果通过request传递到下一个page中去 if (nextRequest != null) { transmitResultItem(page, nextRequest); page.setSkip(true); originalReq.setNextRequest(nextRequest); } }
/** * 当某个解析规则不会产生新的下载请求时(这种情况下当前page生命周期已经结束),当前page必须解析完该规则下的所有字段,存在子规则层层嵌套的情况 * * @param parentRuleId * @param page * @param nextRequest * @param templast * @throws PageProcessException */ private void ruleComplierLoop(int parentRuleId, Page page, Request nextRequest, Request templast) throws PageProcessException { List<SpiderFieldRule> childs = new ArrayList<SpiderFieldRule>(); for (SpiderFieldRule fieldRule : fieldRules) { if (fieldRule.getParentId() == parentRuleId) { childs.add(fieldRule); } } if (childs.size() == 0) { return; } else { for (SpiderFieldRule child : childs) { List<String> results; StringBuilder sb; switch (child.getType()) { case 0: results = page.getHtml().regex(child.getRule()).all(); if (results.size() == 0) { if (child.getAllowEmpty() == 1) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#regex@#}", child.getFieldName(), child.getRule(), child.getParentId())); } } // 判断是否会产生新的下载请求 if (child.getAdditionDownload() == 1) { // page.setIncludeAddition(true); // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析 for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(child.getId()); if (templast == null) { nextRequest = additionReq; templast = nextRequest; } else { templast.setNextRequest(additionReq); templast = additionReq; } if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else if (child.getAdditionRequest() == 1) { for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(child.getId()); page.addTargetRequest(additionReq); if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else { if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); for (String result : results) { sb.append(result + ","); } page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } ruleComplierLoop(child.getId(), page, nextRequest, templast); } break; case 1: results = page.getHtml().xpath(child.getRule()).all(); if (results.size() == 0) { if (child.getAllowEmpty() == 1) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#xpath@#}", child.getFieldName(), child.getRule(), child.getParentId())); } } // 判断是否会产生新的下载请求 if (child.getAdditionDownload() == 1) { // page.setIncludeAddition(true); // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析 for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(child.getId()); if (templast == null) { nextRequest = additionReq; templast = nextRequest; } else { templast.setNextRequest(additionReq); templast = additionReq; } if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else if (child.getAdditionRequest() == 1) { for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(child.getId()); page.addTargetRequest(additionReq); if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else { if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); for (String result : results) { sb.append(result + ","); } page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } ruleComplierLoop(child.getId(), page, nextRequest, templast); } break; case 2: results = page.getHtml().css(child.getRule()).all(); if (results.size() == 0) { if (child.getAllowEmpty() == 1) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#css@#}", child.getFieldName(), child.getRule(), child.getParentId())); } } // 判断是否会产生新的下载请求 if (child.getAdditionDownload() == 1) { // page.setIncludeAddition(true); // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析 for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(child.getId()); if (templast == null) { nextRequest = additionReq; templast = nextRequest; } else { templast.setNextRequest(additionReq); templast = additionReq; } if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else if (child.getAdditionRequest() == 1) { for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(child.getId()); page.addTargetRequest(additionReq); if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else { if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); for (String result : results) { sb.append(result + ","); } page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } ruleComplierLoop(child.getId(), page, nextRequest, templast); } break; default: if (page.getRequest().getExtra(child.getFieldName()) == null) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,type:@#orig@#}", child.getFieldName(), child.getRule())); } page.putField(child.getFieldName(), page.getRequest().getExtra(child.getFieldName())); break; } } } }
@Override public Page download(Request request, Task task) { Site site = null; if (task != null) { site = task.getSite(); } Set<Integer> acceptStatCode; String charset = null; Map<String, String> headers = null; if (site != null) { acceptStatCode = site.getAcceptStatCode(); charset = site.getCharset(); headers = site.getHeaders(); } else { acceptStatCode = Sets.newHashSet(200); } logger.info("downloading page " + request.getUrl()); RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl()); if (headers != null) { for (Map.Entry<String, String> headerEntry : headers.entrySet()) { requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); } } RequestConfig.Builder requestConfigBuilder = RequestConfig.custom() .setConnectionRequestTimeout(site.getTimeOut()) .setSocketTimeout(site.getTimeOut()) .setConnectTimeout(site.getTimeOut()) .setCookieSpec(CookieSpecs.BEST_MATCH); if (site != null && site.getHttpProxy() != null) { requestConfigBuilder.setProxy(site.getHttpProxy()); } requestBuilder.setConfig(requestConfigBuilder.build()); CloseableHttpResponse httpResponse = null; try { httpResponse = getHttpClient(site).execute(requestBuilder.build()); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (acceptStatCode.contains(statusCode)) { // charset if (charset == null) { String value = httpResponse.getEntity().getContentType().getValue(); charset = UrlUtils.getCharset(value); } return handleResponse(request, charset, httpResponse, task); } else { logger.warn("code error " + statusCode + "\t" + request.getUrl()); return null; } } catch (IOException e) { logger.warn("download page " + request.getUrl() + " error", e); if (site.getCycleRetryTimes() > 0) { return addToCycleRetry(request, site); } return null; } finally { try { if (httpResponse != null) { // ensure the connection is released back to pool EntityUtils.consume(httpResponse.getEntity()); } } catch (IOException e) { logger.warn("close response fail", e); } } }
@Override protected void pushWhenNoDuplicate(Request request, Task task) { queue.add(request); fileUrlWriter.println(request.getUrl()); }