Example #1
0
 @Override
 public synchronized void push(Request request, Task task) {
   if (logger.isDebugEnabled()) {
     logger.debug("push to queue " + request.getUrl());
   }
   if (urls.add(request.getUrl())) {
     queue.add(request);
   }
 }
Example #2
0
 @Override
 public synchronized void push(Request request, Task task) {
   Jedis jedis = pool.getResource();
   // 使用SortedSet进行url去重
   if (jedis.zrank(SET_PREFIX + task.getUUID(), request.getUrl()) == null) {
     // 使用List保存队列
     jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
     jedis.zadd(SET_PREFIX + task.getUUID(), System.currentTimeMillis(), request.getUrl());
   }
   pool.returnResource(jedis);
 }
 /**
  * 将当前页面解析到的字段传递给新的request
  *
  * @param page
  * @param nextRequest
  */
 private void transmitResultItem(Page page, Request nextRequest) {
   // 将当前page解析下来的字段转交到nextRequest中
   Map<String, Object> fields = page.getResultItems().getAll();
   for (Entry<String, Object> entry : fields.entrySet()) {
     nextRequest.addInheritField(entry.getKey(), entry.getValue());
   }
 }
 protected Page handleResponse(
     Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
   String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
   Page page = new Page();
   page.setRawText(content);
   page.setUrl(new PlainText(request.getUrl()));
   page.setRequest(request);
   page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
   return page;
 }
Example #5
0
  @Override
  public Page download(Request request, Task task) {
    checkInit();
    WebDriver webDriver;
    try {
      webDriver = webDriverPool.get();
    } catch (InterruptedException e) {
      logger.warn("interrupted", e);
      return null;
    }
    logger.info("downloading page " + request.getUrl());
    webDriver.get(request.getUrl());
    try {
      Thread.sleep(sleepTime);
    } catch (InterruptedException e) {
      e.printStackTrace();
    }
    WebDriver.Options manage = webDriver.manage();
    Site site = task.getSite();
    if (site.getCookies() != null) {
      for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) {
        Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue());
        manage.addCookie(cookie);
      }
    }

    /*
     * TODO You can add mouse event or other processes
     *
     * @author: [email protected]
     */

    WebElement webElement = webDriver.findElement(By.xpath("/html"));
    String content = webElement.getAttribute("outerHTML");
    Page page = new Page();
    page.setRawText(content);
    page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    webDriverPool.returnToPool(webDriver);
    return page;
  }
  /*
   * 原则:
   * 	当前page下要求解析出来的字段都要保证在当前page生命周期内解析完成。
   * 	哪些规则是当前page下的,由page.getRequest().getFiledRuleId()决定
   */
  @Override
  public void process(Page page) throws PageProcessException {

    Request originalReq = page.getRequest(); // 源rquest,非null
    Integer fieldRuleId = originalReq.getFieldRuleId();
    if (fieldRuleId != null) {
      if (fieldRuleId == 20) System.out.println("test");
    }

    Request nextRequest = originalReq.getNextRequest(); // 抽出下一步request,允许null
    Request templast = nextRequest; // 创建请求链时需要的临时节点

    // 找到当前page下要求解析的字段
    final List<SpiderFieldRule> dependenceFieldRules = new ArrayList<SpiderFieldRule>();

    for (SpiderFieldRule fieldRule : fieldRules) {
      if (fieldRule.getParentId() == (fieldRuleId == null ? 0 : fieldRuleId)) {
        dependenceFieldRules.add(fieldRule);
      }
    }
    // 开始解析当前page下要求解析的字段
    for (SpiderFieldRule fieldRule : dependenceFieldRules) {
      List<String> results;
      StringBuilder sb;
      switch (fieldRule.getType()) {
        case 0:
          results = page.getHtml().regex(fieldRule.getRule()).all();
          if (results.size() == 0) {
            if (fieldRule.getAllowEmpty() == 1) {
              throw new PageProcessException(
                  String.format(
                      "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#regex@#}",
                      fieldRule.getFieldName(), fieldRule.getRule(), fieldRule.getParentId()));
            }
          }
          // 判断是否会产生新的下载请求,如果产生新的下载请求,则当前规则只解析顶级层,如果有子规则,要=到新的下载完成之后才能解析
          if (fieldRule.getAdditionDownload() == 1) {
            // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析
            for (String result : results) {
              if (!result.startsWith("http")) {
                result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
              }
              Request additionReq = new Request(result);
              additionReq.setFieldRuleId(fieldRule.getId());
              if (templast == null) {
                nextRequest = additionReq;
                templast = nextRequest;
              } else {
                templast.setNextRequest(additionReq);
                templast = additionReq;
              }
              if (fieldRule.getNeedPersistence() == 0) {
                sb = new StringBuilder();
                sb.append(result + ",");
                page.putField(
                    fieldRule.getFieldName(),
                    StringUtils.substringBeforeLast(sb.toString().trim(), ","));
              }
            }
          } else if (fieldRule.getAdditionRequest() == 1) {
            for (String result : results) {
              if (!result.startsWith("http")) {
                result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
              }
              Request additionReq = new Request(result);
              additionReq.setFieldRuleId(fieldRule.getId());
              transmitResultItem(page, additionReq);
              page.addTargetRequest(additionReq);
              if (fieldRule.getNeedPersistence() == 0) {
                sb = new StringBuilder();
                sb.append(result + ",");
                page.putField(
                    fieldRule.getFieldName(),
                    StringUtils.substringBeforeLast(sb.toString().trim(), ","));
              }
            }
          } else {
            if (fieldRule.getNeedPersistence() == 0) {
              sb = new StringBuilder();
              for (String result : results) {
                sb.append(result + ",");
              }
              page.putField(
                  fieldRule.getFieldName(),
                  StringUtils.substringBeforeLast(sb.toString().trim(), ","));
            }
            ruleComplierLoop(fieldRule.getId(), page, nextRequest, templast);
          }
          break;
        case 1:
          results = page.getHtml().xpath(fieldRule.getRule()).all();
          if (results.size() == 0) {
            if (fieldRule.getAllowEmpty() == 1) {
              throw new PageProcessException(
                  String.format(
                      "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#xpath@#}",
                      fieldRule.getFieldName(), fieldRule.getRule(), fieldRule.getParentId()));
            }
          }
          // 判断是否会产生新的下载请求
          if (fieldRule.getAdditionDownload() == 1) {
            // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析
            for (String result : results) {
              if (!result.startsWith("http")) {
                result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
              }
              Request additionReq = new Request(result);
              additionReq.setFieldRuleId(fieldRule.getId());
              if (templast == null) {
                nextRequest = additionReq;
                templast = nextRequest;
              } else {
                templast.setNextRequest(additionReq);
                templast = additionReq;
              }
              if (fieldRule.getNeedPersistence() == 0) {
                sb = new StringBuilder();
                sb.append(result + ",");
                page.putField(
                    fieldRule.getFieldName(),
                    StringUtils.substringBeforeLast(sb.toString().trim(), ","));
              }
            }
          } else if (fieldRule.getAdditionRequest() == 1) {
            for (String result : results) {
              if (!result.startsWith("http")) {
                result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
              }
              Request additionReq = new Request(result);
              additionReq.setFieldRuleId(fieldRule.getId());
              transmitResultItem(page, additionReq);
              page.addTargetRequest(additionReq);
              if (fieldRule.getNeedPersistence() == 0) {
                sb = new StringBuilder();
                sb.append(result + ",");
                page.putField(
                    fieldRule.getFieldName(),
                    StringUtils.substringBeforeLast(sb.toString().trim(), ","));
              }
            }
          } else {
            if (fieldRule.getNeedPersistence() == 0) {
              sb = new StringBuilder();
              for (String result : results) {
                sb.append(result + ",");
              }
              page.putField(
                  fieldRule.getFieldName(),
                  StringUtils.substringBeforeLast(sb.toString().trim(), ","));
            }
            ruleComplierLoop(fieldRule.getId(), page, nextRequest, templast);
          }
          break;
        case 2:
          results = page.getHtml().css(fieldRule.getRule()).all();
          if (results.size() == 0) {
            if (fieldRule.getAllowEmpty() == 1) {
              throw new PageProcessException(
                  String.format(
                      "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#css@#}",
                      fieldRule.getFieldName(), fieldRule.getRule(), fieldRule.getParentId()));
            }
          }
          // 判断是否会产生新的下载请求
          if (fieldRule.getAdditionDownload() == 1) {
            // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析
            for (String result : results) {
              if (!result.startsWith("http")) {
                result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
              }
              Request additionReq = new Request(result);
              additionReq.setFieldRuleId(fieldRule.getId());
              if (templast == null) {
                nextRequest = additionReq;
                templast = nextRequest;
              } else {
                templast.setNextRequest(additionReq);
                templast = additionReq;
              }
              if (fieldRule.getNeedPersistence() == 0) {
                sb = new StringBuilder();
                sb.append(result + ",");
                page.putField(
                    fieldRule.getFieldName(),
                    StringUtils.substringBeforeLast(sb.toString().trim(), ","));
              }
            }
          } else if (fieldRule.getAdditionRequest() == 1) {
            for (String result : results) {
              if (!result.startsWith("http")) {
                result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
              }
              Request additionReq = new Request(result);
              additionReq.setFieldRuleId(fieldRule.getId());
              transmitResultItem(page, additionReq);
              page.addTargetRequest(additionReq);
              if (fieldRule.getNeedPersistence() == 0) {
                sb = new StringBuilder();
                sb.append(result + ",");
                page.putField(
                    fieldRule.getFieldName(),
                    StringUtils.substringBeforeLast(sb.toString().trim(), ","));
              }
            }
          } else {
            if (fieldRule.getNeedPersistence() == 0) {
              sb = new StringBuilder();
              for (String result : results) {
                sb.append(result + ",");
              }
              page.putField(
                  fieldRule.getFieldName(),
                  StringUtils.substringBeforeLast(sb.toString().trim(), ","));
            }
            ruleComplierLoop(fieldRule.getId(), page, nextRequest, templast);
          }
          break;
        default:
          if (page.getRequest().getExtra(fieldRule.getFieldName()) == null) {
            throw new PageProcessException(
                String.format(
                    "{fieldname:@#%s@#,fieldrule:@#%s@#,type:@#orig@#}",
                    fieldRule.getFieldName(), fieldRule.getRule()));
          }
          page.putField(
              fieldRule.getFieldName(), page.getRequest().getExtra(fieldRule.getFieldName()));
          break;
      }
    }
    // 最后判断一下当前page有没有产生新的下载请求或任务请求,如果有,将page.getResultItems()中的解析结果通过request传递到下一个page中去
    if (nextRequest != null) {
      transmitResultItem(page, nextRequest);
      page.setSkip(true);
      originalReq.setNextRequest(nextRequest);
    }
  }
  /**
   * 当某个解析规则不会产生新的下载请求时(这种情况下当前page生命周期已经结束),当前page必须解析完该规则下的所有字段,存在子规则层层嵌套的情况
   *
   * @param parentRuleId
   * @param page
   * @param nextRequest
   * @param templast
   * @throws PageProcessException
   */
  private void ruleComplierLoop(int parentRuleId, Page page, Request nextRequest, Request templast)
      throws PageProcessException {

    List<SpiderFieldRule> childs = new ArrayList<SpiderFieldRule>();
    for (SpiderFieldRule fieldRule : fieldRules) {
      if (fieldRule.getParentId() == parentRuleId) {
        childs.add(fieldRule);
      }
    }
    if (childs.size() == 0) {
      return;
    } else {
      for (SpiderFieldRule child : childs) {
        List<String> results;
        StringBuilder sb;
        switch (child.getType()) {
          case 0:
            results = page.getHtml().regex(child.getRule()).all();
            if (results.size() == 0) {
              if (child.getAllowEmpty() == 1) {
                throw new PageProcessException(
                    String.format(
                        "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#regex@#}",
                        child.getFieldName(), child.getRule(), child.getParentId()));
              }
            }
            // 判断是否会产生新的下载请求
            if (child.getAdditionDownload() == 1) {
              // page.setIncludeAddition(true);
              // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析
              for (String result : results) {
                if (!result.startsWith("http")) {
                  result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
                }
                Request additionReq = new Request(result);
                additionReq.setFieldRuleId(child.getId());
                if (templast == null) {
                  nextRequest = additionReq;
                  templast = nextRequest;
                } else {
                  templast.setNextRequest(additionReq);
                  templast = additionReq;
                }
                if (child.getNeedPersistence() == 0) {
                  sb = new StringBuilder();
                  sb.append(result + ",");
                  page.putField(
                      child.getFieldName(),
                      StringUtils.substringBeforeLast(sb.toString().trim(), ","));
                }
              }
            } else if (child.getAdditionRequest() == 1) {
              for (String result : results) {
                if (!result.startsWith("http")) {
                  result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
                }
                Request additionReq = new Request(result);
                additionReq.setFieldRuleId(child.getId());
                page.addTargetRequest(additionReq);
                if (child.getNeedPersistence() == 0) {
                  sb = new StringBuilder();
                  sb.append(result + ",");
                  page.putField(
                      child.getFieldName(),
                      StringUtils.substringBeforeLast(sb.toString().trim(), ","));
                }
              }
            } else {
              if (child.getNeedPersistence() == 0) {
                sb = new StringBuilder();
                for (String result : results) {
                  sb.append(result + ",");
                }
                page.putField(
                    child.getFieldName(),
                    StringUtils.substringBeforeLast(sb.toString().trim(), ","));
              }
              ruleComplierLoop(child.getId(), page, nextRequest, templast);
            }
            break;
          case 1:
            results = page.getHtml().xpath(child.getRule()).all();
            if (results.size() == 0) {
              if (child.getAllowEmpty() == 1) {
                throw new PageProcessException(
                    String.format(
                        "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#xpath@#}",
                        child.getFieldName(), child.getRule(), child.getParentId()));
              }
            }
            // 判断是否会产生新的下载请求
            if (child.getAdditionDownload() == 1) {
              // page.setIncludeAddition(true);
              // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析
              for (String result : results) {
                if (!result.startsWith("http")) {
                  result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
                }
                Request additionReq = new Request(result);
                additionReq.setFieldRuleId(child.getId());
                if (templast == null) {
                  nextRequest = additionReq;
                  templast = nextRequest;
                } else {
                  templast.setNextRequest(additionReq);
                  templast = additionReq;
                }
                if (child.getNeedPersistence() == 0) {
                  sb = new StringBuilder();
                  sb.append(result + ",");
                  page.putField(
                      child.getFieldName(),
                      StringUtils.substringBeforeLast(sb.toString().trim(), ","));
                }
              }
            } else if (child.getAdditionRequest() == 1) {
              for (String result : results) {
                if (!result.startsWith("http")) {
                  result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
                }
                Request additionReq = new Request(result);
                additionReq.setFieldRuleId(child.getId());
                page.addTargetRequest(additionReq);
                if (child.getNeedPersistence() == 0) {
                  sb = new StringBuilder();
                  sb.append(result + ",");
                  page.putField(
                      child.getFieldName(),
                      StringUtils.substringBeforeLast(sb.toString().trim(), ","));
                }
              }
            } else {
              if (child.getNeedPersistence() == 0) {
                sb = new StringBuilder();
                for (String result : results) {
                  sb.append(result + ",");
                }
                page.putField(
                    child.getFieldName(),
                    StringUtils.substringBeforeLast(sb.toString().trim(), ","));
              }
              ruleComplierLoop(child.getId(), page, nextRequest, templast);
            }
            break;
          case 2:
            results = page.getHtml().css(child.getRule()).all();
            if (results.size() == 0) {
              if (child.getAllowEmpty() == 1) {
                throw new PageProcessException(
                    String.format(
                        "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#css@#}",
                        child.getFieldName(), child.getRule(), child.getParentId()));
              }
            }
            // 判断是否会产生新的下载请求
            if (child.getAdditionDownload() == 1) {
              // page.setIncludeAddition(true);
              // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析
              for (String result : results) {
                if (!result.startsWith("http")) {
                  result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
                }
                Request additionReq = new Request(result);
                additionReq.setFieldRuleId(child.getId());
                if (templast == null) {
                  nextRequest = additionReq;
                  templast = nextRequest;
                } else {
                  templast.setNextRequest(additionReq);
                  templast = additionReq;
                }
                if (child.getNeedPersistence() == 0) {
                  sb = new StringBuilder();
                  sb.append(result + ",");
                  page.putField(
                      child.getFieldName(),
                      StringUtils.substringBeforeLast(sb.toString().trim(), ","));
                }
              }
            } else if (child.getAdditionRequest() == 1) {
              for (String result : results) {
                if (!result.startsWith("http")) {
                  result = "http://" + site.getDomain() + result.trim().replace("|", "%7C");
                }
                Request additionReq = new Request(result);
                additionReq.setFieldRuleId(child.getId());
                page.addTargetRequest(additionReq);
                if (child.getNeedPersistence() == 0) {
                  sb = new StringBuilder();
                  sb.append(result + ",");
                  page.putField(
                      child.getFieldName(),
                      StringUtils.substringBeforeLast(sb.toString().trim(), ","));
                }
              }
            } else {
              if (child.getNeedPersistence() == 0) {
                sb = new StringBuilder();
                for (String result : results) {
                  sb.append(result + ",");
                }
                page.putField(
                    child.getFieldName(),
                    StringUtils.substringBeforeLast(sb.toString().trim(), ","));
              }
              ruleComplierLoop(child.getId(), page, nextRequest, templast);
            }
            break;
          default:
            if (page.getRequest().getExtra(child.getFieldName()) == null) {
              throw new PageProcessException(
                  String.format(
                      "{fieldname:@#%s@#,fieldrule:@#%s@#,type:@#orig@#}",
                      child.getFieldName(), child.getRule()));
            }
            page.putField(child.getFieldName(), page.getRequest().getExtra(child.getFieldName()));
            break;
        }
      }
    }
  }
 @Override
 public Page download(Request request, Task task) {
   Site site = null;
   if (task != null) {
     site = task.getSite();
   }
   Set<Integer> acceptStatCode;
   String charset = null;
   Map<String, String> headers = null;
   if (site != null) {
     acceptStatCode = site.getAcceptStatCode();
     charset = site.getCharset();
     headers = site.getHeaders();
   } else {
     acceptStatCode = Sets.newHashSet(200);
   }
   logger.info("downloading page " + request.getUrl());
   RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl());
   if (headers != null) {
     for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
       requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
     }
   }
   RequestConfig.Builder requestConfigBuilder =
       RequestConfig.custom()
           .setConnectionRequestTimeout(site.getTimeOut())
           .setSocketTimeout(site.getTimeOut())
           .setConnectTimeout(site.getTimeOut())
           .setCookieSpec(CookieSpecs.BEST_MATCH);
   if (site != null && site.getHttpProxy() != null) {
     requestConfigBuilder.setProxy(site.getHttpProxy());
   }
   requestBuilder.setConfig(requestConfigBuilder.build());
   CloseableHttpResponse httpResponse = null;
   try {
     httpResponse = getHttpClient(site).execute(requestBuilder.build());
     int statusCode = httpResponse.getStatusLine().getStatusCode();
     if (acceptStatCode.contains(statusCode)) {
       // charset
       if (charset == null) {
         String value = httpResponse.getEntity().getContentType().getValue();
         charset = UrlUtils.getCharset(value);
       }
       return handleResponse(request, charset, httpResponse, task);
     } else {
       logger.warn("code error " + statusCode + "\t" + request.getUrl());
       return null;
     }
   } catch (IOException e) {
     logger.warn("download page " + request.getUrl() + " error", e);
     if (site.getCycleRetryTimes() > 0) {
       return addToCycleRetry(request, site);
     }
     return null;
   } finally {
     try {
       if (httpResponse != null) {
         // ensure the connection is released back to pool
         EntityUtils.consume(httpResponse.getEntity());
       }
     } catch (IOException e) {
       logger.warn("close response fail", e);
     }
   }
 }
 @Override
 protected void pushWhenNoDuplicate(Request request, Task task) {
   queue.add(request);
   fileUrlWriter.println(request.getUrl());
 }