/* * 原则: * 当前page下要求解析出来的字段都要保证在当前page生命周期内解析完成。 * 哪些规则是当前page下的,由page.getRequest().getFiledRuleId()决定 */ @Override public void process(Page page) throws PageProcessException { Request originalReq = page.getRequest(); // 源rquest,非null Integer fieldRuleId = originalReq.getFieldRuleId(); if (fieldRuleId != null) { if (fieldRuleId == 20) System.out.println("test"); } Request nextRequest = originalReq.getNextRequest(); // 抽出下一步request,允许null Request templast = nextRequest; // 创建请求链时需要的临时节点 // 找到当前page下要求解析的字段 final List<SpiderFieldRule> dependenceFieldRules = new ArrayList<SpiderFieldRule>(); for (SpiderFieldRule fieldRule : fieldRules) { if (fieldRule.getParentId() == (fieldRuleId == null ? 0 : fieldRuleId)) { dependenceFieldRules.add(fieldRule); } } // 开始解析当前page下要求解析的字段 for (SpiderFieldRule fieldRule : dependenceFieldRules) { List<String> results; StringBuilder sb; switch (fieldRule.getType()) { case 0: results = page.getHtml().regex(fieldRule.getRule()).all(); if (results.size() == 0) { if (fieldRule.getAllowEmpty() == 1) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#regex@#}", fieldRule.getFieldName(), fieldRule.getRule(), fieldRule.getParentId())); } } // 判断是否会产生新的下载请求,如果产生新的下载请求,则当前规则只解析顶级层,如果有子规则,要=到新的下载完成之后才能解析 if (fieldRule.getAdditionDownload() == 1) { // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析 for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(fieldRule.getId()); if (templast == null) { nextRequest = additionReq; templast = nextRequest; } else { templast.setNextRequest(additionReq); templast = additionReq; } if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else if (fieldRule.getAdditionRequest() == 1) { for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(fieldRule.getId()); transmitResultItem(page, additionReq); page.addTargetRequest(additionReq); if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else { if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); for (String result : results) { sb.append(result + ","); } page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } ruleComplierLoop(fieldRule.getId(), page, nextRequest, templast); } break; case 1: results = page.getHtml().xpath(fieldRule.getRule()).all(); if (results.size() == 0) { if (fieldRule.getAllowEmpty() == 1) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#xpath@#}", fieldRule.getFieldName(), fieldRule.getRule(), fieldRule.getParentId())); } } // 判断是否会产生新的下载请求 if (fieldRule.getAdditionDownload() == 1) { // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析 for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(fieldRule.getId()); if (templast == null) { nextRequest = additionReq; templast = nextRequest; } else { templast.setNextRequest(additionReq); templast = additionReq; } if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else if (fieldRule.getAdditionRequest() == 1) { for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(fieldRule.getId()); transmitResultItem(page, additionReq); page.addTargetRequest(additionReq); if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else { if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); for (String result : results) { sb.append(result + ","); } page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } ruleComplierLoop(fieldRule.getId(), page, nextRequest, templast); } break; case 2: results = page.getHtml().css(fieldRule.getRule()).all(); if (results.size() == 0) { if (fieldRule.getAllowEmpty() == 1) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#css@#}", fieldRule.getFieldName(), fieldRule.getRule(), fieldRule.getParentId())); } } // 判断是否会产生新的下载请求 if (fieldRule.getAdditionDownload() == 1) { // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析 for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(fieldRule.getId()); if (templast == null) { nextRequest = additionReq; templast = nextRequest; } else { templast.setNextRequest(additionReq); templast = additionReq; } if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else if (fieldRule.getAdditionRequest() == 1) { for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(fieldRule.getId()); transmitResultItem(page, additionReq); page.addTargetRequest(additionReq); if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else { if (fieldRule.getNeedPersistence() == 0) { sb = new StringBuilder(); for (String result : results) { sb.append(result + ","); } page.putField( fieldRule.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } ruleComplierLoop(fieldRule.getId(), page, nextRequest, templast); } break; default: if (page.getRequest().getExtra(fieldRule.getFieldName()) == null) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,type:@#orig@#}", fieldRule.getFieldName(), fieldRule.getRule())); } page.putField( fieldRule.getFieldName(), page.getRequest().getExtra(fieldRule.getFieldName())); break; } } // 最后判断一下当前page有没有产生新的下载请求或任务请求,如果有,将page.getResultItems()中的解析结果通过request传递到下一个page中去 if (nextRequest != null) { transmitResultItem(page, nextRequest); page.setSkip(true); originalReq.setNextRequest(nextRequest); } }
/** * 当某个解析规则不会产生新的下载请求时(这种情况下当前page生命周期已经结束),当前page必须解析完该规则下的所有字段,存在子规则层层嵌套的情况 * * @param parentRuleId * @param page * @param nextRequest * @param templast * @throws PageProcessException */ private void ruleComplierLoop(int parentRuleId, Page page, Request nextRequest, Request templast) throws PageProcessException { List<SpiderFieldRule> childs = new ArrayList<SpiderFieldRule>(); for (SpiderFieldRule fieldRule : fieldRules) { if (fieldRule.getParentId() == parentRuleId) { childs.add(fieldRule); } } if (childs.size() == 0) { return; } else { for (SpiderFieldRule child : childs) { List<String> results; StringBuilder sb; switch (child.getType()) { case 0: results = page.getHtml().regex(child.getRule()).all(); if (results.size() == 0) { if (child.getAllowEmpty() == 1) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#regex@#}", child.getFieldName(), child.getRule(), child.getParentId())); } } // 判断是否会产生新的下载请求 if (child.getAdditionDownload() == 1) { // page.setIncludeAddition(true); // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析 for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(child.getId()); if (templast == null) { nextRequest = additionReq; templast = nextRequest; } else { templast.setNextRequest(additionReq); templast = additionReq; } if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else if (child.getAdditionRequest() == 1) { for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(child.getId()); page.addTargetRequest(additionReq); if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else { if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); for (String result : results) { sb.append(result + ","); } page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } ruleComplierLoop(child.getId(), page, nextRequest, templast); } break; case 1: results = page.getHtml().xpath(child.getRule()).all(); if (results.size() == 0) { if (child.getAllowEmpty() == 1) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#xpath@#}", child.getFieldName(), child.getRule(), child.getParentId())); } } // 判断是否会产生新的下载请求 if (child.getAdditionDownload() == 1) { // page.setIncludeAddition(true); // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析 for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(child.getId()); if (templast == null) { nextRequest = additionReq; templast = nextRequest; } else { templast.setNextRequest(additionReq); templast = additionReq; } if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else if (child.getAdditionRequest() == 1) { for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(child.getId()); page.addTargetRequest(additionReq); if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else { if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); for (String result : results) { sb.append(result + ","); } page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } ruleComplierLoop(child.getId(), page, nextRequest, templast); } break; case 2: results = page.getHtml().css(child.getRule()).all(); if (results.size() == 0) { if (child.getAllowEmpty() == 1) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,parentId:@#%d@#,type:@#css@#}", child.getFieldName(), child.getRule(), child.getParentId())); } } // 判断是否会产生新的下载请求 if (child.getAdditionDownload() == 1) { // page.setIncludeAddition(true); // 将fieldRule.getId()保存下来,新请求的页面将筛选parentid为该id的规则进行解析 for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(child.getId()); if (templast == null) { nextRequest = additionReq; templast = nextRequest; } else { templast.setNextRequest(additionReq); templast = additionReq; } if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else if (child.getAdditionRequest() == 1) { for (String result : results) { if (!result.startsWith("http")) { result = "http://" + site.getDomain() + result.trim().replace("|", "%7C"); } Request additionReq = new Request(result); additionReq.setFieldRuleId(child.getId()); page.addTargetRequest(additionReq); if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); sb.append(result + ","); page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } } } else { if (child.getNeedPersistence() == 0) { sb = new StringBuilder(); for (String result : results) { sb.append(result + ","); } page.putField( child.getFieldName(), StringUtils.substringBeforeLast(sb.toString().trim(), ",")); } ruleComplierLoop(child.getId(), page, nextRequest, templast); } break; default: if (page.getRequest().getExtra(child.getFieldName()) == null) { throw new PageProcessException( String.format( "{fieldname:@#%s@#,fieldrule:@#%s@#,type:@#orig@#}", child.getFieldName(), child.getRule())); } page.putField(child.getFieldName(), page.getRequest().getExtra(child.getFieldName())); break; } } } }