@Override public void process(Page page) { String title, companyName, salary; String keyStr, valueStr; // 属性 HashMap<String, String> propsMap = new HashMap<String, String>(); pageUrl = page.getRequest().getUrl(); Html pageHtml = page.getHtml(); Selectable pageRefLinks = page.getHtml().links(); // 1.页面是否已经存在过 synchronized (doneLinks) { if (doneLinks.containsKey(pageUrl)) { page.setSkip(true); return; } } String regEx = "http://jobs.zhaopin.com/(\\d)+.htm\\?([\\w=&]+)"; Pattern p = Pattern.compile(regEx); if (p.matcher(pageUrl).find()) { // 可以处理 System.out.println("找到一个:" + pageUrl); String pageTitle[] = pageHtml.getDocument().title().split("-"); title = pageTitle[0].replaceAll("招聘", ""); companyName = pageTitle[1]; List<String> keyList = pageHtml.xpath("//ul[@class='terminal-ul']/li").all(); String line = "", lines[]; for (int ii = 0; ii < keyList.size(); ii++) { if (keyList.get(ii) == null) { logger.warn("why keylist is null!"); continue; } line = StringUtil.html2text(keyList.get(ii)); lines = line.split(":"); if (lines.length != 2) continue; keyStr = lines[0]; valueStr = lines[1]; logger.debug(keyStr + ":" + valueStr); if (!(valueStr == null || keyStr == null)) { if (keyStr.trim().length() > 0 && valueStr.trim().length() > 0) propsMap.put(keyStr, valueStr); } } salary = propsMap.get("职位月薪"); if (salary == null) salary = ""; List<String> textList = pageHtml.xpath("//div[@class='tab-cont-box']/div[@class='tab-inner-cont']").all(); String jobDesc = textList.get(0); String companyDesc = textList.get(1); // save page.putField("jobTitle", title.trim()); page.putField("company", companyName.trim()); page.putField("salary", salary); page.putField("keyword", ""); page.putField("descr", jobDesc.trim()); page.putField("companyDesc", companyDesc); page.putField("props", propsMap); page.putField("url", pageUrl); page.putField("source", "zhaopin"); } else { page.setSkip(true); } // 分页、列表 page.addTargetRequests( pageRefLinks.regex("http://[\\w,\\/-_]+.zhaopin.com/[\\w,\\/.-?&_]+").all()); // page.addTargetRequests(pageRefLinks.regex(regEx).all()); // page.addTargetRequests(pageRefLinks.regex("http://sou.zhaopin.com/[\\w,\\/.-?&_]+").all()); synchronized (doneLinks) { doneLinks.put(pageUrl, doneNum++); SpiderRecord.addKeyNum("Zhaopin_all", doneNum); } }
private String clearHtml(String tag) { return StringUtil.html2text(tag).replaceAll(":", "").trim(); }