public BaiduSpider() { this.seed = "baidu"; this.host = Config.getInstance().getSpiderSeedHost(seed); this.hrUrl = Config.getInstance().getSpiderSeedUrls(seed)[0]; dateFormat = new SimpleDateFormat("yyyy-MM-dd", new Locale("en")); contentPattern = Pattern.compile("<h4>工作职责:</h4><div[\\s\\S]*?</div><h4>职位要求:</h4><div[\\s\\S]*?</div>"); }
private Post parseJob(BasicDBObject job) throws Exception { Post post = new Post().setTitle(job.getString("jobName")); post.setCompany("百度") .setDepartment(job.getString("jobDepartMent")) .setAddress(job.getString("jobAera")); post.setType(job.getString("jobKind")) .setSeed(seed) .setHost(host) .setLink(job.getString("jobUrl")); post.setAuthor(seed) .setPubtime(dateFormat.parse(job.getString("jobTime"))) .setCreateTime(new Date()); // content if (post.getLink() != null && !post.getLink().isEmpty()) { HttpURLConnection conn = (HttpURLConnection) new URL(post.getLink()).openConnection(); conn.setRequestProperty("Host", "hr.baidu.com"); conn.setRequestProperty("Referer", "http://hr.baidu.com/static/jobList.html"); conn.setRequestProperty("User-Agent", USERAGENT); conn.setRequestProperty("Cookie", Config.getInstance().getSpiderCookie(seed)); conn.connect(); BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream())); StringBuffer buf = new StringBuffer(); String line = reader.readLine(); while (line != null) { buf.append(line); line = reader.readLine(); } reader.close(); Matcher matcher = contentPattern.matcher(buf.toString()); if (matcher.find()) { post.setContent(buf.substring(matcher.start(), matcher.end())); } } return post; }
private String crawl(int curPage) throws Exception { HttpURLConnection conn = (HttpURLConnection) new URL(hrUrl).openConnection(); conn.setRequestProperty("Host", "hr.baidu.com"); conn.setRequestProperty("Origin", "http://hr.baidu.com"); conn.setRequestProperty("Referer", "http://hr.baidu.com/static/jobList.html"); conn.setRequestProperty("User-Agent", USERAGENT); conn.setRequestProperty("X-Request-With", "XMLHttpRequest"); conn.setRequestProperty("Cookie", Config.getInstance().getSpiderCookie(seed)); conn.setRequestMethod("POST"); conn.setDoOutput(true); conn.getOutputStream().write(("place=全部&type=全部¤tPage=" + curPage).getBytes()); conn.connect(); BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream())); StringBuffer buf = new StringBuffer(); String line = reader.readLine(); while (line != null) { buf.append(line); line = reader.readLine(); } reader.close(); return buf.toString(); }
@Override public List<Post> crawl() throws Exception { log.info("crawl " + seed + " starts..."); List<Post> posts = new ArrayList<Post>(); int page = 1; boolean hasNextPage = true; while (hasNextPage) { log.info("current page " + page); String buf = this.crawl(page); // parse BasicDBObject result = (BasicDBObject) JSON.parse(buf.toString()); BasicDBObject jobList = (BasicDBObject) result.get("jobList"); BasicDBList jobs = (BasicDBList) jobList.get("jobMes"); for (int i = 0; i < jobs.size(); i++) { Post post = this.parseJob((BasicDBObject) jobs.get(i)); posts.add(post); } // next if (!crawlAllPages) { break; } String rows = jobList.getString("totalRows"); if (rows != null && !rows.isEmpty()) { int total = (int) Math.ceil(Double.parseDouble(rows) / 10); hasNextPage = page < total; } else { hasNextPage = false; } if (hasNextPage) { page++; Thread.sleep(Config.getInstance().getSpiderPolite()); } } log.info("crawl " + seed + " done"); return posts; }