/** * 爬取入口,按产品号遍历下载产品页面 * * @param sourceDir 源文件夹 * @param proNum 产品数量 * @param beginProId 开始产品号 * @throws ClientProtocolException * @throws URISyntaxException * @throws IOException */ public static void crawlZOL(String sourceDir, int proNum, int beginProId) throws ClientProtocolException, URISyntaxException, IOException { int proId = beginProId; int iIP = 0; IPAddress ip; String today = DateFormater.getDateofToday(); String sourcePath = sourceDir + "/" + today; new File(sourcePath).mkdirs(); String html; ip = IPs.get(iIP); String hostName = ip.getHost(); int port = ip.getPort(); for (int j = 0; j < proNum; j++) { StringBuilder sb = new StringBuilder(); boolean validPage = false; int page = 1; while (true) { String requestURL = "http://detail.zol.com.cn/xhr3_Review_GetListAndPage_order=1%5EisFilter=1%5EproId=" + proId + "%5Epage=" + page + ".html"; html = HTTPHandler.getHTML(requestURL, hostName, port); int iReconn = 0; while (html.equals("null")) { html = HTTPHandler.getHTML(requestURL, hostName, port); iReconn++; System.out.println("****" + ip.toString() + " reconnected " + iReconn + " time(s)****"); if (iReconn == 4) { // 4 break; } } if (html.equals("null")) { System.out.println("****5 consecutive connections were failed, now using next IP****"); if (iIP == IPNum - 1) { System.out.println( "****All valid proxy IPs have been tried, still can not get all the data. Now trying the valid proxy IP list again.****"); iIP = 0; System.out.println("****Turn to" + IPs.get(iIP) + ", start connecting****"); } else { iIP++; System.out.println("****Turn to" + IPs.get(iIP) + ", start connecting****"); } } if (html.length() > 4700) { System.out.println( "ok!parsing id:" + proId + "\t page : " + page + "\t length:" + html.length()); sb.append(html + "\n"); validPage = true; page++; } else { break; } } if (validPage) { FileOperation.writeString( sb.toString(), sourcePath + "/" + String.valueOf(proId) + ".html"); } proId++; } }