Beispiel #1
0
 /**
  * 爬取入口,按产品号遍历下载产品页面
  *
  * @param sourceDir 源文件夹
  * @param proNum 产品数量
  * @param beginProId 开始产品号
  * @throws ClientProtocolException
  * @throws URISyntaxException
  * @throws IOException
  */
 public static void crawlZOL(String sourceDir, int proNum, int beginProId)
     throws ClientProtocolException, URISyntaxException, IOException {
   int proId = beginProId;
   int iIP = 0;
   IPAddress ip;
   String today = DateFormater.getDateofToday();
   String sourcePath = sourceDir + "/" + today;
   new File(sourcePath).mkdirs();
   String html;
   ip = IPs.get(iIP);
   String hostName = ip.getHost();
   int port = ip.getPort();
   for (int j = 0; j < proNum; j++) {
     StringBuilder sb = new StringBuilder();
     boolean validPage = false;
     int page = 1;
     while (true) {
       String requestURL =
           "http://detail.zol.com.cn/xhr3_Review_GetListAndPage_order=1%5EisFilter=1%5EproId="
               + proId
               + "%5Epage="
               + page
               + ".html";
       html = HTTPHandler.getHTML(requestURL, hostName, port);
       int iReconn = 0;
       while (html.equals("null")) {
         html = HTTPHandler.getHTML(requestURL, hostName, port);
         iReconn++;
         System.out.println("****" + ip.toString() + " reconnected " + iReconn + " time(s)****");
         if (iReconn == 4) { // 4
           break;
         }
       }
       if (html.equals("null")) {
         System.out.println("****5 consecutive connections were failed, now using next IP****");
         if (iIP == IPNum - 1) {
           System.out.println(
               "****All valid proxy IPs have been tried, still can not get all the data. Now trying the valid proxy IP list again.****");
           iIP = 0;
           System.out.println("****Turn to" + IPs.get(iIP) + ", start connecting****");
         } else {
           iIP++;
           System.out.println("****Turn to" + IPs.get(iIP) + ", start connecting****");
         }
       }
       if (html.length() > 4700) {
         System.out.println(
             "ok!parsing id:" + proId + "\t page : " + page + "\t length:" + html.length());
         sb.append(html + "\n");
         validPage = true;
         page++;
       } else {
         break;
       }
     }
     if (validPage) {
       FileOperation.writeString(
           sb.toString(), sourcePath + "/" + String.valueOf(proId) + ".html");
     }
     proId++;
   }
 }