@Override public void process() { List<UserData> alllist = new ArrayList<UserData>(); List<UserData> list = new ArrayList<UserData>(); String url = getRealUrl(siteinfo, gloaburl); String nexturl = url; HtmlInfo html = htmlInfo(CollectDataType.FANS.name()); int count = 1; try { while (nexturl != null && !nexturl.equals("")) { list.clear(); html.setOrignUrl(nexturl); try { http.getContent(html, user); // // html.setContent(common.util.StringUtil.getContent("filedown/FANS/sina/50b7702c4c3dc15a1cf1c56155b08d46.htm")); nexturl = ((WeiboMonitorXpathExtractor) ((XpathExtractor) xpath)) .templateRelation(list, html, count, id + "", nexturl); if (list.size() == 0) { Systemconfig.sysLog.log(url + "元数据页面解析为空!!"); break; } Systemconfig.sysLog.log(url + "元数据页面解析完成。"); Systemconfig.dbService.getNorepeatData(list, ""); alllist.addAll(list); url = nexturl; count++; if (nexturl != null) TimeUtil.rest(siteinfo.getDownInterval()); } catch (Exception e) { e.printStackTrace(); try { Systemconfig.dbService.saveLog(siteFlag, key, 3, url + "\r\n" + e.getMessage()); } catch (IOException e1) { e1.printStackTrace(); } break; } } Systemconfig.dbService.saveDatas(alllist); } catch (IOException e) { e.printStackTrace(); } finally { alllist.clear(); list.clear(); } }
@Override public void parseUrl(List<WeixinData> list, Node dom, Component component, String... args) { if (args[0] == null || args[0] == "" || args[1] == null || args[1] == "") return; String cookie = args[1]; // String referer = args[1]; List<String> results = StringUtil.regMatches(args[0], "<url>", "/url", true); for (int i = 0; i < results.size(); i++) { String tmpUrl = results.get(i); tmpUrl = "http://weixin.sogou.com" + tmpUrl.substring(tmpUrl.indexOf("CDATA[") + 6, tmpUrl.lastIndexOf("]]>")); String loc = null; try { HttpURLConnection conn = (HttpURLConnection) new URL(tmpUrl).openConnection(); conn.addRequestProperty( "User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:38.0) Gecko/20100101 Firefox/38.0"); conn.setRequestProperty("Cookie", cookie); // conn.setRequestProperty("Referer", referer); HttpURLConnection.setFollowRedirects(false); conn.setFollowRedirects(false); conn.connect(); loc = conn.getHeaderField("Location"); if (loc != null) Systemconfig.sysLog.log(conn.getResponseMessage()); Systemconfig.sysLog.log("real url: " + loc); int sleepTime = 30 + (int) (Math.random() * 20); Systemconfig.sysLog.log("sleep..." + sleepTime); TimeUtil.rest(sleepTime); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } list.get(i).setUrl(loc == null ? "err." : loc); } }