public void process() { String url = getRealUrl(data); if (url == null) return; HtmlInfo html = htmlInfo("DATA"); try { if (url != null && !url.equals("")) { html.setOrignUrl(url); html.setAgent(false); http.getContent(html); // html.setContent(); if (html.getContent() == null) { return; } // 解析数据 xpath.templateContentPage(data, html); Systemconfig.sysLog.log(data.getTitle() + "解析完成。。。"); Systemconfig.dbService.saveData(data); synchronized (key) { key.savedCountIncrease(); } Systemconfig.sysLog.log(data.getTitle() + "保存完成。。。"); } } catch (Exception e) { Systemconfig.sysLog.log("采集出现异常" + url, e); } finally { if (count != null) count.countDown(); } }
@Override public String templateListPage(List<WeixinData> list, HtmlInfo html, int page, String... keyword) throws SAXException, IOException { list.clear(); /** keyword 0: search_keyword 1: search_url(list) 2: ... 3: cookies */ String cookie = keyword[3]; Siteinfo siteinfo = Systemconfig.allSiteinfos.get(html.getSite()); Node domtree = getRealDOM(html); if (domtree == null) { Systemconfig.sysLog.log("DOM解析为NULL!!"); return null; } CommonComponent comp = getRealComp( siteinfo, html.getType().substring(0, html.getType().indexOf(File.separator))); // 得到元数据的配置组件 processList( list, domtree, comp.getComponents(), args(html.getContent(), cookie, String.valueOf(siteinfo.getSiteFlag()), keyword)); if (list.size() == 0) return null; attrSet(list, siteinfo.getSiteFlag(), keyword[0], Integer.parseInt(keyword[2])); return parseNext( domtree, comp.getComponents().get("next"), new String[] {keyword[1], page + ""}); }
@Override public void process() { List<UserData> alllist = new ArrayList<UserData>(); List<UserData> list = new ArrayList<UserData>(); String url = getRealUrl(siteinfo, gloaburl); String nexturl = url; HtmlInfo html = htmlInfo(CollectDataType.FANS.name()); int count = 1; try { while (nexturl != null && !nexturl.equals("")) { list.clear(); html.setOrignUrl(nexturl); try { http.getContent(html, user); // // html.setContent(common.util.StringUtil.getContent("filedown/FANS/sina/50b7702c4c3dc15a1cf1c56155b08d46.htm")); nexturl = ((WeiboMonitorXpathExtractor) ((XpathExtractor) xpath)) .templateRelation(list, html, count, id + "", nexturl); if (list.size() == 0) { Systemconfig.sysLog.log(url + "元数据页面解析为空!!"); break; } Systemconfig.sysLog.log(url + "元数据页面解析完成。"); Systemconfig.dbService.getNorepeatData(list, ""); alllist.addAll(list); url = nexturl; count++; if (nexturl != null) TimeUtil.rest(siteinfo.getDownInterval()); } catch (Exception e) { e.printStackTrace(); try { Systemconfig.dbService.saveLog(siteFlag, key, 3, url + "\r\n" + e.getMessage()); } catch (IOException e1) { e1.printStackTrace(); } break; } } Systemconfig.dbService.saveDatas(alllist); } catch (IOException e) { e.printStackTrace(); } finally { alllist.clear(); list.clear(); } }
public void parseNumber(WeixinData data, Node dom, Component component, String... args) { // http://mp.weixin.qq.com/s?__biz=MjM5ODE1NTMxMQ==&mid=201653867&idx=1&sn=6f3445a3640eb09ce7cfa5a49509f165&3rd=MzA3MDU4NTYzMw==&scene=6#rd String biz = ""; String mid = ""; String uin = ""; String key = ""; String fromFile = StringUtil.getContent("config/WeixinKey/WeixinKey.txt"); try { biz = StringUtil.regMatcher(data.getUrl(), "__biz=", "&"); mid = StringUtil.regMatcher(data.getUrl(), "mid=", "&"); for (String string : fromFile.split("&")) { if (string.contains("uin")) uin = string.split("=")[1].trim(); if (string.contains("key")) key = string.split("=")[1].trim(); } } catch (Exception e) { e.printStackTrace(); } String url = "http://mp.weixin.qq.com" + "/mp/getappmsgext?" + "__biz=" + biz + "&mid=" + mid + "&uin=" + uin + "&key=" + key // + // "&pass_ticket=b3hV91xTLYZxRGKemRNz%2FAi4VKElPnwHYUNtoV8w4dE%3D" + ""; HtmlInfo html = new HtmlInfo(); String charSet = "UTF-8"; html.setType("DATA"); html.setEncode(charSet); html.setOrignUrl(url); html.setCookie("Set-Cookie: wxuin=20156425; Path=/; Expires=Fri, 02-Jan-1970 00:00:00 GMT"); html.setUa( "Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4"); SimpleHttpProcess shp = new SimpleHttpProcess(); shp.getContent(html); String content = html.getContent(); int retry = 0; while (!content.contains("read_num")) { if (retry++ > 3) break; Systemconfig.sysLog.log("请获取key后输入任意内容回车继续...输入c忽略(很可能无法继续采集,不推荐)"); System.err.println("请获取key后输入任意内容回车继续...输入c忽略(很可能无法继续采集,不推荐)"); Scanner input = new Scanner(System.in); String s = input.next(); if (s.equals("c") || s.equals("C")) break; fromFile = StringUtil.getContent("config/WeixinKey/WeixinKey.txt"); try { for (String string : fromFile.split("&")) { if (string.contains("uin")) uin = string.split("=")[1].trim(); if (string.contains("key")) key = string.split("=")[1].trim(); } } catch (Exception e) { e.printStackTrace(); } url = "http://mp.weixin.qq.com" + "/mp/getappmsgext?" + "__biz=" + biz + "&mid=" + mid + "&uin=" + uin + "&key=" + key; html = new HtmlInfo(); charSet = "UTF-8"; html.setType("DATA"); html.setEncode(charSet); html.setOrignUrl(url); html.setCookie("Set-Cookie: wxuin=20156425; Path=/; Expires=Fri, 02-Jan-1970 00:00:00 GMT"); html.setUa( "Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4"); shp = new SimpleHttpProcess(); shp.getContent(html); content = html.getContent(); } String readNumStr = StringUtil.regMatcher(content, "\"read_num\":", ","); String praiseNumStr = StringUtil.regMatcher(content, "\"like_num\":", ","); try { if (readNumStr != null) data.setReadNum(Integer.parseInt(readNumStr)); if (praiseNumStr != null) data.setPraiseNum(Integer.parseInt(praiseNumStr)); } catch (Exception e) { e.printStackTrace(); } }