/** * 摘要 * * @param list * @param dom * @param component * @param strings */ @Override public void parseBrief(List<WeixinData> list, Node dom, Component component, String... args) { if (args[0] == null || args[0] == "") return; List<String> results = StringUtil.regMatches(args[0], "content168>", "/content168", true); for (int i = 0; i < results.size(); i++) { String tmp = results.get(i); String result = StringUtil.regMatcher(tmp, "CDATA\\[", "\\]"); list.get(i).setBrief(result); } }
@Override public void parseTitle(List<WeixinData> list, Node dom, Component component, String... args) { if (args[0] == null || args[0] == "") return; List<String> results = StringUtil.regMatches(args[0], "title>", "/title", true); for (int i = 0; i < results.size(); i++) { String tmp = results.get(i); String result = StringUtil.regMatcher(tmp, "CDATA\\[", "\\]"); WeixinData vd = new WeixinData(); vd.setTitle(result); list.add(vd); } }
@Override public void parseSource(WeixinData data, Node dom, Component component, String... strings) { if (component == null) return; NodeList nl = commonList(component.getXpath(), dom); if (nl == null) return; if (nl.item(0) != null) data.setSource(StringUtil.format(nl.item(0).getTextContent())); }
/** * 来源 * * @param list * @param dom * @param component * @param strings */ @Override public void parseSource(List<WeixinData> list, Node dom, Component component, String... strings) { if (component == null) return; NodeList nl = head(component.getXpath(), dom, list.size(), component.getName()); if (nl == null) return; for (int i = 0; i < nl.getLength(); i++) { list.get(i).setSource(StringUtil.format(nl.item(i).getTextContent())); } }
@Override public void parseImgUrl(WeixinData data, Node dom, Component component, String... args) { if (component == null) return; NodeList nl = commonList(component.getXpath(), dom); if (nl == null) return; String imgs = ""; for (int i = 0; i < nl.getLength(); i++) { imgs += StringUtil.format(nl.item(i).getTextContent()) + ";"; } data.setImgUrl(imgs); }
@Override public void parseSource(NewsData data, Node dom, Component component, String... args) { String str = ""; if (component == null) return; NodeList nl = commonList(component.getXpath(), dom); if (nl == null) return; if (nl.item(0) != null) str = StringUtil.format(nl.item(0).getTextContent()); str = str.replace("来源:", ""); str = str.equals("") ? "慧聪橡胶网" : str; str = str.length() > 20 ? "慧聪橡胶网" : str; data.setSource(str); }
@Override public void parseAuthor(NewsData data, Node dom, Component component, String... args) { String str = ""; if (component == null) { return; } NodeList nl = commonList(component.getXpath(), dom); if (nl == null) { return; } if (nl.item(0) != null) str = StringUtil.format(nl.item(0).getTextContent()); str = str.replace("●", "").trim(); data.setSource(str); }
@Override public void parseUrl(List<WeixinData> list, Node dom, Component component, String... args) { if (args[0] == null || args[0] == "" || args[1] == null || args[1] == "") return; String cookie = args[1]; // String referer = args[1]; List<String> results = StringUtil.regMatches(args[0], "<url>", "/url", true); for (int i = 0; i < results.size(); i++) { String tmpUrl = results.get(i); tmpUrl = "http://weixin.sogou.com" + tmpUrl.substring(tmpUrl.indexOf("CDATA[") + 6, tmpUrl.lastIndexOf("]]>")); String loc = null; try { HttpURLConnection conn = (HttpURLConnection) new URL(tmpUrl).openConnection(); conn.addRequestProperty( "User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:38.0) Gecko/20100101 Firefox/38.0"); conn.setRequestProperty("Cookie", cookie); // conn.setRequestProperty("Referer", referer); HttpURLConnection.setFollowRedirects(false); conn.setFollowRedirects(false); conn.connect(); loc = conn.getHeaderField("Location"); if (loc != null) Systemconfig.sysLog.log(conn.getResponseMessage()); Systemconfig.sysLog.log("real url: " + loc); int sleepTime = 30 + (int) (Math.random() * 20); Systemconfig.sysLog.log("sleep..." + sleepTime); TimeUtil.rest(sleepTime); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } list.get(i).setUrl(loc == null ? "err." : loc); } }
@Override public void parseSource(NewsData data, Node dom, Component component, String... args) { String str = ""; if (component == null) return; NodeList nl = commonList(component.getXpath(), dom); if (nl == null) return; if (nl.item(0) != null) { str = nl.item(0).getTextContent(); str = StringUtil.format(str); } if (str.contains("来源:")) str = str.substring(str.indexOf("来源:") + 3); if (str.contains("发布日期")) str = str.substring(0, str.indexOf("发布日期")).replace(" ", "").trim(); if (str == null) str = "中国橡胶工业协会-轮胎分会"; else { str = str.contains("本站") ? "中国橡胶工业协会-轮胎分会" : str; } str = str.length() > 20 ? "中国橡胶工业协会-轮胎分会" : str; data.setSource(str); }
public void parseNumber(WeixinData data, Node dom, Component component, String... args) { // http://mp.weixin.qq.com/s?__biz=MjM5ODE1NTMxMQ==&mid=201653867&idx=1&sn=6f3445a3640eb09ce7cfa5a49509f165&3rd=MzA3MDU4NTYzMw==&scene=6#rd String biz = ""; String mid = ""; String uin = ""; String key = ""; String fromFile = StringUtil.getContent("config/WeixinKey/WeixinKey.txt"); try { biz = StringUtil.regMatcher(data.getUrl(), "__biz=", "&"); mid = StringUtil.regMatcher(data.getUrl(), "mid=", "&"); for (String string : fromFile.split("&")) { if (string.contains("uin")) uin = string.split("=")[1].trim(); if (string.contains("key")) key = string.split("=")[1].trim(); } } catch (Exception e) { e.printStackTrace(); } String url = "http://mp.weixin.qq.com" + "/mp/getappmsgext?" + "__biz=" + biz + "&mid=" + mid + "&uin=" + uin + "&key=" + key // + // "&pass_ticket=b3hV91xTLYZxRGKemRNz%2FAi4VKElPnwHYUNtoV8w4dE%3D" + ""; HtmlInfo html = new HtmlInfo(); String charSet = "UTF-8"; html.setType("DATA"); html.setEncode(charSet); html.setOrignUrl(url); html.setCookie("Set-Cookie: wxuin=20156425; Path=/; Expires=Fri, 02-Jan-1970 00:00:00 GMT"); html.setUa( "Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4"); SimpleHttpProcess shp = new SimpleHttpProcess(); shp.getContent(html); String content = html.getContent(); int retry = 0; while (!content.contains("read_num")) { if (retry++ > 3) break; Systemconfig.sysLog.log("请获取key后输入任意内容回车继续...输入c忽略(很可能无法继续采集,不推荐)"); System.err.println("请获取key后输入任意内容回车继续...输入c忽略(很可能无法继续采集,不推荐)"); Scanner input = new Scanner(System.in); String s = input.next(); if (s.equals("c") || s.equals("C")) break; fromFile = StringUtil.getContent("config/WeixinKey/WeixinKey.txt"); try { for (String string : fromFile.split("&")) { if (string.contains("uin")) uin = string.split("=")[1].trim(); if (string.contains("key")) key = string.split("=")[1].trim(); } } catch (Exception e) { e.printStackTrace(); } url = "http://mp.weixin.qq.com" + "/mp/getappmsgext?" + "__biz=" + biz + "&mid=" + mid + "&uin=" + uin + "&key=" + key; html = new HtmlInfo(); charSet = "UTF-8"; html.setType("DATA"); html.setEncode(charSet); html.setOrignUrl(url); html.setCookie("Set-Cookie: wxuin=20156425; Path=/; Expires=Fri, 02-Jan-1970 00:00:00 GMT"); html.setUa( "Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4"); shp = new SimpleHttpProcess(); shp.getContent(html); content = html.getContent(); } String readNumStr = StringUtil.regMatcher(content, "\"read_num\":", ","); String praiseNumStr = StringUtil.regMatcher(content, "\"like_num\":", ","); try { if (readNumStr != null) data.setReadNum(Integer.parseInt(readNumStr)); if (praiseNumStr != null) data.setPraiseNum(Integer.parseInt(praiseNumStr)); } catch (Exception e) { e.printStackTrace(); } }