private List<Map<String, String>> extractOrdersBetween( CrawlerPage crawlerPage, Date beginDate, Date endDate, String loginName) { String url = "https://consumeprod.alipay.com/record/advanced.htm?beginTime=00%3A00&endTime=24%3A00" + "&dateRange=customDate&status=all&keyword=bizOutNo&keyValue=&dateType=createDate&minAmount=&maxAmount=" + "&fundFlow=all&tradeType=ALL&categoryId=&_input_charset=utf-8"; url = url + "&beginDate=" + format(beginDate) + "&endDate=" + format(endDate); HtmlBean htmlBean = getNewSourceCode(crawlerPage, url); String sourceCode = htmlBean.getSourceCode(); String nextPageRegex = "class=\"page-next\" href=\"(.*?)\">下一页></a>"; String orderNumRegex = "<div class=\"page-link\">.*?共(.*?)条"; List<Map<String, String>> list = new ArrayList<>(); list.addAll(extractOrders(sourceCode, loginName, htmlBean.getHbaseKey())); Matcher matcher = RegExpUtil.getMatcher(sourceCode, nextPageRegex); if (matcher.find()) { /*高级版*/ while (sourceCode.contains("下一页")) { matcher = RegExpUtil.getMatcher(sourceCode, nextPageRegex); if (matcher.find()) { String nextPageUrl = matcher.group(1); if (nextPageUrl.startsWith("https://consumeprod.alipay.com")) { nextPageUrl += "&maxAmount=&minAount=&keyValue="; nextPageUrl = nextPageUrl.replaceAll("&", "&"); } sourceCode = getNewSourceCode(crawlerPage, nextPageUrl).getSourceCode(); List<Map<String, String>> orders = extractOrders(sourceCode, loginName, htmlBean.getHbaseKey()); list.addAll(orders); } else { break; } } } else { /*基础版*/ url = "https://consumeprod.alipay.com/record/standard.htm?_input_charset=utf-8&tradeType=ALL&dateRange=customDate&status=all&fundFlow=all&&dateType=createDate"; url = url + "&beginDate=" + format(beginDate) + "&endDate=" + format(endDate); htmlBean = getNewSourceCode(crawlerPage, url); sourceCode = htmlBean.getSourceCode(); matcher = RegExpUtil.getMatcher(sourceCode, orderNumRegex); if (matcher.find()) { int orderNum = Integer.parseInt(HtmlUtils.escapeHtmlTag(matcher.group(1))); int pageNum = (orderNum % 20 == 0) ? (orderNum / 20) : (orderNum / 20 + 1); for (int i = 2; i <= pageNum; i++) { sourceCode = getNewSourceCode(crawlerPage, url + "&pageNum=" + i).getSourceCode(); List<Map<String, String>> orders = extractOrders(sourceCode, loginName, htmlBean.getHbaseKey()); list.addAll(orders); } } else { crawlerPage.getMetaData().getExtractorResult().getWarnAtrrSet().add("orderNum"); } } if (crawlerPage.getMetaData().getExtractorResult().getWarnAtrrSet().size() > 0) { LoggerProcessor loggerProcessor = new LoggerProcessor(); loggerProcessor.process(crawlerPage); } return list; }
public List<Map<String, String>> extractOrders( String sourceCode, String loginName, String hbaseKey) { List<Map<String, String>> list = new ArrayList<>(); String orderBlockRegex = "<tr id=\"J-item-\\d+\".*?class=\"J-item.*?\">.*?</tr>"; Matcher matcher = RegExpUtil.getMatcher(sourceCode, orderBlockRegex); LoggerProcessor loggerProcessor = new LoggerProcessor(); while (matcher.find()) { CrawlerPage crawlerPage = new CrawlerPage(); Map<String, String> map = new HashMap<>(); String orderBlock = matcher.group(); for (Map.Entry<String, String> entry : orderRegexMap.entrySet()) { String propertyName = entry.getKey(); String propertyValue = entry.getValue(); Matcher propertyMatcher = RegExpUtil.getMatcher(orderBlock, propertyValue); if (propertyMatcher.find()) { String value = UnicodeDecoderUtils.decodeUnicode(HtmlUtils.escapeHtmlTag(propertyMatcher.group(1))); if (propertyName.equals("amount")) { value = value.replaceAll(" ", ""); } map.put(propertyName, value); } else { crawlerPage.getMetaData().getExtractorResult().getWarnAtrrSet().add(entry.getKey()); } } if (crawlerPage.getMetaData().getExtractorResult().getWarnAtrrSet().size() > 0) { loggerProcessor.process(crawlerPage); } if (map.containsKey("time-d") && !map.get("time-d").isEmpty() && map.containsKey("time-h") && !map.get("time-h").isEmpty()) { map.put("payTime", format(map.get("time-d") + " " + map.get("time-h"), "yyyy.MM.dd hh:mm")); } if (map.containsKey("receiverName") && StringUtils.isNotBlank(map.get("receiverName"))) { String receiverName = map.get("receiverName"); if (receiverName.endsWith("|")) { receiverName = receiverName.substring(0, receiverName.length() - 1); } map.put("receiverName", receiverName.trim()); } map.put("source", "ZHIFUBAO"); map.put("alipayName", loginName); map.put("hbaseKey", hbaseKey); list.add(map); } if (!matcher.find(1)) { CrawlerPage crawlerPage = new CrawlerPage(); crawlerPage.getMetaData().getExtractorResult().getWarnAtrrSet().add("alipayOrderBlock"); loggerProcessor.process(crawlerPage); } return list; }
/** * 根据regexMap匹配网页源码,返回匹配到的key-values对应的map * * @param regexMap * @param sourceCode * @return */ public Map<String, String> getMatchedValues( Map<String, String> regexMap, String sourceCode, String url) { Map<String, String> values = new HashMap<>(); CrawlerPage crawlerPage = new CrawlerPage(); crawlerPage.getUriData().setStrUri(url); for (String key : regexMap.keySet()) { Matcher matcher = RegExpUtil.getMatcher(sourceCode, regexMap.get(key)); if (matcher.find()) { values.put(key, HtmlUtils.escapeHtmlTag(matcher.group(1))); } else { crawlerPage.getMetaData().getExtractorResult().getWarnAtrrSet().add(key); } } if (crawlerPage.getMetaData().getExtractorResult().getWarnAtrrSet().size() > 0) { LoggerProcessor loggerProcessor = new LoggerProcessor(); loggerProcessor.process(crawlerPage); } return values; }