コード例 #1
0
 private List<Map<String, String>> extractOrdersBetween(
     CrawlerPage crawlerPage, Date beginDate, Date endDate, String loginName) {
   String url =
       "https://consumeprod.alipay.com/record/advanced.htm?beginTime=00%3A00&endTime=24%3A00"
           + "&dateRange=customDate&status=all&keyword=bizOutNo&keyValue=&dateType=createDate&minAmount=&maxAmount="
           + "&fundFlow=all&tradeType=ALL&categoryId=&_input_charset=utf-8";
   url = url + "&beginDate=" + format(beginDate) + "&endDate=" + format(endDate);
   HtmlBean htmlBean = getNewSourceCode(crawlerPage, url);
   String sourceCode = htmlBean.getSourceCode();
   String nextPageRegex = "class=\"page-next\" href=\"(.*?)\">下一页></a>";
   String orderNumRegex = "<div class=\"page-link\">.*?共(.*?)条";
   List<Map<String, String>> list = new ArrayList<>();
   list.addAll(extractOrders(sourceCode, loginName, htmlBean.getHbaseKey()));
   Matcher matcher = RegExpUtil.getMatcher(sourceCode, nextPageRegex);
   if (matcher.find()) {
     /*高级版*/
     while (sourceCode.contains("下一页")) {
       matcher = RegExpUtil.getMatcher(sourceCode, nextPageRegex);
       if (matcher.find()) {
         String nextPageUrl = matcher.group(1);
         if (nextPageUrl.startsWith("https://consumeprod.alipay.com")) {
           nextPageUrl += "&maxAmount=&minAount=&keyValue=";
           nextPageUrl = nextPageUrl.replaceAll("&amp;", "&");
         }
         sourceCode = getNewSourceCode(crawlerPage, nextPageUrl).getSourceCode();
         List<Map<String, String>> orders =
             extractOrders(sourceCode, loginName, htmlBean.getHbaseKey());
         list.addAll(orders);
       } else {
         break;
       }
     }
   } else {
     /*基础版*/
     url =
         "https://consumeprod.alipay.com/record/standard.htm?_input_charset=utf-8&tradeType=ALL&dateRange=customDate&status=all&fundFlow=all&&dateType=createDate";
     url = url + "&beginDate=" + format(beginDate) + "&endDate=" + format(endDate);
     htmlBean = getNewSourceCode(crawlerPage, url);
     sourceCode = htmlBean.getSourceCode();
     matcher = RegExpUtil.getMatcher(sourceCode, orderNumRegex);
     if (matcher.find()) {
       int orderNum = Integer.parseInt(HtmlUtils.escapeHtmlTag(matcher.group(1)));
       int pageNum = (orderNum % 20 == 0) ? (orderNum / 20) : (orderNum / 20 + 1);
       for (int i = 2; i <= pageNum; i++) {
         sourceCode = getNewSourceCode(crawlerPage, url + "&pageNum=" + i).getSourceCode();
         List<Map<String, String>> orders =
             extractOrders(sourceCode, loginName, htmlBean.getHbaseKey());
         list.addAll(orders);
       }
     } else {
       crawlerPage.getMetaData().getExtractorResult().getWarnAtrrSet().add("orderNum");
     }
   }
   if (crawlerPage.getMetaData().getExtractorResult().getWarnAtrrSet().size() > 0) {
     LoggerProcessor loggerProcessor = new LoggerProcessor();
     loggerProcessor.process(crawlerPage);
   }
   return list;
 }
コード例 #2
0
 public List<Map<String, String>> extractOrders(
     String sourceCode, String loginName, String hbaseKey) {
   List<Map<String, String>> list = new ArrayList<>();
   String orderBlockRegex = "<tr id=\"J-item-\\d+\".*?class=\"J-item.*?\">.*?</tr>";
   Matcher matcher = RegExpUtil.getMatcher(sourceCode, orderBlockRegex);
   LoggerProcessor loggerProcessor = new LoggerProcessor();
   while (matcher.find()) {
     CrawlerPage crawlerPage = new CrawlerPage();
     Map<String, String> map = new HashMap<>();
     String orderBlock = matcher.group();
     for (Map.Entry<String, String> entry : orderRegexMap.entrySet()) {
       String propertyName = entry.getKey();
       String propertyValue = entry.getValue();
       Matcher propertyMatcher = RegExpUtil.getMatcher(orderBlock, propertyValue);
       if (propertyMatcher.find()) {
         String value =
             UnicodeDecoderUtils.decodeUnicode(HtmlUtils.escapeHtmlTag(propertyMatcher.group(1)));
         if (propertyName.equals("amount")) {
           value = value.replaceAll(" ", "");
         }
         map.put(propertyName, value);
       } else {
         crawlerPage.getMetaData().getExtractorResult().getWarnAtrrSet().add(entry.getKey());
       }
     }
     if (crawlerPage.getMetaData().getExtractorResult().getWarnAtrrSet().size() > 0) {
       loggerProcessor.process(crawlerPage);
     }
     if (map.containsKey("time-d")
         && !map.get("time-d").isEmpty()
         && map.containsKey("time-h")
         && !map.get("time-h").isEmpty()) {
       map.put("payTime", format(map.get("time-d") + " " + map.get("time-h"), "yyyy.MM.dd hh:mm"));
     }
     if (map.containsKey("receiverName") && StringUtils.isNotBlank(map.get("receiverName"))) {
       String receiverName = map.get("receiverName");
       if (receiverName.endsWith("|")) {
         receiverName = receiverName.substring(0, receiverName.length() - 1);
       }
       map.put("receiverName", receiverName.trim());
     }
     map.put("source", "ZHIFUBAO");
     map.put("alipayName", loginName);
     map.put("hbaseKey", hbaseKey);
     list.add(map);
   }
   if (!matcher.find(1)) {
     CrawlerPage crawlerPage = new CrawlerPage();
     crawlerPage.getMetaData().getExtractorResult().getWarnAtrrSet().add("alipayOrderBlock");
     loggerProcessor.process(crawlerPage);
   }
   return list;
 }
コード例 #3
0
 /**
  * 根据regexMap匹配网页源码,返回匹配到的key-values对应的map
  *
  * @param regexMap
  * @param sourceCode
  * @return
  */
 public Map<String, String> getMatchedValues(
     Map<String, String> regexMap, String sourceCode, String url) {
   Map<String, String> values = new HashMap<>();
   CrawlerPage crawlerPage = new CrawlerPage();
   crawlerPage.getUriData().setStrUri(url);
   for (String key : regexMap.keySet()) {
     Matcher matcher = RegExpUtil.getMatcher(sourceCode, regexMap.get(key));
     if (matcher.find()) {
       values.put(key, HtmlUtils.escapeHtmlTag(matcher.group(1)));
     } else {
       crawlerPage.getMetaData().getExtractorResult().getWarnAtrrSet().add(key);
     }
   }
   if (crawlerPage.getMetaData().getExtractorResult().getWarnAtrrSet().size() > 0) {
     LoggerProcessor loggerProcessor = new LoggerProcessor();
     loggerProcessor.process(crawlerPage);
   }
   return values;
 }