private static void setStandardIdsToMap(Integer pageNo, String html) throws Exception { Parser parser = Parser.createParser(html, "gb2312"); AndFilter viewStateFilter = new AndFilter( new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_StandardView")); NodeList nodes = parser.parse(viewStateFilter); TableTag node = (TableTag) nodes.elementAt(0); TableRow[] rows = node.getRows(); for (int i = 1; i < rows.length; i++) { TableColumn[] cols = rows[i].getColumns(); TableColumn col = cols[3]; LinkTag tag = (LinkTag) ((Div) col.getChildren().elementAt(1)).getChildren().elementAt(2); if (tag == null) { List<Integer> lst = error.get(pageNo); if (lst == null) { lst = new ArrayList<Integer>(); } lst.add(i); error.put(pageNo, lst); continue; } String href = tag.getAttribute("href"); if (href == null) { List<Integer> lst = error.get(pageNo); if (lst == null) { lst = new ArrayList<Integer>(); } lst.add(i); error.put(pageNo, lst); continue; } int start = href.indexOf("standardid="); int end = href.indexOf("&"); String standardId = href.substring(start, end).replaceAll("standardid=", ""); List<String> lst = map.get(pageNo); if (lst == null) { lst = new ArrayList<String>(); } lst.add(standardId); map.put(pageNo, lst); } }
private static void addDetailToMap(String key, String text) throws Exception { Parser parser = Parser.createParser(text, "gb2312"); TagNameFilter tableFiler = new TagNameFilter("table"); NodeList nodes = parser.parse(tableFiler); TableTag node = (TableTag) nodes.elementAt(5); TableRow[] rows = node.getRows(); for (int i = 1; i < 11; i++) { TableColumn[] cols = rows[i].getColumns(); StringBuffer txt1 = new StringBuffer(); StringBuffer txt2 = new StringBuffer(); NodeList span1 = cols[1].getChildren().elementAt(1).getChildren(); for (int j = 0; j < span1.size(); j++) { if (span1.elementAt(j) instanceof TextNode) { txt1.append(span1.elementAt(j).getText()).append(" "); } } NodeList span2 = cols[3].getChildren().elementAt(1).getChildren(); for (int j = 0; j < span2.size(); j++) { if (span2.elementAt(j) instanceof TextNode) { txt2.append(span2.elementAt(j).getText()).append(" "); } } List<String> lst = detailMap.get(key); if (lst == null) { lst = new ArrayList<String>(); } lst.add(txt1.toString().trim()); lst.add(txt2.toString().trim()); detailMap.put(key, lst); } }
@Override public LotteryDraw fetchResultDetail(String phase) { LotteryDraw lotteryDraw = null; lotteryDraw = nowPhaseResult(); if (phase == null || "".equals(phase) || lotteryDraw.getPhase().equals(phase)) { return lotteryDraw; } else { lotteryDraw = null; } String url = RESULT_MORE_LOCALITY_URL; String data = null; String pageInfo = "结果页面" + url; String encoding = "utf-8"; String logHeader = "==" + lotteryScope + "==" + siteName + "==" + pageInfo + "==抓取==" + getLotteryType().getName() + "=="; try { data = CoreFetcherUtils.URLGet(url, null, encoding); } catch (Exception e) { logger.error("获取html数据失败" + e.getMessage()); return null; } if (data == null || data.indexOf("404 Not Found") > 0 || data.isEmpty()) { logger.error(logHeader + "data is null or 404 Not Found"); return null; } Parser parser = null; try { parser = Parser.createParser(data, encoding); } catch (Exception e) { logger.error("解析html页面失败" + e.getMessage()); return null; } NodeFilter filter = new HasAttributeFilter("class", "mytable"); NodeList nodeList = null; try { nodeList = parser.extractAllNodesThatMatch(filter); TableTag tableTag = (TableTag) nodeList.elementAt(0); TableRow[] tableRows = tableTag.getRows(); for (int i = 1; i < tableRows.length; i++) { TableColumn[] tableColumns = tableRows[i].getColumns(); String phaseTmp = tableColumns[0].toPlainTextString(); if (phaseTmp != null && !"".equals(phaseTmp) && phase.equals(phaseTmp)) { lotteryDraw = new LotteryDraw(); // 彩期 lotteryDraw.setPhase(phaseTmp); // 开奖结果 String strResult = tableColumns[1].toPlainTextString(); strResult = strResult.trim().replace(" ", ","); lotteryDraw.setResult(strResult); // 彩种 lotteryDraw.setLotteryType(super.getLotteryType()); break; } } } catch (ParserException e) { logger.error("数据解析错误==" + e.getMessage(), e); return null; } return lotteryDraw; }
/** 从课表处,分课表 */ public List<Courses> parseCourses(String html) { Parser parser = new Parser(); try { parser.setInputHTML(html); parser.setEncoding("utf-8"); } catch (ParserException e) { e.printStackTrace(); } NodeFilter filter = new NodeClassFilter(TableTag.class); NodeList nodeList = null; try { nodeList = parser.extractAllNodesThatMatch(filter); } catch (ParserException e) { e.printStackTrace(); } List<Courses> list = new ArrayList<Courses>(); String schoolyear = ""; String semester = ""; for (int i = 0; i < nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); TableRow[] rows = tag.getRows(); for (int j = 0; j < rows.length; j++) { TableRow row = (TableRow) rows[j]; TableColumn[] columns = row.getColumns(); Courses courses = null; boolean isCourse = false; for (int k = 0; k < columns.length; k++) { Node columnNode = columns[k]; String info = columnNode.toPlainTextString().trim(); String temp = "学年学期:"; int start = info.indexOf(temp); int len = "2012-2013".length(); if (start != -1) { start = start + temp.length(); schoolyear = info.substring(start, start + len); // semester = info.substring(start+len+2); // 网络正常时候测试学期改为数字 semester = info.substring(start + len + 3, start + len + 4); if ("一".equals(semester)) { semester = "1"; } else if ("二".equals(semester)) { semester = "2"; } } if (k == 1 && info.indexOf("[") != -1) { courses = new Courses(); String courseCode = info.substring(1, 9); String coursesname = info.substring(10); courses.setCourseCode(courseCode); courses.setCoursesname(coursesname); isCourse = true; } if (k == 2 && isCourse) { double credit = Double.parseDouble(info); courses.setCredit(credit); } if (k == 3 && isCourse) { courses.setType(info); } if (k == 4 && isCourse) { courses.setLeanType(info); } if (k == 5 && isCourse) { courses.setCheckType(info); } if (k == 6 && isCourse) { courses.setGetType(info); } if (k == 7 && isCourse) { // double score=Double.parseDouble(info); courses.setScore(info); } if (k == 8 && isCourse) { courses.setRemark(info); } } // end for k if (courses != null) { courses.setSchoolYear(schoolyear); courses.setSemester(semester); list.add(courses); } } // end for j } } return list; }
public List<TimeTable> parseTimeTables(String html) { Parser parser = new Parser(); try { parser.setInputHTML(html); parser.setEncoding("utf-8"); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } List<TimeTable> list = new ArrayList<TimeTable>(); NodeFilter filter = new NodeClassFilter(TableTag.class); NodeList nodeList = null; try { nodeList = parser.extractAllNodesThatMatch(filter); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } for (int i = 0; i < nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); if (tag.getText().indexOf("[课程号]") == -1) { continue; } TableRow[] rows = tag.getRows(); for (int j = 1; j < rows.length; j++) { TableRow row = (TableRow) rows[j]; TableColumn[] columns = row.getColumns(); boolean isCourse = false; TimeTable timeTable = null; for (int k = 0; k < columns.length; k++) { Node columnNode = columns[k]; String info = columnNode.toPlainTextString().trim(); System.out.println(info + "===" + k); switch (k) { case 1: int start = info.indexOf("["); int end = info.indexOf("]"); timeTable = new TimeTable(); timeTable.setCourseCode(info.substring(start + 1, end)); timeTable.setCourseName(info.substring(end + 1)); break; case 3: timeTable.setCredit(Double.parseDouble(info)); break; case 4: timeTable.setType(info); break; case 5: int t_start = info.indexOf("]"); timeTable.setTeacher(info.substring(t_start + 1)); break; case 8: List<TimeAndAdress> ta_list = praseStr(info); for (TimeAndAdress ta : ta_list) { timeTable.setAddress(ta.getAddress()); timeTable.setTime(ta.getTime()); timeTable.setCycle(ta.getCycle()); timeTable.setSingleDouble(ta.getSingleDouble()); timeTable.setWeek(ta.getWeek()); list.add(timeTable.clone()); } break; default: break; } } } // end for j } } return list; }
@Override public List<JclqScheduleItem> fetchJclqSchedule(String officialDate) throws FetchFailedException { Map<String, String> headerParams = new HashMap<String, String>(); headerParams.put("Referer", "http://info.sporttery.cn"); headerParams.put( "User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19"); List<JclqScheduleItem> jclqScheduleItemList = new ArrayList<JclqScheduleItem>(); String encoding = "gbk"; JclqScheduleItem jclqScheduleItem = null; String siteName = "中国竞彩网网[竞彩篮球赛程]"; String logHeader = siteName + SCHEDULE_URL; try { String webInfo = CoreFetcherUtils.URLGetWithHeaderParams(SCHEDULE_URL, headerParams, null, encoding); if (webInfo == null || webInfo.indexOf("404 Not Found") > 0) { logger.error(logHeader + ",data is null or 404 Not Found"); throw new FetchFailedException("404 Not Found"); } Parser parser = Parser.createParser(webInfo, encoding); NodeList nodeList = parser.extractAllNodesThatMatch(new CssSelectorNodeFilter("div[class='box-tbl']")); if (null != nodeList && nodeList.size() > 0) { NodeFilter tableFilter = new TagNameFilter("table"); Parser parser2 = Parser.createParser(nodeList.toHtml(), encoding); NodeList tableNodeList = parser2.extractAllNodesThatMatch(tableFilter); if (tableNodeList != null && tableNodeList.size() > 0) { TableTag catchTableTag = new TableTag(); catchTableTag = (TableTag) tableNodeList.elementAt(0); if (catchTableTag != null) { TableRow[] catchRows = catchTableTag.getRows(); TableColumn[] catchColumns = null; for (int i = 2; i < catchRows.length; i++) { catchColumns = catchRows[i].getColumns(); if (catchColumns != null && catchColumns.length >= 6) { jclqScheduleItem = new JclqScheduleItem(); String officialNum = catchColumns[0].toPlainTextString().trim(); if (officialNum.length() < 5) { continue; } // 先解析比赛时间 String matchDateStr = catchColumns[3].toPlainTextString().trim(); String[] yearStr = matchDateStr.split("-"); if (yearStr[0].length() <= 2) { matchDateStr = "20" + matchDateStr + ":00"; } else { matchDateStr = matchDateStr + ":00"; } Date matchDate = CoreDateUtils.parseDate(matchDateStr, CoreDateUtils.DATETIME); Calendar matchDateCalendar = Calendar.getInstance(); matchDateCalendar.setTime(matchDate); matchDateCalendar.add(Calendar.MINUTE, 1); jclqScheduleItem.setMatchDate(matchDateCalendar.getTime()); // 根据周几、当前时间和比赛时间计算官方发布的日期 Calendar cd = Calendar.getInstance(); // 将时分秒等区域清零 cd.set(Calendar.HOUR_OF_DAY, 0); cd.set(Calendar.MINUTE, 0); cd.set(Calendar.SECOND, 0); cd.set(Calendar.MILLISECOND, 0); int nowWeekDay = cd.get(Calendar.DAY_OF_WEEK); int fetchWeekDay = weekDay.get(officialNum.substring(0, 2)); if (nowWeekDay != fetchWeekDay) { int m = fetchWeekDay - nowWeekDay; if (m < -1) { cd.add(Calendar.DATE, m + 7); } else { cd.add(Calendar.DATE, m); } } // 如果计算出来的日期超过了比赛时间,减去一周 if (cd.after(matchDateCalendar)) { cd.add(Calendar.DATE, -7); } // 如果计算出来的日期距离比赛时间相隔超过一周,加上一周的倍数 // 一周的毫秒数 long weekTimeInMillis = 3600 * 1000 * 24 * 7; long diffTimeInMillis = matchDateCalendar.getTimeInMillis() - cd.getTimeInMillis(); if (diffTimeInMillis > weekTimeInMillis) { // 计算相差几周 int diffWeekCount = (int) (diffTimeInMillis / weekTimeInMillis); cd.add(Calendar.DATE, 7 * diffWeekCount); } jclqScheduleItem.setMatchNum( CoreDateUtils.formatDate(cd.getTime(), "yyyyMMdd") + LotteryConstant.JCLQ_MATCH_NUM_CODE_DEFAULT + officialNum.substring(2)); jclqScheduleItem.setOfficialDate( CoreDateUtils.parseDate(CoreDateUtils.formatDate(cd.getTime()))); Integer oNum = null; try { oNum = Integer.valueOf(officialNum.substring(2)); } catch (Exception e) { logger.error("截取官方编码时,转换为Integer错误", e); } jclqScheduleItem.setOfficialNum(oNum); jclqScheduleItem.setMatchName( JclqUtil.convertMatchName( catchColumns[1].toPlainTextString().trim(), LotteryType.JCLQ_SF, FetcherType.T_PENGINEAPI)); String team = catchColumns[2].toPlainTextString().trim(); String[] teamStr = team.split("VS"); jclqScheduleItem.setAwayTeam(teamStr[0].trim()); jclqScheduleItem.setHomeTeam(teamStr[1].trim()); if ("已开售".equals(catchColumns[4].toPlainTextString().trim())) { jclqScheduleItem.setStatus(JclqRaceStatus.OPEN); } else { jclqScheduleItem.setStatus(JclqRaceStatus.UNOPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("胜负单关") > 0) { jclqScheduleItem.setDynamicSaleSfStatus(JclqDynamicSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setDynamicSaleSfStatus(JclqDynamicSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("胜负过关") > 0) { jclqScheduleItem.setStaticSaleSfStatus(JclqStaticSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setStaticSaleSfStatus(JclqStaticSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("让分胜负单关") > 0) { jclqScheduleItem.setDynamicSaleRfsfStatus(JclqDynamicSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setDynamicSaleRfsfStatus(JclqDynamicSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("让分胜负过关") > 0) { jclqScheduleItem.setStaticSaleRfsfStatus(JclqStaticSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setStaticSaleRfsfStatus(JclqStaticSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("胜分差单关") > 0) { jclqScheduleItem.setDynamicSaleSfcStatus(JclqDynamicSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setDynamicSaleSfcStatus(JclqDynamicSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("胜分差过关") > 0) { jclqScheduleItem.setStaticSaleSfcStatus(JclqStaticSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setStaticSaleSfcStatus(JclqStaticSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("大小分单关") > 0) { jclqScheduleItem.setDynamicSaleDxfStatus(JclqDynamicSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setDynamicSaleDxfStatus(JclqDynamicSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("大小分过关") > 0) { jclqScheduleItem.setStaticSaleDxfStatus(JclqStaticSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setStaticSaleDxfStatus(JclqStaticSaleStatus.SALE_OPEN); } jclqScheduleItemList.add(jclqScheduleItem); } } // end for catchRows } // end if catchTableTag!=null } // end if(tableNodeList!=null&&tableNodeList.size()>0) } else { logger.error(logHeader + "竞彩篮球赛程数据表格不存在,返回null"); throw new FetchFailedException("竞彩篮球赛程数据表格不存在"); } } catch (Exception e) { logger.error(logHeader + "竞彩篮球赛程错误" + e.getMessage(), e); throw new FetchFailedException(e.getMessage()); } return jclqScheduleItemList; }
/** * Creates a list of Grids based on the given HTML string. This works only for table-based HTML * documents. * * @param html the HTML string. * @return a list of Grids. */ public static List<Grid> fromHtml(String html) throws Exception { if (html == null || html.trim().isEmpty()) { return null; } List<Grid> grids = new ArrayList<>(); Parser parser = Parser.createParser(html, "UTF-8"); Node[] tables = parser.extractAllNodesThatMatch(new TagNameFilter("table")).toNodeArray(); for (Node t : tables) { Grid grid = new ListGrid(); TableTag table = (TableTag) t; TableRow[] rows = table.getRows(); Integer firstColumnCount = null; for (TableRow row : rows) { if (getColumnCount(row) == 0) // Ignore if no cells { log.warn("Ignoring row with no columns"); continue; } Node[] cells = row.getChildren().extractAllNodesThatMatch(HTML_ROW_FILTER).toNodeArray(); if (firstColumnCount == null) // First row becomes header { firstColumnCount = getColumnCount(row); for (Node c : cells) { TagNode cell = (TagNode) c; grid.addHeader(new GridHeader(getValue(cell), false, false)); Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan")); if (colSpan != null && colSpan > 1) { grid.addEmptyHeaders((colSpan - 1)); } } } else // Rest becomes rows { if (firstColumnCount != getColumnCount(row)) // Ignore { log.warn( "Ignoring row which has " + row.getColumnCount() + " columns since table has " + firstColumnCount + " columns"); continue; } grid.addRow(); for (Node c : cells) { // TODO row span TagNode cell = (TagNode) c; grid.addValue(getValue(cell)); Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan")); if (colSpan != null && colSpan > 1) { grid.addEmptyValues((colSpan - 1)); } } } } grids.add(grid); } return grids; }
public List<TimeTable> parseTimeTables(String html) { Parser parser = new Parser(); try { parser.setInputHTML(html); parser.setEncoding("utf-8"); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } List<TimeTable> list = new ArrayList<TimeTable>(); NodeFilter tagfilter = new NodeClassFilter(TableTag.class); NodeFilter idFilter = new HasAttributeFilter("id", "reportArea"); NodeFilter filter = new AndFilter(tagfilter, idFilter); NodeList nodeList = null; try { nodeList = parser.extractAllNodesThatMatch(filter); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } for (int i = 0; i < nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); TableRow[] rows = tag.getRows(); for (int j = 0; j < rows.length; j++) { TableRow row = (TableRow) rows[j]; TableColumn[] columns = row.getColumns(); boolean isCourse = false; TimeTable timeTable = null; for (int k = 0; k < columns.length; k++) { Node columnNode = columns[k]; String info = columnNode.toPlainTextString().trim(); // System.out.println(info+"=="+k); if (k == 1 && info.indexOf("[") != -1) { timeTable = new TimeTable(); String courseCode = info.substring(1, 9); String coursesname = info.substring(10); timeTable.setCourseName(coursesname); timeTable.setCourseCode(courseCode); isCourse = true; } if (k == 2 && isCourse) { double credit = Double.parseDouble(info); timeTable.setCredit(credit); } if (k == 3 && isCourse) { timeTable.setType(info); } if (k == 4 && isCourse) { timeTable.setTeacher(info); } if (k == 5 && isCourse) { timeTable.setClassId(info); } if (k == 6 && isCourse) { timeTable.setClassNum(info); } if (k == 11 && isCourse) { List<TimeAndAdress> ta_list = praseStr(info); for (TimeAndAdress ta : ta_list) { timeTable.setAddress(ta.getAddress()); timeTable.setTime(ta.getTime()); timeTable.setCycle(ta.getCycle()); timeTable.setSingleDouble(ta.getSingleDouble()); timeTable.setWeek(ta.getWeek()); list.add(timeTable.clone()); } } } // end for k } // end for j } } return list; }