// 获取页面指定内容的Link public static List getLinksByConditions(String result, String coditions, String codeKind) { List links = null; Parser parser; NodeList nodelist; // 页面编码配置 To do by shengf parser = Parser.createParser(result, codeKind); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); try { links = new ArrayList(); nodelist = parser.parse(linkFilter); Node[] nodes = nodelist.toNodeArray(); int count = 1; for (int i = 0; i < nodes.length; i++) { Node node = nodes[i]; if (node instanceof LinkTag) { LinkTag link = (LinkTag) node; if (link.toHtml().indexOf(coditions) != -1) { links.add(link); count++; if (count > CatchNum) { return links; } } } } } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } return links; }
public void testUrlModificationWithVisitor() throws Exception { Parser parser = Parser.createParser(HTML_WITH_LINK, null); UrlModifyingVisitor visitor = new UrlModifyingVisitor("localhost://"); parser.visitAllNodesWith(visitor); String result = visitor.getModifiedResult(); assertStringEquals("Expected HTML", MODIFIED_HTML, result); }
/** * 获取新闻的内容 * * @param newsContentFilter * @param parser * @return content 新闻内容 */ public String getNewsContent(NodeFilter newsContentFilter, Parser parser) { String content = null; StringBuilder builder = new StringBuilder(); try { NodeList newsContentList = parser.parse(newsContentFilter); for (int i = 0; i < newsContentList.size(); i++) { Div newsContenTag = (Div) newsContentList.elementAt(i); builder = builder.append(newsContenTag.getStringText()); } content = builder.toString(); // 转换为String 类型。 if (content != null) { parser.reset(); parser = Parser.createParser(content, "utf8"); StringBean sb = new StringBean(); sb.setCollapse(true); parser.visitAllNodesWith(sb); content = sb.getStrings(); // String s = "\";} else{ document.getElementById('TurnAD444').innerHTML = \"\";} } // showTurnAD444(intTurnAD444); }catch(e){}"; content = content.replaceAll("\\\".*[a-z].*\\}", ""); content = content.replace("[我来说两句]", ""); } else { System.out.println("没有得到新闻内容!"); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } return content; }
public static void main(String[] args) throws Exception { RequestConfig requestConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD_STRICT).build(); CloseableHttpClient httpclient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build(); int count = 1; for (int i = 0; i <= 16; i++) { int index = i; // System.out.println(index); HttpGet httpGet = new HttpGet(url3 + index + url4); HttpResponse response = httpclient.execute(httpGet); HttpEntity entity = response.getEntity(); String htmls = null; if (entity != null) { htmls = EntityUtils.toString(entity).replace("\t", " "); } Parser parser = Parser.createParser(htmls, "utf-8"); AndFilter dFilter = new AndFilter(new TagNameFilter("h2"), new HasAttributeFilter("class", "field-content")); NodeList nodes3 = parser.extractAllNodesThatMatch(dFilter); for (int k = 0; k < nodes3.size(); k++) { htmls = nodes3.elementAt(k).toHtml(); parser = Parser.createParser(htmls, "utf-8"); AndFilter ProfessionNameFilter = new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("href")); NodeList nodes4 = parser.extractAllNodesThatMatch(ProfessionNameFilter); for (int j = 0; j < nodes4.size(); j++) { LinkTag link = (LinkTag) nodes4.elementAt(j); // if(link.getAttribute("href").contains("http://www.ulster.ac.uk/")) { // .replaceAll("<span[\\s\\S]*/span>","") String temp = link.toHtml(); System.out.println( "{\"" + count + "\",\"http://www.chi.ac.uk/" + link.getAttribute("href") + "\",\"" + html2Str(temp).replace("\r\n", "").trim() + "\",\"0\"},"); count++; } } } } // System.out.println("DONE."); }
public static void setEventValidation(String html) throws ParserException { Parser parser = Parser.createParser(html, "gb2312"); AndFilter filter = new AndFilter( new TagNameFilter("input"), new HasAttributeFilter("id", "__EVENTVALIDATION")); NodeList nodes = parser.parse(filter); InputTag node = (InputTag) nodes.elementAt(0); eventValidation = node.getAttribute("value"); }
public static void setViewState(String html) throws Exception { Parser parser = Parser.createParser(html, "gb2312"); AndFilter filter = new AndFilter(new TagNameFilter("input"), new HasAttributeFilter("id", "__VIEWSTATE")); NodeList nodes = parser.parse(filter); InputTag node = (InputTag) nodes.elementAt(0); viewState = node.getAttribute("value"); }
private static void setStandardIdsToMap(Integer pageNo, String html) throws Exception { Parser parser = Parser.createParser(html, "gb2312"); AndFilter viewStateFilter = new AndFilter( new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_StandardView")); NodeList nodes = parser.parse(viewStateFilter); TableTag node = (TableTag) nodes.elementAt(0); TableRow[] rows = node.getRows(); for (int i = 1; i < rows.length; i++) { TableColumn[] cols = rows[i].getColumns(); TableColumn col = cols[3]; LinkTag tag = (LinkTag) ((Div) col.getChildren().elementAt(1)).getChildren().elementAt(2); if (tag == null) { List<Integer> lst = error.get(pageNo); if (lst == null) { lst = new ArrayList<Integer>(); } lst.add(i); error.put(pageNo, lst); continue; } String href = tag.getAttribute("href"); if (href == null) { List<Integer> lst = error.get(pageNo); if (lst == null) { lst = new ArrayList<Integer>(); } lst.add(i); error.put(pageNo, lst); continue; } int start = href.indexOf("standardid="); int end = href.indexOf("&"); String standardId = href.substring(start, end).replaceAll("standardid=", ""); List<String> lst = map.get(pageNo); if (lst == null) { lst = new ArrayList<String>(); } lst.add(standardId); map.put(pageNo, lst); } }
public static void setInnerHTML(Element root, String html) { // remove old root childs OverrideNodeList<Node> list = (OverrideNodeList<Node>) root.getChildNodes(); list.getList().clear(); if (html != null) { Parser parser = Parser.createParser(html, "UTF-8"); try { parser.visitAllNodesWith(new GwtNodeVisitor(root)); } catch (ParserException e) { throw new RuntimeException( "error while parsing <" + root.getTagName() + "> element's innerHTML : " + html, e); } } }
/** Test a better method of modifying an HTML page. */ public void testPageModification() throws Exception { Parser parser = Parser.createParser(HTML_WITH_LINK, null); NodeList list = parser.parse(null); // no filter // make an inner class that does the same thing as the UrlModifyingVisitor NodeVisitor visitor = new NodeVisitor() { String linkPrefix = "localhost://"; public void visitTag(Tag tag) { if (tag instanceof LinkTag) ((LinkTag) tag).setLink(linkPrefix + ((LinkTag) tag).getLink()); else if (tag instanceof ImageTag) ((ImageTag) tag).setImageURL(linkPrefix + ((ImageTag) tag).getImageURL()); } }; list.visitAllNodesWith(visitor); String result = list.toHtml(); assertStringEquals("Expected HTML", MODIFIED_HTML, result); }
private static void addDetailToMap(String key, String text) throws Exception { Parser parser = Parser.createParser(text, "gb2312"); TagNameFilter tableFiler = new TagNameFilter("table"); NodeList nodes = parser.parse(tableFiler); TableTag node = (TableTag) nodes.elementAt(5); TableRow[] rows = node.getRows(); for (int i = 1; i < 11; i++) { TableColumn[] cols = rows[i].getColumns(); StringBuffer txt1 = new StringBuffer(); StringBuffer txt2 = new StringBuffer(); NodeList span1 = cols[1].getChildren().elementAt(1).getChildren(); for (int j = 0; j < span1.size(); j++) { if (span1.elementAt(j) instanceof TextNode) { txt1.append(span1.elementAt(j).getText()).append(" "); } } NodeList span2 = cols[3].getChildren().elementAt(1).getChildren(); for (int j = 0; j < span2.size(); j++) { if (span2.elementAt(j) instanceof TextNode) { txt2.append(span2.elementAt(j).getText()).append(" "); } } List<String> lst = detailMap.get(key); if (lst == null) { lst = new ArrayList<String>(); } lst.add(txt1.toString().trim()); lst.add(txt2.toString().trim()); detailMap.put(key, lst); } }
// 土地交易单独处理 public static List getLinksByConditions2(String result, String coditions, String codeKind) { List links = null; Parser parser; NodeList nodelist; parser = Parser.createParser(result, codeKind); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); try { links = new ArrayList(); nodelist = parser.parse(linkFilter); Node[] nodes = nodelist.toNodeArray(); int count = 1; for (int i = 0; i < nodes.length; i++) { Node node = nodes[i]; if (node instanceof LinkTag) { LinkTag link = (LinkTag) node; if ((link.toHtml().indexOf(coditions) != -1) && (link.getChildrenHTML().indexOf("查看") == -1)) { // System.out.println(link.toHtml()); // System.out.println(link.getLink()); // System.out.println("test:" + link.getChildrenHTML()); // Node nextNode = link.getParent().getNextSibling(); // System.out.println(nextNode.getChildren().toHtml().replaceAll("/r/n","").trim()); // nextNode = // nextNode.getNextSibling().getNextSibling(); // System.out.println(nextNode.getChildren().toHtml().replaceAll("/r/n","").trim()); links.add(link); count++; if (count > CatchNum) { return links; } } } } } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } return links; }
public static HashMap<String, String> SouthamptonGetDetails(String[] url) { while (true) { try { HashMap<String, String> result = new HashMap<String, String>(); RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(10000).setConnectTimeout(10000).build(); CloseableHttpClient httpclient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build(); HttpGet httpGet = new HttpGet(url[1]); HttpResponse response = httpclient.execute(httpGet); HttpEntity entity = response.getEntity(); String htmls = null; if (entity != null) { htmls = EntityUtils.toString(entity).replace("\t", " "); // System.out.println(htmls); } System.out.println("Got reply!"); // htmls=HTMLFilter(htmls); Parser parser = null; // **********************************get school********************** parser = Parser.createParser(htmls.replace("span", "form"), "utf-8"); AndFilter SFilter = new AndFilter( new TagNameFilter("form"), // table class="CSCPreviewTable grey" new HasAttributeFilter("class", "first-owner")); NodeList nodes4 = parser.extractAllNodesThatMatch(SFilter); if (nodes4.size() > 0) { String school = html2Str(nodes4.elementAt(0).toHtml()); result.put("School", school); } // **********************************get entry structure********************** parser = Parser.createParser(htmls, "utf-8"); AndFilter ESFilter = new AndFilter( new TagNameFilter("div"), // table class="CSCPreviewTable grey" new HasAttributeFilter("class", "body__inner w-doublewide copy")); NodeList nodes1 = parser.extractAllNodesThatMatch(ESFilter); String structure = ""; String[] ProgramURL = null; if (nodes1.size() > 0) { String AllContents = nodes1.toHtml(); String[] SP = AllContents.split("<h2 id="); for (int i = 1; i < SP.length; i++) { String row = "<h2 id=" + SP[i]; if (row.contains("<h2 id=\"requirements\">Requirements</h2>")) // Structure { structure = (html2Str( row.replace("<br />", "\r\n") .replace("</strong>", "") .replace("<strong>", "") .replace("</", "\r\n</") .replace("\t", " ") .replace("&", " ")) .replace("\r\n\r\n", "\r\n")); structure = HTMLFilter(structure); result.put("Structure", structure); } // <a href="/program/BSC">Bachelor of Science (BSC)</a> else if (row.contains("<h2 id=\"relevant-degrees\">Relevant Degrees</h2>")) { parser = Parser.createParser(row, "utf-8"); AndFilter ProfessionNameFilter = new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("href")); NodeList nodes5 = parser.extractAllNodesThatMatch(ProfessionNameFilter); for (int j = 0; j < nodes4.size(); j++) { LinkTag link = (LinkTag) nodes5.elementAt(j); if (!link.getAttribute("href").equals("#")) { String code = link.getAttribute("href").replace("/program/", ""); ProgramURL = getProgram(code); result.put("Scholarship", code); // title=HTMLFilter(html2Str(link.toHtml())); } } } } } // ****************IELTS result.put("IELTS Average Requirement", "6.5"); result.put("IELTS Lowest Requirement", "6.0"); // **************************get title & type********************** result.put("Title", url[4] + " " + url[2]); result.put("Level", url[3]); if (ProgramURL != null) { result.put("Type", ProgramURL[2]); result.put("Length (months)", ProgramURL[5]); } httpclient.close(); return result; } catch (Exception ee) { System.out.println("Retrying..." + url[0]); ee.printStackTrace(); } } } // ...
@Override public List<JclqScheduleItem> fetchJclqSchedule(String officialDate) throws FetchFailedException { Map<String, String> headerParams = new HashMap<String, String>(); headerParams.put("Referer", "http://info.sporttery.cn"); headerParams.put( "User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19"); List<JclqScheduleItem> jclqScheduleItemList = new ArrayList<JclqScheduleItem>(); String encoding = "gbk"; JclqScheduleItem jclqScheduleItem = null; String siteName = "中国竞彩网网[竞彩篮球赛程]"; String logHeader = siteName + SCHEDULE_URL; try { String webInfo = CoreFetcherUtils.URLGetWithHeaderParams(SCHEDULE_URL, headerParams, null, encoding); if (webInfo == null || webInfo.indexOf("404 Not Found") > 0) { logger.error(logHeader + ",data is null or 404 Not Found"); throw new FetchFailedException("404 Not Found"); } Parser parser = Parser.createParser(webInfo, encoding); NodeList nodeList = parser.extractAllNodesThatMatch(new CssSelectorNodeFilter("div[class='box-tbl']")); if (null != nodeList && nodeList.size() > 0) { NodeFilter tableFilter = new TagNameFilter("table"); Parser parser2 = Parser.createParser(nodeList.toHtml(), encoding); NodeList tableNodeList = parser2.extractAllNodesThatMatch(tableFilter); if (tableNodeList != null && tableNodeList.size() > 0) { TableTag catchTableTag = new TableTag(); catchTableTag = (TableTag) tableNodeList.elementAt(0); if (catchTableTag != null) { TableRow[] catchRows = catchTableTag.getRows(); TableColumn[] catchColumns = null; for (int i = 2; i < catchRows.length; i++) { catchColumns = catchRows[i].getColumns(); if (catchColumns != null && catchColumns.length >= 6) { jclqScheduleItem = new JclqScheduleItem(); String officialNum = catchColumns[0].toPlainTextString().trim(); if (officialNum.length() < 5) { continue; } // 先解析比赛时间 String matchDateStr = catchColumns[3].toPlainTextString().trim(); String[] yearStr = matchDateStr.split("-"); if (yearStr[0].length() <= 2) { matchDateStr = "20" + matchDateStr + ":00"; } else { matchDateStr = matchDateStr + ":00"; } Date matchDate = CoreDateUtils.parseDate(matchDateStr, CoreDateUtils.DATETIME); Calendar matchDateCalendar = Calendar.getInstance(); matchDateCalendar.setTime(matchDate); matchDateCalendar.add(Calendar.MINUTE, 1); jclqScheduleItem.setMatchDate(matchDateCalendar.getTime()); // 根据周几、当前时间和比赛时间计算官方发布的日期 Calendar cd = Calendar.getInstance(); // 将时分秒等区域清零 cd.set(Calendar.HOUR_OF_DAY, 0); cd.set(Calendar.MINUTE, 0); cd.set(Calendar.SECOND, 0); cd.set(Calendar.MILLISECOND, 0); int nowWeekDay = cd.get(Calendar.DAY_OF_WEEK); int fetchWeekDay = weekDay.get(officialNum.substring(0, 2)); if (nowWeekDay != fetchWeekDay) { int m = fetchWeekDay - nowWeekDay; if (m < -1) { cd.add(Calendar.DATE, m + 7); } else { cd.add(Calendar.DATE, m); } } // 如果计算出来的日期超过了比赛时间,减去一周 if (cd.after(matchDateCalendar)) { cd.add(Calendar.DATE, -7); } // 如果计算出来的日期距离比赛时间相隔超过一周,加上一周的倍数 // 一周的毫秒数 long weekTimeInMillis = 3600 * 1000 * 24 * 7; long diffTimeInMillis = matchDateCalendar.getTimeInMillis() - cd.getTimeInMillis(); if (diffTimeInMillis > weekTimeInMillis) { // 计算相差几周 int diffWeekCount = (int) (diffTimeInMillis / weekTimeInMillis); cd.add(Calendar.DATE, 7 * diffWeekCount); } jclqScheduleItem.setMatchNum( CoreDateUtils.formatDate(cd.getTime(), "yyyyMMdd") + LotteryConstant.JCLQ_MATCH_NUM_CODE_DEFAULT + officialNum.substring(2)); jclqScheduleItem.setOfficialDate( CoreDateUtils.parseDate(CoreDateUtils.formatDate(cd.getTime()))); Integer oNum = null; try { oNum = Integer.valueOf(officialNum.substring(2)); } catch (Exception e) { logger.error("截取官方编码时,转换为Integer错误", e); } jclqScheduleItem.setOfficialNum(oNum); jclqScheduleItem.setMatchName( JclqUtil.convertMatchName( catchColumns[1].toPlainTextString().trim(), LotteryType.JCLQ_SF, FetcherType.T_PENGINEAPI)); String team = catchColumns[2].toPlainTextString().trim(); String[] teamStr = team.split("VS"); jclqScheduleItem.setAwayTeam(teamStr[0].trim()); jclqScheduleItem.setHomeTeam(teamStr[1].trim()); if ("已开售".equals(catchColumns[4].toPlainTextString().trim())) { jclqScheduleItem.setStatus(JclqRaceStatus.OPEN); } else { jclqScheduleItem.setStatus(JclqRaceStatus.UNOPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("胜负单关") > 0) { jclqScheduleItem.setDynamicSaleSfStatus(JclqDynamicSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setDynamicSaleSfStatus(JclqDynamicSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("胜负过关") > 0) { jclqScheduleItem.setStaticSaleSfStatus(JclqStaticSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setStaticSaleSfStatus(JclqStaticSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("让分胜负单关") > 0) { jclqScheduleItem.setDynamicSaleRfsfStatus(JclqDynamicSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setDynamicSaleRfsfStatus(JclqDynamicSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("让分胜负过关") > 0) { jclqScheduleItem.setStaticSaleRfsfStatus(JclqStaticSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setStaticSaleRfsfStatus(JclqStaticSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("胜分差单关") > 0) { jclqScheduleItem.setDynamicSaleSfcStatus(JclqDynamicSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setDynamicSaleSfcStatus(JclqDynamicSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("胜分差过关") > 0) { jclqScheduleItem.setStaticSaleSfcStatus(JclqStaticSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setStaticSaleSfcStatus(JclqStaticSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("大小分单关") > 0) { jclqScheduleItem.setDynamicSaleDxfStatus(JclqDynamicSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setDynamicSaleDxfStatus(JclqDynamicSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("大小分过关") > 0) { jclqScheduleItem.setStaticSaleDxfStatus(JclqStaticSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setStaticSaleDxfStatus(JclqStaticSaleStatus.SALE_OPEN); } jclqScheduleItemList.add(jclqScheduleItem); } } // end for catchRows } // end if catchTableTag!=null } // end if(tableNodeList!=null&&tableNodeList.size()>0) } else { logger.error(logHeader + "竞彩篮球赛程数据表格不存在,返回null"); throw new FetchFailedException("竞彩篮球赛程数据表格不存在"); } } catch (Exception e) { logger.error(logHeader + "竞彩篮球赛程错误" + e.getMessage(), e); throw new FetchFailedException(e.getMessage()); } return jclqScheduleItemList; }
/** * Creates a list of Grids based on the given HTML string. This works only for table-based HTML * documents. * * @param html the HTML string. * @return a list of Grids. */ public static List<Grid> fromHtml(String html) throws Exception { if (html == null || html.trim().isEmpty()) { return null; } List<Grid> grids = new ArrayList<>(); Parser parser = Parser.createParser(html, "UTF-8"); Node[] tables = parser.extractAllNodesThatMatch(new TagNameFilter("table")).toNodeArray(); for (Node t : tables) { Grid grid = new ListGrid(); TableTag table = (TableTag) t; TableRow[] rows = table.getRows(); Integer firstColumnCount = null; for (TableRow row : rows) { if (getColumnCount(row) == 0) // Ignore if no cells { log.warn("Ignoring row with no columns"); continue; } Node[] cells = row.getChildren().extractAllNodesThatMatch(HTML_ROW_FILTER).toNodeArray(); if (firstColumnCount == null) // First row becomes header { firstColumnCount = getColumnCount(row); for (Node c : cells) { TagNode cell = (TagNode) c; grid.addHeader(new GridHeader(getValue(cell), false, false)); Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan")); if (colSpan != null && colSpan > 1) { grid.addEmptyHeaders((colSpan - 1)); } } } else // Rest becomes rows { if (firstColumnCount != getColumnCount(row)) // Ignore { log.warn( "Ignoring row which has " + row.getColumnCount() + " columns since table has " + firstColumnCount + " columns"); continue; } grid.addRow(); for (Node c : cells) { // TODO row span TagNode cell = (TagNode) c; grid.addValue(getValue(cell)); Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan")); if (colSpan != null && colSpan > 1) { grid.addEmptyValues((colSpan - 1)); } } } } grids.add(grid); } return grids; }
@Override protected LotteryDraw parseLotteryDrawResult(String html) { LotteryType lotteryType = this.getLotteryType(); Parser parser = null; try { parser = Parser.createParser(html, CharsetConstant.CHARSET_UTF8); } catch (Exception e) { logger.error("解析html内容出错: {}", html, e); return null; } LotteryDraw lotteryDraw = new LotteryDraw(); lotteryDraw.setLotteryType(lotteryType); // 解析基本信息 try { NodeFilter tInfoFilter = new HasAttributeFilter("class", "tInfo"); NodeList tInfoNodeList = parser.extractAllNodesThatMatch(tInfoFilter); if (tInfoNodeList.size() == 0) { return null; } parser.setInputHTML(tInfoNodeList.elementAt(0).toHtml()); // 取四个红色部分,依次为彩期、销售总额、开奖日期、开奖号码 NodeFilter redFilter = new HasAttributeFilter("class", "fc-red"); NodeList redNodeList = parser.extractAllNodesThatMatch(redFilter); if (redNodeList.size() < 4) { logger.error("解析的内容不符合要求: {}", tInfoNodeList.elementAt(0).toHtml()); return null; } lotteryDraw.setPhase(redNodeList.elementAt(0).toPlainTextString().trim()); lotteryDraw.setVolumeOfSales( StringUtils.replace(redNodeList.elementAt(1).toPlainTextString().trim(), ",", "")); Date drawDate = CoreDateUtils.parseDate( redNodeList.elementAt(2).toPlainTextString().trim(), "yyyy年MM月dd日"); if (drawDate != null) { lotteryDraw.setTimeDraw(CoreDateUtils.formatDateTime(drawDate)); } lotteryDraw.setResult( StringUtils.replace(redNodeList.elementAt(3).toPlainTextString().trim(), " ", ",")); } catch (ParserException e) { logger.error(e.getMessage(), e); return null; } // 解析详情信息 try { parser.setInputHTML(html); NodeFilter dInfoFilter = new HasAttributeFilter("class", "dInfo"); NodeList dInfoNodeList = parser.extractAllNodesThatMatch(dInfoFilter); if (dInfoNodeList.size() == 0) { return null; } parser.setInputHTML(dInfoNodeList.elementAt(0).toHtml()); NodeFilter prizeFilter = new TagNameFilter("p"); NodeList prizeNodeList = parser.extractAllNodesThatMatch(prizeFilter); if (prizeNodeList.size() == 0) { return null; } String[] splitted = prizeNodeList .elementAt(0) .toPlainTextString() .split("--------------------------------------------------"); if (splitted.length < 2) { logger.error("未解析到{}开奖详情: {}", lotteryType.getName()); return lotteryDraw; } splitted = StringUtils.split(splitted[1].trim(), " "); List<LotteryDrawPrizeItem> resultDetail = new ArrayList<LotteryDrawPrizeItem>(); int index = -1; LotteryDrawPrizeItem prizeItem = null; for (int i = 0; i < splitted.length; i++) { String s = splitted[i].trim(); if (s.length() == 0) { continue; } index++; if (index % 4 == 0) { // 一行有4列 index = 0; prizeItem = new LotteryDrawPrizeItem(); resultDetail.add(prizeItem); } switch (index) { case 0: prizeItem.setName(s); break; case 1: prizeItem.setWinningCount(StringUtils.replace(s, "注", "")); break; case 2: prizeItem.setPrizeAmount( CoreStringUtils.replaceAll( s, new String[][] { {"元", ""}, {",", ""} })); break; default: break; } } lotteryDraw.setResultDetail(resultDetail); } catch (ParserException e) { logger.error(e.getMessage(), e); } return lotteryDraw; }
@Override public LotteryDraw fetchResultDetail(String phase) { LotteryDraw lotteryDraw = null; lotteryDraw = nowPhaseResult(); if (phase == null || "".equals(phase) || lotteryDraw.getPhase().equals(phase)) { return lotteryDraw; } else { lotteryDraw = null; } String url = RESULT_MORE_LOCALITY_URL; String data = null; String pageInfo = "结果页面" + url; String encoding = "utf-8"; String logHeader = "==" + lotteryScope + "==" + siteName + "==" + pageInfo + "==抓取==" + getLotteryType().getName() + "=="; try { data = CoreFetcherUtils.URLGet(url, null, encoding); } catch (Exception e) { logger.error("获取html数据失败" + e.getMessage()); return null; } if (data == null || data.indexOf("404 Not Found") > 0 || data.isEmpty()) { logger.error(logHeader + "data is null or 404 Not Found"); return null; } Parser parser = null; try { parser = Parser.createParser(data, encoding); } catch (Exception e) { logger.error("解析html页面失败" + e.getMessage()); return null; } NodeFilter filter = new HasAttributeFilter("class", "mytable"); NodeList nodeList = null; try { nodeList = parser.extractAllNodesThatMatch(filter); TableTag tableTag = (TableTag) nodeList.elementAt(0); TableRow[] tableRows = tableTag.getRows(); for (int i = 1; i < tableRows.length; i++) { TableColumn[] tableColumns = tableRows[i].getColumns(); String phaseTmp = tableColumns[0].toPlainTextString(); if (phaseTmp != null && !"".equals(phaseTmp) && phase.equals(phaseTmp)) { lotteryDraw = new LotteryDraw(); // 彩期 lotteryDraw.setPhase(phaseTmp); // 开奖结果 String strResult = tableColumns[1].toPlainTextString(); strResult = strResult.trim().replace(" ", ","); lotteryDraw.setResult(strResult); // 彩种 lotteryDraw.setLotteryType(super.getLotteryType()); break; } } } catch (ParserException e) { logger.error("数据解析错误==" + e.getMessage(), e); return null; } return lotteryDraw; }
public static HashMap<String, String> SouthamptonGetDetails2(String[] url) // for ECS { while (true) { try { HashMap<String, String> result = new HashMap<String, String>(); RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(10000).setConnectTimeout(10000).build(); CloseableHttpClient httpclient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build(); HttpGet httpGet = new HttpGet(url[1]); HttpResponse response = httpclient.execute(httpGet); HttpEntity entity = response.getEntity(); String htmls = null; if (entity != null) { htmls = EntityUtils.toString(entity).replace("\t", " "); // System.out.println(htmls); } System.out.println("Got reply!"); // htmls=HTMLFilter(htmls); Parser parser = null; HtmlPage page = new HtmlPage(parser); if (htmls.contains("September") || htmls.contains("september")) { result.put("Month of Entry", "9"); } else if (htmls.contains("October") || htmls.contains("october")) { result.put("Month of Entry", "10"); } else { result.put("Month of Entry", ""); } // div class="widgetCourse" h1 // **************************get title & type********************** parser = Parser.createParser(htmls, "utf-8"); AndFilter TitleFilter = new AndFilter( new TagNameFilter("h2"), new HasAttributeFilter("class", "uos-sia-title")); NodeList nodes4 = parser.extractAllNodesThatMatch(TitleFilter); if (nodes4.size() > 0) { String title = HTMLFilter(html2Str(nodes4.toHtml())); result.put("Title", title); result.put("Type", GetType(title)); } // **********************************get school********************** result.put("School", "Electronics and Computer Science (ECS)"); // **********************************get fee********************** Pattern p = Pattern.compile("£[0-9]+"); Matcher m = p.matcher(htmls.replace(",", "")); ArrayList<Integer> money = new ArrayList<Integer>(); while (m.find()) { money.add(Integer.parseInt(m.group().replace("£", ""))); } int max = 0; for (int w = 0; w < money.size(); w++) { if (money.get(w) > max) { max = money.get(w); } } if (max != 0) { System.out.println(max); result.put("Tuition Fee", "" + max); } // **************************get entry********************** parser = Parser.createParser(htmls, "utf-8"); AndFilter EntryFilter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "pane_entry")); NodeList nodes5 = parser.extractAllNodesThatMatch(EntryFilter); String entryAll = ""; if (nodes5.size() > 0) { for (int i = 0; i < nodes5.size(); i++) { Node node = (Node) nodes5.elementAt(i); entryAll = (html2Str(node.toHtml().replace(">", "> "))).replace("\r", ""); entryAll = entryAll.replace("\n", " "); entryAll = HTMLFilter(entryAll); result.put("Academic Entry Requirement", entryAll); } } // **************************get entry********************** parser = Parser.createParser(htmls, "utf-8"); AndFilter StructureFilter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "pane_modules")); NodeList nodes6 = parser.extractAllNodesThatMatch(StructureFilter); String structure = ""; if (nodes6.size() > 0) { for (int i = 0; i < nodes6.size(); i++) { Node node = (Node) nodes6.elementAt(i); structure = (html2Str( node.toHtml() .replace("<br />", "\r\n") .replace("</strong>", "") .replace("<strong>", "") .replace("</", "\r\n</") .replace("\t", " ") .replace("&", " ")) .replace("\r\n\r\n", "\r\n")); structure = HTMLFilter(structure); result.put("Structure", structure); } } // *****************Length String length = getLastYear(structure); result.put("Length (months)", length); // ****************IELTS String International = entryAll; ArrayList<String> list = new ArrayList<String>(); if (International.contains("7.5")) { list.add("7.5"); } if (International.contains("7.0") || International.contains(" 7 ")) { list.add("7.0"); } if (International.contains("6.5")) { list.add("6.5"); } if (International.contains("6.0") || International.contains(" 6 ")) { list.add("6.0"); } if (International.contains("5.5")) { list.add("5.5"); } if (list.size() == 1) { result.put("IELTS Average Requirement", list.get(0)); result.put("IELTS Lowest Requirement", list.get(0)); } else if (list.size() >= 2) { result.put("IELTS Average Requirement", list.get(0)); result.put("IELTS Lowest Requirement", list.get(1)); } else { result.put("IELTS Average Requirement", "6.0"); result.put("IELTS Lowest Requirement", "5.5"); } // finance/ result.put("Level", "Undergraduate"); result.put("Scholarship", ""); httpclient.close(); return result; } catch (Exception ee) { System.out.println("Retrying..."); ee.printStackTrace(); } } } // ...