public static void dealOnePage(String url, int starNo) { try { Parser parser = new Parser((HttpURLConnection) (new URL(url)).openConnection()); NodeList tableSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("bgcolor", "#DDE1FF")); parser = new Parser(new Lexer(tableSet.toHtml())); NodeList tdSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("tr")); parser = new Parser(new Lexer(tdSet.toHtml())); PrototypicalNodeFactory p = new PrototypicalNodeFactory(); p.registerTag(new SpanTag()); parser.setNodeFactory(p); NodeList spanSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("span")); int index = 0; for (int i = 5; i < spanSet.size(); i = i + 5) { String str = spanSet.elementAt(i).toPlainTextString(); String now = "" + (starNo * 100 + index); index++; while (str.compareTo(now) != 0) { System.out.println(now); now = "" + (starNo * 100 + index); index++; } // System.out.println(str); } } catch (ParserException e) { e.printStackTrace(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
/** * 获取文章链接 * * @param url * @throws Exception */ void doc(String url, String pre) throws Exception { Parser parser = new Parser(); parser.setURL(url); parser.setEncoding("GB2312"); NodeFilter fileter = new NodeClassFilter(LinkTag.class); NodeList list = parser.extractAllNodesThatMatch(fileter); if (list != null && list.size() > 0) { Parser p1 = new Parser(); p1.setInputHTML(list.toHtml()); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); NodeList linkList = p1.extractAllNodesThatMatch(linkFilter); if (linkList != null && linkList.size() > 0) { for (int i = 0; i < linkList.size(); i++) { LinkTag link = (LinkTag) linkList.elementAt(i); LinkBean bean = null; if (link.getLink().toLowerCase().startsWith(pre) && !link.getLinkText().equalsIgnoreCase("详细内容")) { bean = new LinkBean(); bean.setLink(link.getLink()); bean.setName(link.getLinkText()); LINKHASH.put(link.getLink(), bean); } } } } }
/** * 获取文章链接 * * @param url * @throws Exception */ void docByHTML(String content, String pre) throws Exception { Parser parser = new Parser(); parser.setInputHTML(content); parser.setEncoding("GB2312"); NodeFilter fileter = new NodeClassFilter(LinkTag.class); NodeList list = parser.extractAllNodesThatMatch(fileter); if (list != null && list.size() > 0) { Parser p1 = new Parser(); p1.setInputHTML(list.toHtml()); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); NodeList linkList = p1.extractAllNodesThatMatch(linkFilter); if (linkList != null && linkList.size() > 0) { for (int i = 0; i < linkList.size(); i++) { LinkTag link = (LinkTag) linkList.elementAt(i); LinkBean bean = null; if (link.getLink().toLowerCase().startsWith(pre) && !link.getLinkText().equalsIgnoreCase("详细内容")) { if (null == articleDocCache.get(getKey(link.getLink()))) { bean = new LinkBean(); bean.setLink(link.getLink()); bean.setName(link.getLinkText()); LINKHASH.put(link.getLink(), bean); } else { logger.info(">> 已存在 [" + link.getLink() + "] 地址"); } } } } } }
/** Test a better method of modifying an HTML page. */ public void testPageModification() throws Exception { Parser parser = Parser.createParser(HTML_WITH_LINK, null); NodeList list = parser.parse(null); // no filter // make an inner class that does the same thing as the UrlModifyingVisitor NodeVisitor visitor = new NodeVisitor() { String linkPrefix = "localhost://"; public void visitTag(Tag tag) { if (tag instanceof LinkTag) ((LinkTag) tag).setLink(linkPrefix + ((LinkTag) tag).getLink()); else if (tag instanceof ImageTag) ((ImageTag) tag).setImageURL(linkPrefix + ((ImageTag) tag).getImageURL()); } }; list.visitAllNodesWith(visitor); String result = list.toHtml(); assertStringEquals("Expected HTML", MODIFIED_HTML, result); }
public static HashMap<String, String> SouthamptonGetDetails(String[] url) { while (true) { try { HashMap<String, String> result = new HashMap<String, String>(); RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(10000).setConnectTimeout(10000).build(); CloseableHttpClient httpclient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build(); HttpGet httpGet = new HttpGet(url[1]); HttpResponse response = httpclient.execute(httpGet); HttpEntity entity = response.getEntity(); String htmls = null; if (entity != null) { htmls = EntityUtils.toString(entity).replace("\t", " "); // System.out.println(htmls); } System.out.println("Got reply!"); // htmls=HTMLFilter(htmls); Parser parser = null; // **********************************get school********************** parser = Parser.createParser(htmls.replace("span", "form"), "utf-8"); AndFilter SFilter = new AndFilter( new TagNameFilter("form"), // table class="CSCPreviewTable grey" new HasAttributeFilter("class", "first-owner")); NodeList nodes4 = parser.extractAllNodesThatMatch(SFilter); if (nodes4.size() > 0) { String school = html2Str(nodes4.elementAt(0).toHtml()); result.put("School", school); } // **********************************get entry structure********************** parser = Parser.createParser(htmls, "utf-8"); AndFilter ESFilter = new AndFilter( new TagNameFilter("div"), // table class="CSCPreviewTable grey" new HasAttributeFilter("class", "body__inner w-doublewide copy")); NodeList nodes1 = parser.extractAllNodesThatMatch(ESFilter); String structure = ""; String[] ProgramURL = null; if (nodes1.size() > 0) { String AllContents = nodes1.toHtml(); String[] SP = AllContents.split("<h2 id="); for (int i = 1; i < SP.length; i++) { String row = "<h2 id=" + SP[i]; if (row.contains("<h2 id=\"requirements\">Requirements</h2>")) // Structure { structure = (html2Str( row.replace("<br />", "\r\n") .replace("</strong>", "") .replace("<strong>", "") .replace("</", "\r\n</") .replace("\t", " ") .replace("&", " ")) .replace("\r\n\r\n", "\r\n")); structure = HTMLFilter(structure); result.put("Structure", structure); } // <a href="/program/BSC">Bachelor of Science (BSC)</a> else if (row.contains("<h2 id=\"relevant-degrees\">Relevant Degrees</h2>")) { parser = Parser.createParser(row, "utf-8"); AndFilter ProfessionNameFilter = new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("href")); NodeList nodes5 = parser.extractAllNodesThatMatch(ProfessionNameFilter); for (int j = 0; j < nodes4.size(); j++) { LinkTag link = (LinkTag) nodes5.elementAt(j); if (!link.getAttribute("href").equals("#")) { String code = link.getAttribute("href").replace("/program/", ""); ProgramURL = getProgram(code); result.put("Scholarship", code); // title=HTMLFilter(html2Str(link.toHtml())); } } } } } // ****************IELTS result.put("IELTS Average Requirement", "6.5"); result.put("IELTS Lowest Requirement", "6.0"); // **************************get title & type********************** result.put("Title", url[4] + " " + url[2]); result.put("Level", url[3]); if (ProgramURL != null) { result.put("Type", ProgramURL[2]); result.put("Length (months)", ProgramURL[5]); } httpclient.close(); return result; } catch (Exception ee) { System.out.println("Retrying..." + url[0]); ee.printStackTrace(); } } } // ...
@Override public List<JclqScheduleItem> fetchJclqSchedule(String officialDate) throws FetchFailedException { Map<String, String> headerParams = new HashMap<String, String>(); headerParams.put("Referer", "http://info.sporttery.cn"); headerParams.put( "User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19"); List<JclqScheduleItem> jclqScheduleItemList = new ArrayList<JclqScheduleItem>(); String encoding = "gbk"; JclqScheduleItem jclqScheduleItem = null; String siteName = "中国竞彩网网[竞彩篮球赛程]"; String logHeader = siteName + SCHEDULE_URL; try { String webInfo = CoreFetcherUtils.URLGetWithHeaderParams(SCHEDULE_URL, headerParams, null, encoding); if (webInfo == null || webInfo.indexOf("404 Not Found") > 0) { logger.error(logHeader + ",data is null or 404 Not Found"); throw new FetchFailedException("404 Not Found"); } Parser parser = Parser.createParser(webInfo, encoding); NodeList nodeList = parser.extractAllNodesThatMatch(new CssSelectorNodeFilter("div[class='box-tbl']")); if (null != nodeList && nodeList.size() > 0) { NodeFilter tableFilter = new TagNameFilter("table"); Parser parser2 = Parser.createParser(nodeList.toHtml(), encoding); NodeList tableNodeList = parser2.extractAllNodesThatMatch(tableFilter); if (tableNodeList != null && tableNodeList.size() > 0) { TableTag catchTableTag = new TableTag(); catchTableTag = (TableTag) tableNodeList.elementAt(0); if (catchTableTag != null) { TableRow[] catchRows = catchTableTag.getRows(); TableColumn[] catchColumns = null; for (int i = 2; i < catchRows.length; i++) { catchColumns = catchRows[i].getColumns(); if (catchColumns != null && catchColumns.length >= 6) { jclqScheduleItem = new JclqScheduleItem(); String officialNum = catchColumns[0].toPlainTextString().trim(); if (officialNum.length() < 5) { continue; } // 先解析比赛时间 String matchDateStr = catchColumns[3].toPlainTextString().trim(); String[] yearStr = matchDateStr.split("-"); if (yearStr[0].length() <= 2) { matchDateStr = "20" + matchDateStr + ":00"; } else { matchDateStr = matchDateStr + ":00"; } Date matchDate = CoreDateUtils.parseDate(matchDateStr, CoreDateUtils.DATETIME); Calendar matchDateCalendar = Calendar.getInstance(); matchDateCalendar.setTime(matchDate); matchDateCalendar.add(Calendar.MINUTE, 1); jclqScheduleItem.setMatchDate(matchDateCalendar.getTime()); // 根据周几、当前时间和比赛时间计算官方发布的日期 Calendar cd = Calendar.getInstance(); // 将时分秒等区域清零 cd.set(Calendar.HOUR_OF_DAY, 0); cd.set(Calendar.MINUTE, 0); cd.set(Calendar.SECOND, 0); cd.set(Calendar.MILLISECOND, 0); int nowWeekDay = cd.get(Calendar.DAY_OF_WEEK); int fetchWeekDay = weekDay.get(officialNum.substring(0, 2)); if (nowWeekDay != fetchWeekDay) { int m = fetchWeekDay - nowWeekDay; if (m < -1) { cd.add(Calendar.DATE, m + 7); } else { cd.add(Calendar.DATE, m); } } // 如果计算出来的日期超过了比赛时间,减去一周 if (cd.after(matchDateCalendar)) { cd.add(Calendar.DATE, -7); } // 如果计算出来的日期距离比赛时间相隔超过一周,加上一周的倍数 // 一周的毫秒数 long weekTimeInMillis = 3600 * 1000 * 24 * 7; long diffTimeInMillis = matchDateCalendar.getTimeInMillis() - cd.getTimeInMillis(); if (diffTimeInMillis > weekTimeInMillis) { // 计算相差几周 int diffWeekCount = (int) (diffTimeInMillis / weekTimeInMillis); cd.add(Calendar.DATE, 7 * diffWeekCount); } jclqScheduleItem.setMatchNum( CoreDateUtils.formatDate(cd.getTime(), "yyyyMMdd") + LotteryConstant.JCLQ_MATCH_NUM_CODE_DEFAULT + officialNum.substring(2)); jclqScheduleItem.setOfficialDate( CoreDateUtils.parseDate(CoreDateUtils.formatDate(cd.getTime()))); Integer oNum = null; try { oNum = Integer.valueOf(officialNum.substring(2)); } catch (Exception e) { logger.error("截取官方编码时,转换为Integer错误", e); } jclqScheduleItem.setOfficialNum(oNum); jclqScheduleItem.setMatchName( JclqUtil.convertMatchName( catchColumns[1].toPlainTextString().trim(), LotteryType.JCLQ_SF, FetcherType.T_PENGINEAPI)); String team = catchColumns[2].toPlainTextString().trim(); String[] teamStr = team.split("VS"); jclqScheduleItem.setAwayTeam(teamStr[0].trim()); jclqScheduleItem.setHomeTeam(teamStr[1].trim()); if ("已开售".equals(catchColumns[4].toPlainTextString().trim())) { jclqScheduleItem.setStatus(JclqRaceStatus.OPEN); } else { jclqScheduleItem.setStatus(JclqRaceStatus.UNOPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("胜负单关") > 0) { jclqScheduleItem.setDynamicSaleSfStatus(JclqDynamicSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setDynamicSaleSfStatus(JclqDynamicSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("胜负过关") > 0) { jclqScheduleItem.setStaticSaleSfStatus(JclqStaticSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setStaticSaleSfStatus(JclqStaticSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("让分胜负单关") > 0) { jclqScheduleItem.setDynamicSaleRfsfStatus(JclqDynamicSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setDynamicSaleRfsfStatus(JclqDynamicSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("让分胜负过关") > 0) { jclqScheduleItem.setStaticSaleRfsfStatus(JclqStaticSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setStaticSaleRfsfStatus(JclqStaticSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("胜分差单关") > 0) { jclqScheduleItem.setDynamicSaleSfcStatus(JclqDynamicSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setDynamicSaleSfcStatus(JclqDynamicSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("胜分差过关") > 0) { jclqScheduleItem.setStaticSaleSfcStatus(JclqStaticSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setStaticSaleSfcStatus(JclqStaticSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("大小分单关") > 0) { jclqScheduleItem.setDynamicSaleDxfStatus(JclqDynamicSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setDynamicSaleDxfStatus(JclqDynamicSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("大小分过关") > 0) { jclqScheduleItem.setStaticSaleDxfStatus(JclqStaticSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setStaticSaleDxfStatus(JclqStaticSaleStatus.SALE_OPEN); } jclqScheduleItemList.add(jclqScheduleItem); } } // end for catchRows } // end if catchTableTag!=null } // end if(tableNodeList!=null&&tableNodeList.size()>0) } else { logger.error(logHeader + "竞彩篮球赛程数据表格不存在,返回null"); throw new FetchFailedException("竞彩篮球赛程数据表格不存在"); } } catch (Exception e) { logger.error(logHeader + "竞彩篮球赛程错误" + e.getMessage(), e); throw new FetchFailedException(e.getMessage()); } return jclqScheduleItemList; }
public static HashMap<String, String> SouthamptonGetDetails2(String[] url) // for ECS { while (true) { try { HashMap<String, String> result = new HashMap<String, String>(); RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(10000).setConnectTimeout(10000).build(); CloseableHttpClient httpclient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build(); HttpGet httpGet = new HttpGet(url[1]); HttpResponse response = httpclient.execute(httpGet); HttpEntity entity = response.getEntity(); String htmls = null; if (entity != null) { htmls = EntityUtils.toString(entity).replace("\t", " "); // System.out.println(htmls); } System.out.println("Got reply!"); // htmls=HTMLFilter(htmls); Parser parser = null; HtmlPage page = new HtmlPage(parser); if (htmls.contains("September") || htmls.contains("september")) { result.put("Month of Entry", "9"); } else if (htmls.contains("October") || htmls.contains("october")) { result.put("Month of Entry", "10"); } else { result.put("Month of Entry", ""); } // div class="widgetCourse" h1 // **************************get title & type********************** parser = Parser.createParser(htmls, "utf-8"); AndFilter TitleFilter = new AndFilter( new TagNameFilter("h2"), new HasAttributeFilter("class", "uos-sia-title")); NodeList nodes4 = parser.extractAllNodesThatMatch(TitleFilter); if (nodes4.size() > 0) { String title = HTMLFilter(html2Str(nodes4.toHtml())); result.put("Title", title); result.put("Type", GetType(title)); } // **********************************get school********************** result.put("School", "Electronics and Computer Science (ECS)"); // **********************************get fee********************** Pattern p = Pattern.compile("£[0-9]+"); Matcher m = p.matcher(htmls.replace(",", "")); ArrayList<Integer> money = new ArrayList<Integer>(); while (m.find()) { money.add(Integer.parseInt(m.group().replace("£", ""))); } int max = 0; for (int w = 0; w < money.size(); w++) { if (money.get(w) > max) { max = money.get(w); } } if (max != 0) { System.out.println(max); result.put("Tuition Fee", "" + max); } // **************************get entry********************** parser = Parser.createParser(htmls, "utf-8"); AndFilter EntryFilter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "pane_entry")); NodeList nodes5 = parser.extractAllNodesThatMatch(EntryFilter); String entryAll = ""; if (nodes5.size() > 0) { for (int i = 0; i < nodes5.size(); i++) { Node node = (Node) nodes5.elementAt(i); entryAll = (html2Str(node.toHtml().replace(">", "> "))).replace("\r", ""); entryAll = entryAll.replace("\n", " "); entryAll = HTMLFilter(entryAll); result.put("Academic Entry Requirement", entryAll); } } // **************************get entry********************** parser = Parser.createParser(htmls, "utf-8"); AndFilter StructureFilter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "pane_modules")); NodeList nodes6 = parser.extractAllNodesThatMatch(StructureFilter); String structure = ""; if (nodes6.size() > 0) { for (int i = 0; i < nodes6.size(); i++) { Node node = (Node) nodes6.elementAt(i); structure = (html2Str( node.toHtml() .replace("<br />", "\r\n") .replace("</strong>", "") .replace("<strong>", "") .replace("</", "\r\n</") .replace("\t", " ") .replace("&", " ")) .replace("\r\n\r\n", "\r\n")); structure = HTMLFilter(structure); result.put("Structure", structure); } } // *****************Length String length = getLastYear(structure); result.put("Length (months)", length); // ****************IELTS String International = entryAll; ArrayList<String> list = new ArrayList<String>(); if (International.contains("7.5")) { list.add("7.5"); } if (International.contains("7.0") || International.contains(" 7 ")) { list.add("7.0"); } if (International.contains("6.5")) { list.add("6.5"); } if (International.contains("6.0") || International.contains(" 6 ")) { list.add("6.0"); } if (International.contains("5.5")) { list.add("5.5"); } if (list.size() == 1) { result.put("IELTS Average Requirement", list.get(0)); result.put("IELTS Lowest Requirement", list.get(0)); } else if (list.size() >= 2) { result.put("IELTS Average Requirement", list.get(0)); result.put("IELTS Lowest Requirement", list.get(1)); } else { result.put("IELTS Average Requirement", "6.0"); result.put("IELTS Lowest Requirement", "5.5"); } // finance/ result.put("Level", "Undergraduate"); result.put("Scholarship", ""); httpclient.close(); return result; } catch (Exception ee) { System.out.println("Retrying..."); ee.printStackTrace(); } } } // ...
public ContentModel listHtml(String param, String type) { ContentModel model = new ContentModel(); StringBuffer html = new StringBuffer(); try { NodeFilter filter = new TagNameFilter("body"); Parser parser = new Parser(); parser.setURL(SearchHelper.SEARCH_URL_BAIDU + param); parser.setEncoding(parser.getEncoding()); NodeList list = parser.extractAllNodesThatMatch(filter); String body = list.toHtml(); Parser content = new Parser(); content.setInputHTML(body); content.setEncoding(parser.getEncoding()); NodeFilter content_filter = new TagNameFilter("table"); NodeList content_list = content.extractAllNodesThatMatch(content_filter); for (int i = 0; i < content_list.size(); i++) { String s = content_list.elementAt(i).toHtml(); if (s.indexOf("div") != -1) { continue; } if (s.indexOf("相关搜索") != -1) { html.append("<div id=\"rs\">" + s + "</div>"); continue; } html.append("<div class=\"content\">"); for (Node n : extractHtml(content_list.elementAt(i), type)) { if (n instanceof LinkTag) { if (n.toPlainTextString().equals("百度快照")) { continue; } html.append("<h3 class=\"t\">" + n.toHtml() + "</h3>"); } else { html.append(n.toHtml()); } } html.append("<br/></div><br>"); } /** 获取分页数据 */ Parser page = new Parser(); page.setInputHTML(body); page.setEncoding(parser.getEncoding()); NodeFilter page_filter = new TagNameFilter("p"); NodeList page_list = page.extractAllNodesThatMatch(page_filter); for (int i = 0; i < page_list.size(); i++) { String s = page_list.elementAt(i).toHtml(); if (s.indexOf("page") == -1) { continue; } html.append("<p id=\"page\">" + page_list.elementAt(i).toHtml() + "</div>"); } } catch (Exception e) { e.printStackTrace(); } model.setContent(html.toString()); return model; }