/** * 获取文章链接 * * @param url * @throws Exception */ void docByHTML(String content, String pre) throws Exception { Parser parser = new Parser(); parser.setInputHTML(content); parser.setEncoding("GB2312"); NodeFilter fileter = new NodeClassFilter(LinkTag.class); NodeList list = parser.extractAllNodesThatMatch(fileter); if (list != null && list.size() > 0) { Parser p1 = new Parser(); p1.setInputHTML(list.toHtml()); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); NodeList linkList = p1.extractAllNodesThatMatch(linkFilter); if (linkList != null && linkList.size() > 0) { for (int i = 0; i < linkList.size(); i++) { LinkTag link = (LinkTag) linkList.elementAt(i); LinkBean bean = null; if (link.getLink().toLowerCase().startsWith(pre) && !link.getLinkText().equalsIgnoreCase("详细内容")) { if (null == articleDocCache.get(getKey(link.getLink()))) { bean = new LinkBean(); bean.setLink(link.getLink()); bean.setName(link.getLinkText()); LINKHASH.put(link.getLink(), bean); } else { logger.info(">> 已存在 [" + link.getLink() + "] 地址"); } } } } } }
/** * 获取文章链接 * * @param url * @throws Exception */ void doc(String url, String pre) throws Exception { Parser parser = new Parser(); parser.setURL(url); parser.setEncoding("GB2312"); NodeFilter fileter = new NodeClassFilter(LinkTag.class); NodeList list = parser.extractAllNodesThatMatch(fileter); if (list != null && list.size() > 0) { Parser p1 = new Parser(); p1.setInputHTML(list.toHtml()); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); NodeList linkList = p1.extractAllNodesThatMatch(linkFilter); if (linkList != null && linkList.size() > 0) { for (int i = 0; i < linkList.size(); i++) { LinkTag link = (LinkTag) linkList.elementAt(i); LinkBean bean = null; if (link.getLink().toLowerCase().startsWith(pre) && !link.getLinkText().equalsIgnoreCase("详细内容")) { bean = new LinkBean(); bean.setLink(link.getLink()); bean.setName(link.getLinkText()); LINKHASH.put(link.getLink(), bean); } } } } }
public static void dealOnePage(String url, int starNo) { try { Parser parser = new Parser((HttpURLConnection) (new URL(url)).openConnection()); NodeList tableSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("bgcolor", "#DDE1FF")); parser = new Parser(new Lexer(tableSet.toHtml())); NodeList tdSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("tr")); parser = new Parser(new Lexer(tdSet.toHtml())); PrototypicalNodeFactory p = new PrototypicalNodeFactory(); p.registerTag(new SpanTag()); parser.setNodeFactory(p); NodeList spanSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("span")); int index = 0; for (int i = 5; i < spanSet.size(); i = i + 5) { String str = spanSet.elementAt(i).toPlainTextString(); String now = "" + (starNo * 100 + index); index++; while (str.compareTo(now) != 0) { System.out.println(now); now = "" + (starNo * 100 + index); index++; } // System.out.println(str); } } catch (ParserException e) { e.printStackTrace(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
// If there is a <title> element on the start page, use that as our AU // name. String recomputeRegName() { if (!isStarted()) { // This can get invoked (seveeral times, mostly from logging) before // enough mechanism has started to make it possible to resolve the CuUrl // below. return null; } try { CachedUrl cu = makeCachedUrl(m_registryUrl); if (cu == null) return null; URL cuUrl = CuUrl.fromCu(cu); Parser parser = new Parser(cuUrl.toString()); NodeList nodelst = parser.extractAllNodesThatMatch(new NodeClassFilter(TitleTag.class)); Node nodes[] = nodelst.toNodeArray(); recomputeRegName = false; if (nodes.length < 1) return null; // Get the first title found TitleTag tag = (TitleTag) nodes[0]; if (tag == null) return null; return tag.getTitle(); } catch (MalformedURLException e) { log.warning("recomputeRegName", e); return null; } catch (ParserException e) { if (e.getThrowable() instanceof FileNotFoundException) { log.warning("recomputeRegName: " + e.getThrowable().toString()); } else { log.warning("recomputeRegName", e); } return null; } }
public void checkprice() throws Exception { // System.out.println("checking Aptamil url [" + page.url + "]"); URL url = new URL(page.url); HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection(); urlConnection.setConnectTimeout(Constant.connect_timeout); urlConnection.connect(); Parser parser = new Parser(urlConnection); parser.setEncoding(Constant.ENCODE); NodeClassFilter div_filter = new NodeClassFilter(Div.class); OrFilter filters = new OrFilter(); filters.setPredicates(new NodeFilter[] {div_filter}); NodeList list = parser.extractAllNodesThatMatch(filters); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof Div) { Div d = (Div) tag; String divclass = d.getAttribute("class"); if ("pl_addToBasket".equalsIgnoreCase(divclass)) { // return getName(d); } } } }
/** * 对新闻URL进行解析提取新闻,同时将新闻插入到数据库中。 * * @param url 新闻连接。 */ public void parser(String url) { try { parser = new Parser(url); // NodeFilter contentFilter = new AndFilter(new TagNameFilter("div"), new // HasAttributeFilter("class", "TRS_PreAppend")); // parser.reset(); //记得每次用完parser后,要重置一次parser。要不然就得不到我们想要的内容了。 NodeFilter innerFilter = new AndFilter(new TagNameFilter("p"), new HasAttributeFilter("class", "MsoNormal")); NodeFilter xk = new HasParentFilter(innerFilter); NodeList nodes = parser.extractAllNodesThatMatch(xk); System.out.println(nodes.size()); for (int i = 0; i < nodes.size(); i++) { Node time = nodes.elementAt(i); // System.out.println(time.toPlainTextString().trim().replace(" ", // "").replaceAll("[\\t\\n\\r]", "").replaceAll(" ", "")); System.out.println( replaceBlank(time.getLastChild().getText().replaceAll("span", "").replaceAll(" ", ""))); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } }
public static void main(String[] args) throws Exception { RequestConfig requestConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD_STRICT).build(); CloseableHttpClient httpclient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build(); int count = 1; for (int i = 0; i <= 16; i++) { int index = i; // System.out.println(index); HttpGet httpGet = new HttpGet(url3 + index + url4); HttpResponse response = httpclient.execute(httpGet); HttpEntity entity = response.getEntity(); String htmls = null; if (entity != null) { htmls = EntityUtils.toString(entity).replace("\t", " "); } Parser parser = Parser.createParser(htmls, "utf-8"); AndFilter dFilter = new AndFilter(new TagNameFilter("h2"), new HasAttributeFilter("class", "field-content")); NodeList nodes3 = parser.extractAllNodesThatMatch(dFilter); for (int k = 0; k < nodes3.size(); k++) { htmls = nodes3.elementAt(k).toHtml(); parser = Parser.createParser(htmls, "utf-8"); AndFilter ProfessionNameFilter = new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("href")); NodeList nodes4 = parser.extractAllNodesThatMatch(ProfessionNameFilter); for (int j = 0; j < nodes4.size(); j++) { LinkTag link = (LinkTag) nodes4.elementAt(j); // if(link.getAttribute("href").contains("http://www.ulster.ac.uk/")) { // .replaceAll("<span[\\s\\S]*/span>","") String temp = link.toHtml(); System.out.println( "{\"" + count + "\",\"http://www.chi.ac.uk/" + link.getAttribute("href") + "\",\"" + html2Str(temp).replace("\r\n", "").trim() + "\",\"0\"},"); count++; } } } } // System.out.println("DONE."); }
// 获取一个网站上的链接,filter来过滤链接 public static Set<String> extracLinks(String url, Cobweb cobweb) { Set<String> links = new HashSet<String>(); try { Parser parser = new Parser(url); parser.setEncoding(cobweb.getCharSet()); // 过滤<frame >标签的filter,用来提取frame 标签里的src 属性 NodeFilter frameFilter = new NodeFilter() { public boolean accept(Node node) { if (node.getText().startsWith("frame src=")) { return true; } else { return false; } } }; // OrFilter 来设置过滤<a> 标签和<frame> 标签 OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter); // 得到所有经过过滤的标签 NodeList list = parser.extractAllNodesThatMatch(linkFilter); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof LinkTag) { // <a> 标签 LinkTag link = (LinkTag) tag; String linkUrl = link.getLink(); // URL if (cobweb.accept(linkUrl)) { links.add( // java.net.URLEncoder.encode(linkUrl)); linkUrl .replaceAll("\\?", "\\%3F") // 转码 .replaceAll("\\&", "\\%26") .replaceAll("\\|", "\\%124") .replaceAll("\\#", "")); } ; } else { // <frame>标签 // 提取frame 里src 属性的链接,如<frame src="test.html"/> String frame = tag.getText(); int start = frame.indexOf("src="); frame = frame.substring(start); int end = frame.indexOf(" "); if (end == -1) { end = frame.indexOf(">"); } String frameUrl = frame.substring(5, end - 1); if (cobweb.accept(frameUrl)) { links.add(frameUrl); } } } } catch (ParserException e) { e.printStackTrace(); } return links; }
/** * 获取滚动品牌 * * @param path * @param city * @param fileName * @return */ public static Map<String, String> getBrandInfo(String path, String city, String fileName) { Map<String, String> brandMap = new LinkedHashMap<String, String>(); try { StringBuilder filePath = new StringBuilder(); filePath.append(PATH); filePath.append(city); filePath.append(INCLUDE); filePath.append(fileName); filePath.append(STUFF); // 开始解析 Parser parser = new Parser(filePath.toString()); // 过滤出<a></a>标签 NodeFilter divFilter = new NodeClassFilter(Div.class); NodeList classList = parser.extractAllNodesThatMatch(divFilter); NodeList hrefList = null; NodeList imgList = null; Node picNode = null; Node hrefNode = null; Node imgNode = null; String classStr = ""; String hrefStr = ""; String imgStr = ""; String imgClass = ""; for (int i = 0; i < classList.size(); i++) { picNode = classList.elementAt(i); classStr = ((Div) picNode).getAttribute("class"); if ("business_list_pic".equalsIgnoreCase(classStr)) { hrefList = picNode.getChildren(); for (int j = 0; j < hrefList.size(); j++) { hrefNode = hrefList.elementAt(j); if (hrefNode instanceof LinkTag) { hrefStr = ((LinkTag) hrefNode).getLink(); // 有用品牌id,获取到id hrefStr = MParseBrand.getBrandId(hrefStr); imgList = hrefNode.getChildren(); for (int k = 0; k < imgList.size(); k++) { imgNode = imgList.elementAt(k); if (imgNode instanceof ImageTag) { imgClass = ((ImageTag) imgNode).getAttribute("class"); if (null != imgClass) { imgStr = ((ImageTag) imgNode).getAttribute("src"); if (null == imgStr) { imgStr = ((ImageTag) imgNode).getAttribute("original"); } } } } brandMap.put(hrefStr, imgStr); } } } } } catch (Exception e) { e.printStackTrace(); } return brandMap; }
/** Test regular expression matching: */ public void testRegularExpression() throws Exception { String target = "\n" + "\n" + "Most recently, in the Western Conference final, the Flames knocked off \n" + "the San Jose Sharks, the Pacific Division champions, to become the first \n" + "Canadian team to reach the Stanley Cup Championship series since 1994."; String html = "<html><head><title>CBC Sports Online: NHL Playoffs</title></head>" + "<body><h1>CBC SPORTS ONLINE</h1>\n" + "The Calgary Flames have already defeated three NHL division winners \n" + "during their improbable playoff run. If they are to hoist the Stanley \n" + "Cup they'll have to go through one more. <p><table ALIGN=\"Right\" width=196 CELLPADDING=0 cellspacing=0 hspace=4> <tr><td><img src=\"/gfx/topstory/sports/iginla_j0524.jpg\" width=194 height=194 hspace=3 border=1><br>\n" + "\n" + "<font SIZE=\"1\" FACE=\"verdana,arial\">\n" + "Jarome Iginla skates during the Flames' practice on Monday. Calgary takes on the Tampa Bay Lightning in the Stanley Cup finals beginning Tuesday night in Tampa\n" + "</font></td></tr></table>\n" + "\n" + "\n" + "In the post-season's first round, the Flames defeated the Vancouver \n" + "Canucks, the Northwest Division winners, in seven tough games. <p>\n" + "\n" + "In Round 2 it was the Detroit Red Wings, who not only won the Central \n" + "Division, but also boasted the NHL's best overall record during the \n" + "regular season, who fell to the Flames. <p>" + target + "<p>\n" + "\n" + "Up next for the Flames is the Tampa Bay Lighting -- the runaway winners \n" + "of the NHL's Southeast Division and the Eastern Conference's best team \n" + "during the regular season. <p>\n" + "\n" + "The Lighting advanced by beating the Philadelphia Flyers in the Eastern \n" + "Conference final. <p>\n" + "</body></html>\n"; Lexer lexer; Parser parser; RegexFilter filter; NodeIterator iterator; int count; lexer = new Lexer(html); parser = new Parser(lexer); filter = new RegexFilter( "(19|20)\\d\\d([- \\\\/.](0[1-9]|1[012])[- \\\\/.](0[1-9]|[12][0-9]|3[01]))?"); count = 0; for (iterator = parser.extractAllNodesThatMatch(filter).elements(); iterator.hasMoreNodes(); ) { assertEquals("text wrong", target, iterator.nextNode().toHtml()); count++; } assertEquals("wrong count", 1, count); }
public static List<String> getLinks(String url) throws ParserException { Parser htmlParser = new Parser(url); List<String> links = new LinkedList<String>(); NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class)); for (int m = 0; m < tagNodeList.size(); m++) { LinkTag loopLinks = (LinkTag) tagNodeList.elementAt(m); String linkName = loopLinks.getLink(); links.add(linkName); } return links; }
// 获取一个网站上的链接,filter 用来过滤链接 public static Set<String> extracLinks(String url, LinkFilter filter) { Set<String> links = new HashSet<String>(); try { Parser parser = new Parser(url); // parser.setEncoding("utf8"); // 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接 NodeFilter frameFilter = new NodeFilter() { /** */ private static final long serialVersionUID = 1L; public boolean accept(Node node) { if (node.getText().startsWith("iframe") && node.getText().contains("src=")) { return true; } else { return false; } } }; // OrFilter 来设置过滤 <a> 标签和 <frame> 标签 OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter); // 得到所有经过过滤的标签 NodeList list = parser.extractAllNodesThatMatch(linkFilter); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof LinkTag) // <a> 标签 { LinkTag link = (LinkTag) tag; String linkUrl = link.getLink(); // url可能出现在src,href等属性中 if (filter.accept(linkUrl)) links.add(linkUrl); } else // <frame> 标签 { // 提取 frame 里 src 属性的链接如 <frame src="test.html"/> String frame = tag.getText(); int start = frame.indexOf("src=\""); frame = frame.substring(start); int end = frame.indexOf("\">"); if (end == -1) { end = frame.indexOf("?"); } String frameUrl = frame.substring(5, end - 1); if (filter.accept(frameUrl)) links.add(frameUrl); } } } catch (ParserException e) { e.printStackTrace(); } return links; }
/** * @param url * @return * @throws Exception */ String author(String url) throws Exception { Parser parser = new Parser(); parser.setURL(url); parser.setEncoding("GB2312"); NodeFilter fileter = new NodeClassFilter(Div.class); NodeList list = parser .extractAllNodesThatMatch(fileter) .extractAllNodesThatMatch(new HasAttributeFilter("class", "otb14")); String author = null; if (list != null && list.size() > 0) { Div div = (Div) list.elementAt(0); String tmp = div.getStringText(); author = tmp; } if (null == author) { // logger.debug("重新解析作者栏"); parser = new Parser(); parser.setURL(url); parser.setEncoding("GB2312"); NodeFilter fileter1 = new NodeClassFilter(Div.class); NodeList list1 = parser .extractAllNodesThatMatch(fileter1) .extractAllNodesThatMatch(new HasAttributeFilter("class", "pop_2_1_2")); if (null != list1 && list1.size() > 0) { Div div = (Div) list1.elementAt(1); String tmp = div.getStringText(); author = tmp.substring(tmp.indexOf("</a>") + 4); logger.debug("author:" + author); } } return author; }
public boolean checkprice() { System.out.println("checking amazon url:" + page.url); try { URL url = new URL(page.url); HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection(); urlConnection.setConnectTimeout(Constant.connect_timeout); Parser parser = new Parser(urlConnection); parser.setEncoding(Constant.ENCODE); // OrFilter lastFilter = new OrFilter(); // lastFilter.setPredicates(new NodeFilter[] { // new NodeClassFilter(TableTag.class), // new NodeClassFilter(Div.class) }); // // NodeList list = parser.extractAllNodesThatMatch(lastFilter); NodeList list = parser.extractAllNodesThatMatch(new NodeClassFilter(Div.class)); System.out.println("size:" + list.size()); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof Div) { Div d = (Div) tag; System.out.println(d.getAttribute("id")); if (d.getAttribute("id").startsWith("result_")) { // found one product try { AmazonProduct product = new AmazonProduct(); product.name = d.getAttribute("name"); getPriceAndLabel(d, product); } catch (Exception e) { e.printStackTrace(); } } } } } catch (Exception e) { System.out.println(e.getMessage()); } return false; }
public void testSelectors() throws Exception { String html = "<html><head><title>sample title</title></head><body inserterr=\"true\" yomama=\"false\"><h3 id=\"heading\">big </invalid>heading</h3><ul id=\"things\"><li><br word=\"broken\"/>>moocow<li><applet/>doohickey<li class=\"last\"><b class=\"item\">final<br>item</b></ul></body></html>"; Lexer l; Parser p; CssSelectorNodeFilter it; NodeIterator i; int count; l = new Lexer(html); p = new Parser(l); it = new CssSelectorNodeFilter("li + li"); count = 0; for (i = p.extractAllNodesThatMatch(it).elements(); i.hasMoreNodes(); ) { assertEquals("tag name wrong", "LI", ((Tag) i.nextNode()).getTagName()); count++; } assertEquals("wrong count", 2, count); }
/** * 根据URL获取内容 * * @param url * @return * @throws Exception */ static String content(String url) throws Exception { Parser parser = new Parser(); parser.setURL(url); parser.setEncoding("GB2312"); NodeFilter fileter = new NodeClassFilter(Div.class); NodeList list = parser .extractAllNodesThatMatch(fileter) .extractAllNodesThatMatch(new HasAttributeFilter("id", "contentDiv")); String content = null; if (null != list && list.size() > 0) { Div div = (Div) list.elementAt(0); String tmp = div.getStringText(); // logger.debug("author:"+tmp); content = tmp; } return content; }
/** * 获取规定标签及属性的内容 使用示例: HtmlParseUtil.getContentByTagNameAndAttribute(sourse, "div", "class", * "hello"); 会获取所有div节点,并且该节点具有属性class="hello" * * @param sourse * @param tagName * @param attribute * @param attributeValue * @return */ public static List<String> getContentByTagNameAndAttribute( String sourse, String tagName, String attribute, String attributeValue) { List<String> list = new ArrayList<String>(); Parser parser = null; NodeFilter tagNameFilter = new TagNameFilter(tagName); NodeFilter classNameFilter = new HasAttributeFilter(attribute, attributeValue); NodeFilter and = new AndFilter(tagNameFilter, classNameFilter); try { parser = new Parser(sourse); NodeList nodeList = parser.extractAllNodesThatMatch(and); for (int i = 0; i < nodeList.size(); ++i) { String text = nodeList.elementAt(i).toHtml(); list.add(text); } } catch (ParserException e) { e.printStackTrace(); } return list; }
public ContentModel view(String url) { ContentModel model = new ContentModel(); try { NodeFilter filter = new TagNameFilter("html"); Parser parser = new Parser(); parser.setURL(SearchHelper.decrypt(url)); parser.setEncoding(parser.getEncoding()); // parser.setEncoding("gb2312"); NodeList list = parser.extractAllNodesThatMatch(filter); for (int i = 0; i < list.size(); i++) { String s = list.elementAt(i).toHtml(); model.setContent(s); } } catch (Exception e) { e.printStackTrace(); } return model; }
public void checkprice() throws Exception { // System.out.println("checking drugstore url:" + page.getUrl()); String cookies = ""; // DrugstoreLogin.getCookies(); URL url = new URL(page.getUrl()); HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection(); urlConnection.setConnectTimeout(Constant.connect_timeout); urlConnection.setRequestProperty( "User-Agent", "Mozilla/5.0 (compatible; MSIE 6.0; Windows NT)"); urlConnection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded"); urlConnection.setRequestProperty("Cookie", cookies); urlConnection.connect(); // InputStream is = urlConnection.getInputStream(); // // BufferedReader reader = new BufferedReader(new // InputStreamReader(is)); // // String s; // StringBuilder result = new StringBuilder(); // while (((s = reader.readLine()) != null)) { // result.append(s); // } // // System.out.println("result= " + result.toString()); // // is.close(); Parser parser = new Parser(urlConnection); parser.setEncoding(Constant.ENCODE); NodeFilter name_filter3 = new AndFilter( new NodeClassFilter(Div.class), new HasAttributeFilter("id", "divAvailablity")); NodeFilter name_filter4 = new AndFilter(new NodeClassFilter(Div.class), new HasAttributeFilter("id", "productprice")); OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates( new NodeFilter[] {new NodeClassFilter(TitleTag.class), name_filter3, name_filter4}); NodeList list = parser.extractAllNodesThatMatch(lastFilter); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof TitleTag) { TitleTag d = (TitleTag) tag; page.title = d.getTitle().replaceAll("drugstore.com", "").replaceAll("\\|", "").trim(); } else if (tag instanceof Div) { Div d = (Div) tag; String sStr = d.getStringText(); // System.out.println(sStr); if ("productprice".equalsIgnoreCase(d.getAttribute("id"))) { page.price = getPrice(sStr); } else if ("divAvailablity".equalsIgnoreCase(d.getAttribute("id"))) { if (sStr.indexOf("In Stock") >= 0 || sStr.indexOf("in stock") >= 0) { page.instock = true; } } // System.out.println(d.getStringText()); // getinStock(d); } } }
/** 从课表处,分课表 */ public List<Courses> parseCourses(String html) { Parser parser = new Parser(); try { parser.setInputHTML(html); parser.setEncoding("utf-8"); } catch (ParserException e) { e.printStackTrace(); } NodeFilter filter = new NodeClassFilter(TableTag.class); NodeList nodeList = null; try { nodeList = parser.extractAllNodesThatMatch(filter); } catch (ParserException e) { e.printStackTrace(); } List<Courses> list = new ArrayList<Courses>(); String schoolyear = ""; String semester = ""; for (int i = 0; i < nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); TableRow[] rows = tag.getRows(); for (int j = 0; j < rows.length; j++) { TableRow row = (TableRow) rows[j]; TableColumn[] columns = row.getColumns(); Courses courses = null; boolean isCourse = false; for (int k = 0; k < columns.length; k++) { Node columnNode = columns[k]; String info = columnNode.toPlainTextString().trim(); String temp = "学年学期:"; int start = info.indexOf(temp); int len = "2012-2013".length(); if (start != -1) { start = start + temp.length(); schoolyear = info.substring(start, start + len); // semester = info.substring(start+len+2); // 网络正常时候测试学期改为数字 semester = info.substring(start + len + 3, start + len + 4); if ("一".equals(semester)) { semester = "1"; } else if ("二".equals(semester)) { semester = "2"; } } if (k == 1 && info.indexOf("[") != -1) { courses = new Courses(); String courseCode = info.substring(1, 9); String coursesname = info.substring(10); courses.setCourseCode(courseCode); courses.setCoursesname(coursesname); isCourse = true; } if (k == 2 && isCourse) { double credit = Double.parseDouble(info); courses.setCredit(credit); } if (k == 3 && isCourse) { courses.setType(info); } if (k == 4 && isCourse) { courses.setLeanType(info); } if (k == 5 && isCourse) { courses.setCheckType(info); } if (k == 6 && isCourse) { courses.setGetType(info); } if (k == 7 && isCourse) { // double score=Double.parseDouble(info); courses.setScore(info); } if (k == 8 && isCourse) { courses.setRemark(info); } } // end for k if (courses != null) { courses.setSchoolYear(schoolyear); courses.setSemester(semester); list.add(courses); } } // end for j } } return list; }
public List<TimeTable> parseTimeTables(String html) { Parser parser = new Parser(); try { parser.setInputHTML(html); parser.setEncoding("utf-8"); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } List<TimeTable> list = new ArrayList<TimeTable>(); NodeFilter filter = new NodeClassFilter(TableTag.class); NodeList nodeList = null; try { nodeList = parser.extractAllNodesThatMatch(filter); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } for (int i = 0; i < nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); if (tag.getText().indexOf("[课程号]") == -1) { continue; } TableRow[] rows = tag.getRows(); for (int j = 1; j < rows.length; j++) { TableRow row = (TableRow) rows[j]; TableColumn[] columns = row.getColumns(); boolean isCourse = false; TimeTable timeTable = null; for (int k = 0; k < columns.length; k++) { Node columnNode = columns[k]; String info = columnNode.toPlainTextString().trim(); System.out.println(info + "===" + k); switch (k) { case 1: int start = info.indexOf("["); int end = info.indexOf("]"); timeTable = new TimeTable(); timeTable.setCourseCode(info.substring(start + 1, end)); timeTable.setCourseName(info.substring(end + 1)); break; case 3: timeTable.setCredit(Double.parseDouble(info)); break; case 4: timeTable.setType(info); break; case 5: int t_start = info.indexOf("]"); timeTable.setTeacher(info.substring(t_start + 1)); break; case 8: List<TimeAndAdress> ta_list = praseStr(info); for (TimeAndAdress ta : ta_list) { timeTable.setAddress(ta.getAddress()); timeTable.setTime(ta.getTime()); timeTable.setCycle(ta.getCycle()); timeTable.setSingleDouble(ta.getSingleDouble()); timeTable.setWeek(ta.getWeek()); list.add(timeTable.clone()); } break; default: break; } } } // end for j } } return list; }
/* * In order to find full text PDF you need to find the citation_pdf_url meta tag in the * abstract html pull out the pdf url normalize it (reorder params...) and find the matching * cached URL */ protected ArticleFiles processAbstract(CachedUrl absCu, Matcher absMat) { NodeList nl = null; ArticleFiles af = new ArticleFiles(); if (absCu != null && absCu.hasContent()) { // TEMPORARY: set absCU as default full text CU in case there is // no PDF CU with content; the current metadata manager currently // uses only the full text CU, but this will change with the new // metadata schema that can have multiple CUs for an article. af.setFullTextCu(absCu); af.setRoleCu(ArticleFiles.ROLE_ABSTRACT, absCu); try { InputStreamSource is = new InputStreamSource(new Stream(absCu.getUnfilteredInputStream())); Page pg = new Page(is); Lexer lx = new Lexer(pg); Parser parser = new Parser(lx); Lexer.STRICT_REMARKS = false; NodeFilter nf = new NodeFilter() { public boolean accept(Node node) { if (!(node instanceof MetaTag)) return false; MetaTag meta = (MetaTag) node; if (!"citation_pdf_url".equalsIgnoreCase(meta.getMetaTagName())) return false; return true; } }; nl = parser.extractAllNodesThatMatch(nf); } catch (ParserException e) { log.debug("Unable to parse abstract page html", e); } catch (UnsupportedEncodingException e) { log.debug("Bad encoding in abstact page html", e); } finally { absCu.release(); } } try { if (nl != null) { if (nl.size() > 0) { // minimally encode URL to prevent URL constructor // from stripping trailing spaces String pdfUrlStr = ((MetaTag) nl.elementAt(0)).getMetaContent(); URL pdfUrl = new URL(UrlUtil.minimallyEncodeUrl(pdfUrlStr)); List<String> paramList = new ArrayList<String>(); paramList.add("fileType"); paramList.add("fileId"); paramList.add("fileName"); pdfUrl = reArrangeUrlParams(pdfUrl, paramList); if (!pdfUrl.getHost().startsWith("www.")) { pdfUrl = new URL(pdfUrl.getProtocol(), "www." + pdfUrl.getHost(), pdfUrl.getFile()); } // note: must leave URL encoded because that's how we store URLs CachedUrl pdfCu = au.makeCachedUrl(pdfUrl.toString()); if (pdfCu != null && pdfCu.hasContent()) { // replace absCU with pdfCU if exists and has content af.setFullTextCu(pdfCu); af.setRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF, pdfCu); } } } } catch (MalformedURLException e) { log.debug("Badly formatted pdf url link", e); } catch (IllegalArgumentException e) { log.debug("Badly formatted pdf url link", e); } return af; }
public static HashMap<String, String> SouthamptonGetDetails(String[] url) { while (true) { try { HashMap<String, String> result = new HashMap<String, String>(); RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(10000).setConnectTimeout(10000).build(); CloseableHttpClient httpclient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build(); HttpGet httpGet = new HttpGet(url[1]); HttpResponse response = httpclient.execute(httpGet); HttpEntity entity = response.getEntity(); String htmls = null; if (entity != null) { htmls = EntityUtils.toString(entity).replace("\t", " "); // System.out.println(htmls); } System.out.println("Got reply!"); // htmls=HTMLFilter(htmls); Parser parser = null; // **********************************get school********************** parser = Parser.createParser(htmls.replace("span", "form"), "utf-8"); AndFilter SFilter = new AndFilter( new TagNameFilter("form"), // table class="CSCPreviewTable grey" new HasAttributeFilter("class", "first-owner")); NodeList nodes4 = parser.extractAllNodesThatMatch(SFilter); if (nodes4.size() > 0) { String school = html2Str(nodes4.elementAt(0).toHtml()); result.put("School", school); } // **********************************get entry structure********************** parser = Parser.createParser(htmls, "utf-8"); AndFilter ESFilter = new AndFilter( new TagNameFilter("div"), // table class="CSCPreviewTable grey" new HasAttributeFilter("class", "body__inner w-doublewide copy")); NodeList nodes1 = parser.extractAllNodesThatMatch(ESFilter); String structure = ""; String[] ProgramURL = null; if (nodes1.size() > 0) { String AllContents = nodes1.toHtml(); String[] SP = AllContents.split("<h2 id="); for (int i = 1; i < SP.length; i++) { String row = "<h2 id=" + SP[i]; if (row.contains("<h2 id=\"requirements\">Requirements</h2>")) // Structure { structure = (html2Str( row.replace("<br />", "\r\n") .replace("</strong>", "") .replace("<strong>", "") .replace("</", "\r\n</") .replace("\t", " ") .replace("&", " ")) .replace("\r\n\r\n", "\r\n")); structure = HTMLFilter(structure); result.put("Structure", structure); } // <a href="/program/BSC">Bachelor of Science (BSC)</a> else if (row.contains("<h2 id=\"relevant-degrees\">Relevant Degrees</h2>")) { parser = Parser.createParser(row, "utf-8"); AndFilter ProfessionNameFilter = new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("href")); NodeList nodes5 = parser.extractAllNodesThatMatch(ProfessionNameFilter); for (int j = 0; j < nodes4.size(); j++) { LinkTag link = (LinkTag) nodes5.elementAt(j); if (!link.getAttribute("href").equals("#")) { String code = link.getAttribute("href").replace("/program/", ""); ProgramURL = getProgram(code); result.put("Scholarship", code); // title=HTMLFilter(html2Str(link.toHtml())); } } } } } // ****************IELTS result.put("IELTS Average Requirement", "6.5"); result.put("IELTS Lowest Requirement", "6.0"); // **************************get title & type********************** result.put("Title", url[4] + " " + url[2]); result.put("Level", url[3]); if (ProgramURL != null) { result.put("Type", ProgramURL[2]); result.put("Length (months)", ProgramURL[5]); } httpclient.close(); return result; } catch (Exception ee) { System.out.println("Retrying..." + url[0]); ee.printStackTrace(); } } } // ...
@Override protected LotteryDraw parseLotteryDrawResult(String html) { LotteryType lotteryType = this.getLotteryType(); Parser parser = null; try { parser = Parser.createParser(html, CharsetConstant.CHARSET_UTF8); } catch (Exception e) { logger.error("解析html内容出错: {}", html, e); return null; } LotteryDraw lotteryDraw = new LotteryDraw(); lotteryDraw.setLotteryType(lotteryType); // 解析基本信息 try { NodeFilter tInfoFilter = new HasAttributeFilter("class", "tInfo"); NodeList tInfoNodeList = parser.extractAllNodesThatMatch(tInfoFilter); if (tInfoNodeList.size() == 0) { return null; } parser.setInputHTML(tInfoNodeList.elementAt(0).toHtml()); // 取四个红色部分,依次为彩期、销售总额、开奖日期、开奖号码 NodeFilter redFilter = new HasAttributeFilter("class", "fc-red"); NodeList redNodeList = parser.extractAllNodesThatMatch(redFilter); if (redNodeList.size() < 4) { logger.error("解析的内容不符合要求: {}", tInfoNodeList.elementAt(0).toHtml()); return null; } lotteryDraw.setPhase(redNodeList.elementAt(0).toPlainTextString().trim()); lotteryDraw.setVolumeOfSales( StringUtils.replace(redNodeList.elementAt(1).toPlainTextString().trim(), ",", "")); Date drawDate = CoreDateUtils.parseDate( redNodeList.elementAt(2).toPlainTextString().trim(), "yyyy年MM月dd日"); if (drawDate != null) { lotteryDraw.setTimeDraw(CoreDateUtils.formatDateTime(drawDate)); } lotteryDraw.setResult( StringUtils.replace(redNodeList.elementAt(3).toPlainTextString().trim(), " ", ",")); } catch (ParserException e) { logger.error(e.getMessage(), e); return null; } // 解析详情信息 try { parser.setInputHTML(html); NodeFilter dInfoFilter = new HasAttributeFilter("class", "dInfo"); NodeList dInfoNodeList = parser.extractAllNodesThatMatch(dInfoFilter); if (dInfoNodeList.size() == 0) { return null; } parser.setInputHTML(dInfoNodeList.elementAt(0).toHtml()); NodeFilter prizeFilter = new TagNameFilter("p"); NodeList prizeNodeList = parser.extractAllNodesThatMatch(prizeFilter); if (prizeNodeList.size() == 0) { return null; } String[] splitted = prizeNodeList .elementAt(0) .toPlainTextString() .split("--------------------------------------------------"); if (splitted.length < 2) { logger.error("未解析到{}开奖详情: {}", lotteryType.getName()); return lotteryDraw; } splitted = StringUtils.split(splitted[1].trim(), " "); List<LotteryDrawPrizeItem> resultDetail = new ArrayList<LotteryDrawPrizeItem>(); int index = -1; LotteryDrawPrizeItem prizeItem = null; for (int i = 0; i < splitted.length; i++) { String s = splitted[i].trim(); if (s.length() == 0) { continue; } index++; if (index % 4 == 0) { // 一行有4列 index = 0; prizeItem = new LotteryDrawPrizeItem(); resultDetail.add(prizeItem); } switch (index) { case 0: prizeItem.setName(s); break; case 1: prizeItem.setWinningCount(StringUtils.replace(s, "注", "")); break; case 2: prizeItem.setPrizeAmount( CoreStringUtils.replaceAll( s, new String[][] { {"元", ""}, {",", ""} })); break; default: break; } } lotteryDraw.setResultDetail(resultDetail); } catch (ParserException e) { logger.error(e.getMessage(), e); } return lotteryDraw; }
@Override public LotteryDraw fetchResultDetail(String phase) { LotteryDraw lotteryDraw = null; lotteryDraw = nowPhaseResult(); if (phase == null || "".equals(phase) || lotteryDraw.getPhase().equals(phase)) { return lotteryDraw; } else { lotteryDraw = null; } String url = RESULT_MORE_LOCALITY_URL; String data = null; String pageInfo = "结果页面" + url; String encoding = "utf-8"; String logHeader = "==" + lotteryScope + "==" + siteName + "==" + pageInfo + "==抓取==" + getLotteryType().getName() + "=="; try { data = CoreFetcherUtils.URLGet(url, null, encoding); } catch (Exception e) { logger.error("获取html数据失败" + e.getMessage()); return null; } if (data == null || data.indexOf("404 Not Found") > 0 || data.isEmpty()) { logger.error(logHeader + "data is null or 404 Not Found"); return null; } Parser parser = null; try { parser = Parser.createParser(data, encoding); } catch (Exception e) { logger.error("解析html页面失败" + e.getMessage()); return null; } NodeFilter filter = new HasAttributeFilter("class", "mytable"); NodeList nodeList = null; try { nodeList = parser.extractAllNodesThatMatch(filter); TableTag tableTag = (TableTag) nodeList.elementAt(0); TableRow[] tableRows = tableTag.getRows(); for (int i = 1; i < tableRows.length; i++) { TableColumn[] tableColumns = tableRows[i].getColumns(); String phaseTmp = tableColumns[0].toPlainTextString(); if (phaseTmp != null && !"".equals(phaseTmp) && phase.equals(phaseTmp)) { lotteryDraw = new LotteryDraw(); // 彩期 lotteryDraw.setPhase(phaseTmp); // 开奖结果 String strResult = tableColumns[1].toPlainTextString(); strResult = strResult.trim().replace(" ", ","); lotteryDraw.setResult(strResult); // 彩种 lotteryDraw.setLotteryType(super.getLotteryType()); break; } } } catch (ParserException e) { logger.error("数据解析错误==" + e.getMessage(), e); return null; } return lotteryDraw; }
// 获取一个网站上的链接,filter 用来过滤链接 public static Set<String> extracLinks(String url, NodeFilter filter) { Set<String> links = new HashSet<String>(); try { Parser parser = new Parser(url); parser.setEncoding("UTF-8"); @SuppressWarnings("serial") NodeFilter frameFilter = new NodeFilter() { public boolean accept(Node node) { if (node.getText().startsWith("frame src=")) { return true; } else { return false; } } }; OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter); NodeList list = parser.extractAllNodesThatMatch(linkFilter); System.out.println("length=" + list.size()); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof LinkTag) { // <a> 标签 LinkTag link = (LinkTag) tag; String linkUrl = link.getLink(); // URL /* * if (filter.accept(linkUrl)) { links.add(linkUrl); } */ System.out.println("linkUrl=" + linkUrl); if (filter.accept(tag)) { links.add(linkUrl); } } else { // <frame> 标签 // 提取 frame 里 src 属性的链接,如 <frame src="test.html"/> String frame = tag.getText(); int start = frame.indexOf("src="); frame = frame.substring(start); int end = frame.indexOf(" "); if (end == -1) { end = frame.indexOf(">"); } String frameUrl = frame.substring(5, end - 1); // if (filter.accept(frameUrl)) { // links.add(frameUrl); // } System.out.println("frameUrl=" + frameUrl); if (filter.accept(tag)) { links.add(frameUrl); } } } /* * NodeFilter filter = new TagNameFilter("DIV"); NodeList nodes = * parser.extractAllNodesThatMatch(filter); if(nodes!=null) { for * (int i = 0; i < nodes.size(); i++) { Node textnode = (Node) * nodes.elementAt(i); * System.out.println("getText:"+textnode.getText()); * System.out.println * ("================================================="); } } */ /* * for(NodeIterator i = parser.elements (); i.hasMoreNodes(); ) { * Node node = i.nextNode(); * System.out.println("getText:"+node.getText()); * System.out.println("getPlainText:"+node.toPlainTextString()); * System.out.println("toHtml:"+node.toHtml()); * System.out.println("toHtml(true):"+node.toHtml(true)); * System.out.println("toHtml(false):"+node.toHtml(false)); * System.out.println("toString:"+node.toString()); * System.out.println * ("================================================="); } */ /* * TextExtractingVisitor visitor = new TextExtractingVisitor(); * parser.visitAllNodesWith(visitor); String textInPage = * visitor.getExtractedText(); System.out.println(textInPage); */ } catch (ParserException e) { e.printStackTrace(); } return links; }
/** * Creates a list of Grids based on the given HTML string. This works only for table-based HTML * documents. * * @param html the HTML string. * @return a list of Grids. */ public static List<Grid> fromHtml(String html) throws Exception { if (html == null || html.trim().isEmpty()) { return null; } List<Grid> grids = new ArrayList<>(); Parser parser = Parser.createParser(html, "UTF-8"); Node[] tables = parser.extractAllNodesThatMatch(new TagNameFilter("table")).toNodeArray(); for (Node t : tables) { Grid grid = new ListGrid(); TableTag table = (TableTag) t; TableRow[] rows = table.getRows(); Integer firstColumnCount = null; for (TableRow row : rows) { if (getColumnCount(row) == 0) // Ignore if no cells { log.warn("Ignoring row with no columns"); continue; } Node[] cells = row.getChildren().extractAllNodesThatMatch(HTML_ROW_FILTER).toNodeArray(); if (firstColumnCount == null) // First row becomes header { firstColumnCount = getColumnCount(row); for (Node c : cells) { TagNode cell = (TagNode) c; grid.addHeader(new GridHeader(getValue(cell), false, false)); Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan")); if (colSpan != null && colSpan > 1) { grid.addEmptyHeaders((colSpan - 1)); } } } else // Rest becomes rows { if (firstColumnCount != getColumnCount(row)) // Ignore { log.warn( "Ignoring row which has " + row.getColumnCount() + " columns since table has " + firstColumnCount + " columns"); continue; } grid.addRow(); for (Node c : cells) { // TODO row span TagNode cell = (TagNode) c; grid.addValue(getValue(cell)); Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan")); if (colSpan != null && colSpan > 1) { grid.addEmptyValues((colSpan - 1)); } } } } grids.add(grid); } return grids; }
public List<TimeTable> parseTimeTables(String html) { Parser parser = new Parser(); try { parser.setInputHTML(html); parser.setEncoding("utf-8"); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } List<TimeTable> list = new ArrayList<TimeTable>(); NodeFilter tagfilter = new NodeClassFilter(TableTag.class); NodeFilter idFilter = new HasAttributeFilter("id", "reportArea"); NodeFilter filter = new AndFilter(tagfilter, idFilter); NodeList nodeList = null; try { nodeList = parser.extractAllNodesThatMatch(filter); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } for (int i = 0; i < nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); TableRow[] rows = tag.getRows(); for (int j = 0; j < rows.length; j++) { TableRow row = (TableRow) rows[j]; TableColumn[] columns = row.getColumns(); boolean isCourse = false; TimeTable timeTable = null; for (int k = 0; k < columns.length; k++) { Node columnNode = columns[k]; String info = columnNode.toPlainTextString().trim(); // System.out.println(info+"=="+k); if (k == 1 && info.indexOf("[") != -1) { timeTable = new TimeTable(); String courseCode = info.substring(1, 9); String coursesname = info.substring(10); timeTable.setCourseName(coursesname); timeTable.setCourseCode(courseCode); isCourse = true; } if (k == 2 && isCourse) { double credit = Double.parseDouble(info); timeTable.setCredit(credit); } if (k == 3 && isCourse) { timeTable.setType(info); } if (k == 4 && isCourse) { timeTable.setTeacher(info); } if (k == 5 && isCourse) { timeTable.setClassId(info); } if (k == 6 && isCourse) { timeTable.setClassNum(info); } if (k == 11 && isCourse) { List<TimeAndAdress> ta_list = praseStr(info); for (TimeAndAdress ta : ta_list) { timeTable.setAddress(ta.getAddress()); timeTable.setTime(ta.getTime()); timeTable.setCycle(ta.getCycle()); timeTable.setSingleDouble(ta.getSingleDouble()); timeTable.setWeek(ta.getWeek()); list.add(timeTable.clone()); } } } // end for k } // end for j } } return list; }
@Override public List<JclqScheduleItem> fetchJclqSchedule(String officialDate) throws FetchFailedException { Map<String, String> headerParams = new HashMap<String, String>(); headerParams.put("Referer", "http://info.sporttery.cn"); headerParams.put( "User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19"); List<JclqScheduleItem> jclqScheduleItemList = new ArrayList<JclqScheduleItem>(); String encoding = "gbk"; JclqScheduleItem jclqScheduleItem = null; String siteName = "中国竞彩网网[竞彩篮球赛程]"; String logHeader = siteName + SCHEDULE_URL; try { String webInfo = CoreFetcherUtils.URLGetWithHeaderParams(SCHEDULE_URL, headerParams, null, encoding); if (webInfo == null || webInfo.indexOf("404 Not Found") > 0) { logger.error(logHeader + ",data is null or 404 Not Found"); throw new FetchFailedException("404 Not Found"); } Parser parser = Parser.createParser(webInfo, encoding); NodeList nodeList = parser.extractAllNodesThatMatch(new CssSelectorNodeFilter("div[class='box-tbl']")); if (null != nodeList && nodeList.size() > 0) { NodeFilter tableFilter = new TagNameFilter("table"); Parser parser2 = Parser.createParser(nodeList.toHtml(), encoding); NodeList tableNodeList = parser2.extractAllNodesThatMatch(tableFilter); if (tableNodeList != null && tableNodeList.size() > 0) { TableTag catchTableTag = new TableTag(); catchTableTag = (TableTag) tableNodeList.elementAt(0); if (catchTableTag != null) { TableRow[] catchRows = catchTableTag.getRows(); TableColumn[] catchColumns = null; for (int i = 2; i < catchRows.length; i++) { catchColumns = catchRows[i].getColumns(); if (catchColumns != null && catchColumns.length >= 6) { jclqScheduleItem = new JclqScheduleItem(); String officialNum = catchColumns[0].toPlainTextString().trim(); if (officialNum.length() < 5) { continue; } // 先解析比赛时间 String matchDateStr = catchColumns[3].toPlainTextString().trim(); String[] yearStr = matchDateStr.split("-"); if (yearStr[0].length() <= 2) { matchDateStr = "20" + matchDateStr + ":00"; } else { matchDateStr = matchDateStr + ":00"; } Date matchDate = CoreDateUtils.parseDate(matchDateStr, CoreDateUtils.DATETIME); Calendar matchDateCalendar = Calendar.getInstance(); matchDateCalendar.setTime(matchDate); matchDateCalendar.add(Calendar.MINUTE, 1); jclqScheduleItem.setMatchDate(matchDateCalendar.getTime()); // 根据周几、当前时间和比赛时间计算官方发布的日期 Calendar cd = Calendar.getInstance(); // 将时分秒等区域清零 cd.set(Calendar.HOUR_OF_DAY, 0); cd.set(Calendar.MINUTE, 0); cd.set(Calendar.SECOND, 0); cd.set(Calendar.MILLISECOND, 0); int nowWeekDay = cd.get(Calendar.DAY_OF_WEEK); int fetchWeekDay = weekDay.get(officialNum.substring(0, 2)); if (nowWeekDay != fetchWeekDay) { int m = fetchWeekDay - nowWeekDay; if (m < -1) { cd.add(Calendar.DATE, m + 7); } else { cd.add(Calendar.DATE, m); } } // 如果计算出来的日期超过了比赛时间,减去一周 if (cd.after(matchDateCalendar)) { cd.add(Calendar.DATE, -7); } // 如果计算出来的日期距离比赛时间相隔超过一周,加上一周的倍数 // 一周的毫秒数 long weekTimeInMillis = 3600 * 1000 * 24 * 7; long diffTimeInMillis = matchDateCalendar.getTimeInMillis() - cd.getTimeInMillis(); if (diffTimeInMillis > weekTimeInMillis) { // 计算相差几周 int diffWeekCount = (int) (diffTimeInMillis / weekTimeInMillis); cd.add(Calendar.DATE, 7 * diffWeekCount); } jclqScheduleItem.setMatchNum( CoreDateUtils.formatDate(cd.getTime(), "yyyyMMdd") + LotteryConstant.JCLQ_MATCH_NUM_CODE_DEFAULT + officialNum.substring(2)); jclqScheduleItem.setOfficialDate( CoreDateUtils.parseDate(CoreDateUtils.formatDate(cd.getTime()))); Integer oNum = null; try { oNum = Integer.valueOf(officialNum.substring(2)); } catch (Exception e) { logger.error("截取官方编码时,转换为Integer错误", e); } jclqScheduleItem.setOfficialNum(oNum); jclqScheduleItem.setMatchName( JclqUtil.convertMatchName( catchColumns[1].toPlainTextString().trim(), LotteryType.JCLQ_SF, FetcherType.T_PENGINEAPI)); String team = catchColumns[2].toPlainTextString().trim(); String[] teamStr = team.split("VS"); jclqScheduleItem.setAwayTeam(teamStr[0].trim()); jclqScheduleItem.setHomeTeam(teamStr[1].trim()); if ("已开售".equals(catchColumns[4].toPlainTextString().trim())) { jclqScheduleItem.setStatus(JclqRaceStatus.OPEN); } else { jclqScheduleItem.setStatus(JclqRaceStatus.UNOPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("胜负单关") > 0) { jclqScheduleItem.setDynamicSaleSfStatus(JclqDynamicSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setDynamicSaleSfStatus(JclqDynamicSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("胜负过关") > 0) { jclqScheduleItem.setStaticSaleSfStatus(JclqStaticSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setStaticSaleSfStatus(JclqStaticSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("让分胜负单关") > 0) { jclqScheduleItem.setDynamicSaleRfsfStatus(JclqDynamicSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setDynamicSaleRfsfStatus(JclqDynamicSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("让分胜负过关") > 0) { jclqScheduleItem.setStaticSaleRfsfStatus(JclqStaticSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setStaticSaleRfsfStatus(JclqStaticSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("胜分差单关") > 0) { jclqScheduleItem.setDynamicSaleSfcStatus(JclqDynamicSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setDynamicSaleSfcStatus(JclqDynamicSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("胜分差过关") > 0) { jclqScheduleItem.setStaticSaleSfcStatus(JclqStaticSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setStaticSaleSfcStatus(JclqStaticSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("大小分单关") > 0) { jclqScheduleItem.setDynamicSaleDxfStatus(JclqDynamicSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setDynamicSaleDxfStatus(JclqDynamicSaleStatus.SALE_OPEN); } if (catchColumns[5].toPlainTextString().trim().indexOf("大小分过关") > 0) { jclqScheduleItem.setStaticSaleDxfStatus(JclqStaticSaleStatus.SALE_UNOPEN); } else { jclqScheduleItem.setStaticSaleDxfStatus(JclqStaticSaleStatus.SALE_OPEN); } jclqScheduleItemList.add(jclqScheduleItem); } } // end for catchRows } // end if catchTableTag!=null } // end if(tableNodeList!=null&&tableNodeList.size()>0) } else { logger.error(logHeader + "竞彩篮球赛程数据表格不存在,返回null"); throw new FetchFailedException("竞彩篮球赛程数据表格不存在"); } } catch (Exception e) { logger.error(logHeader + "竞彩篮球赛程错误" + e.getMessage(), e); throw new FetchFailedException(e.getMessage()); } return jclqScheduleItemList; }
private static boolean handleURL(String address) { Main.status(String.format("Processing page \"%s\".", address)); try { NodeList posts = getPosts(address); if (posts.toNodeArray().length == 0) { return false; } for (Node post_node : posts.toNodeArray()) { if (post_node instanceof TagNode) { TagNode post = (TagNode) post_node; Post new_post = new Post(Long.parseLong(post.getAttribute("id").substring(5))); if (!Main.post_post_hash.containsKey(new_post)) { NodeList photo_posts = getPhotoPosts(post.getChildren()); NodeList remarks = getRemarks(photo_posts); for (Node node : remarks.toNodeArray()) { Matcher matcher = lores.matcher(node.getText()); String media_url = ""; if (matcher.find()) { media_url = matcher.group(); media_url = media_url.substring(17, media_url.length() - 1); } String thumb = media_url.replace( media_url.substring(media_url.lastIndexOf("_"), media_url.lastIndexOf(".")), "_75sq"); URL thumb_url = new URL(thumb); new_post.pictures.add(new Picture(new URL(media_url), thumb_url)); } NodeList photoset_posts = getPhotosetPosts(post.getChildren()); NodeList iframes = getIFrames(photoset_posts); for (Node node : iframes.toNodeArray()) { if (node instanceof TagNode) { String iframe_url = ((TagNode) node).getAttribute("src"); Parser parser2 = new Parser(iframe_url); NodeList a_list = parser2.extractAllNodesThatMatch(new TagNameFilter("a")); Node[] a_array = a_list.toNodeArray(); Node[] img_array = a_list.extractAllNodesThatMatch(new TagNameFilter("img"), true).toNodeArray(); String media_url; for (int i = 0; i < a_array.length; i++) { media_url = ((TagNode) img_array[i]).getAttribute("src"); String thumb = media_url.replace( media_url.substring( media_url.lastIndexOf("_"), media_url.lastIndexOf(".")), "_75sq"); URL thumb_url = new URL(thumb); new_post.pictures.add(new Picture(new URL(media_url), thumb_url)); } } } Main.handlePost(new_post); } else { new_post = post_post_hash.get(new_post); handleNonDownloadPost(new_post); } } } } catch (Exception ex) { ex.printStackTrace(); Main.status("Error handling post."); } return true; }