public void checkprice() throws Exception { // System.out.println("checking Aptamil url [" + page.url + "]"); URL url = new URL(page.url); HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection(); urlConnection.setConnectTimeout(Constant.connect_timeout); urlConnection.connect(); Parser parser = new Parser(urlConnection); parser.setEncoding(Constant.ENCODE); NodeClassFilter div_filter = new NodeClassFilter(Div.class); OrFilter filters = new OrFilter(); filters.setPredicates(new NodeFilter[] {div_filter}); NodeList list = parser.extractAllNodesThatMatch(filters); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof Div) { Div d = (Div) tag; String divclass = d.getAttribute("class"); if ("pl_addToBasket".equalsIgnoreCase(divclass)) { // return getName(d); } } } }
/** * 获取新闻的内容 * * @param newsContentFilter * @param parser * @return content 新闻内容 */ public String getNewsContent(NodeFilter newsContentFilter, Parser parser) { String content = null; StringBuilder builder = new StringBuilder(); try { NodeList newsContentList = parser.parse(newsContentFilter); for (int i = 0; i < newsContentList.size(); i++) { Div newsContenTag = (Div) newsContentList.elementAt(i); builder = builder.append(newsContenTag.getStringText()); } content = builder.toString(); // 转换为String 类型。 if (content != null) { parser.reset(); parser = Parser.createParser(content, "utf8"); StringBean sb = new StringBean(); sb.setCollapse(true); parser.visitAllNodesWith(sb); content = sb.getStrings(); // String s = "\";} else{ document.getElementById('TurnAD444').innerHTML = \"\";} } // showTurnAD444(intTurnAD444); }catch(e){}"; content = content.replaceAll("\\\".*[a-z].*\\}", ""); content = content.replace("[我来说两句]", ""); } else { System.out.println("没有得到新闻内容!"); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } return content; }
/** * 方法:获取对应的页面内容 * * @param htmlPageContent * @param preUrl * @throws ParserException * <p>Add By Ethan Lam At 2011-11-23 */ public void fetchHtmlContent(String htmlPageContent, String preUrl) throws ParserException { Parser parser = new Parser(); parser.setInputHTML(htmlPageContent); NodeFilter filter = new AndFilter( new TagNameFilter("div"), new HasAttributeFilter("class", "blkContainerSblkCon")); NodeList nodeList = parser.parse(filter); NodeIterator it = nodeList.elements(); Div div = null; StringBuffer htmlContent = new StringBuffer(); while (it.hasMoreNodes()) { div = (Div) it.nextNode(); NodeList nl = div.getChildren(); if (nl == null) continue; NodeIterator sub = nl.elements(); while (sub.hasMoreNodes()) { Node t = sub.nextNode(); if (t instanceof ParagraphTag) { // LoggerUtil.info("fetchHtmlContent:",((ParagraphTag) t).getStringText()); htmlContent.append(((ParagraphTag) t).getStringText()); } } } if ("".equals(htmlContent.toString().trim())) return; Page page = new Page(); page.setUrl(preUrl); page.setSegment(htmlContent.toString()); LoggerUtil.info(preUrl + "获取到的页面内容:", htmlContent.toString()); pageSer.save(page); }
/* * 获得新闻的日期 */ public String getNewsDate(NodeFilter dateFilter, Parser parser) { String newsDate = null; try { NodeList dateList = parser.parse(dateFilter); for (int i = 0; i < dateList.size(); i++) { Div dateTag = (Div) dateList.elementAt(i); newsDate = dateTag.getStringText(); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } return newsDate; }
/** * 获得新闻的责任编辑,也就是作者。 * * @param newsauthorFilter * @param parser * @return */ public String getNewsAuthor(NodeFilter newsauthorFilter, Parser parser) { String newsAuthor = ""; try { NodeList authorList = parser.parse(newsauthorFilter); for (int i = 0; i < authorList.size(); i++) { Div authorSpan = (Div) authorList.elementAt(i); newsAuthor = authorSpan.getStringText(); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } return newsAuthor; }
public boolean checkprice() { System.out.println("checking amazon url:" + page.url); try { URL url = new URL(page.url); HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection(); urlConnection.setConnectTimeout(Constant.connect_timeout); Parser parser = new Parser(urlConnection); parser.setEncoding(Constant.ENCODE); // OrFilter lastFilter = new OrFilter(); // lastFilter.setPredicates(new NodeFilter[] { // new NodeClassFilter(TableTag.class), // new NodeClassFilter(Div.class) }); // // NodeList list = parser.extractAllNodesThatMatch(lastFilter); NodeList list = parser.extractAllNodesThatMatch(new NodeClassFilter(Div.class)); System.out.println("size:" + list.size()); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof Div) { Div d = (Div) tag; System.out.println(d.getAttribute("id")); if (d.getAttribute("id").startsWith("result_")) { // found one product try { AmazonProduct product = new AmazonProduct(); product.name = d.getAttribute("name"); getPriceAndLabel(d, product); } catch (Exception e) { e.printStackTrace(); } } } } } catch (Exception e) { System.out.println(e.getMessage()); } return false; }
/** * 根据URL获取内容 * * @param url * @return * @throws Exception */ static String content(String url) throws Exception { Parser parser = new Parser(); parser.setURL(url); parser.setEncoding("GB2312"); NodeFilter fileter = new NodeClassFilter(Div.class); NodeList list = parser .extractAllNodesThatMatch(fileter) .extractAllNodesThatMatch(new HasAttributeFilter("id", "contentDiv")); String content = null; if (null != list && list.size() > 0) { Div div = (Div) list.elementAt(0); String tmp = div.getStringText(); // logger.debug("author:"+tmp); content = tmp; } return content; }
/** * Extracted from "http://scores.nba.com/games/20031029/scoreboard.html" which has a lot of table * columns with unclosed DIV tags because the closing DIV doesn't have a slash. This caused * java.lang.StackOverflowError on Windows. Tests the new non-recursive CompositeTagScanner with * the walk back through the parse stack. See also Bug #750117 StackOverFlow while Node-Iteration * and others. */ public void testInvalidNesting() throws ParserException { String html = "<table cellspacing=\"2\" cellpadding=\"0\" border=\"0\" width=\"600\">\n" + "<tr>\n" + "<td><div class=\"ScoreBoardSec\"> <a target=\"_parent\" class=\"ScoreBoardSec\" href=\"http://www.nba.com/heat/\">Heat</a><div></td>\n" + "</tr>\n" + "</table>"; createParser(html); parseAndAssertNodeCount(1); assertType("table", TableTag.class, node[0]); TableTag table = (TableTag) node[0]; assertTrue("table should have 3 nodes", 3 == table.getChildCount()); assertType("row", TableRow.class, table.childAt(1)); TableRow row = (TableRow) table.childAt(1); assertTrue("row should have 3 nodes", 3 == row.getChildCount()); assertType("column", TableColumn.class, row.childAt(1)); TableColumn column = (TableColumn) row.childAt(1); assertTrue("column should have 1 node", 1 == column.getChildCount()); assertType("element", Div.class, column.childAt(0)); Div div = (Div) column.childAt(0); assertTrue("div should have 3 nodes", 3 == div.getChildCount()); assertType("link", LinkTag.class, div.childAt(1)); LinkTag link = (LinkTag) div.childAt(1); assertTrue("link contents", link.getLink().equals("http://www.nba.com/heat/")); assertType("bogus div", Div.class, div.childAt(2)); assertTrue("bogus div should have no children", 0 == ((Div) div.childAt(2)).getChildCount()); }
/** * @param url * @return * @throws Exception */ String author(String url) throws Exception { Parser parser = new Parser(); parser.setURL(url); parser.setEncoding("GB2312"); NodeFilter fileter = new NodeClassFilter(Div.class); NodeList list = parser .extractAllNodesThatMatch(fileter) .extractAllNodesThatMatch(new HasAttributeFilter("class", "otb14")); String author = null; if (list != null && list.size() > 0) { Div div = (Div) list.elementAt(0); String tmp = div.getStringText(); author = tmp; } if (null == author) { // logger.debug("重新解析作者栏"); parser = new Parser(); parser.setURL(url); parser.setEncoding("GB2312"); NodeFilter fileter1 = new NodeClassFilter(Div.class); NodeList list1 = parser .extractAllNodesThatMatch(fileter1) .extractAllNodesThatMatch(new HasAttributeFilter("class", "pop_2_1_2")); if (null != list1 && list1.size() > 0) { Div div = (Div) list1.elementAt(1); String tmp = div.getStringText(); author = tmp.substring(tmp.indexOf("</a>") + 4); logger.debug("author:" + author); } } return author; }
@Override public void execute() { try { // 根据URL地址,获取网页内容 String html = HttpUtils.getHtml(httpclient, url); if (html == null) { throw new RuntimeException("无法获取【" + url + "】网址的内容"); } Topic a = new Topic(); // 设置文章的来源 a.setSource("www.ibm.com"); // 对网页内容进行分析和提取 // 设置文章的标题 MetaTag titleTag = ParseUtils.parseTag(html, MetaTag.class, "name", "title"); a.setTitle(titleTag.getMetaContent()); // 设置文章的关键字 MetaTag keywordTag = ParseUtils.parseTag(html, MetaTag.class, "name", "Keywords"); if (keywordTag.getMetaContent().length() > 255) { a.setKeyword(keywordTag.getMetaContent().substring(0, 255)); } // 设置文章的简介 MetaTag introTag = ParseUtils.parseTag(html, MetaTag.class, "name", "Abstract"); a.setSummary(introTag.getMetaContent()); // 设置文章的作者 List<Div> authors = ParseUtils.parseTags(html, Div.class, "class", "author"); String author = ""; for (int i = 0; i < authors.size(); i++) { if (i != 0) { author = author + ","; } Div div = authors.get(i); author = author + ParseUtils.parseTag(div.getStringText(), LinkTag.class).getStringText(); } a.setAuthor(author); // 设置文章的内容 String content = StringUtils.substringBetween(html, "<!-- MAIN_COLUMN_CONTENT_BEGIN -->", "<!-- CMA"); // 查询文章的内容中所包含的图片,并下载到upload目录,然后创建Attachment对象,设置到Article对象中 List<ImageTag> imageTags = ParseUtils.parseTags(content, ImageTag.class); if (imageTags != null) { for (ImageTag it : imageTags) { // 得到图片所在的路径目录 String baseUrl = url.substring(0, url.lastIndexOf("/") + 1); // 这个是<img>标签中的src的值 String imageUrl = it.getImageURL(); // 图片的绝对路径 String absoluteUrl = baseUrl + imageUrl; // : "文章标题/xxx.jpg" String imageName = a.getTitle().replaceAll("/|\\\\|\\:|\\*|\\?|\\||\\<|>", "_") + "/" + imageUrl; // 把图片保存到upload目录 // 首先确定,保存到本地的图片的路径 String imageLocalFile = ""; // Attachment.ATTACHMENT_DIR + imageName; // 如果图片已经被下载到本地,则不再下载 if (!new File(imageLocalFile).exists()) { // 下载图片的信息 byte[] image = HttpUtils.getImage(httpclient, absoluteUrl); // 直接使用new FileOutputStream(imageLocalFile)这种方式,创建一个 // 文件输出流,存在的问题就是:如果这个文件所在的目录不存在,则创建不了 // 输出流,会抛出异常! // 所以,使用辅助的工具类来创建一个文件输出流:FileUtils.openOutputStream(new File(imageLocalFile)) // 通过这个方法,当文件所在的父目录不存在的时候,将自动创建其所有的父目录 IOUtils.write(image, FileUtils.openOutputStream(new File(imageLocalFile))); System.out.println("图片【" + absoluteUrl + "】已下载"); } // 针对每张图片,创建一个Attachment对象 Attachment attachment = new Attachment(); attachment.setType("image/jpeg"); attachment.setOldName(imageName); // a.addAttachment(attachment); } } // 修改content中的所有图片的src的值 // 将src的值,加上前缀:upload_image/文章标题/图片.jpg content = ParseUtils.modifyImageUrl( content, "upload_image/" + a.getTitle().replaceAll("/|\\\\|\\:|\\*|\\?|\\||\\<|>", "_") + "/"); // 删除<hr>和"回首页"的链接标签 content = ParseUtils.reomveTags(content, Div.class, "class", "ibm-alternate-rule"); content = ParseUtils.reomveTags( content, ParagraphTag.class, "class", "ibm-ind-link ibm-back-to-top"); a.setContent(content); // 将文章对象放入HttpContext List<Topic> articles = new ArrayList<Topic>(); articles.add(a); context.setAttribute("articles", articles); } catch (Exception e) { e.printStackTrace(); } }
public void checkprice() throws Exception { // System.out.println("checking drugstore url:" + page.getUrl()); String cookies = ""; // DrugstoreLogin.getCookies(); URL url = new URL(page.getUrl()); HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection(); urlConnection.setConnectTimeout(Constant.connect_timeout); urlConnection.setRequestProperty( "User-Agent", "Mozilla/5.0 (compatible; MSIE 6.0; Windows NT)"); urlConnection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded"); urlConnection.setRequestProperty("Cookie", cookies); urlConnection.connect(); // InputStream is = urlConnection.getInputStream(); // // BufferedReader reader = new BufferedReader(new // InputStreamReader(is)); // // String s; // StringBuilder result = new StringBuilder(); // while (((s = reader.readLine()) != null)) { // result.append(s); // } // // System.out.println("result= " + result.toString()); // // is.close(); Parser parser = new Parser(urlConnection); parser.setEncoding(Constant.ENCODE); NodeFilter name_filter3 = new AndFilter( new NodeClassFilter(Div.class), new HasAttributeFilter("id", "divAvailablity")); NodeFilter name_filter4 = new AndFilter(new NodeClassFilter(Div.class), new HasAttributeFilter("id", "productprice")); OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates( new NodeFilter[] {new NodeClassFilter(TitleTag.class), name_filter3, name_filter4}); NodeList list = parser.extractAllNodesThatMatch(lastFilter); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof TitleTag) { TitleTag d = (TitleTag) tag; page.title = d.getTitle().replaceAll("drugstore.com", "").replaceAll("\\|", "").trim(); } else if (tag instanceof Div) { Div d = (Div) tag; String sStr = d.getStringText(); // System.out.println(sStr); if ("productprice".equalsIgnoreCase(d.getAttribute("id"))) { page.price = getPrice(sStr); } else if ("divAvailablity".equalsIgnoreCase(d.getAttribute("id"))) { if (sStr.indexOf("In Stock") >= 0 || sStr.indexOf("in stock") >= 0) { page.instock = true; } } // System.out.println(d.getStringText()); // getinStock(d); } } }
private void scanPage() throws IOException, ParserException, ParseException { URL u = new URL(this.url); HttpURLConnection conn = (HttpURLConnection) u.openConnection(); Parser parser = new Parser(conn); System.setProperty("sun.net.client.defaultConnectTimeout", "30000000"); // jdk1.4换成这个,连接超时 System.setProperty("sun.net.client.defaultReadTimeout", "30000000"); // jdk1.4换成这个,读操作超时 // con.setConnectTimeout(5000);//jdk 1.5换成这个,连接超时 // con.setReadTimeout(5000);//jdk 1.5换成这个,读操作超时 parser.setEncoding("UTF-8"); NodeFilter filter = new NodeClassFilter(CompositeTag.class); NodeList tags = parser.extractAllNodesThatMatch(filter); SimpleNodeIterator iter = tags.elements(); CompositeTag tag = null; while (iter.hasMoreNodes()) { tag = (CompositeTag) iter.nextNode(); String id = tag.getAttribute("id"); String cls = tag.getAttribute("class"); if ((tag instanceof LinkTag)) { LinkTag lt = (LinkTag) tag; if (cls == null) { continue; } if (cls.startsWith("gae-click*Product-Page*Breadcrumb*Category")) { this.category = lt.getStringText(); continue; } if (cls.startsWith("gae-click*Product-Page*Breadcrumb*Sub-Category")) { this.subCategory = lt.getStringText(); continue; } if (cls.startsWith("gae-click*Product-Page*Breadcrumb*Brand")) { this.brand = lt.getStringText(); continue; } if (cls.startsWith("gae-click*Product-Page*PrForm*Free-Shipping")) { this.freight = "Free Shipping!"; } else if (cls.equalsIgnoreCase("link fn")) { this.pname = lt.getStringText(); continue; } } else if ((tag instanceof LabelTag)) { LabelTag lt = (LabelTag) tag; if ((id != null) && (id.startsWith("label")) && (cls != null) && (cls.startsWith("d"))) { String l = lt.getLabel(); l = l.replace("\n", ""); int idx = l.indexOf(40); if (idx > 0) { l = l.substring(0, idx); } this.dimNames.put(cls, l); } } else if (!(tag instanceof SelectTag)) { if ((tag instanceof Span)) { if ((id != null) && (id.equalsIgnoreCase("sku"))) { String sku = tag.getStringText(); this.pid = sku.substring(sku.indexOf(35) + 1); } } else if ((tag instanceof Bullet)) { Bullet b = (Bullet) tag; String text = b.getStringText().trim(); if (text.startsWith("Weight")) { int idx = text.indexOf(":"); this.weight = text.substring(idx + 1).trim(); } } else if ((tag instanceof Div)) { Div div = (Div) tag; if (cls == null) { continue; } if (cls.equalsIgnoreCase("description")) { StringBuilder sb = new StringBuilder(); BulletList bullets = (BulletList) div.getChild(0); SimpleNodeIterator bls = bullets.elements(); while (bls.hasMoreNodes()) { Node n = bls.nextNode(); if ((n instanceof Bullet)) { Bullet bl = (Bullet) n; sb.append(bl.getStringText()); } } this.intro = sb.toString(); } } else if ((this.items == null) && ((tag instanceof ScriptTag))) { this.items = readScript((ScriptTag) tag); } } } }