/** * 获取新闻的内容 * * @param newsContentFilter * @param parser * @return content 新闻内容 */ public String getNewsContent(NodeFilter newsContentFilter, Parser parser) { String content = null; StringBuilder builder = new StringBuilder(); try { NodeList newsContentList = parser.parse(newsContentFilter); for (int i = 0; i < newsContentList.size(); i++) { Div newsContenTag = (Div) newsContentList.elementAt(i); builder = builder.append(newsContenTag.getStringText()); } content = builder.toString(); // 转换为String 类型。 if (content != null) { parser.reset(); parser = Parser.createParser(content, "utf8"); StringBean sb = new StringBean(); sb.setCollapse(true); parser.visitAllNodesWith(sb); content = sb.getStrings(); // String s = "\";} else{ document.getElementById('TurnAD444').innerHTML = \"\";} } // showTurnAD444(intTurnAD444); }catch(e){}"; content = content.replaceAll("\\\".*[a-z].*\\}", ""); content = content.replace("[我来说两句]", ""); } else { System.out.println("没有得到新闻内容!"); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } return content; }
/* * 获得新闻的日期 */ public String getNewsDate(NodeFilter dateFilter, Parser parser) { String newsDate = null; try { NodeList dateList = parser.parse(dateFilter); for (int i = 0; i < dateList.size(); i++) { Div dateTag = (Div) dateList.elementAt(i); newsDate = dateTag.getStringText(); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } return newsDate; }
/** * 获得新闻的责任编辑,也就是作者。 * * @param newsauthorFilter * @param parser * @return */ public String getNewsAuthor(NodeFilter newsauthorFilter, Parser parser) { String newsAuthor = ""; try { NodeList authorList = parser.parse(newsauthorFilter); for (int i = 0; i < authorList.size(); i++) { Div authorSpan = (Div) authorList.elementAt(i); newsAuthor = authorSpan.getStringText(); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } return newsAuthor; }
/** * @param url * @return * @throws Exception */ String author(String url) throws Exception { Parser parser = new Parser(); parser.setURL(url); parser.setEncoding("GB2312"); NodeFilter fileter = new NodeClassFilter(Div.class); NodeList list = parser .extractAllNodesThatMatch(fileter) .extractAllNodesThatMatch(new HasAttributeFilter("class", "otb14")); String author = null; if (list != null && list.size() > 0) { Div div = (Div) list.elementAt(0); String tmp = div.getStringText(); author = tmp; } if (null == author) { // logger.debug("重新解析作者栏"); parser = new Parser(); parser.setURL(url); parser.setEncoding("GB2312"); NodeFilter fileter1 = new NodeClassFilter(Div.class); NodeList list1 = parser .extractAllNodesThatMatch(fileter1) .extractAllNodesThatMatch(new HasAttributeFilter("class", "pop_2_1_2")); if (null != list1 && list1.size() > 0) { Div div = (Div) list1.elementAt(1); String tmp = div.getStringText(); author = tmp.substring(tmp.indexOf("</a>") + 4); logger.debug("author:" + author); } } return author; }
/** * 根据URL获取内容 * * @param url * @return * @throws Exception */ static String content(String url) throws Exception { Parser parser = new Parser(); parser.setURL(url); parser.setEncoding("GB2312"); NodeFilter fileter = new NodeClassFilter(Div.class); NodeList list = parser .extractAllNodesThatMatch(fileter) .extractAllNodesThatMatch(new HasAttributeFilter("id", "contentDiv")); String content = null; if (null != list && list.size() > 0) { Div div = (Div) list.elementAt(0); String tmp = div.getStringText(); // logger.debug("author:"+tmp); content = tmp; } return content; }
@Override public void execute() { try { // 根据URL地址,获取网页内容 String html = HttpUtils.getHtml(httpclient, url); if (html == null) { throw new RuntimeException("无法获取【" + url + "】网址的内容"); } Topic a = new Topic(); // 设置文章的来源 a.setSource("www.ibm.com"); // 对网页内容进行分析和提取 // 设置文章的标题 MetaTag titleTag = ParseUtils.parseTag(html, MetaTag.class, "name", "title"); a.setTitle(titleTag.getMetaContent()); // 设置文章的关键字 MetaTag keywordTag = ParseUtils.parseTag(html, MetaTag.class, "name", "Keywords"); if (keywordTag.getMetaContent().length() > 255) { a.setKeyword(keywordTag.getMetaContent().substring(0, 255)); } // 设置文章的简介 MetaTag introTag = ParseUtils.parseTag(html, MetaTag.class, "name", "Abstract"); a.setSummary(introTag.getMetaContent()); // 设置文章的作者 List<Div> authors = ParseUtils.parseTags(html, Div.class, "class", "author"); String author = ""; for (int i = 0; i < authors.size(); i++) { if (i != 0) { author = author + ","; } Div div = authors.get(i); author = author + ParseUtils.parseTag(div.getStringText(), LinkTag.class).getStringText(); } a.setAuthor(author); // 设置文章的内容 String content = StringUtils.substringBetween(html, "<!-- MAIN_COLUMN_CONTENT_BEGIN -->", "<!-- CMA"); // 查询文章的内容中所包含的图片,并下载到upload目录,然后创建Attachment对象,设置到Article对象中 List<ImageTag> imageTags = ParseUtils.parseTags(content, ImageTag.class); if (imageTags != null) { for (ImageTag it : imageTags) { // 得到图片所在的路径目录 String baseUrl = url.substring(0, url.lastIndexOf("/") + 1); // 这个是<img>标签中的src的值 String imageUrl = it.getImageURL(); // 图片的绝对路径 String absoluteUrl = baseUrl + imageUrl; // : "文章标题/xxx.jpg" String imageName = a.getTitle().replaceAll("/|\\\\|\\:|\\*|\\?|\\||\\<|>", "_") + "/" + imageUrl; // 把图片保存到upload目录 // 首先确定,保存到本地的图片的路径 String imageLocalFile = ""; // Attachment.ATTACHMENT_DIR + imageName; // 如果图片已经被下载到本地,则不再下载 if (!new File(imageLocalFile).exists()) { // 下载图片的信息 byte[] image = HttpUtils.getImage(httpclient, absoluteUrl); // 直接使用new FileOutputStream(imageLocalFile)这种方式,创建一个 // 文件输出流,存在的问题就是:如果这个文件所在的目录不存在,则创建不了 // 输出流,会抛出异常! // 所以,使用辅助的工具类来创建一个文件输出流:FileUtils.openOutputStream(new File(imageLocalFile)) // 通过这个方法,当文件所在的父目录不存在的时候,将自动创建其所有的父目录 IOUtils.write(image, FileUtils.openOutputStream(new File(imageLocalFile))); System.out.println("图片【" + absoluteUrl + "】已下载"); } // 针对每张图片,创建一个Attachment对象 Attachment attachment = new Attachment(); attachment.setType("image/jpeg"); attachment.setOldName(imageName); // a.addAttachment(attachment); } } // 修改content中的所有图片的src的值 // 将src的值,加上前缀:upload_image/文章标题/图片.jpg content = ParseUtils.modifyImageUrl( content, "upload_image/" + a.getTitle().replaceAll("/|\\\\|\\:|\\*|\\?|\\||\\<|>", "_") + "/"); // 删除<hr>和"回首页"的链接标签 content = ParseUtils.reomveTags(content, Div.class, "class", "ibm-alternate-rule"); content = ParseUtils.reomveTags( content, ParagraphTag.class, "class", "ibm-ind-link ibm-back-to-top"); a.setContent(content); // 将文章对象放入HttpContext List<Topic> articles = new ArrayList<Topic>(); articles.add(a); context.setAttribute("articles", articles); } catch (Exception e) { e.printStackTrace(); } }
public void checkprice() throws Exception { // System.out.println("checking drugstore url:" + page.getUrl()); String cookies = ""; // DrugstoreLogin.getCookies(); URL url = new URL(page.getUrl()); HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection(); urlConnection.setConnectTimeout(Constant.connect_timeout); urlConnection.setRequestProperty( "User-Agent", "Mozilla/5.0 (compatible; MSIE 6.0; Windows NT)"); urlConnection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded"); urlConnection.setRequestProperty("Cookie", cookies); urlConnection.connect(); // InputStream is = urlConnection.getInputStream(); // // BufferedReader reader = new BufferedReader(new // InputStreamReader(is)); // // String s; // StringBuilder result = new StringBuilder(); // while (((s = reader.readLine()) != null)) { // result.append(s); // } // // System.out.println("result= " + result.toString()); // // is.close(); Parser parser = new Parser(urlConnection); parser.setEncoding(Constant.ENCODE); NodeFilter name_filter3 = new AndFilter( new NodeClassFilter(Div.class), new HasAttributeFilter("id", "divAvailablity")); NodeFilter name_filter4 = new AndFilter(new NodeClassFilter(Div.class), new HasAttributeFilter("id", "productprice")); OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates( new NodeFilter[] {new NodeClassFilter(TitleTag.class), name_filter3, name_filter4}); NodeList list = parser.extractAllNodesThatMatch(lastFilter); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof TitleTag) { TitleTag d = (TitleTag) tag; page.title = d.getTitle().replaceAll("drugstore.com", "").replaceAll("\\|", "").trim(); } else if (tag instanceof Div) { Div d = (Div) tag; String sStr = d.getStringText(); // System.out.println(sStr); if ("productprice".equalsIgnoreCase(d.getAttribute("id"))) { page.price = getPrice(sStr); } else if ("divAvailablity".equalsIgnoreCase(d.getAttribute("id"))) { if (sStr.indexOf("In Stock") >= 0 || sStr.indexOf("in stock") >= 0) { page.instock = true; } } // System.out.println(d.getStringText()); // getinStock(d); } } }