/** Test scan with data which is of diff nodes type */ public void testScan() throws ParserException { createParser( "<A HREF=\"mytest.html\"><IMG SRC=\"abcd.jpg\">Hello World</A>", "http://www.yahoo.com"); parser.setNodeFactory( new PrototypicalNodeFactory( new Tag[] { new LinkTag(), new ImageTag(), })); parseAndAssertNodeCount(1); assertTrue("Node should be a link node", node[0] instanceof LinkTag); LinkTag linkTag = (LinkTag) node[0]; // Get the link data and cross-check Node[] dataNode = new Node[10]; int i = 0; for (SimpleNodeIterator e = linkTag.children(); e.hasMoreNodes(); ) { dataNode[i++] = e.nextNode(); } assertEquals("Number of data nodes", new Integer(2), new Integer(i)); assertTrue("First data node should be an Image Node", dataNode[0] instanceof ImageTag); assertTrue("Second data node shouls be a String Node", dataNode[1] instanceof Text); // Check the contents of each data node ImageTag imageTag = (ImageTag) dataNode[0]; assertEquals("Image URL", "http://www.yahoo.com/abcd.jpg", imageTag.getImageURL()); Text stringNode = (Text) dataNode[1]; assertEquals("String Contents", "Hello World", stringNode.getText()); }
public void testBadImageInLinkBug() throws ParserException { createParser( "<a href=\"registration.asp?EventID=1272\"><img border=\"0\" src=\"\\images\\register.gif\"</a>", "http://www.fedpage.com/Event.asp?EventID=1272"); parseAndAssertNodeCount(1); assertTrue("Node should be a HTMLLinkTag", node[0] instanceof LinkTag); LinkTag linkTag = (LinkTag) node[0]; // Get the image tag from the link Node insideNodes[] = new Node[10]; int j = 0; for (SimpleNodeIterator e = linkTag.children(); e.hasMoreNodes(); ) { insideNodes[j++] = e.nextNode(); } assertEquals("Number of contained internal nodes", 1, j); assertTrue(insideNodes[0] instanceof ImageTag); ImageTag imageTag = (ImageTag) insideNodes[0]; assertEquals( "Image Tag Location", "http://www.fedpage.com/images\\register.gif", imageTag.getImageURL()); }
public void testLinkDataContents() throws ParserException { createParser( "<a href=\"http://transfer.go.com/cgi/atransfer.pl?goto=http://www.signs.movies.com&name=114332&srvc=nws&context=283&guid=4AD5723D-C802-4310-A388-0B24E1A79689\" target=\"_new\"><img src=\"http://ad.abcnews.com/ad/sponsors/buena_vista_pictures/bvpi-ban0003.gif\" width=468 height=60 border=\"0\" alt=\"See Signs in Theaters 8-2 - Starring Mel Gibson\" align=><font face=\"verdana,arial,helvetica\" SIZE=\"1\"><b></b></font></a>", "http://transfer.go.com"); parser.setNodeFactory( new PrototypicalNodeFactory( new Tag[] { new LinkTag(), new ImageTag(), })); parseAndAssertNodeCount(1); assertTrue("Node 0 should be a link tag", node[0] instanceof LinkTag); LinkTag linkTag = (LinkTag) node[0]; assertEquals( "Link URL", "http://transfer.go.com/cgi/atransfer.pl?goto=http://www.signs.movies.com&name=114332&srvc=nws&context=283&guid=4AD5723D-C802-4310-A388-0B24E1A79689", linkTag.getLink()); assertEquals("Link Text", "", linkTag.getLinkText()); Node[] containedNodes = new Node[10]; int i = 0; for (SimpleNodeIterator e = linkTag.children(); e.hasMoreNodes(); ) { containedNodes[i++] = e.nextNode(); } assertEquals("There should be 5 contained nodes in the link tag", 5, i); assertTrue( "First contained node should be an image tag", containedNodes[0] instanceof ImageTag); ImageTag imageTag = (ImageTag) containedNodes[0]; assertEquals( "Image Location", "http://ad.abcnews.com/ad/sponsors/buena_vista_pictures/bvpi-ban0003.gif", imageTag.getImageURL()); assertEquals("Image Height", "60", imageTag.getAttribute("HEIGHT")); assertEquals("Image Width", "468", imageTag.getAttribute("WIDTH")); assertEquals("Image Border", "0", imageTag.getAttribute("BORDER")); assertEquals( "Image Alt", "See Signs in Theaters 8-2 - Starring Mel Gibson", imageTag.getAttribute("ALT")); assertTrue("Second contained node should be Tag", containedNodes[1] instanceof Tag); Tag tag1 = (Tag) containedNodes[1]; assertEquals( "Tag Contents", "font face=\"verdana,arial,helvetica\" SIZE=\"1\"", tag1.getText()); assertTrue("Third contained node should be Tag", containedNodes[2] instanceof Tag); Tag tag2 = (Tag) containedNodes[2]; assertEquals("Tag Contents", "b", tag2.getText()); assertTrue("Fourth contained node should be a Tag", containedNodes[3] instanceof Tag); Tag tag = (Tag) containedNodes[3]; assertTrue("Fourth contained node should be an EndTag", tag.isEndTag()); assertEquals("Fourth Tag contents", "/b", tag.getText()); assertTrue("Fifth contained node should be a Tag", containedNodes[4] instanceof Tag); tag = (Tag) containedNodes[4]; assertTrue("Fifth contained node should be an EndTag", tag.isEndTag()); assertEquals("Fifth Tag contents", "/font", tag.getText()); }
@Override public void execute() { try { // 根据URL地址,获取网页内容 String html = HttpUtils.getHtml(httpclient, url); if (html == null) { throw new RuntimeException("无法获取【" + url + "】网址的内容"); } Topic a = new Topic(); // 设置文章的来源 a.setSource("www.ibm.com"); // 对网页内容进行分析和提取 // 设置文章的标题 MetaTag titleTag = ParseUtils.parseTag(html, MetaTag.class, "name", "title"); a.setTitle(titleTag.getMetaContent()); // 设置文章的关键字 MetaTag keywordTag = ParseUtils.parseTag(html, MetaTag.class, "name", "Keywords"); if (keywordTag.getMetaContent().length() > 255) { a.setKeyword(keywordTag.getMetaContent().substring(0, 255)); } // 设置文章的简介 MetaTag introTag = ParseUtils.parseTag(html, MetaTag.class, "name", "Abstract"); a.setSummary(introTag.getMetaContent()); // 设置文章的作者 List<Div> authors = ParseUtils.parseTags(html, Div.class, "class", "author"); String author = ""; for (int i = 0; i < authors.size(); i++) { if (i != 0) { author = author + ","; } Div div = authors.get(i); author = author + ParseUtils.parseTag(div.getStringText(), LinkTag.class).getStringText(); } a.setAuthor(author); // 设置文章的内容 String content = StringUtils.substringBetween(html, "<!-- MAIN_COLUMN_CONTENT_BEGIN -->", "<!-- CMA"); // 查询文章的内容中所包含的图片,并下载到upload目录,然后创建Attachment对象,设置到Article对象中 List<ImageTag> imageTags = ParseUtils.parseTags(content, ImageTag.class); if (imageTags != null) { for (ImageTag it : imageTags) { // 得到图片所在的路径目录 String baseUrl = url.substring(0, url.lastIndexOf("/") + 1); // 这个是<img>标签中的src的值 String imageUrl = it.getImageURL(); // 图片的绝对路径 String absoluteUrl = baseUrl + imageUrl; // : "文章标题/xxx.jpg" String imageName = a.getTitle().replaceAll("/|\\\\|\\:|\\*|\\?|\\||\\<|>", "_") + "/" + imageUrl; // 把图片保存到upload目录 // 首先确定,保存到本地的图片的路径 String imageLocalFile = ""; // Attachment.ATTACHMENT_DIR + imageName; // 如果图片已经被下载到本地,则不再下载 if (!new File(imageLocalFile).exists()) { // 下载图片的信息 byte[] image = HttpUtils.getImage(httpclient, absoluteUrl); // 直接使用new FileOutputStream(imageLocalFile)这种方式,创建一个 // 文件输出流,存在的问题就是:如果这个文件所在的目录不存在,则创建不了 // 输出流,会抛出异常! // 所以,使用辅助的工具类来创建一个文件输出流:FileUtils.openOutputStream(new File(imageLocalFile)) // 通过这个方法,当文件所在的父目录不存在的时候,将自动创建其所有的父目录 IOUtils.write(image, FileUtils.openOutputStream(new File(imageLocalFile))); System.out.println("图片【" + absoluteUrl + "】已下载"); } // 针对每张图片,创建一个Attachment对象 Attachment attachment = new Attachment(); attachment.setType("image/jpeg"); attachment.setOldName(imageName); // a.addAttachment(attachment); } } // 修改content中的所有图片的src的值 // 将src的值,加上前缀:upload_image/文章标题/图片.jpg content = ParseUtils.modifyImageUrl( content, "upload_image/" + a.getTitle().replaceAll("/|\\\\|\\:|\\*|\\?|\\||\\<|>", "_") + "/"); // 删除<hr>和"回首页"的链接标签 content = ParseUtils.reomveTags(content, Div.class, "class", "ibm-alternate-rule"); content = ParseUtils.reomveTags( content, ParagraphTag.class, "class", "ibm-ind-link ibm-back-to-top"); a.setContent(content); // 将文章对象放入HttpContext List<Topic> articles = new ArrayList<Topic>(); articles.add(a); context.setAttribute("articles", articles); } catch (Exception e) { e.printStackTrace(); } }