Example #1
0
  /** Test scan with data which is of diff nodes type */
  public void testScan() throws ParserException {
    createParser(
        "<A HREF=\"mytest.html\"><IMG SRC=\"abcd.jpg\">Hello World</A>", "http://www.yahoo.com");
    parser.setNodeFactory(
        new PrototypicalNodeFactory(
            new Tag[] {
              new LinkTag(), new ImageTag(),
            }));
    parseAndAssertNodeCount(1);
    assertTrue("Node should be a link node", node[0] instanceof LinkTag);

    LinkTag linkTag = (LinkTag) node[0];
    // Get the link data and cross-check
    Node[] dataNode = new Node[10];
    int i = 0;
    for (SimpleNodeIterator e = linkTag.children(); e.hasMoreNodes(); ) {
      dataNode[i++] = e.nextNode();
    }
    assertEquals("Number of data nodes", new Integer(2), new Integer(i));
    assertTrue("First data node should be an Image Node", dataNode[0] instanceof ImageTag);
    assertTrue("Second data node shouls be a String Node", dataNode[1] instanceof Text);

    // Check the contents of each data node
    ImageTag imageTag = (ImageTag) dataNode[0];
    assertEquals("Image URL", "http://www.yahoo.com/abcd.jpg", imageTag.getImageURL());
    Text stringNode = (Text) dataNode[1];
    assertEquals("String Contents", "Hello World", stringNode.getText());
  }
Example #2
0
  public void testBadImageInLinkBug() throws ParserException {
    createParser(
        "<a href=\"registration.asp?EventID=1272\"><img border=\"0\" src=\"\\images\\register.gif\"</a>",
        "http://www.fedpage.com/Event.asp?EventID=1272");
    parseAndAssertNodeCount(1);
    assertTrue("Node should be a HTMLLinkTag", node[0] instanceof LinkTag);
    LinkTag linkTag = (LinkTag) node[0];
    // Get the image tag from the link

    Node insideNodes[] = new Node[10];
    int j = 0;
    for (SimpleNodeIterator e = linkTag.children(); e.hasMoreNodes(); ) {
      insideNodes[j++] = e.nextNode();
    }
    assertEquals("Number of contained internal nodes", 1, j);
    assertTrue(insideNodes[0] instanceof ImageTag);
    ImageTag imageTag = (ImageTag) insideNodes[0];
    assertEquals(
        "Image Tag Location",
        "http://www.fedpage.com/images\\register.gif",
        imageTag.getImageURL());
  }
Example #3
0
 public void testLinkDataContents() throws ParserException {
   createParser(
       "<a href=\"http://transfer.go.com/cgi/atransfer.pl?goto=http://www.signs.movies.com&name=114332&srvc=nws&context=283&guid=4AD5723D-C802-4310-A388-0B24E1A79689\" target=\"_new\"><img src=\"http://ad.abcnews.com/ad/sponsors/buena_vista_pictures/bvpi-ban0003.gif\" width=468 height=60 border=\"0\" alt=\"See Signs in Theaters 8-2 - Starring Mel Gibson\" align=><font face=\"verdana,arial,helvetica\" SIZE=\"1\"><b></b></font></a>",
       "http://transfer.go.com");
   parser.setNodeFactory(
       new PrototypicalNodeFactory(
           new Tag[] {
             new LinkTag(), new ImageTag(),
           }));
   parseAndAssertNodeCount(1);
   assertTrue("Node 0 should be a link tag", node[0] instanceof LinkTag);
   LinkTag linkTag = (LinkTag) node[0];
   assertEquals(
       "Link URL",
       "http://transfer.go.com/cgi/atransfer.pl?goto=http://www.signs.movies.com&name=114332&srvc=nws&context=283&guid=4AD5723D-C802-4310-A388-0B24E1A79689",
       linkTag.getLink());
   assertEquals("Link Text", "", linkTag.getLinkText());
   Node[] containedNodes = new Node[10];
   int i = 0;
   for (SimpleNodeIterator e = linkTag.children(); e.hasMoreNodes(); ) {
     containedNodes[i++] = e.nextNode();
   }
   assertEquals("There should be 5 contained nodes in the link tag", 5, i);
   assertTrue(
       "First contained node should be an image tag", containedNodes[0] instanceof ImageTag);
   ImageTag imageTag = (ImageTag) containedNodes[0];
   assertEquals(
       "Image Location",
       "http://ad.abcnews.com/ad/sponsors/buena_vista_pictures/bvpi-ban0003.gif",
       imageTag.getImageURL());
   assertEquals("Image Height", "60", imageTag.getAttribute("HEIGHT"));
   assertEquals("Image Width", "468", imageTag.getAttribute("WIDTH"));
   assertEquals("Image Border", "0", imageTag.getAttribute("BORDER"));
   assertEquals(
       "Image Alt",
       "See Signs in Theaters 8-2 - Starring Mel Gibson",
       imageTag.getAttribute("ALT"));
   assertTrue("Second contained node should be Tag", containedNodes[1] instanceof Tag);
   Tag tag1 = (Tag) containedNodes[1];
   assertEquals(
       "Tag Contents", "font face=\"verdana,arial,helvetica\" SIZE=\"1\"", tag1.getText());
   assertTrue("Third contained node should be Tag", containedNodes[2] instanceof Tag);
   Tag tag2 = (Tag) containedNodes[2];
   assertEquals("Tag Contents", "b", tag2.getText());
   assertTrue("Fourth contained node should be a Tag", containedNodes[3] instanceof Tag);
   Tag tag = (Tag) containedNodes[3];
   assertTrue("Fourth contained node should be an EndTag", tag.isEndTag());
   assertEquals("Fourth Tag contents", "/b", tag.getText());
   assertTrue("Fifth contained node should be a Tag", containedNodes[4] instanceof Tag);
   tag = (Tag) containedNodes[4];
   assertTrue("Fifth contained node should be an EndTag", tag.isEndTag());
   assertEquals("Fifth Tag contents", "/font", tag.getText());
 }
Example #4
0
  @Override
  public void execute() {

    try {
      // 根据URL地址,获取网页内容
      String html = HttpUtils.getHtml(httpclient, url);

      if (html == null) {
        throw new RuntimeException("无法获取【" + url + "】网址的内容");
      }

      Topic a = new Topic();

      // 设置文章的来源
      a.setSource("www.ibm.com");

      // 对网页内容进行分析和提取
      // 设置文章的标题
      MetaTag titleTag = ParseUtils.parseTag(html, MetaTag.class, "name", "title");
      a.setTitle(titleTag.getMetaContent());

      // 设置文章的关键字
      MetaTag keywordTag = ParseUtils.parseTag(html, MetaTag.class, "name", "Keywords");
      if (keywordTag.getMetaContent().length() > 255) {
        a.setKeyword(keywordTag.getMetaContent().substring(0, 255));
      }

      // 设置文章的简介
      MetaTag introTag = ParseUtils.parseTag(html, MetaTag.class, "name", "Abstract");
      a.setSummary(introTag.getMetaContent());

      // 设置文章的作者
      List<Div> authors = ParseUtils.parseTags(html, Div.class, "class", "author");
      String author = "";
      for (int i = 0; i < authors.size(); i++) {
        if (i != 0) {
          author = author + ",";
        }
        Div div = authors.get(i);
        author = author + ParseUtils.parseTag(div.getStringText(), LinkTag.class).getStringText();
      }
      a.setAuthor(author);

      // 设置文章的内容
      String content =
          StringUtils.substringBetween(html, "<!-- MAIN_COLUMN_CONTENT_BEGIN -->", "<!-- CMA");

      // 查询文章的内容中所包含的图片,并下载到upload目录,然后创建Attachment对象,设置到Article对象中
      List<ImageTag> imageTags = ParseUtils.parseTags(content, ImageTag.class);
      if (imageTags != null) {
        for (ImageTag it : imageTags) {

          // 得到图片所在的路径目录
          String baseUrl = url.substring(0, url.lastIndexOf("/") + 1);

          // 这个是<img>标签中的src的值
          String imageUrl = it.getImageURL();

          // 图片的绝对路径
          String absoluteUrl = baseUrl + imageUrl;

          // :   "文章标题/xxx.jpg"
          String imageName =
              a.getTitle().replaceAll("/|\\\\|\\:|\\*|\\?|\\||\\<|>", "_") + "/" + imageUrl;

          // 把图片保存到upload目录
          // 首先确定,保存到本地的图片的路径
          String imageLocalFile = ""; // Attachment.ATTACHMENT_DIR + imageName;

          // 如果图片已经被下载到本地,则不再下载
          if (!new File(imageLocalFile).exists()) {
            // 下载图片的信息
            byte[] image = HttpUtils.getImage(httpclient, absoluteUrl);
            // 直接使用new FileOutputStream(imageLocalFile)这种方式,创建一个
            // 文件输出流,存在的问题就是:如果这个文件所在的目录不存在,则创建不了
            // 输出流,会抛出异常!
            // 所以,使用辅助的工具类来创建一个文件输出流:FileUtils.openOutputStream(new File(imageLocalFile))
            // 通过这个方法,当文件所在的父目录不存在的时候,将自动创建其所有的父目录
            IOUtils.write(image, FileUtils.openOutputStream(new File(imageLocalFile)));
            System.out.println("图片【" + absoluteUrl + "】已下载");
          }

          // 针对每张图片,创建一个Attachment对象
          Attachment attachment = new Attachment();
          attachment.setType("image/jpeg");
          attachment.setOldName(imageName);
          // a.addAttachment(attachment);
        }
      }

      // 修改content中的所有图片的src的值
      // 将src的值,加上前缀:upload_image/文章标题/图片.jpg
      content =
          ParseUtils.modifyImageUrl(
              content,
              "upload_image/" + a.getTitle().replaceAll("/|\\\\|\\:|\\*|\\?|\\||\\<|>", "_") + "/");

      // 删除<hr>和"回首页"的链接标签
      content = ParseUtils.reomveTags(content, Div.class, "class", "ibm-alternate-rule");
      content =
          ParseUtils.reomveTags(
              content, ParagraphTag.class, "class", "ibm-ind-link ibm-back-to-top");

      a.setContent(content);

      // 将文章对象放入HttpContext
      List<Topic> articles = new ArrayList<Topic>();
      articles.add(a);

      context.setAttribute("articles", articles);
    } catch (Exception e) {
      e.printStackTrace();
    }
  }