Пример #1
0
  public void checkprice() throws Exception {

    // System.out.println("checking Aptamil url [" + page.url + "]");
    URL url = new URL(page.url);
    HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
    urlConnection.setConnectTimeout(Constant.connect_timeout);
    urlConnection.connect();

    Parser parser = new Parser(urlConnection);

    parser.setEncoding(Constant.ENCODE);
    NodeClassFilter div_filter = new NodeClassFilter(Div.class);

    OrFilter filters = new OrFilter();
    filters.setPredicates(new NodeFilter[] {div_filter});

    NodeList list = parser.extractAllNodesThatMatch(filters);

    for (int i = 0; i < list.size(); i++) {
      Node tag = list.elementAt(i);
      if (tag instanceof Div) {
        Div d = (Div) tag;
        String divclass = d.getAttribute("class");
        if ("pl_addToBasket".equalsIgnoreCase(divclass)) {
          // return getName(d);
        }
      }
    }
  }
Пример #2
0
  /**
   * 获取新闻的内容
   *
   * @param newsContentFilter
   * @param parser
   * @return content 新闻内容
   */
  public String getNewsContent(NodeFilter newsContentFilter, Parser parser) {
    String content = null;
    StringBuilder builder = new StringBuilder();

    try {
      NodeList newsContentList = parser.parse(newsContentFilter);
      for (int i = 0; i < newsContentList.size(); i++) {
        Div newsContenTag = (Div) newsContentList.elementAt(i);
        builder = builder.append(newsContenTag.getStringText());
      }
      content = builder.toString(); // 转换为String 类型。
      if (content != null) {
        parser.reset();
        parser = Parser.createParser(content, "utf8");
        StringBean sb = new StringBean();
        sb.setCollapse(true);
        parser.visitAllNodesWith(sb);
        content = sb.getStrings();
        // String s = "\";} else{ document.getElementById('TurnAD444').innerHTML = \"\";} }
        // showTurnAD444(intTurnAD444); }catch(e){}";

        content = content.replaceAll("\\\".*[a-z].*\\}", "");

        content = content.replace("[我来说两句]", "");

      } else {
        System.out.println("没有得到新闻内容!");
      }

    } catch (ParserException ex) {
      Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex);
    }

    return content;
  }
Пример #3
0
  /**
   * 方法:获取对应的页面内容
   *
   * @param htmlPageContent
   * @param preUrl
   * @throws ParserException
   *     <p>Add By Ethan Lam At 2011-11-23
   */
  public void fetchHtmlContent(String htmlPageContent, String preUrl) throws ParserException {
    Parser parser = new Parser();
    parser.setInputHTML(htmlPageContent);
    NodeFilter filter =
        new AndFilter(
            new TagNameFilter("div"), new HasAttributeFilter("class", "blkContainerSblkCon"));
    NodeList nodeList = parser.parse(filter);
    NodeIterator it = nodeList.elements();
    Div div = null;
    StringBuffer htmlContent = new StringBuffer();
    while (it.hasMoreNodes()) {
      div = (Div) it.nextNode();
      NodeList nl = div.getChildren();
      if (nl == null) continue;
      NodeIterator sub = nl.elements();
      while (sub.hasMoreNodes()) {
        Node t = sub.nextNode();
        if (t instanceof ParagraphTag) {
          //	        		    LoggerUtil.info("fetchHtmlContent:",((ParagraphTag) t).getStringText());
          htmlContent.append(((ParagraphTag) t).getStringText());
        }
      }
    }
    if ("".equals(htmlContent.toString().trim())) return;

    Page page = new Page();
    page.setUrl(preUrl);
    page.setSegment(htmlContent.toString());
    LoggerUtil.info(preUrl + "获取到的页面内容:", htmlContent.toString());
    pageSer.save(page);
  }
Пример #4
0
  /*
   * 获得新闻的日期
   */
  public String getNewsDate(NodeFilter dateFilter, Parser parser) {
    String newsDate = null;
    try {
      NodeList dateList = parser.parse(dateFilter);
      for (int i = 0; i < dateList.size(); i++) {
        Div dateTag = (Div) dateList.elementAt(i);
        newsDate = dateTag.getStringText();
      }
    } catch (ParserException ex) {
      Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex);
    }

    return newsDate;
  }
Пример #5
0
  /**
   * 获得新闻的责任编辑,也就是作者。
   *
   * @param newsauthorFilter
   * @param parser
   * @return
   */
  public String getNewsAuthor(NodeFilter newsauthorFilter, Parser parser) {
    String newsAuthor = "";
    try {
      NodeList authorList = parser.parse(newsauthorFilter);
      for (int i = 0; i < authorList.size(); i++) {
        Div authorSpan = (Div) authorList.elementAt(i);
        newsAuthor = authorSpan.getStringText();
      }

    } catch (ParserException ex) {
      Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex);
    }
    return newsAuthor;
  }
Пример #6
0
  public boolean checkprice() {
    System.out.println("checking amazon url:" + page.url);
    try {

      URL url = new URL(page.url);
      HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
      urlConnection.setConnectTimeout(Constant.connect_timeout);

      Parser parser = new Parser(urlConnection);
      parser.setEncoding(Constant.ENCODE);

      // OrFilter lastFilter = new OrFilter();
      // lastFilter.setPredicates(new NodeFilter[] {
      // new NodeClassFilter(TableTag.class),
      // new NodeClassFilter(Div.class) });
      //
      // NodeList list = parser.extractAllNodesThatMatch(lastFilter);

      NodeList list = parser.extractAllNodesThatMatch(new NodeClassFilter(Div.class));
      System.out.println("size:" + list.size());

      for (int i = 0; i < list.size(); i++) {
        Node tag = list.elementAt(i);

        if (tag instanceof Div) {
          Div d = (Div) tag;
          System.out.println(d.getAttribute("id"));

          if (d.getAttribute("id").startsWith("result_")) {
            // found one product
            try {
              AmazonProduct product = new AmazonProduct();
              product.name = d.getAttribute("name");
              getPriceAndLabel(d, product);

            } catch (Exception e) {
              e.printStackTrace();
            }
          }
        }
      }

    } catch (Exception e) {
      System.out.println(e.getMessage());
    }
    return false;
  }
Пример #7
0
  /**
   * 根据URL获取内容
   *
   * @param url
   * @return
   * @throws Exception
   */
  static String content(String url) throws Exception {
    Parser parser = new Parser();
    parser.setURL(url);
    parser.setEncoding("GB2312");

    NodeFilter fileter = new NodeClassFilter(Div.class);
    NodeList list =
        parser
            .extractAllNodesThatMatch(fileter)
            .extractAllNodesThatMatch(new HasAttributeFilter("id", "contentDiv"));
    String content = null;
    if (null != list && list.size() > 0) {
      Div div = (Div) list.elementAt(0);
      String tmp = div.getStringText();
      // logger.debug("author:"+tmp);
      content = tmp;
    }
    return content;
  }
 /**
  * Extracted from "http://scores.nba.com/games/20031029/scoreboard.html" which has a lot of table
  * columns with unclosed DIV tags because the closing DIV doesn't have a slash. This caused
  * java.lang.StackOverflowError on Windows. Tests the new non-recursive CompositeTagScanner with
  * the walk back through the parse stack. See also Bug #750117 StackOverFlow while Node-Iteration
  * and others.
  */
 public void testInvalidNesting() throws ParserException {
   String html =
       "<table cellspacing=\"2\" cellpadding=\"0\" border=\"0\" width=\"600\">\n"
           + "<tr>\n"
           + "<td><div class=\"ScoreBoardSec\">&nbsp;<a  target=\"_parent\" class=\"ScoreBoardSec\" href=\"http://www.nba.com/heat/\">Heat</a><div></td>\n"
           + "</tr>\n"
           + "</table>";
   createParser(html);
   parseAndAssertNodeCount(1);
   assertType("table", TableTag.class, node[0]);
   TableTag table = (TableTag) node[0];
   assertTrue("table should have 3 nodes", 3 == table.getChildCount());
   assertType("row", TableRow.class, table.childAt(1));
   TableRow row = (TableRow) table.childAt(1);
   assertTrue("row should have 3 nodes", 3 == row.getChildCount());
   assertType("column", TableColumn.class, row.childAt(1));
   TableColumn column = (TableColumn) row.childAt(1);
   assertTrue("column should have 1 node", 1 == column.getChildCount());
   assertType("element", Div.class, column.childAt(0));
   Div div = (Div) column.childAt(0);
   assertTrue("div should have 3 nodes", 3 == div.getChildCount());
   assertType("link", LinkTag.class, div.childAt(1));
   LinkTag link = (LinkTag) div.childAt(1);
   assertTrue("link contents", link.getLink().equals("http://www.nba.com/heat/"));
   assertType("bogus div", Div.class, div.childAt(2));
   assertTrue("bogus div should have no children", 0 == ((Div) div.childAt(2)).getChildCount());
 }
Пример #9
0
  /**
   * @param url
   * @return
   * @throws Exception
   */
  String author(String url) throws Exception {
    Parser parser = new Parser();
    parser.setURL(url);
    parser.setEncoding("GB2312");

    NodeFilter fileter = new NodeClassFilter(Div.class);
    NodeList list =
        parser
            .extractAllNodesThatMatch(fileter)
            .extractAllNodesThatMatch(new HasAttributeFilter("class", "otb14"));
    String author = null;
    if (list != null && list.size() > 0) {
      Div div = (Div) list.elementAt(0);
      String tmp = div.getStringText();
      author = tmp;
    }

    if (null == author) {
      // logger.debug("重新解析作者栏");
      parser = new Parser();
      parser.setURL(url);
      parser.setEncoding("GB2312");

      NodeFilter fileter1 = new NodeClassFilter(Div.class);
      NodeList list1 =
          parser
              .extractAllNodesThatMatch(fileter1)
              .extractAllNodesThatMatch(new HasAttributeFilter("class", "pop_2_1_2"));
      if (null != list1 && list1.size() > 0) {
        Div div = (Div) list1.elementAt(1);
        String tmp = div.getStringText();
        author = tmp.substring(tmp.indexOf("</a>") + 4);
        logger.debug("author:" + author);
      }
    }
    return author;
  }
Пример #10
0
  @Override
  public void execute() {

    try {
      // 根据URL地址,获取网页内容
      String html = HttpUtils.getHtml(httpclient, url);

      if (html == null) {
        throw new RuntimeException("无法获取【" + url + "】网址的内容");
      }

      Topic a = new Topic();

      // 设置文章的来源
      a.setSource("www.ibm.com");

      // 对网页内容进行分析和提取
      // 设置文章的标题
      MetaTag titleTag = ParseUtils.parseTag(html, MetaTag.class, "name", "title");
      a.setTitle(titleTag.getMetaContent());

      // 设置文章的关键字
      MetaTag keywordTag = ParseUtils.parseTag(html, MetaTag.class, "name", "Keywords");
      if (keywordTag.getMetaContent().length() > 255) {
        a.setKeyword(keywordTag.getMetaContent().substring(0, 255));
      }

      // 设置文章的简介
      MetaTag introTag = ParseUtils.parseTag(html, MetaTag.class, "name", "Abstract");
      a.setSummary(introTag.getMetaContent());

      // 设置文章的作者
      List<Div> authors = ParseUtils.parseTags(html, Div.class, "class", "author");
      String author = "";
      for (int i = 0; i < authors.size(); i++) {
        if (i != 0) {
          author = author + ",";
        }
        Div div = authors.get(i);
        author = author + ParseUtils.parseTag(div.getStringText(), LinkTag.class).getStringText();
      }
      a.setAuthor(author);

      // 设置文章的内容
      String content =
          StringUtils.substringBetween(html, "<!-- MAIN_COLUMN_CONTENT_BEGIN -->", "<!-- CMA");

      // 查询文章的内容中所包含的图片,并下载到upload目录,然后创建Attachment对象,设置到Article对象中
      List<ImageTag> imageTags = ParseUtils.parseTags(content, ImageTag.class);
      if (imageTags != null) {
        for (ImageTag it : imageTags) {

          // 得到图片所在的路径目录
          String baseUrl = url.substring(0, url.lastIndexOf("/") + 1);

          // 这个是<img>标签中的src的值
          String imageUrl = it.getImageURL();

          // 图片的绝对路径
          String absoluteUrl = baseUrl + imageUrl;

          // :   "文章标题/xxx.jpg"
          String imageName =
              a.getTitle().replaceAll("/|\\\\|\\:|\\*|\\?|\\||\\<|>", "_") + "/" + imageUrl;

          // 把图片保存到upload目录
          // 首先确定,保存到本地的图片的路径
          String imageLocalFile = ""; // Attachment.ATTACHMENT_DIR + imageName;

          // 如果图片已经被下载到本地,则不再下载
          if (!new File(imageLocalFile).exists()) {
            // 下载图片的信息
            byte[] image = HttpUtils.getImage(httpclient, absoluteUrl);
            // 直接使用new FileOutputStream(imageLocalFile)这种方式,创建一个
            // 文件输出流,存在的问题就是:如果这个文件所在的目录不存在,则创建不了
            // 输出流,会抛出异常!
            // 所以,使用辅助的工具类来创建一个文件输出流:FileUtils.openOutputStream(new File(imageLocalFile))
            // 通过这个方法,当文件所在的父目录不存在的时候,将自动创建其所有的父目录
            IOUtils.write(image, FileUtils.openOutputStream(new File(imageLocalFile)));
            System.out.println("图片【" + absoluteUrl + "】已下载");
          }

          // 针对每张图片,创建一个Attachment对象
          Attachment attachment = new Attachment();
          attachment.setType("image/jpeg");
          attachment.setOldName(imageName);
          // a.addAttachment(attachment);
        }
      }

      // 修改content中的所有图片的src的值
      // 将src的值,加上前缀:upload_image/文章标题/图片.jpg
      content =
          ParseUtils.modifyImageUrl(
              content,
              "upload_image/" + a.getTitle().replaceAll("/|\\\\|\\:|\\*|\\?|\\||\\<|>", "_") + "/");

      // 删除<hr>和"回首页"的链接标签
      content = ParseUtils.reomveTags(content, Div.class, "class", "ibm-alternate-rule");
      content =
          ParseUtils.reomveTags(
              content, ParagraphTag.class, "class", "ibm-ind-link ibm-back-to-top");

      a.setContent(content);

      // 将文章对象放入HttpContext
      List<Topic> articles = new ArrayList<Topic>();
      articles.add(a);

      context.setAttribute("articles", articles);
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
Пример #11
0
  public void checkprice() throws Exception {
    // System.out.println("checking drugstore url:" + page.getUrl());

    String cookies = ""; // DrugstoreLogin.getCookies();

    URL url = new URL(page.getUrl());
    HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
    urlConnection.setConnectTimeout(Constant.connect_timeout);
    urlConnection.setRequestProperty(
        "User-Agent", "Mozilla/5.0 (compatible; MSIE 6.0; Windows NT)");
    urlConnection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");

    urlConnection.setRequestProperty("Cookie", cookies);

    urlConnection.connect();
    // InputStream is = urlConnection.getInputStream();
    //
    // BufferedReader reader = new BufferedReader(new
    // InputStreamReader(is));
    //
    // String s;
    // StringBuilder result = new StringBuilder();
    // while (((s = reader.readLine()) != null)) {
    // result.append(s);
    // }
    //
    // System.out.println("result= " + result.toString());
    //
    // is.close();

    Parser parser = new Parser(urlConnection);

    parser.setEncoding(Constant.ENCODE);

    NodeFilter name_filter3 =
        new AndFilter(
            new NodeClassFilter(Div.class), new HasAttributeFilter("id", "divAvailablity"));
    NodeFilter name_filter4 =
        new AndFilter(new NodeClassFilter(Div.class), new HasAttributeFilter("id", "productprice"));

    OrFilter lastFilter = new OrFilter();
    lastFilter.setPredicates(
        new NodeFilter[] {new NodeClassFilter(TitleTag.class), name_filter3, name_filter4});

    NodeList list = parser.extractAllNodesThatMatch(lastFilter);
    for (int i = 0; i < list.size(); i++) {
      Node tag = list.elementAt(i);
      if (tag instanceof TitleTag) {
        TitleTag d = (TitleTag) tag;
        page.title = d.getTitle().replaceAll("drugstore.com", "").replaceAll("\\|", "").trim();
      } else if (tag instanceof Div) {
        Div d = (Div) tag;
        String sStr = d.getStringText();
        // System.out.println(sStr);
        if ("productprice".equalsIgnoreCase(d.getAttribute("id"))) {
          page.price = getPrice(sStr);
        } else if ("divAvailablity".equalsIgnoreCase(d.getAttribute("id"))) {
          if (sStr.indexOf("In Stock") >= 0 || sStr.indexOf("in stock") >= 0) {
            page.instock = true;
          }
        }
        // System.out.println(d.getStringText());
        // getinStock(d);
      }
    }
  }
Пример #12
0
  private void scanPage() throws IOException, ParserException, ParseException {
    URL u = new URL(this.url);
    HttpURLConnection conn = (HttpURLConnection) u.openConnection();
    Parser parser = new Parser(conn);
    System.setProperty("sun.net.client.defaultConnectTimeout", "30000000"); // jdk1.4换成这个,连接超时
    System.setProperty("sun.net.client.defaultReadTimeout", "30000000"); // jdk1.4换成这个,读操作超时
    // con.setConnectTimeout(5000);//jdk 1.5换成这个,连接超时
    // con.setReadTimeout(5000);//jdk 1.5换成这个,读操作超时
    parser.setEncoding("UTF-8");
    NodeFilter filter = new NodeClassFilter(CompositeTag.class);
    NodeList tags = parser.extractAllNodesThatMatch(filter);
    SimpleNodeIterator iter = tags.elements();

    CompositeTag tag = null;
    while (iter.hasMoreNodes()) {
      tag = (CompositeTag) iter.nextNode();
      String id = tag.getAttribute("id");
      String cls = tag.getAttribute("class");
      if ((tag instanceof LinkTag)) {
        LinkTag lt = (LinkTag) tag;

        if (cls == null) {
          continue;
        }
        if (cls.startsWith("gae-click*Product-Page*Breadcrumb*Category")) {
          this.category = lt.getStringText();
          continue;
        }
        if (cls.startsWith("gae-click*Product-Page*Breadcrumb*Sub-Category")) {
          this.subCategory = lt.getStringText();
          continue;
        }
        if (cls.startsWith("gae-click*Product-Page*Breadcrumb*Brand")) {
          this.brand = lt.getStringText();
          continue;
        }
        if (cls.startsWith("gae-click*Product-Page*PrForm*Free-Shipping")) {
          this.freight = "Free Shipping!";
        } else if (cls.equalsIgnoreCase("link fn")) {
          this.pname = lt.getStringText();
          continue;
        }
      } else if ((tag instanceof LabelTag)) {
        LabelTag lt = (LabelTag) tag;
        if ((id != null) && (id.startsWith("label")) && (cls != null) && (cls.startsWith("d"))) {
          String l = lt.getLabel();
          l = l.replace("\n", "");
          int idx = l.indexOf(40);
          if (idx > 0) {
            l = l.substring(0, idx);
          }
          this.dimNames.put(cls, l);
        }
      } else if (!(tag instanceof SelectTag)) {
        if ((tag instanceof Span)) {
          if ((id != null) && (id.equalsIgnoreCase("sku"))) {
            String sku = tag.getStringText();
            this.pid = sku.substring(sku.indexOf(35) + 1);
          }
        } else if ((tag instanceof Bullet)) {
          Bullet b = (Bullet) tag;
          String text = b.getStringText().trim();

          if (text.startsWith("Weight")) {
            int idx = text.indexOf(":");
            this.weight = text.substring(idx + 1).trim();
          }

        } else if ((tag instanceof Div)) {
          Div div = (Div) tag;
          if (cls == null) {
            continue;
          }
          if (cls.equalsIgnoreCase("description")) {
            StringBuilder sb = new StringBuilder();
            BulletList bullets = (BulletList) div.getChild(0);
            SimpleNodeIterator bls = bullets.elements();
            while (bls.hasMoreNodes()) {
              Node n = bls.nextNode();
              if ((n instanceof Bullet)) {
                Bullet bl = (Bullet) n;
                sb.append(bl.getStringText());
              }
            }
            this.intro = sb.toString();
          }
        } else if ((this.items == null) && ((tag instanceof ScriptTag))) {
          this.items = readScript((ScriptTag) tag);
        }
      }
    }
  }