コード例 #1
0
  /**
   * 方法:获取对应的页面内容
   *
   * @param htmlPageContent
   * @param preUrl
   * @throws ParserException
   *     <p>Add By Ethan Lam At 2011-11-23
   */
  public void fetchHtmlContent(String htmlPageContent, String preUrl) throws ParserException {
    Parser parser = new Parser();
    parser.setInputHTML(htmlPageContent);
    NodeFilter filter =
        new AndFilter(
            new TagNameFilter("div"), new HasAttributeFilter("class", "blkContainerSblkCon"));
    NodeList nodeList = parser.parse(filter);
    NodeIterator it = nodeList.elements();
    Div div = null;
    StringBuffer htmlContent = new StringBuffer();
    while (it.hasMoreNodes()) {
      div = (Div) it.nextNode();
      NodeList nl = div.getChildren();
      if (nl == null) continue;
      NodeIterator sub = nl.elements();
      while (sub.hasMoreNodes()) {
        Node t = sub.nextNode();
        if (t instanceof ParagraphTag) {
          //	        		    LoggerUtil.info("fetchHtmlContent:",((ParagraphTag) t).getStringText());
          htmlContent.append(((ParagraphTag) t).getStringText());
        }
      }
    }
    if ("".equals(htmlContent.toString().trim())) return;

    Page page = new Page();
    page.setUrl(preUrl);
    page.setSegment(htmlContent.toString());
    LoggerUtil.info(preUrl + "获取到的页面内容:", htmlContent.toString());
    pageSer.save(page);
  }
コード例 #2
0
  /**
   * 处理目标 超链接节点
   *
   * @param htmlPageContent
   * @param preUrl
   * @throws Exception
   */
  public void dealLinkNodes(String htmlPageContent, String preUrl) {
    try {
      Parser parser = new Parser();
      parser.setInputHTML(htmlPageContent);
      NodeFilter filter =
          new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank"));
      NodeList nodeList = parser.parse(filter);
      LoggerUtil.info("ParserHandler", "爬虫得到新的节点个数:" + (nodeList != null ? nodeList.size() : 0));
      NodeIterator it = nodeList.elements();
      while (it.hasMoreNodes()) {
        Node node = it.nextNode();
        if (node instanceof LinkTag) {
          if (!filterHandler.isLinkTagFilter(((LinkTag) node))) {
            LoggerUtil.debug(
                "ParserHandler  ", ((LinkTag) node).getLink(), ((LinkTag) node).getLinkText());
            CrawlQueue.getQueueManager()
                .newNode(((LinkTag) node).getLinkText(), ((LinkTag) node).getLink(), preUrl);
          }
        }
      }
    } catch (Exception e) {

    }
  }