Ejemplo n.º 1
0
  @Test
  @Ignore
  public void test01() throws Exception {
    System.out.println("-------------------------------");
    WebClient webClient = new WebClient(BrowserVersion.CHROME);
    try {
      WebClientOptions options = webClient.getOptions();
      options.setThrowExceptionOnFailingStatusCode(false);
      options.setThrowExceptionOnScriptError(false);
      options.setCssEnabled(false);
      options.setJavaScriptEnabled(true);
      options.setTimeout(50000);
      // webClient.setAjaxController(new NicelyResynchronizingAjaxController());
      HtmlPage pageOrgin = webClient.getPage("http://lvyou.baidu.com/jinhua/jingdian");
      Thread.sleep(5000);
      DomNodeList<DomNode> pageNodes = pageOrgin.querySelectorAll(".pagination");
      HtmlDivision pageDiv = (HtmlDivision) pageNodes.get(0);
      DomNodeList<HtmlElement> liElements = pageDiv.getElementsByTagName("li");
      Integer pageSize = liElements.size() - 1;
      for (int pageNow = 0; pageNow < pageSize; pageNow++) {
        DomNodeList<HtmlElement> pageAnchors = liElements.get(pageNow).getElementsByTagName("a");
        HtmlAnchor pageAnchor = (HtmlAnchor) pageAnchors.get(0);
        HtmlPage page = pageAnchor.click();
        Thread.sleep(10000);
        DomElement jViewDom = page.getElementById("J-view-list-container");
        DomNodeList<HtmlElement> lis = jViewDom.getElementsByTagName("li");
        for (HtmlElement li : lis) {
          DomNodeList<DomNode> titleNodes = li.querySelectorAll(".title");
          HtmlAnchor titleAnchor = (HtmlAnchor) titleNodes.get(0);
          System.out.println("---------------标题----------------");
          log.debug("{}", titleAnchor.asText());

          DomNodeList<DomNode> picNodes = li.querySelectorAll(".pic");
          HtmlAnchor picAnchor = (HtmlAnchor) picNodes.get(0);
          System.out.println("---------------详情URL----------------");
          String detailUrl = "http://lvyou.baidu.com" + picAnchor.getAttribute("href");
          log.debug("{}", detailUrl);
          DomNodeList<HtmlElement> imgEelements = picAnchor.getElementsByTagName("img");
          for (HtmlElement imgEelement : imgEelements) {
            System.out.println("---------------图片----------------");
            log.debug("{}", imgEelement.getAttribute("src"));
          }

          DomNodeList<DomNode> sumNodes = li.querySelectorAll(".view-userSays");
          HtmlDivision sumDiv = (HtmlDivision) sumNodes.get(0);
          DomNodeList<HtmlElement> sumElements = sumDiv.getElementsByTagName("p");
          HtmlParagraph sumPara = (HtmlParagraph) sumElements.get(0);
          System.out.println("---------------摘要----------------");
          log.debug("{}", sumPara.asText());
        }
      }
    } finally {
      webClient.close();
    }
    System.out.println("-------------------------------");
  }
Ejemplo n.º 2
0
  @Test
  @Ignore
  public void test() throws Exception {
    System.out.println("-------------------------------");

    WebClient webClient = new WebClient(BrowserVersion.CHROME);
    webClient.getOptions().setCssEnabled(false);
    webClient.getOptions().setJavaScriptEnabled(false);
    HtmlPage page = webClient.getPage("http://news.163.com/domestic/");
    // DomNodeList<HtmlElement> elements = page.getElementBy

    System.out.println("---------------标题----------------");
    DomNodeList<DomNode> domNodes = page.querySelectorAll(".item-top");
    // log.debug("{}", domNodes);
    for (DomNode domNode : domNodes) {
      HtmlDivision htmlDivision = (HtmlDivision) domNode;
      DomNodeList<HtmlElement> aElements = htmlDivision.getElementsByTagName("a");
      HtmlAnchor htmlAnchor = (HtmlAnchor) aElements.get(0);
      // HTMLHeadingElement htmlHeading2 = (HTMLHeadingElement)
      // htmlDivision.getElementsByTagName("h2");
      // HtmlAnchor htmlAnchor = (HtmlAnchor) htmlDivision.getElementsByTagName("a");
      log.debug("{}", htmlAnchor.asText());
      log.debug("{}", htmlAnchor.getAttribute("href"));

      DomNodeList<HtmlElement> pElements = htmlDivision.getElementsByTagName("p");
      HtmlParagraph htmlParagraph = (HtmlParagraph) pElements.get(0);
      log.debug("{}", htmlParagraph.asText());

      DomNodeList<HtmlElement> iEelements = htmlDivision.getElementsByTagName("img");
      for (HtmlElement iEelement : iEelements) {
        log.debug("{}", iEelement.getAttribute("src"));
      }

      String detailUrl = htmlAnchor.getAttribute("href");
      if (detailUrl.equals("http://news.163.com/15/1215/17/BAT2L8RB00014JB6.html#f=dlist")) {
        HtmlPage detailPage = webClient.getPage(detailUrl);
        System.out.println("---------------正文----------------");
        DomElement endTextElement = detailPage.getElementById("endText");
        log.debug("{}", endTextElement.asText());

        System.out.println("---------------图片----------------");
        DomNodeList<DomNode> imgNodes = endTextElement.querySelectorAll(".f_center");
        for (DomNode imgNode : imgNodes) {
          HtmlParagraph imgpara = (HtmlParagraph) imgNode;
          DomNodeList<HtmlElement> endImgs = imgpara.getElementsByTagName("img");
          for (HtmlElement endImg : endImgs) {
            log.debug("{}", endImg.getAttribute("src"));
          }
        }
      }
    }
    webClient.close();
    System.out.println("-------------------------------");
  }