Пример #1
0
  private ResultItems myDownload(Request request, Spider spider) throws IOException {
    ResultItems resultItems = new ResultItems(request, spider);

    DesiredCapabilities cap = DesiredCapabilities.chrome();
    ChromeOptions co = new ChromeOptions();
    String userAgent = request.getUserAgent();
    co.addArguments("--user-agent=" + userAgent);
    cap.setCapability(ChromeOptions.CAPABILITY, co);

    Proxy myProxy = spider.getProxy(request);
    if (myProxy != null) {
      String proxy = myProxy.getHost() + ":" + myProxy.getPort();
      org.openqa.selenium.Proxy p = new org.openqa.selenium.Proxy();
      p.setHttpProxy(proxy).setFtpProxy(proxy).setSslProxy(proxy);
      cap.setCapability(CapabilityType.PROXY, p);
    }

    WebDriver driver = new ChromeDriver(cap);
    driver.get(request.getUrl());
    // Get the html source of the page
    String pageSource = driver.getPageSource();
    // Close the browser
    driver.quit();
    return resultItems.setResource(Jsoup.parse(pageSource, request.getUrl()));
  }
 private void process(KindEntity video) throws IOException {
   Request request = VideoDetailPageProcessor.getRequest(video);
   ResultItems page = JsoupDownloader.getInstance().download(request, new SpiderAdapter());
   if (!page.isCacheUsed()) new FilePipeline().process(page);
   VideoDetailPageProcessor.instance.process(page);
   System.out.println("series" + video.get("series"));
 }
Пример #3
0
  @Override
  public void process(ResultItems page) {
    Document doc = (Document) page.getResource();

    Elements elements = doc.select("div.txt-list-category-v2");
    for (Element item : elements) {
      String ancestorName = item.select("h3").text();
      String ancestorId = item.attr("id");
      CategoryEntity ancestor =
          new CategoryEntity().setName(ancestorName).setSite(SiteName.Taobao).setCode(ancestorId);
      getLogger().trace(ancestor);
      page.addItem(ancestor);

      Elements subElements = item.select("a");
      CategoryEntity parent = null;
      for (Element item3rd : subElements) {
        if (item3rd.attr("href").isEmpty()) {
          String name = item3rd.text().trim();
          if (name.isEmpty()) {
            continue;
          }
          if (name.toCharArray()[0] == 160) {
            continue;
          }
          parent = new CategoryEntity().setName(name).setSite(SiteName.Taobao).setParent(ancestor);
          getLogger().trace(parent);
          page.addItem(parent);
        } else {
          String url = item3rd.absUrl("href");
          try {
            url = java.net.URLDecoder.decode(url, "utf-8");
          } catch (UnsupportedEncodingException e) {
            throw new RuntimeException(url, e);
          }
          String name = item3rd.text().trim();
          if (name.isEmpty()) {
            continue;
          }
          CategoryEntity grand =
              new CategoryEntity()
                  .setName(name)
                  .setUrl(url)
                  .setSite(SiteName.Taobao)
                  .setParent(parent);
          if (parent == null) {
            throw new RuntimeException("no parent of " + grand);
          }
          getLogger().trace(grand);
          page.addItem(grand);
        }
      }
    }
  }