private ResultItems myDownload(Request request, Spider spider) throws IOException { ResultItems resultItems = new ResultItems(request, spider); DesiredCapabilities cap = DesiredCapabilities.chrome(); ChromeOptions co = new ChromeOptions(); String userAgent = request.getUserAgent(); co.addArguments("--user-agent=" + userAgent); cap.setCapability(ChromeOptions.CAPABILITY, co); Proxy myProxy = spider.getProxy(request); if (myProxy != null) { String proxy = myProxy.getHost() + ":" + myProxy.getPort(); org.openqa.selenium.Proxy p = new org.openqa.selenium.Proxy(); p.setHttpProxy(proxy).setFtpProxy(proxy).setSslProxy(proxy); cap.setCapability(CapabilityType.PROXY, p); } WebDriver driver = new ChromeDriver(cap); driver.get(request.getUrl()); // Get the html source of the page String pageSource = driver.getPageSource(); // Close the browser driver.quit(); return resultItems.setResource(Jsoup.parse(pageSource, request.getUrl())); }
private void process(KindEntity video) throws IOException { Request request = VideoDetailPageProcessor.getRequest(video); ResultItems page = JsoupDownloader.getInstance().download(request, new SpiderAdapter()); if (!page.isCacheUsed()) new FilePipeline().process(page); VideoDetailPageProcessor.instance.process(page); System.out.println("series" + video.get("series")); }
@Override public void process(ResultItems page) { Document doc = (Document) page.getResource(); Elements elements = doc.select("div.txt-list-category-v2"); for (Element item : elements) { String ancestorName = item.select("h3").text(); String ancestorId = item.attr("id"); CategoryEntity ancestor = new CategoryEntity().setName(ancestorName).setSite(SiteName.Taobao).setCode(ancestorId); getLogger().trace(ancestor); page.addItem(ancestor); Elements subElements = item.select("a"); CategoryEntity parent = null; for (Element item3rd : subElements) { if (item3rd.attr("href").isEmpty()) { String name = item3rd.text().trim(); if (name.isEmpty()) { continue; } if (name.toCharArray()[0] == 160) { continue; } parent = new CategoryEntity().setName(name).setSite(SiteName.Taobao).setParent(ancestor); getLogger().trace(parent); page.addItem(parent); } else { String url = item3rd.absUrl("href"); try { url = java.net.URLDecoder.decode(url, "utf-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(url, e); } String name = item3rd.text().trim(); if (name.isEmpty()) { continue; } CategoryEntity grand = new CategoryEntity() .setName(name) .setUrl(url) .setSite(SiteName.Taobao) .setParent(parent); if (parent == null) { throw new RuntimeException("no parent of " + grand); } getLogger().trace(grand); page.addItem(grand); } } } }