private CloseableHttpClient generateClient(Site site) { HttpClientBuilder httpClientBuilder = HttpClients.custom().setConnectionManager(connectionManager); if (site != null && site.getUserAgent() != null) { httpClientBuilder.setUserAgent(site.getUserAgent()); } else { httpClientBuilder.setUserAgent(""); } if (site == null || site.isUseGzip()) { httpClientBuilder.addInterceptorFirst( new HttpRequestInterceptor() { public void process(final HttpRequest request, final HttpContext context) throws HttpException, IOException { if (!request.containsHeader("Accept-Encoding")) { request.addHeader("Accept-Encoding", "gzip"); } } }); } SocketConfig socketConfig = SocketConfig.custom().setSoKeepAlive(true).setTcpNoDelay(true).build(); httpClientBuilder.setDefaultSocketConfig(socketConfig); if (site != null) { httpClientBuilder.setRetryHandler( new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); } generateCookie(httpClientBuilder, site); return httpClientBuilder.build(); }
private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) { CookieStore cookieStore = new BasicCookieStore(); for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) { BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); cookie.setDomain(site.getDomain()); cookieStore.addCookie(cookie); } for (Map.Entry<String, Map<String, String>> domainEntry : site.getAllCookies().entrySet()) { for (Map.Entry<String, String> cookieEntry : domainEntry.getValue().entrySet()) { BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); cookie.setDomain(domainEntry.getKey()); cookieStore.addCookie(cookie); } } httpClientBuilder.setDefaultCookieStore(cookieStore); }
public static void main(String[] args) { ApplicationContext applicationContext = new ClassPathXmlApplicationContext("classpath:/applicationContext*.xml"); final ScreeningWeixinPipeline pipeline = applicationContext.getBean(ScreeningWeixinPipeline.class); OOSpider.create( Site.me().setSleepTime(1000).setCycleRetryTimes(30), pipeline, ScreeningWeixinModel.class) .addUrl( "http://m.wepiao.com/data/v5/cinemas/cities/10/sched_city_cinema_10_1003249.json?cityId=10&cinemaId=1003249") .thread(1) .run(); }
/** * @author [email protected] <br> * @since 0.4.0 */ public class BaiduBaikePageProcessor implements PageProcessor { private Site site = Site.me() // .setHttpProxy(new HttpHost("127.0.0.1",8888)) .setRetryTimes(3) .setSleepTime(1000) .setUseGzip(true); public static void main(String[] args) { // single download Spider spider = Spider.create(new BaiduBaikePageProcessor()).thread(2); String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"; ResultItems resultItems = spider.<ResultItems>get(String.format(urlTemplate, "水力发电")); System.out.println(resultItems); // multidownload List<String> list = new ArrayList<String>(); list.add(String.format(urlTemplate, "风力发电")); list.add(String.format(urlTemplate, "太阳能")); list.add(String.format(urlTemplate, "地热发电")); list.add(String.format(urlTemplate, "地热发电")); List<ResultItems> resultItemses = spider.<ResultItems>getAll(list); for (ResultItems resultItemse : resultItemses) { System.out.println(resultItemse.getAll()); } spider.close(); } @Override public void process(Page page) { page.putField("name", page.getHtml().css("h1.title div.lemmaTitleH1", "text").toString()); page.putField( "description", page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()")); } @Override public Site getSite() { return site; } }