예제 #1
0
  private CloseableHttpClient generateClient(Site site) {
    HttpClientBuilder httpClientBuilder =
        HttpClients.custom().setConnectionManager(connectionManager);
    if (site != null && site.getUserAgent() != null) {
      httpClientBuilder.setUserAgent(site.getUserAgent());
    } else {
      httpClientBuilder.setUserAgent("");
    }
    if (site == null || site.isUseGzip()) {
      httpClientBuilder.addInterceptorFirst(
          new HttpRequestInterceptor() {

            public void process(final HttpRequest request, final HttpContext context)
                throws HttpException, IOException {
              if (!request.containsHeader("Accept-Encoding")) {
                request.addHeader("Accept-Encoding", "gzip");
              }
            }
          });
    }
    SocketConfig socketConfig =
        SocketConfig.custom().setSoKeepAlive(true).setTcpNoDelay(true).build();
    httpClientBuilder.setDefaultSocketConfig(socketConfig);
    if (site != null) {
      httpClientBuilder.setRetryHandler(
          new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
    }
    generateCookie(httpClientBuilder, site);
    return httpClientBuilder.build();
  }
예제 #2
0
 private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) {
   CookieStore cookieStore = new BasicCookieStore();
   for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) {
     BasicClientCookie cookie =
         new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
     cookie.setDomain(site.getDomain());
     cookieStore.addCookie(cookie);
   }
   for (Map.Entry<String, Map<String, String>> domainEntry : site.getAllCookies().entrySet()) {
     for (Map.Entry<String, String> cookieEntry : domainEntry.getValue().entrySet()) {
       BasicClientCookie cookie =
           new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
       cookie.setDomain(domainEntry.getKey());
       cookieStore.addCookie(cookie);
     }
   }
   httpClientBuilder.setDefaultCookieStore(cookieStore);
 }
예제 #3
0
  public static void main(String[] args) {
    ApplicationContext applicationContext =
        new ClassPathXmlApplicationContext("classpath:/applicationContext*.xml");
    final ScreeningWeixinPipeline pipeline =
        applicationContext.getBean(ScreeningWeixinPipeline.class);

    OOSpider.create(
            Site.me().setSleepTime(1000).setCycleRetryTimes(30),
            pipeline,
            ScreeningWeixinModel.class)
        .addUrl(
            "http://m.wepiao.com/data/v5/cinemas/cities/10/sched_city_cinema_10_1003249.json?cityId=10&cinemaId=1003249")
        .thread(1)
        .run();
  }
/**
 * @author [email protected] <br>
 * @since 0.4.0
 */
public class BaiduBaikePageProcessor implements PageProcessor {

  private Site site =
      Site.me() // .setHttpProxy(new HttpHost("127.0.0.1",8888))
          .setRetryTimes(3)
          .setSleepTime(1000)
          .setUseGzip(true);

  public static void main(String[] args) {
    // single download
    Spider spider = Spider.create(new BaiduBaikePageProcessor()).thread(2);
    String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
    ResultItems resultItems = spider.<ResultItems>get(String.format(urlTemplate, "水力发电"));
    System.out.println(resultItems);

    // multidownload
    List<String> list = new ArrayList<String>();
    list.add(String.format(urlTemplate, "风力发电"));
    list.add(String.format(urlTemplate, "太阳能"));
    list.add(String.format(urlTemplate, "地热发电"));
    list.add(String.format(urlTemplate, "地热发电"));
    List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
    for (ResultItems resultItemse : resultItemses) {
      System.out.println(resultItemse.getAll());
    }
    spider.close();
  }

  @Override
  public void process(Page page) {
    page.putField("name", page.getHtml().css("h1.title div.lemmaTitleH1", "text").toString());
    page.putField(
        "description",
        page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()"));
  }

  @Override
  public Site getSite() {
    return site;
  }
}