示例#1
0
  public static void main(String[] args) {
    ApplicationContext applicationContext =
        new ClassPathXmlApplicationContext("classpath:/applicationContext*.xml");
    final ScreeningWeixinPipeline pipeline =
        applicationContext.getBean(ScreeningWeixinPipeline.class);

    OOSpider.create(
            Site.me().setSleepTime(1000).setCycleRetryTimes(30),
            pipeline,
            ScreeningWeixinModel.class)
        .addUrl(
            "http://m.wepiao.com/data/v5/cinemas/cities/10/sched_city_cinema_10_1003249.json?cityId=10&cinemaId=1003249")
        .thread(1)
        .run();
  }
/**
 * @author [email protected] <br>
 * @since 0.4.0
 */
public class BaiduBaikePageProcessor implements PageProcessor {

  private Site site =
      Site.me() // .setHttpProxy(new HttpHost("127.0.0.1",8888))
          .setRetryTimes(3)
          .setSleepTime(1000)
          .setUseGzip(true);

  public static void main(String[] args) {
    // single download
    Spider spider = Spider.create(new BaiduBaikePageProcessor()).thread(2);
    String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
    ResultItems resultItems = spider.<ResultItems>get(String.format(urlTemplate, "水力发电"));
    System.out.println(resultItems);

    // multidownload
    List<String> list = new ArrayList<String>();
    list.add(String.format(urlTemplate, "风力发电"));
    list.add(String.format(urlTemplate, "太阳能"));
    list.add(String.format(urlTemplate, "地热发电"));
    list.add(String.format(urlTemplate, "地热发电"));
    List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
    for (ResultItems resultItemse : resultItemses) {
      System.out.println(resultItemse.getAll());
    }
    spider.close();
  }

  @Override
  public void process(Page page) {
    page.putField("name", page.getHtml().css("h1.title div.lemmaTitleH1", "text").toString());
    page.putField(
        "description",
        page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()"));
  }

  @Override
  public Site getSite() {
    return site;
  }
}