@Override
 public Site getSite() {
   return Site.me()
       .setDomain("www.diandian.com")
       .addStartUrl("http://17dujingdian.com/")
       .setUserAgent(
           "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
 }
示例#2
0
/**
 * @author [email protected] <br>
 * @since 0.5.1
 */
public class ZhihuPageProcessor implements PageProcessor {

  private Site site =
      Site.me()
          .setCycleRetryTimes(5)
          .setRetryTimes(5)
          .setSleepTime(500)
          .setTimeOut(3 * 60 * 1000)
          .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0")
          .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
          .addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
          .setCharset("UTF-8");

  private static final int voteNum = 1000;

  @Override
  public void process(Page page) {
    List<String> relativeUrl =
        page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all();
    page.addTargetRequests(relativeUrl);
    relativeUrl =
        page.getHtml()
            .xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href")
            .all();
    page.addTargetRequests(relativeUrl);
    List<String> answers = page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all();
    boolean exist = false;
    for (String answer : answers) {
      String vote =
          new Html(answer)
              .xpath("//div[@class='zm-votebar']//span[@class='count']/text()")
              .toString();
      if (Integer.valueOf(vote) >= voteNum) {
        page.putField("vote", vote);
        page.putField("content", new Html(answer).xpath("//div[@class='zm-editable-content']"));
        page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href"));
        exist = true;
      }
    }
    if (!exist) {
      page.setSkip(true);
    }
  }

  @Override
  public Site getSite() {
    return site;
  }

  public static void main(String[] args) {
    Spider.create(new ZhihuPageProcessor())
        .addUrl("http://www.zhihu.com/search?type=question&q=java")
        .addPipeline(new FilePipeline("D:\\webmagic\\"))
        .thread(5)
        .run();
  }
}
/** @author chenruoyu */
public class BlogPageProcessor implements PageProcessor {

  // 1926267847
  public static final String URL_LIST =
      "http://blog\\.sina\\.com\\.cn/s/articlelist_1197161814_0_\\d+\\.html";

  public static final String URL_POST = "http://blog\\.sina\\.com\\.cn/s/blog_\\w+\\.html";

  private Site site = Site.me().setDomain("blog.sina.com.cn").setSleepTime(3000);

  @Override
  public void process(Page page) {
    // 列表页
    if (page.getUrl().regex(URL_LIST).match()) {
      page.addTargetRequests(
          page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex(URL_POST).all());
      page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all());
      // 文章页
    } else {
      page.putField(
          "title",
          page.getHtml().xpath("//div[@class='articalTitle']/h2/tidyText()").toString().trim());
      page.putField(
          "date",
          page.getHtml()
              .xpath("//div[@id='articlebody']//span[@class='time SG_txtc']")
              .regex("\\((.*)\\)"));
      // 博文正文全文
      String passage =
          page.getHtml().xpath("//div[@id='sina_keyword_ad_area2']/allText()").toString();
      page.putField("passage", passage);
      // 博文标签 //*[@id='sina_keyword_ad_area']/table/tbody/tr/td[1]/h3/a
      String tag =
          page.getHtml()
              .xpath("//div[@id='sina_keyword_ad_area']/table/tbody/tr/td[1]/h3/a/text()")
              .toString()
              .trim();
      page.putField("tag", tag);
      System.out.println("\n");
    }
  }

  @Override
  public Site getSite() {
    return site;
  }
  // 1926267847
  public static void main(String[] args) {
    Spider.create(new BlogPageProcessor())
        .addUrl("http://blog.sina.com.cn/s/articlelist_1197161814_0_1.html")
        .run();
  }
}
@Component("SpiderZhaopinProcessor")
public class SpiderZhaopinProcessor implements PageProcessor {

  public Site site = Site.me().setRetryTimes(3).setSleepTime(3000);

  private static final Logger logger = LoggerFactory.getLogger(SpiderZhaopinProcessor.class);

  private String pageUrl;
  private static HashMap<String, Integer> doneLinks = new HashMap<String, Integer>();
  private static Integer doneNum = 0;

  public SpiderZhaopinProcessor() {

    site.setDomain("zhaopin.com");
    site.setUserAgent(
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36");
  }

  @Override
  public Site getSite() {
    return site;
  }

  @Override
  public void process(Page page) {

    String title, companyName, salary;

    String keyStr, valueStr;

    // 属性
    HashMap<String, String> propsMap = new HashMap<String, String>();

    pageUrl = page.getRequest().getUrl();

    Html pageHtml = page.getHtml();
    Selectable pageRefLinks = page.getHtml().links();

    // 1.页面是否已经存在过
    synchronized (doneLinks) {
      if (doneLinks.containsKey(pageUrl)) {

        page.setSkip(true);
        return;
      }
    }

    String regEx = "http://jobs.zhaopin.com/(\\d)+.htm\\?([\\w=&]+)";
    Pattern p = Pattern.compile(regEx);

    if (p.matcher(pageUrl).find()) {

      // 可以处理
      System.out.println("找到一个:" + pageUrl);

      String pageTitle[] = pageHtml.getDocument().title().split("-");

      title = pageTitle[0].replaceAll("招聘", "");
      companyName = pageTitle[1];

      List<String> keyList = pageHtml.xpath("//ul[@class='terminal-ul']/li").all();

      String line = "", lines[];
      for (int ii = 0; ii < keyList.size(); ii++) {

        if (keyList.get(ii) == null) {
          logger.warn("why keylist is null!");
          continue;
        }

        line = StringUtil.html2text(keyList.get(ii));
        lines = line.split(":");

        if (lines.length != 2) continue;

        keyStr = lines[0];
        valueStr = lines[1];

        logger.debug(keyStr + ":" + valueStr);
        if (!(valueStr == null || keyStr == null)) {

          if (keyStr.trim().length() > 0 && valueStr.trim().length() > 0)
            propsMap.put(keyStr, valueStr);
        }
      }

      salary = propsMap.get("职位月薪");
      if (salary == null) salary = "";

      List<String> textList =
          pageHtml.xpath("//div[@class='tab-cont-box']/div[@class='tab-inner-cont']").all();

      String jobDesc = textList.get(0);
      String companyDesc = textList.get(1);

      // save
      page.putField("jobTitle", title.trim());
      page.putField("company", companyName.trim());
      page.putField("salary", salary);

      page.putField("keyword", "");
      page.putField("descr", jobDesc.trim());
      page.putField("companyDesc", companyDesc);

      page.putField("props", propsMap);

      page.putField("url", pageUrl);
      page.putField("source", "zhaopin");

    } else {
      page.setSkip(true);
    }
    // 分页、列表
    page.addTargetRequests(
        pageRefLinks.regex("http://[\\w,\\/-_]+.zhaopin.com/[\\w,\\/.-?&_]+").all());
    // page.addTargetRequests(pageRefLinks.regex(regEx).all());
    // page.addTargetRequests(pageRefLinks.regex("http://sou.zhaopin.com/[\\w,\\/.-?&_]+").all());

    synchronized (doneLinks) {
      doneLinks.put(pageUrl, doneNum++);
      SpiderRecord.addKeyNum("Zhaopin_all", doneNum);
    }
  }

  private String clearHtml(String tag) {

    return StringUtil.html2text(tag).replaceAll(":", "").trim();
  }
}
示例#5
0
/**
 * 软件盒子 http://www.itopdog.cn/ Itopdog #81 (1)该网站的关键字搜索结果的翻页有错误,通过页面给定的结果无法进入到正确的搜索结果中
 * 但是可以通过手动构造翻页url来获取,尝试多个关键字的搜索,搜索结果最多只有两页,因此,手动构造第二页的url (2)此网站的下载次数是及时更新的 (3)该网站有些应用已经无法下载
 *
 * @author DMT
 */
public class Itopdog implements PageProcessor {
  Site site = Site.me().setCharset("utf-8").setRetryTimes(0).setSleepTime(3);

  @Override
  public Apk process(Page page) {
    // index page
    //	http://www.itopdog.cn/home.php?type=az&ct=home&ac=search&q=%E6%B5%8F%E8%A7%88%E5%99%A8
    if (page.getUrl().regex("http://www\\.itopdog\\.cn/home\\.php\\?*").match()) {
      // app的具体介绍页面
      List<String> url1 =
          page.getHtml()
              .links("//div[@class='panel']")
              .regex("http://www\\.itopdog\\.cn/az.*")
              .all();

      // 添加下一页url(翻页)  第2页
      List<String> url2 =
          page.getHtml()
              .links("//div[@class='clearfix pagewrap']")
              .regex("http://www\\.itopdog\\.cn/.*")
              .all();
      if (url2.isEmpty() == false) {
        String url = page.getUrl() + "&per_page=20";
        url1.add(url);
      }

      // remove the duplicate urls in list
      HashSet<String> urlSet = new HashSet<String>(url1);

      // add the urls to page
      Iterator<String> it = urlSet.iterator();
      while (it.hasNext()) {
        page.addTargetRequest(it.next());
      }
    }

    // the app detail page
    if (page.getUrl().regex("http://www\\.itopdog\\.cn/az.*").match()) {
      //			Apk apk = null;
      //			String appName = null;				//app名字
      //			String appDetailUrl = null;			//具体页面url
      //			String appDownloadUrl = null;		//app下载地址
      //			String osPlatform = null ;			//运行平台
      //			String appVersion = null;			//app版本
      //			String appSize = null;				//app大小
      //			String appUpdateDate = null;		//更新日期
      //			String appType = null;				//下载的文件类型 apk?zip?rar?ipa?
      //			String appvender = null;			//app开发者  APK这个类中尚未添加
      //			String appDownloadedTime=null;		//app的下载次数
      //
      //			//有的名字里面包含版本号,有的不包含
      //			String nameString=page.getHtml().xpath("//font[@class='h2_css']/text()").toString();
      //			if(nameString != null && nameString.contains("V"))
      //			{
      //				appName=nameString.substring(0,nameString.indexOf("V")-1);
      //				appVersion = nameString.substring(nameString.indexOf("V")+1,nameString.length());
      //			}
      //			else if(nameString != null && nameString.contains("v"))
      //			{
      //				appName=nameString.substring(0,nameString.indexOf("v")-1);
      //				appVersion = nameString.substring(nameString.indexOf("V")+1,nameString.length());
      //			}
      //			else if(nameString != null && nameString.contains("."))
      //			{
      //				appName=nameString.substring(0,nameString.indexOf(".")-1);
      //				appVersion = nameString.substring(nameString.indexOf(".")-1,nameString.length());
      //			}
      //			else
      //			{
      //				appName = nameString;
      //				appVersion = null;
      //			}
      //
      //			appDetailUrl = page.getUrl().toString();
      //
      //			appDownloadUrl = page.getHtml().xpath("//div[@class='down-btn']/a/@href").toString();
      //
      //			osPlatform = page.getHtml().xpath("//dl[@class='clearfix
      // appinfo']/dd[4]/text()").toString();
      //
      //			String sizeString = page.getHtml().xpath("//dl[@class='clearfix
      // appinfo']/dd[1]/text()").toString();
      //				appSize = sizeString;
      //
      //			String updatedateString = page.getHtml().xpath("//div[@class='six
      // code2d']/strong/text()").toString();
      //				appUpdateDate =
      // updatedateString.substring(updatedateString.indexOf(":")+1,updatedateString.length());
      //
      //			String typeString = "apk";
      //				appType =typeString;
      //
      //			appvender=null;
      //
      //			//下载次数是动态获取的,使用downloadTimeUrl构造出获取下载次数的链接
      //			String
      // id=appDetailUrl.substring(appDetailUrl.indexOf("-")+1,appDetailUrl.lastIndexOf(".")-1);
      //			String downloadTimeUrl="http://www.itopdog.cn/home.php?ct=home&ac=get_updown_api&id="+id;
      //			String line=null;
      //			try {
      //				//打开一个网址,获取源文件,这个网址里面是
      ////				{
      ////					state: true,
      ////					up: "0",
      ////					down: "0",
      ////					up_per: "0%",
      ////					down_per: "0%",
      ////					down_all: "3"
      ////					}
      //				URL url=new URL(downloadTimeUrl);
      //				BufferedReader reader;
      //				reader = new BufferedReader(new InputStreamReader(url.openStream()));
      //				for(int i=0;i<7;i++)
      //					line=reader.readLine();
      //				//line=document.write('30168');
      //			} catch (Exception e) {
      //			}
      //			if(line != null)
      //				appDownloadedTime =line.substring(line.indexOf("\"")+1,line.lastIndexOf("\"")-1);
      //
      ////			String DownloadedTimeString =
      // page.getHtml().xpath("//ul[@class='mdccs']/li[9]/text()").toString();
      ////				appDownloadedTime =
      // DownloadedTimeString.substring(DownloadedTimeString.indexOf(":")+1,DownloadedTimeString.length());
      //
      //
      //
      //			System.out.println("appName="+appName);
      //			System.out.println("appDetailUrl="+appDetailUrl);
      //			System.out.println("appDownloadUrl="+appDownloadUrl);
      //			System.out.println("osPlatform="+osPlatform);
      //			System.out.println("appVersion="+appVersion);
      //			System.out.println("appSize="+appSize);
      //			System.out.println("appUpdateDate="+appUpdateDate);
      //			System.out.println("appType="+appType);
      //			System.out.println("appvender="+appvender);
      //			System.out.println("appDownloadedTime="+appDownloadedTime);
      //
      //			if(appName != null && appDownloadUrl != null){
      //				apk = new Apk(appName,appDetailUrl,appDownloadUrl,osPlatform
      // ,appVersion,appSize,appUpdateDate,appType,null);
      //			}
      //
      return Itopdog_Detail.getApkDetail(page);
    }

    return null;
  }

  @Override
  public Site getSite() {
    return site;
  }

  @Override
  public List<Apk> processMulti(Page page) {
    // TODO Auto-generated method stub
    return null;
  }
}
示例#6
0
/** Created by yiang on 2015/4/9. */
@Component
public class IndeedCrawler {
  private static int beginId = 0;
  private static int endid = 10000;
  private int threadNum = 10;

  @Qualifier("IndeedListPipeline")
  @Autowired
  private PageModelPipeline indeedListPipeline;

  @Qualifier("IndeedDetailPipeline")
  @Autowired
  private PageModelPipeline indeedDetailPipeline;

  @Resource private IndeedDao indeedDao;

  private static Site site =
      Site.me()
          .setTimeOut(20000)
          .setSleepTime(5000)
          .setUserAgent(
              "Mozilla/5.0 (compatible; Baiduspider/2.0; "
                  + "+http://www.baidu.com/search/spider.html)");

  // 列表页爬取
  public void crawlList() {

    while (true) {
      // 取标签
      String tag = indeedDao.getTags();
      if (tag == null || tag.equals("")) {
        break;
      }
      int num = indeedDao.getTagNum(tag);
      // int endid=(int)num/10;
      if (num > 1000) {
        endid = 1000;
      } else {
        endid = num;
      }

      List<String> urls = new ArrayList<String>();
      for (int id = beginId; id <= endid; id += 10) {
        urls.add("http://www.indeed.com/jobs?q=" + tag + "&sort=date&start=" + id);
        // urls.add("http://cn.indeed.com/%E5%B7%A5%E4%BD%9C?q=" + tag + "&sort=date&start=" + id);
      }
      OOSpider.create(site, indeedListPipeline, IndeedList.class)
          .startUrls(urls)
          .setSpawnUrl(false)
          .thread(threadNum)
          .run();

      // 时间戳标记
      indeedDao.setTimestamp(tag);
    }
  }

  // 详情页爬取
  public void crawlDeatil() {
    while (true) {

      List<String> l = indeedDao.getTargetUrls();
      if (l == null) {
        break;
      }
      OOSpider.create(site, indeedDetailPipeline, IndeedDetail.class)
          .startUrls(l)
          .thread(threadNum)
          .setSpawnUrl(false)
          .run();
    }
  }

  public static void main(String[] args) {
    // 加载配置
    ApplicationContext applicationContext =
        new ClassPathXmlApplicationContext("classpath:/spring/applicationContext*.xml");
    final IndeedCrawler indeedCrawler = applicationContext.getBean(IndeedCrawler.class);

    indeedCrawler.crawlList();
    // indeedCrawler.crawlDeatil();

  }
}
示例#7
0
/**
 * 安卓乐园[中国] app搜索抓取 url:http://search.520apk.com/cse/search?q=QQ&s=17910776473296434043&nsid=1
 *
 * @version 1.0.0
 */
public class PagePro520apk implements PageProcessor {

  // 日志管理对象
  private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(PagePro520apk.class);

  // 定义网站编码,以及间隔时间
  Site site =
      Site.me()
          .setCharset("utf-8")
          .setRetryTimes(PropertiesUtil.getRetryTimes())
          .setSleepTime(PropertiesUtil.getInterval());

  /**
   * process the page, extract urls to fetch, extract the data and store
   *
   * @param page
   */
  @Override
  public Apk process(Page page) {
    LOGGER.debug("crawler url: {}", page.getUrl());

    // 获取搜索页面
    if (page.getUrl().regex("http://search\\.520apk\\.com/cse/search\\?q=.*").match()) {
      LOGGER.debug("match success, url:{}", page.getUrl());

      // 获取详细链接,以及分页链接
      List<String> urlList =
          page.getHtml().links("//div[@id='results']/div[@class='result f s0']/h3").all();
      urlList.addAll(page.getHtml().links("//div[@class='pager clearfix']").all());

      Iterator<String> iter = Sets.newHashSet(urlList).iterator();
      while (iter.hasNext()) {
        page.addTargetRequest(iter.next());
      }

      // 打印搜索结果url
      LOGGER.debug("app info results urls: {}", page.getTargetRequests());
    }

    // 获取信息
    if (page.getUrl().regex("http://www.520apk.com/android/*").match()) {
      return PagePro520apk_Detail.getApkDetail(page);
    }
    return null;
  }

  /**
   * get the site settings
   *
   * @return site
   * @see us.codecraft.webmagic.Site
   */
  @Override
  public Site getSite() {
    return site;
  }

  @Override
  public List<Apk> processMulti(Page page) {
    // TODO Auto-generated method stub
    return null;
  }
}
示例#8
0
/**
 * 游戏狗[中国] app搜索抓取 url:http://search.gamedog.cn/app/?keyword=QQ&platform=Android id:12
 *
 * @version 1.0.0
 */
public class PageProGameDog implements PageProcessor {

  // 日志管理对象
  private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(PageProGameDog.class);

  // 定义网站编码,以及间隔时间
  Site site =
      Site.me()
          .setCharset("utf-8")
          .setRetryTimes(PropertiesUtil.getRetryTimes())
          .setSleepTime(PropertiesUtil.getInterval());

  /**
   * process the page, extract urls to fetch, extract the data and store
   *
   * @param page
   */
  @Override
  public Apk process(Page page) {
    LOGGER.debug("crawler url: {}", page.getUrl());

    // 获取搜索页面
    if (page.getUrl().regex("http://android\\.gamedog\\.cn/.*").match()) {
      LOGGER.debug("match success, url:{}", page.getUrl());

      // 获取详细链接,以及分页链接
      List<String> urlList = page.getHtml().links().regex("http://android\\.gamedog\\.cn/.*").all();
      Set<String> cacheSet = Sets.newHashSet();
      cacheSet.addAll(urlList);

      for (String temp : cacheSet) {
        if (PageProUrlFilter.isUrlReasonable(temp)) page.addTargetRequest(temp);
      }

      // 打印搜索结果url
      LOGGER.debug("app info results urls: {}", page.getTargetRequests());
    }

    // 获取信息
    if (page.getUrl().regex("http://android.gamedog.cn/soft|game|online.*").match()) {
      Html html = page.getHtml();
      Apk apk = PageProGameDog_Detail.getApkDetail(page);

      page.putField("apk", apk);
      if (page.getResultItems().get("apk") == null) {
        page.setSkip(true);
      }
    } else {
      page.setSkip(true);
    }

    return null;
  }

  /**
   * get the site settings
   *
   * @return site
   * @see Site
   */
  @Override
  public Site getSite() {
    return site;
  }

  @Override
  public List<Apk> processMulti(Page page) {
    // TODO Auto-generated method stub
    return null;
  }
}
示例#9
0
/**
 * url:http://s.shuiguo.com/qq_1_1.html id:38
 *
 * @version 1.0.0
 */
public class PageProShuiGuo implements PageProcessor {

  // 日志管理对象
  private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(PageProShuiGuo.class);

  // 定义网站编码,以及间隔时间
  Site site =
      Site.me()
          .setCharset("utf-8")
          .setRetryTimes(PropertiesUtil.getRetryTimes())
          .setSleepTime(PropertiesUtil.getInterval());

  /**
   * process the page, extract urls to fetch, extract the data and store
   *
   * @param page
   */
  @Override
  public Apk process(Page page) {
    LOGGER.debug("crawler url: {}", page.getUrl());

    // 获取搜索页面
    if ((page.getUrl().regex("http://www.shuiguo.com/android/.*").match()
            && page.getUrl().get().contains(".html"))
        || page.getUrl().regex("http://a\\.shuiguo\\.com/phb/").match()) {
      LOGGER.debug("match success, url:{}", page.getUrl());

      // 获取详细链接,以及分页链接
      List<String> urlList =
          page.getHtml().links().regex("http://www\\.shuiguo\\.com/android/.*").all();

      Iterator<String> iter = Sets.newHashSet(urlList).iterator();
      while (iter.hasNext()) {
        page.addTargetRequest(iter.next());
      }

      // 打印搜索结果url
      LOGGER.debug("app info results urls: {}", page.getTargetRequests());
    }

    // 获取信息
    if (page.getUrl().regex("http://www\\.shuiguo\\.com/android/.*").match()
        && !page.getUrl().get().endsWith(".html")) {
      Html html = page.getHtml();
      Apk apk;
      try {
        apk = PageProShuiGuo_Detail.getApkDetail(page);
        page.putField("apk", apk);
        if (page.getResultItems().get("apk") == null) {
          page.setSkip(true);
        }
      } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }

    } else {
      page.setSkip(true);
    }
    return null;
  }

  /**
   * get the site settings
   *
   * @return site
   * @see Site
   */
  @Override
  public Site getSite() {
    return site;
  }

  @Override
  public List<Apk> processMulti(Page page) {
    // TODO Auto-generated method stub
    return null;
  }
}
/**
 * 华军软件园[中国] app搜索抓取
 * url:http://search.newhua.com/search_list.php?searchname=MT&searchsid=6&app=search&controller=index&action=search&type=news
 * ID:23 需要两次请求
 *
 * @version 1.0.0
 */
public class PageProOnlineDown implements PageProcessor {

  // 日志管理对象
  private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(PageProOnlineDown.class);

  // 定义网站编码,以及间隔时间
  Site site =
      Site.me()
          .setCharset("utf-8")
          .setRetryTimes(PropertiesUtil.getRetryTimes())
          .setSleepTime(PropertiesUtil.getInterval());

  /** 返回结果结果集 */
  private Set<Apk> resSet = Sets.newHashSet();

  /**
   * process the page, extract urls to fetch, extract the data and store
   *
   * @param page
   */
  @Override
  public Apk process(Page page) {
    LOGGER.debug("crawler url: {}", page.getUrl());

    // 获取搜索页面
    if (page.getUrl().regex("http://www\\.onlinedown\\.net/.*").match()) {
      LOGGER.debug("match success, url:{}", page.getUrl());

      List<String> urlList = page.getHtml().links().regex("http://www\\.onlinedown\\.net/.*").all();

      Set<String> sets = Sets.newHashSet(urlList);
      for (String url : sets) {
        if (PageProUrlFilter.isUrlReasonable(url)) {
          if (url.contains("http://www.onlinedown.net/android/soft")) {
            // url="http://www.onlinedown.net/"+url;
            url = url.replaceAll("//android", "");
            System.out.println("aaa");
          }

          page.addTargetRequest(url);
        }
      }
      // 打印搜索结果url
      LOGGER.debug("app info results urls: {}", page.getTargetRequests());
    }

    // 获取信息
    if (page.getUrl().regex("http://www\\.onlinedown\\.net/soft/.*").match()) {

      PageProOnlineDown_Detail.getApkDetail(page, resSet);
    }

    // 更新下载地址
    if (page.getUrl().regex("http://www\\.onlinedown\\.net/softdown/.*").match()) {
      Html html = page.getHtml();
      Apk apk = PageProOnlineDown_Detail.getApkDetail(page, resSet);

      page.putField("apk", apk);
      if (page.getResultItems().get("apk") == null) {
        page.setSkip(true);
      }
    } else {
      page.setSkip(true);
    }
    return null;
  }

  /**
   * get the site settings
   *
   * @return site
   * @see Site
   */
  @Override
  public Site getSite() {
    return site;
  }

  @Override
  public List<Apk> processMulti(Page page) {
    // TODO Auto-generated method stub
    return null;
  }
}
/*
 * author : 罗一鑫
 * date : 2015/10/19
 * 根据播放页html,从中抓取qitanid
 *
 * */
public class CommentCrawler implements PageProcessor {
  private static String part1OfUrl =
      "http://api.t.iqiyi.com/qx_api/comment/get_video_comments?aid=";
  private static String part2OfUrl =
      "&categoryid=1&cb=fnsucc&escape=true&need_reply=true&need_subject=true&need_total=1&page=";
  private static String part3OfUrl =
      "&page_size=10&page_size_reply=3&qitan_comment_type=1&qitancallback=fnsucc&qitanid=";
  private static String part4OfUrl = "&sort=hot&t=&tvid=";
  /*
   * xpath解析
   * */
  private static String TOTAL =
      "//div/div[@class='wrapper']/div[@class='wrapper-left']/"
          + "div[@id='block-I']/div[@id='qitancommonarea']/@";
  private static String QITANID_XPATH = TOTAL + "data-qitancomment-qitanid";
  private static String TVID_XPATH = TOTAL + "data-qitancomment-tvid";
  private static String TITLE_XPATH = "//head/title";

  /*
   * 站点设置
   * */
  private Site site =
      Site.me()
          .setDomain("http://www.iqiyi.com/")
          .setSleepTime(3000)
          .setUserAgent(
              "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) "
                  + "AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");

  @Override
  public Site getSite() {
    // TODO Auto-generated method stub
    return this.site;
  }

  @Override
  public void process(Page page) {
    if (!GlobalVar.isFilePlayUrlsReaded) {
      GlobalVar.isFilePlayUrlsReaded = true;
      FileInputStream fis = null;
      InputStreamReader isr = null;
      BufferedReader br = null;
      String str = null;
      try {
        fis = new FileInputStream("./playUrls.txt");
        isr = new InputStreamReader(fis);
        br = new BufferedReader(isr);
        while ((str = br.readLine()) != null) {
          page.addTargetRequest(str);
        }
      } catch (Exception e) {
        e.printStackTrace();
      } finally {
        try {
          br.close();
          isr.close();
          fis.close();
        } catch (IOException e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }
      }
    }
    System.out.println("test");
    // TODO Auto-generated method stub
    page.putField(
        "qitanid",
        page.getHtml().xpath(QITANID_XPATH).toString()
            + ","
            + page.getHtml().xpath(TVID_XPATH).toString()
            + ","
            + page.getHtml().xpath(TITLE_XPATH).toString().split(">")[1].split("-")[0]);
  }
}
示例#12
0
public class PagePro360 implements PageProcessor {
  // Site site = Site.me().setCharset("utf-8").setRetryTimes(2).setSleepTime(0);
  Site site =
      Site.me()
          .setCharset("utf-8")
          .setRetryTimes(PropertiesUtil.getRetryTimes())
          .setSleepTime(PropertiesUtil.getInterval());

  private Logger LOGGER = LoggerFactory.getLogger(PagePro360.class);

  public Apk process(Page page) {
    List<String> urls = page.getHtml().links().regex("(http://zhushou\\.360\\.cn/.*)").all();

    Set<String> cacheSet = Sets.newHashSet();
    cacheSet.addAll(urls);

    // 构造分页
    // http://zhushou.360.cn/list/index/cid/1
    // if(page.getUrl().regex("(http://zhushou\\.360\\.cn/detail/list/index/.*)").match()){
    if (page.getRequest().getUrl().equals("http://zhushou.360.cn/list/index/cid/1")
        || page.getRequest().getUrl().equals("http://zhushou.360.cn/list/index/cid/2")) {
      String pageStr = page.getHtml().regex("(pg\\.pageCount\\s=\\s\\w+)").toString();
      int pageCount = Integer.parseInt(pageStr.substring(15));
      List<String> url1 = new ArrayList<String>();
      for (int i = 2; i <= pageCount; i++) {
        url1.add(page.getRequest().getUrl() + "?page=" + i);
      }

      page.addTargetRequests(url1);
    }
    // 剔除锚点.*?#.*
    // #expand,#next,#prev,#comment,#nogo,#guess-like,#btn-install-now-log,#comment-list,#report
    for (String url : cacheSet) {
      if (url.toString().endsWith("#expand")
          || url.toString().endsWith("#next")
          || url.toString().endsWith("#prev")
          || url.toString().endsWith("#comment")
          || url.toString().endsWith("#nogo")
          || url.toString().endsWith("#guess-like")
          || url.toString().endsWith("#btn-install-now-log")
          || url.toString().endsWith("#comment-list")
          || url.toString().endsWith("#report")) {
        LOGGER.error("anchor:" + url.toString());
      } else {
        LOGGER.info(url.toString());
        page.addTargetRequest(url);
      }
    }

    // 提取页面信息
    if (page.getUrl().regex("(http://zhushou\\.360\\.cn/detail/index/soft_id/.*)").match()) {
      Html html = page.getHtml();
      Apk apk = PagePro360_Detail.getApkDetail(page);

      page.putField("apk", apk);
      if (page.getResultItems().get("apk") == null) {
        page.setSkip(true);
      }
    } else {
      page.setSkip(true);
    }
    return null;
  }

  public static void main(String[] args) {
    String url = "http://zhushou.360.cn/list/index/cid/1?page=24#expand";
    //
    //	if(url.endsWith("#expand||#next||#prev||#comment||#nogo||#guess-like||#btn-install-now-log")){
    if (url.endsWith("#expand || #next")) {
      System.out.println("true");
    }
  }

  @Override
  public List<Apk> processMulti(Page page) {
    // TODO Auto-generated method stub
    return null;
  }

  public Site getSite() {
    return site;
  }
}
示例#13
0
/**
 * 苹果资讯 app搜索抓取
 * url:http://www.baidu.com/s?ie=UTF-8&wd=%E8%B6%85%E8%83%BD%E9%99%86%E6%88%98%20site:shouyou.178.com
 *
 * @version 1.0.0
 */
public class PageProIfan178 implements PageProcessor {

  // 日志管理对象
  private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(PageProIfan178.class);

  // 定义网站编码,以及间隔时间
  Site site = Site.me().setCharset("utf-8").setRetryTimes(2).setSleepTime(3);

  /**
   * process the page, extract urls to fetch, extract the data and store
   *
   * @param page
   */
  @Override
  public Apk process(Page page) {
    LOGGER.debug("crawler url: {}", page.getUrl());

    // 获取搜索页面
    if (page.getUrl().regex("http://www\\.baidu\\.com/s\\?ie=UTF-8.*").match()) {
      LOGGER.debug("match success, url:{}", page.getUrl());

      // 获取详细链接,以及分页链接
      List<String> urlList =
          page.getHtml().links("//div[@id='content_left']/div/h3[@class='t']").all();

      Iterator<String> iter = Sets.newHashSet(urlList).iterator();
      while (iter.hasNext()) {
        page.addTargetRequest(iter.next());
      }

      // 打印搜索结果url
      LOGGER.debug("app info results urls: {}", page.getTargetRequests());
    }

    // 获取信息
    if (page.getUrl().regex("http://www\\.baidu\\.com/link\\?url=.*").match()) {
      // 获取dom对象
      //            Html html = page.getHtml();
      //
      //            // 找出对应需要信息
      //            String appDetailUrl = page.getUrl().toString();
      //            String appName =
      // html.xpath("//div[@class='page-page']/div[@class='t1']/h1/text()").toString();
      //            if (StringUtils.isEmpty(appName)) {
      //                appName = html.xpath("//div[@class='box-dw-l-t']/h1/strong/text()").get();
      //            }
      //            String appVersion = null;
      //            String appDownloadUrl = html.xpath("//div[@class='clearfix t2']/a/@href").get();
      //            if (StringUtils.isEmpty(appDownloadUrl)) {
      //                appDownloadUrl =
      // html.xpath("//div[@class='dw-btn']/a[@class='dw-btn2']/@href").get();
      //            }
      //            String osPlatform = null;
      //            String appSize =
      // StringUtils.substringAfterLast(html.xpath("//div[@class='txt']/div[@class='clearfix
      // inf']/p[1]/text()").get(), ":");
      //            String appUpdateDate = null;
      //            String downloadNum = null;
      //            String appDesc = html.xpath("//div[@class='app_detail_infor']/p/text()").get();
      //            if (StringUtils.isEmpty(appDesc)) {
      //                appDesc =
      // html.xpath("//div[@class='box-dw-l']/div[@class='jianjie']/p/text()").get();
      //            }
      //            String appType = null;
      //
      //            LOGGER.debug("name:{}, version: {}, url:{}, size: {}, appType: {}, os: {},
      // date:{}, appDesc:{}", appName, appVersion, appDownloadUrl, appSize, appType, osPlatform,
      // appUpdateDate, appDesc);
      //
      //            if (null != appName && null != appDownloadUrl) {
      //                Apk apk = new Apk(appName, appDetailUrl, appDownloadUrl, osPlatform,
      // appVersion, appSize, appUpdateDate, null != appType ? appType : "APK");
      //                apk.setAppDescription(appDesc);

      return PageProIfan178_Detail.getApkDetail(page);
    }

    return null;
  }

  /**
   * get the site settings
   *
   * @return site
   * @see us.codecraft.webmagic.Site
   */
  @Override
  public Site getSite() {
    return site;
  }

  @Override
  public List<Apk> processMulti(Page page) {
    // TODO Auto-generated method stub
    return null;
  }
}