コード例 #1
0
 /** 启动方法,根据关键字解析种子列表,并设置规则 */
 public void start(String kw) {
   UrlseedDao dao = new UrlseedDao();
   List<String> list = dao.findAll(null);
   for (String li : list) {
     String match = li.substring(li.indexOf(".") + 1, li.lastIndexOf("."));
     regexRule.addRule("^http://.*" + match + ".*/.*"); // 忽略大小写?
     regexRule.addRule("^https://.*" + match + ".*/.*");
     regexRule.addRule("^ftp://.*" + match + ".*/.*");
     //			regexRule.addRule("-.*[.][(jpg)|(png)|(gif)|(bmp)|(jpeg)]$");
     search(li, kw);
   }
 }
コード例 #2
0
ファイル: PicCrawler.java プロジェクト: dluobo/SpiderWeb
  @Override
  public Links visitAndGetNextLinks(Page page) {
    System.out.println(page.getHtml());
    if (Pattern.matches(".*jpg$", page.getUrl())
        || Pattern.matches(".*png$", page.getUrl())
        || Pattern.matches(".*gif$", page.getUrl())) {
      try {
        FileUtils.writeFileWithParent(
            "download/" + id.incrementAndGet() + ".jpg", page.getContent());
        System.out.println("download:" + page.getUrl());
      } catch (IOException e) {
        e.printStackTrace();
      }
    }

    MyLinks nextLinks = new MyLinks();
    RegexRule rr = new RegexRule();
    rr.addRule(".*meishij.*");
    nextLinks.addAllFromDocument(page.getDoc(), rr);
    nextLinks.filterImgUrl(page.getDoc(), rr);

    System.out.println(nextLinks.size());
    try {
      Thread.sleep(1500);
    } catch (InterruptedException e) {
      e.printStackTrace();
    }
    return nextLinks;
  }