Example #1
0
  @Override
  public Links visitAndGetNextLinks(Page page) {
    System.out.println(page.getHtml());
    if (Pattern.matches(".*jpg$", page.getUrl())
        || Pattern.matches(".*png$", page.getUrl())
        || Pattern.matches(".*gif$", page.getUrl())) {
      try {
        FileUtils.writeFileWithParent(
            "download/" + id.incrementAndGet() + ".jpg", page.getContent());
        System.out.println("download:" + page.getUrl());
      } catch (IOException e) {
        e.printStackTrace();
      }
    }

    MyLinks nextLinks = new MyLinks();
    RegexRule rr = new RegexRule();
    rr.addRule(".*meishij.*");
    nextLinks.addAllFromDocument(page.getDoc(), rr);
    nextLinks.filterImgUrl(page.getDoc(), rr);

    System.out.println(nextLinks.size());
    try {
      Thread.sleep(1500);
    } catch (InterruptedException e) {
      e.printStackTrace();
    }
    return nextLinks;
  }
 /** 启动方法,根据关键字解析种子列表,并设置规则 */
 public void start(String kw) {
   UrlseedDao dao = new UrlseedDao();
   List<String> list = dao.findAll(null);
   for (String li : list) {
     String match = li.substring(li.indexOf(".") + 1, li.lastIndexOf("."));
     regexRule.addRule("^http://.*" + match + ".*/.*"); // 忽略大小写?
     regexRule.addRule("^https://.*" + match + ".*/.*");
     regexRule.addRule("^ftp://.*" + match + ".*/.*");
     //			regexRule.addRule("-.*[.][(jpg)|(png)|(gif)|(bmp)|(jpeg)]$");
     search(li, kw);
   }
 }
Example #3
0
 @Override
 public void init() {
   array = new JSONArray();
   crawlContext = new Context();
   crawlContext.setContextType(ContextType.CRAWL);
   crawlContext.setOriginalInvoker(this);
   content = new CrawlContextContent();
   content.setAutoParse(true);
   RegexRule regexRule = new RegexRule();
   regexRule.addNegative("^(file|ftp|mailto):");
   regexRule.addNegative(
       "\\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$");
   regexRule.addNegative("[?*!@=]");
   regexRule.addPositive("http://www.court.gov.cn/zgcpwsw/[a-z]*/.*");
   seeds = new ArrayList<>();
   seeds.add("http://www.court.gov.cn/zgcpwsw/");
   content.setTopN(1000);
   content.setRegexRule(regexRule);
   content.setDepth(4);
   content.setSeeds(seeds);
   pluginProcess = new PluginProcess();
   pluginProcess.setProcess(
       new IPluginProcess() {
         @Override
         public Links process(Page page, Links links) {
           if (page.getUrl()
               .split("/")[page.getUrl().split("/").length - 1]
               .matches("t\\d*_\\d*\\.htm")) {
             String string = page.getDoc().select("#wenshu").text();
             JSONObject object = new JSONObject();
             object.put("content", string.trim());
             array.put(object);
           }
           return links;
         }
       });
   content.setProcess(pluginProcess);
   content.setCrawlerType(Crawler.BREADTH);
   crawlContext.setContextContent(content);
   logger.info("court initiated");
 }