@Override public Links visitAndGetNextLinks(Page page) { System.out.println(page.getHtml()); if (Pattern.matches(".*jpg$", page.getUrl()) || Pattern.matches(".*png$", page.getUrl()) || Pattern.matches(".*gif$", page.getUrl())) { try { FileUtils.writeFileWithParent( "download/" + id.incrementAndGet() + ".jpg", page.getContent()); System.out.println("download:" + page.getUrl()); } catch (IOException e) { e.printStackTrace(); } } MyLinks nextLinks = new MyLinks(); RegexRule rr = new RegexRule(); rr.addRule(".*meishij.*"); nextLinks.addAllFromDocument(page.getDoc(), rr); nextLinks.filterImgUrl(page.getDoc(), rr); System.out.println(nextLinks.size()); try { Thread.sleep(1500); } catch (InterruptedException e) { e.printStackTrace(); } return nextLinks; }
/** 启动方法,根据关键字解析种子列表,并设置规则 */ public void start(String kw) { UrlseedDao dao = new UrlseedDao(); List<String> list = dao.findAll(null); for (String li : list) { String match = li.substring(li.indexOf(".") + 1, li.lastIndexOf(".")); regexRule.addRule("^http://.*" + match + ".*/.*"); // 忽略大小写? regexRule.addRule("^https://.*" + match + ".*/.*"); regexRule.addRule("^ftp://.*" + match + ".*/.*"); // regexRule.addRule("-.*[.][(jpg)|(png)|(gif)|(bmp)|(jpeg)]$"); search(li, kw); } }
@Override public void init() { array = new JSONArray(); crawlContext = new Context(); crawlContext.setContextType(ContextType.CRAWL); crawlContext.setOriginalInvoker(this); content = new CrawlContextContent(); content.setAutoParse(true); RegexRule regexRule = new RegexRule(); regexRule.addNegative("^(file|ftp|mailto):"); regexRule.addNegative( "\\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$"); regexRule.addNegative("[?*!@=]"); regexRule.addPositive("http://www.court.gov.cn/zgcpwsw/[a-z]*/.*"); seeds = new ArrayList<>(); seeds.add("http://www.court.gov.cn/zgcpwsw/"); content.setTopN(1000); content.setRegexRule(regexRule); content.setDepth(4); content.setSeeds(seeds); pluginProcess = new PluginProcess(); pluginProcess.setProcess( new IPluginProcess() { @Override public Links process(Page page, Links links) { if (page.getUrl() .split("/")[page.getUrl().split("/").length - 1] .matches("t\\d*_\\d*\\.htm")) { String string = page.getDoc().select("#wenshu").text(); JSONObject object = new JSONObject(); object.put("content", string.trim()); array.put(object); } return links; } }); content.setProcess(pluginProcess); content.setCrawlerType(Crawler.BREADTH); crawlContext.setContextContent(content); logger.info("court initiated"); }