@Override public void reciveContext(IModule source, Context context, IModule target) { if (source instanceof IPluginManager) { if (invoker == null) { invoker = source; } if (context.getContextType() == ContextType.INIT) { init(); } if (context.getContextType() == ContextType.CRAWL) { crawl(); } if (context.getContextType() == ContextType.REPORT) { System.out.println("done"); } } }
@Override public void init() { array = new JSONArray(); crawlContext = new Context(); crawlContext.setContextType(ContextType.CRAWL); crawlContext.setOriginalInvoker(this); content = new CrawlContextContent(); content.setAutoParse(true); RegexRule regexRule = new RegexRule(); regexRule.addNegative("^(file|ftp|mailto):"); regexRule.addNegative( "\\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$"); regexRule.addNegative("[?*!@=]"); regexRule.addPositive("http://www.court.gov.cn/zgcpwsw/[a-z]*/.*"); seeds = new ArrayList<>(); seeds.add("http://www.court.gov.cn/zgcpwsw/"); content.setTopN(1000); content.setRegexRule(regexRule); content.setDepth(4); content.setSeeds(seeds); pluginProcess = new PluginProcess(); pluginProcess.setProcess( new IPluginProcess() { @Override public Links process(Page page, Links links) { if (page.getUrl() .split("/")[page.getUrl().split("/").length - 1] .matches("t\\d*_\\d*\\.htm")) { String string = page.getDoc().select("#wenshu").text(); JSONObject object = new JSONObject(); object.put("content", string.trim()); array.put(object); } return links; } }); content.setProcess(pluginProcess); content.setCrawlerType(Crawler.BREADTH); crawlContext.setContextContent(content); logger.info("court initiated"); }