Exemple #1
0
 @Override
 public void reciveContext(IModule source, Context context, IModule target) {
   if (source instanceof IPluginManager) {
     if (invoker == null) {
       invoker = source;
     }
     if (context.getContextType() == ContextType.INIT) {
       init();
     }
     if (context.getContextType() == ContextType.CRAWL) {
       crawl();
     }
     if (context.getContextType() == ContextType.REPORT) {
       System.out.println("done");
     }
   }
 }
Exemple #2
0
 @Override
 public void init() {
   array = new JSONArray();
   crawlContext = new Context();
   crawlContext.setContextType(ContextType.CRAWL);
   crawlContext.setOriginalInvoker(this);
   content = new CrawlContextContent();
   content.setAutoParse(true);
   RegexRule regexRule = new RegexRule();
   regexRule.addNegative("^(file|ftp|mailto):");
   regexRule.addNegative(
       "\\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$");
   regexRule.addNegative("[?*!@=]");
   regexRule.addPositive("http://www.court.gov.cn/zgcpwsw/[a-z]*/.*");
   seeds = new ArrayList<>();
   seeds.add("http://www.court.gov.cn/zgcpwsw/");
   content.setTopN(1000);
   content.setRegexRule(regexRule);
   content.setDepth(4);
   content.setSeeds(seeds);
   pluginProcess = new PluginProcess();
   pluginProcess.setProcess(
       new IPluginProcess() {
         @Override
         public Links process(Page page, Links links) {
           if (page.getUrl()
               .split("/")[page.getUrl().split("/").length - 1]
               .matches("t\\d*_\\d*\\.htm")) {
             String string = page.getDoc().select("#wenshu").text();
             JSONObject object = new JSONObject();
             object.put("content", string.trim());
             array.put(object);
           }
           return links;
         }
       });
   content.setProcess(pluginProcess);
   content.setCrawlerType(Crawler.BREADTH);
   crawlContext.setContextContent(content);
   logger.info("court initiated");
 }