public NewsFrontier(Environment env, CrawlConfig config, DocIDServer docIdServer) {
   super(env, config, docIdServer);
   this.counters = new Counters(env, config);
   this.docIdServer = docIdServer;
   try {
     workQueues = new WorkQueues(env, "PendingURLsDB", config.isResumableCrawling());
     if (config.isResumableCrawling()) {
       scheduledPages = counters.getValue(Counters.ReservedCounterNames.SCHEDULED_PAGES);
       inProcessPages = new InProcessPagesDB(env);
       long numPreviouslyInProcessPages = inProcessPages.getLength();
       if (numPreviouslyInProcessPages > 0) {
         logger.info("Rescheduling " + numPreviouslyInProcessPages + " URLs from previous crawl.");
         scheduledPages -= numPreviouslyInProcessPages;
         while (true) {
           List<WebURL> urls = inProcessPages.get(100);
           if (urls.size() == 0) {
             break;
           }
           scheduleAll(urls);
           inProcessPages.delete(urls.size());
         }
       }
     } else {
       inProcessPages = null;
       scheduledPages = 0;
     }
   } catch (DatabaseException e) {
     logger.error("Error while initializing the Frontier: " + e.getMessage());
     workQueues = null;
   }
 }
Exemple #2
0
 private void prepareList()
 {
   if (this.list == null) {
     String path = Config.getContextRealPath() + CrawlConfig.getWebGatherDir();
     if ((!path.endsWith("/")) && (!path.endsWith("\\"))) {
       path = path + "/";
     }
     path = path + this.config.getID() + "/";
     File f = new File(path);
     if (!f.exists()) {
       f.mkdirs();
     }
     this.list = new DocumentList(path);
   }
 }