public NewsFrontier(Environment env, CrawlConfig config, DocIDServer docIdServer) { super(env, config, docIdServer); this.counters = new Counters(env, config); this.docIdServer = docIdServer; try { workQueues = new WorkQueues(env, "PendingURLsDB", config.isResumableCrawling()); if (config.isResumableCrawling()) { scheduledPages = counters.getValue(Counters.ReservedCounterNames.SCHEDULED_PAGES); inProcessPages = new InProcessPagesDB(env); long numPreviouslyInProcessPages = inProcessPages.getLength(); if (numPreviouslyInProcessPages > 0) { logger.info("Rescheduling " + numPreviouslyInProcessPages + " URLs from previous crawl."); scheduledPages -= numPreviouslyInProcessPages; while (true) { List<WebURL> urls = inProcessPages.get(100); if (urls.size() == 0) { break; } scheduleAll(urls); inProcessPages.delete(urls.size()); } } } else { inProcessPages = null; scheduledPages = 0; } } catch (DatabaseException e) { logger.error("Error while initializing the Frontier: " + e.getMessage()); workQueues = null; } }
private void prepareList() { if (this.list == null) { String path = Config.getContextRealPath() + CrawlConfig.getWebGatherDir(); if ((!path.endsWith("/")) && (!path.endsWith("\\"))) { path = path + "/"; } path = path + this.config.getID() + "/"; File f = new File(path); if (!f.exists()) { f.mkdirs(); } this.list = new DocumentList(path); } }