public Frontier(Environment env, CrawlConfig config, DocIDServer docIdServer) { super(config); this.counters = new Counters(env, config); this.docIdServer = docIdServer; try { workQueues = new WorkQueues(env, "PendingURLsDB", config.isResumableCrawling()); if (config.isResumableCrawling()) { scheduledPages = counters.getValue(ReservedCounterNames.SCHEDULED_PAGES); inProcessPages = new InProcessPagesDB(env); long numPreviouslyInProcessPages = inProcessPages.getLength(); if (numPreviouslyInProcessPages > 0) { logger.info("Rescheduling " + numPreviouslyInProcessPages + " URLs from previous crawl."); scheduledPages -= numPreviouslyInProcessPages; while (true) { List<WebURL> urls = inProcessPages.get(100); if (urls.size() == 0) { break; } scheduleAll(urls); inProcessPages.delete(urls.size()); } } } else { inProcessPages = null; scheduledPages = 0; } } catch (DatabaseException e) { logger.error("Error while initializing the Frontier: " + e.getMessage()); workQueues = null; } }
public void getNextURLs(int max, List<WebURL> result) { while (true) { synchronized (mutex) { if (isFinished) { return; } try { List<WebURL> curResults = workQueues.get(max); workQueues.delete(curResults.size()); if (inProcessPages != null) { for (WebURL curPage : curResults) { inProcessPages.put(curPage); } } result.addAll(curResults); } catch (DatabaseException e) { logger.error("Error while getting next urls: " + e.getMessage()); e.printStackTrace(); } if (result.size() > 0) { return; } } try { synchronized (waitingList) { waitingList.wait(); } } catch (InterruptedException ignored) { // Do nothing } if (isFinished) { return; } } }
public void setProcessed(WebURL webURL) { counters.increment(ReservedCounterNames.PROCESSED_PAGES); if (inProcessPages != null) { if (!inProcessPages.removeURL(webURL)) { logger.warn("Could not remove: " + webURL.getURL() + " from list of processed pages."); } } }
public long getNumberOfAssignedPages() { return inProcessPages.getLength(); }