@Override public void run() { try { for (int i = 0; i < otherWorkerIPPort.size(); i++) { BlockingQueue<URL> queue = getUrlsForOtherCrawlers().get(i); if (queue.isEmpty()) { continue; } StringBuilder contentBuilder = new StringBuilder(); contentBuilder.append(DispatcherConstants.NEW_URLS_PARAM + "="); int items = DispatcherConstants.URLS_TO_SEND; while (!queue.isEmpty() || items < 0) { URL url = queue.take(); String cleansedURL = URLEncoder.encode(url.toString(), CrawlerConstants.CHARSET); contentBuilder.append(cleansedURL + ";"); } String urlString = "http://" + otherWorkerIPPort.get(i) + "/worker/" + DispatcherConstants.ADD_URLS_URL; URL url = new URL(urlString); DispatcherUtils.sendHttpRequest( url, contentBuilder.toString(), DispatcherUtils.Method.POST, true); } } catch (Exception e) { Utils.logStackTrace(e); } }
/** Start the crawler */ public void startCrawler() { try { logger.info(CLASSNAME + ": Starting crawler"); initialise(); } catch (Exception e) { Utils.logStackTrace(e); } }
/** * Returns the number of documents that have been crawled and saved * * @return */ public int getNumCrawledDocuments() { try { File storageDirectory = new File(CrawlerConstants.DB_DIRECTORY + CrawlerConstants.STORAGE_DIRECTORY); return storageDirectory.list().length; } catch (Exception e) { Utils.logStackTrace(e); return -1; } }
/** * Set up the entire crawler framework * * @throws NoSuchAlgorithmException */ private void initialise() throws NoSuchAlgorithmException { this.dbEnvDir = CrawlerConstants.DB_DIRECTORY; this.storageDirectory = CrawlerConstants.DB_DIRECTORY + CrawlerConstants.STORAGE_DIRECTORY; Utils.createDirectory(this.storageDirectory); this.urlStorageDirectory = CrawlerConstants.DB_DIRECTORY + CrawlerConstants.URL_STORAGE_DIRECTORY; Utils.createDirectory(this.urlStorageDirectory); initialiseDb(); this.getCrawlQueue = new ArrayBlockingQueue<URL>(CrawlerConstants.QUEUE_CAPACITY); this.contentForLinkExtractor = new ArrayBlockingQueue<RawCrawledItem>(CrawlerConstants.QUEUE_CAPACITY); this.preRedistributionNewURLQueue = new ArrayBlockingQueue<URL>(CrawlerConstants.NEW_URL_QUEUE_CAPACITY); this.siteInfoMap = new ConcurrentHashMap<String, SiteInfo>(); this.sitesCrawledThisSession = new HashSet<String>(); this.urlsForOtherCrawlers = new ConcurrentHashMap<Integer, BlockingQueue<URL>>(); for (int i = 0; i < this.otherWorkerIPPort.size(); i++) { this.urlsForOtherCrawlers.put( i, new ArrayBlockingQueue<URL>(CrawlerConstants.SMALL_QUEUE_CAPACITY)); } linkQueuerThreadPool(); initialiseLinkExtractorThreadPool(); initialiseGetThreadPool(); initialiseMatcherPool(); loadInStartingURLS(); Timer timer = new Timer(); timer.scheduleAtFixedRate( new RedistributeURLsTask(), 0, DispatcherConstants.REDISTRIBUTE_URLS_FREQUENCY_MS); }