示例#1
0
    @Override
    public void run() {

      try {
        for (int i = 0; i < otherWorkerIPPort.size(); i++) {

          BlockingQueue<URL> queue = getUrlsForOtherCrawlers().get(i);

          if (queue.isEmpty()) {
            continue;
          }

          StringBuilder contentBuilder = new StringBuilder();
          contentBuilder.append(DispatcherConstants.NEW_URLS_PARAM + "=");

          int items = DispatcherConstants.URLS_TO_SEND;

          while (!queue.isEmpty() || items < 0) {
            URL url = queue.take();
            String cleansedURL = URLEncoder.encode(url.toString(), CrawlerConstants.CHARSET);
            contentBuilder.append(cleansedURL + ";");
          }

          String urlString =
              "http://" + otherWorkerIPPort.get(i) + "/worker/" + DispatcherConstants.ADD_URLS_URL;
          URL url = new URL(urlString);
          DispatcherUtils.sendHttpRequest(
              url, contentBuilder.toString(), DispatcherUtils.Method.POST, true);
        }

      } catch (Exception e) {
        Utils.logStackTrace(e);
      }
    }
示例#2
0
 /** Start the crawler */
 public void startCrawler() {
   try {
     logger.info(CLASSNAME + ": Starting crawler");
     initialise();
   } catch (Exception e) {
     Utils.logStackTrace(e);
   }
 }
示例#3
0
 /**
  * Returns the number of documents that have been crawled and saved
  *
  * @return
  */
 public int getNumCrawledDocuments() {
   try {
     File storageDirectory =
         new File(CrawlerConstants.DB_DIRECTORY + CrawlerConstants.STORAGE_DIRECTORY);
     return storageDirectory.list().length;
   } catch (Exception e) {
     Utils.logStackTrace(e);
     return -1;
   }
 }
示例#4
0
  /**
   * Set up the entire crawler framework
   *
   * @throws NoSuchAlgorithmException
   */
  private void initialise() throws NoSuchAlgorithmException {

    this.dbEnvDir = CrawlerConstants.DB_DIRECTORY;
    this.storageDirectory = CrawlerConstants.DB_DIRECTORY + CrawlerConstants.STORAGE_DIRECTORY;
    Utils.createDirectory(this.storageDirectory);
    this.urlStorageDirectory =
        CrawlerConstants.DB_DIRECTORY + CrawlerConstants.URL_STORAGE_DIRECTORY;
    Utils.createDirectory(this.urlStorageDirectory);

    initialiseDb();

    this.getCrawlQueue = new ArrayBlockingQueue<URL>(CrawlerConstants.QUEUE_CAPACITY);

    this.contentForLinkExtractor =
        new ArrayBlockingQueue<RawCrawledItem>(CrawlerConstants.QUEUE_CAPACITY);
    this.preRedistributionNewURLQueue =
        new ArrayBlockingQueue<URL>(CrawlerConstants.NEW_URL_QUEUE_CAPACITY);

    this.siteInfoMap = new ConcurrentHashMap<String, SiteInfo>();
    this.sitesCrawledThisSession = new HashSet<String>();

    this.urlsForOtherCrawlers = new ConcurrentHashMap<Integer, BlockingQueue<URL>>();
    for (int i = 0; i < this.otherWorkerIPPort.size(); i++) {
      this.urlsForOtherCrawlers.put(
          i, new ArrayBlockingQueue<URL>(CrawlerConstants.SMALL_QUEUE_CAPACITY));
    }

    linkQueuerThreadPool();
    initialiseLinkExtractorThreadPool();
    initialiseGetThreadPool();
    initialiseMatcherPool();
    loadInStartingURLS();
    Timer timer = new Timer();
    timer.scheduleAtFixedRate(
        new RedistributeURLsTask(), 0, DispatcherConstants.REDISTRIBUTE_URLS_FREQUENCY_MS);
  }