void stop() {
   stop = true;
   if (!crawlerResponseQueue.offer(CrawlerTask.createExitTask())) {
     logger.warning("Failed to add STOP sentinel to crawler response queue");
   }
   synchronized (requestLock) {
     if (activeRequest != null) activeRequest.abort();
   }
 }
Exemple #2
0
  public void insertCrawlerTaskList(Iterable<CrawlerTask> list) throws Throwable {
    conn.setAutoCommit(false);
    PreparedStatement statement =
        conn.prepareStatement(
            "INSERT INTO `yamaloo`.`crawlertask`"
                + " (`BatchID`, `UrlHash`, `Url`, `Depth`, `ParentTaskID`, `Status`, `CreateTime`)"
                + " SELECT ?, ?, ?, ?, ?, ?, ? FROM dual"
                + " WHERE not exists"
                + " (select * from `yamaloo`.`crawlertask` where BatchID = ? AND UrlHash = ?); ");

    int count = 0;
    for (CrawlerTask task : list) {
      statement.setInt(1, task.getBatchID());
      statement.setString(2, task.getUrlHash());
      statement.setString(3, task.getUrl().toString());
      statement.setInt(4, task.getDepth());
      statement.setInt(5, task.getParentTaskID());
      statement.setString(6, task.getStatus().toString());
      statement.setTimestamp(7, task.getCreateTime());
      statement.setInt(8, task.getBatchID());
      statement.setString(9, task.getUrlHash());

      statement.addBatch();
      count++;

      if (count >= 1000) {
        count = 0;
        statement.executeBatch();
        conn.commit();
      }
    }

    statement.executeBatch();
    conn.commit();

    statement.close();
    conn.setAutoCommit(true);
  }
Exemple #3
0
  // crawlertaskdetailview
  public List<CrawlerTask> getBatchTasks(int batchID) throws Throwable {
    PreparedStatement statement =
        conn.prepareStatement(
            "SELECT * FROM `yamaloo`.`crawlertaskdetailview` WHERE `BatchID` = ?");
    statement.setInt(1, batchID);

    List<CrawlerTask> list = new ArrayList<CrawlerTask>();
    ResultSet rs = statement.executeQuery();

    while (rs.next()) {
      CrawlerTask task = CrawlerTask.Parse(rs);
      list.add(task);
    }

    statement.close();
    return list;
  }
Exemple #4
0
  // TODO: Priority field is not used yet, f**k mysql
  // Order by RAND() to shuttle Urls from same site
  // Otherwise, you may have a package full of Urls from the same site, which
  // will slow the performance due to QPS limit
  public List<CrawlerTask> getCrawlerTaskPackage(int crawlerID, int count) throws Throwable {
    PreparedStatement statement =
        conn.prepareStatement(
            "UPDATE `yamaloo`.`crawlertask`"
                + " SET `Status` = ?,"
                + " `CrawlerID` = ?"
                + " WHERE `Status` = ?"
                + " ORDER BY Depth, RAND()"
                + " LIMIT ?");
    statement.setString(1, Statuses.Crawling.toString());
    statement.setInt(2, crawlerID);
    statement.setString(3, Statuses.None.toString());
    statement.setInt(4, count);
    statement.execute();
    statement.close();

    // FIFO
    statement =
        conn.prepareStatement(
            "SELECT B.SiteID, T.*"
                + " FROM (SELECT * FROM `yamaloo`.`crawlertask` WHERE `Status` = ? AND `CrawlerID` = ?) AS T"
                + " INNER JOIN `yamaloo`.`batch` AS B"
                + " ON T.BatchID = B.BatchID"
                + " ORDER BY T.Depth, CrawlerTaskID; ");

    statement.setString(1, Statuses.Crawling.toString());
    statement.setInt(2, crawlerID);

    List<CrawlerTask> list = new ArrayList<CrawlerTask>();
    ResultSet rs = statement.executeQuery();

    while (rs.next()) {
      CrawlerTask task = CrawlerTask.Parse(rs);
      list.add(task);
    }

    statement.close();
    return list;
  }
Exemple #5
0
  public void updateCrawlerTaskList(List<CrawlerTask> list) throws Throwable {
    conn.setAutoCommit(false);
    PreparedStatement statement =
        conn.prepareStatement(
            "UPDATE `yamaloo`.`crawlertask`"
                + " SET `Status` = ?,"
                + " `CrawlBeginTime` = ?,"
                + " `CrawlEndTime` = ?,"
                + " `RetryCount` = ?,"
                + " `ContentType` = ?"
                + " WHERE CrawlerTaskID = ?");

    int count = 0;
    for (CrawlerTask task : list) {
      statement.setString(1, task.getStatus().toString());
      statement.setTimestamp(2, task.getCrawlBeginTime());
      statement.setTimestamp(3, task.getCrawlEndTime());
      statement.setInt(4, task.getRetryCount());
      statement.setString(5, task.getContentType());
      statement.setInt(6, task.getCrawlerTaskID());

      statement.addBatch();
      count++;

      if (count >= 1000) {
        count = 0;
        statement.executeBatch();
        conn.commit();
      }
    }

    statement.executeBatch();
    conn.commit();

    statement.close();
    conn.setAutoCommit(true);
  }
  private void runLoop() throws InterruptedException {
    while (!stop) {
      CrawlerTask task = crawlerResponseQueue.take();
      if (task.isExitTask()) {
        crawlerRequestQueue.add(CrawlerTask.createExitTask());
        crawlerResponseQueue.add(task);
        return;
      }
      HttpUriRequest req = task.getRequest();
      activeRequest = req;
      try {
        if (task.getResponse() != null) {
          task.getResponseProcessor()
              .processResponse(crawler, req, task.getResponse(), task.getArgument());
        }
      } catch (Exception e) {
        logger.log(
            Level.WARNING, "Unexpected exception processing crawler request: " + req.getURI(), e);
      } finally {
        synchronized (requestLock) {
          activeRequest = null;
        }
        final HttpEntity entity =
            (task.getResponse() == null) ? (null) : task.getResponse().getRawResponse().getEntity();
        if (entity != null)
          try {
            EntityUtils.consume(entity);
          } catch (IOException e) {
            logger.log(
                Level.WARNING,
                "I/O exception consuming request entity content for "
                    + req.getURI()
                    + " : "
                    + e.getMessage());
          }
      }

      synchronized (counter) {
        counter.addCompletedTask();
        crawler.updateProgress();
      }
      if (task.causedException()) {
        crawler.notifyException(req, task.getException());
      }

      if (outstandingTasks.decrementAndGet() <= 0) {
        crawlerRequestQueue.add(CrawlerTask.createExitTask());
        crawlerResponseQueue.add(CrawlerTask.createExitTask());
        return;
      }
    }
  }