Example #1
0
  public int getCrawlingTaskCount(int batchID) throws Throwable {
    PreparedStatement statement =
        conn.prepareStatement(
            "SELECT COUNT(*) FROM `yamaloo`.`crawlertask` WHERE BatchID = ? AND Status IN (?, ?)");
    statement.setInt(1, batchID);
    statement.setString(2, Statuses.None.toString());
    statement.setString(3, Statuses.Crawling.toString());

    ResultSet rs = statement.executeQuery();

    int count = -1;
    rs.next();
    count = rs.getInt(1);

    statement.close();
    return count;
  }
Example #2
0
  // TODO: Priority field is not used yet, f**k mysql
  // Order by RAND() to shuttle Urls from same site
  // Otherwise, you may have a package full of Urls from the same site, which
  // will slow the performance due to QPS limit
  public List<CrawlerTask> getCrawlerTaskPackage(int crawlerID, int count) throws Throwable {
    PreparedStatement statement =
        conn.prepareStatement(
            "UPDATE `yamaloo`.`crawlertask`"
                + " SET `Status` = ?,"
                + " `CrawlerID` = ?"
                + " WHERE `Status` = ?"
                + " ORDER BY Depth, RAND()"
                + " LIMIT ?");
    statement.setString(1, Statuses.Crawling.toString());
    statement.setInt(2, crawlerID);
    statement.setString(3, Statuses.None.toString());
    statement.setInt(4, count);
    statement.execute();
    statement.close();

    // FIFO
    statement =
        conn.prepareStatement(
            "SELECT B.SiteID, T.*"
                + " FROM (SELECT * FROM `yamaloo`.`crawlertask` WHERE `Status` = ? AND `CrawlerID` = ?) AS T"
                + " INNER JOIN `yamaloo`.`batch` AS B"
                + " ON T.BatchID = B.BatchID"
                + " ORDER BY T.Depth, CrawlerTaskID; ");

    statement.setString(1, Statuses.Crawling.toString());
    statement.setInt(2, crawlerID);

    List<CrawlerTask> list = new ArrayList<CrawlerTask>();
    ResultSet rs = statement.executeQuery();

    while (rs.next()) {
      CrawlerTask task = CrawlerTask.Parse(rs);
      list.add(task);
    }

    statement.close();
    return list;
  }