// crawlertaskdetailview public List<CrawlerTask> getBatchTasks(int batchID) throws Throwable { PreparedStatement statement = conn.prepareStatement( "SELECT * FROM `yamaloo`.`crawlertaskdetailview` WHERE `BatchID` = ?"); statement.setInt(1, batchID); List<CrawlerTask> list = new ArrayList<CrawlerTask>(); ResultSet rs = statement.executeQuery(); while (rs.next()) { CrawlerTask task = CrawlerTask.Parse(rs); list.add(task); } statement.close(); return list; }
// TODO: Priority field is not used yet, f**k mysql // Order by RAND() to shuttle Urls from same site // Otherwise, you may have a package full of Urls from the same site, which // will slow the performance due to QPS limit public List<CrawlerTask> getCrawlerTaskPackage(int crawlerID, int count) throws Throwable { PreparedStatement statement = conn.prepareStatement( "UPDATE `yamaloo`.`crawlertask`" + " SET `Status` = ?," + " `CrawlerID` = ?" + " WHERE `Status` = ?" + " ORDER BY Depth, RAND()" + " LIMIT ?"); statement.setString(1, Statuses.Crawling.toString()); statement.setInt(2, crawlerID); statement.setString(3, Statuses.None.toString()); statement.setInt(4, count); statement.execute(); statement.close(); // FIFO statement = conn.prepareStatement( "SELECT B.SiteID, T.*" + " FROM (SELECT * FROM `yamaloo`.`crawlertask` WHERE `Status` = ? AND `CrawlerID` = ?) AS T" + " INNER JOIN `yamaloo`.`batch` AS B" + " ON T.BatchID = B.BatchID" + " ORDER BY T.Depth, CrawlerTaskID; "); statement.setString(1, Statuses.Crawling.toString()); statement.setInt(2, crawlerID); List<CrawlerTask> list = new ArrayList<CrawlerTask>(); ResultSet rs = statement.executeQuery(); while (rs.next()) { CrawlerTask task = CrawlerTask.Parse(rs); list.add(task); } statement.close(); return list; }