void stop() { stop = true; if (!crawlerResponseQueue.offer(CrawlerTask.createExitTask())) { logger.warning("Failed to add STOP sentinel to crawler response queue"); } synchronized (requestLock) { if (activeRequest != null) activeRequest.abort(); } }
public void insertCrawlerTaskList(Iterable<CrawlerTask> list) throws Throwable { conn.setAutoCommit(false); PreparedStatement statement = conn.prepareStatement( "INSERT INTO `yamaloo`.`crawlertask`" + " (`BatchID`, `UrlHash`, `Url`, `Depth`, `ParentTaskID`, `Status`, `CreateTime`)" + " SELECT ?, ?, ?, ?, ?, ?, ? FROM dual" + " WHERE not exists" + " (select * from `yamaloo`.`crawlertask` where BatchID = ? AND UrlHash = ?); "); int count = 0; for (CrawlerTask task : list) { statement.setInt(1, task.getBatchID()); statement.setString(2, task.getUrlHash()); statement.setString(3, task.getUrl().toString()); statement.setInt(4, task.getDepth()); statement.setInt(5, task.getParentTaskID()); statement.setString(6, task.getStatus().toString()); statement.setTimestamp(7, task.getCreateTime()); statement.setInt(8, task.getBatchID()); statement.setString(9, task.getUrlHash()); statement.addBatch(); count++; if (count >= 1000) { count = 0; statement.executeBatch(); conn.commit(); } } statement.executeBatch(); conn.commit(); statement.close(); conn.setAutoCommit(true); }
// crawlertaskdetailview public List<CrawlerTask> getBatchTasks(int batchID) throws Throwable { PreparedStatement statement = conn.prepareStatement( "SELECT * FROM `yamaloo`.`crawlertaskdetailview` WHERE `BatchID` = ?"); statement.setInt(1, batchID); List<CrawlerTask> list = new ArrayList<CrawlerTask>(); ResultSet rs = statement.executeQuery(); while (rs.next()) { CrawlerTask task = CrawlerTask.Parse(rs); list.add(task); } statement.close(); return list; }
// TODO: Priority field is not used yet, f**k mysql // Order by RAND() to shuttle Urls from same site // Otherwise, you may have a package full of Urls from the same site, which // will slow the performance due to QPS limit public List<CrawlerTask> getCrawlerTaskPackage(int crawlerID, int count) throws Throwable { PreparedStatement statement = conn.prepareStatement( "UPDATE `yamaloo`.`crawlertask`" + " SET `Status` = ?," + " `CrawlerID` = ?" + " WHERE `Status` = ?" + " ORDER BY Depth, RAND()" + " LIMIT ?"); statement.setString(1, Statuses.Crawling.toString()); statement.setInt(2, crawlerID); statement.setString(3, Statuses.None.toString()); statement.setInt(4, count); statement.execute(); statement.close(); // FIFO statement = conn.prepareStatement( "SELECT B.SiteID, T.*" + " FROM (SELECT * FROM `yamaloo`.`crawlertask` WHERE `Status` = ? AND `CrawlerID` = ?) AS T" + " INNER JOIN `yamaloo`.`batch` AS B" + " ON T.BatchID = B.BatchID" + " ORDER BY T.Depth, CrawlerTaskID; "); statement.setString(1, Statuses.Crawling.toString()); statement.setInt(2, crawlerID); List<CrawlerTask> list = new ArrayList<CrawlerTask>(); ResultSet rs = statement.executeQuery(); while (rs.next()) { CrawlerTask task = CrawlerTask.Parse(rs); list.add(task); } statement.close(); return list; }
public void updateCrawlerTaskList(List<CrawlerTask> list) throws Throwable { conn.setAutoCommit(false); PreparedStatement statement = conn.prepareStatement( "UPDATE `yamaloo`.`crawlertask`" + " SET `Status` = ?," + " `CrawlBeginTime` = ?," + " `CrawlEndTime` = ?," + " `RetryCount` = ?," + " `ContentType` = ?" + " WHERE CrawlerTaskID = ?"); int count = 0; for (CrawlerTask task : list) { statement.setString(1, task.getStatus().toString()); statement.setTimestamp(2, task.getCrawlBeginTime()); statement.setTimestamp(3, task.getCrawlEndTime()); statement.setInt(4, task.getRetryCount()); statement.setString(5, task.getContentType()); statement.setInt(6, task.getCrawlerTaskID()); statement.addBatch(); count++; if (count >= 1000) { count = 0; statement.executeBatch(); conn.commit(); } } statement.executeBatch(); conn.commit(); statement.close(); conn.setAutoCommit(true); }
private void runLoop() throws InterruptedException { while (!stop) { CrawlerTask task = crawlerResponseQueue.take(); if (task.isExitTask()) { crawlerRequestQueue.add(CrawlerTask.createExitTask()); crawlerResponseQueue.add(task); return; } HttpUriRequest req = task.getRequest(); activeRequest = req; try { if (task.getResponse() != null) { task.getResponseProcessor() .processResponse(crawler, req, task.getResponse(), task.getArgument()); } } catch (Exception e) { logger.log( Level.WARNING, "Unexpected exception processing crawler request: " + req.getURI(), e); } finally { synchronized (requestLock) { activeRequest = null; } final HttpEntity entity = (task.getResponse() == null) ? (null) : task.getResponse().getRawResponse().getEntity(); if (entity != null) try { EntityUtils.consume(entity); } catch (IOException e) { logger.log( Level.WARNING, "I/O exception consuming request entity content for " + req.getURI() + " : " + e.getMessage()); } } synchronized (counter) { counter.addCompletedTask(); crawler.updateProgress(); } if (task.causedException()) { crawler.notifyException(req, task.getException()); } if (outstandingTasks.decrementAndGet() <= 0) { crawlerRequestQueue.add(CrawlerTask.createExitTask()); crawlerResponseQueue.add(CrawlerTask.createExitTask()); return; } } }