/** * Returns whether a component task-thread should be closed because the containing JobInProgress * has completed or the task is killed by the user */ public boolean shouldClose(TaskAttemptID taskid) { /** * If the task hasn't been closed yet, and it belongs to a completed TaskInProgress close it. * * <p>However, for completed map tasks we do not close the task which actually was the one * responsible for _completing_ the TaskInProgress. */ if (tasksReportedClosed.contains(taskid)) { if (tasksToKill.keySet().contains(taskid)) return true; else return false; } boolean close = false; TaskStatus ts = taskStatuses.get(taskid); if ((ts != null) && ((this.failed) || ((job.getStatus().getRunState() != JobStatus.RUNNING && (job.getStatus().getRunState() != JobStatus.PREP))))) { tasksReportedClosed.add(taskid); close = true; } else if ((completes > 0) && // isComplete() is synchronized! !(isMapTask() && !jobSetup && !jobCleanup && isComplete(taskid))) { tasksReportedClosed.add(taskid); close = true; } else if (isCommitPending(taskid) && !shouldCommit(taskid)) { tasksReportedClosed.add(taskid); close = true; } else { close = tasksToKill.keySet().contains(taskid); } return close; }
/** Return a Task that can be sent to a TaskTracker for execution. */ public Task getTaskToRun(String taskTracker) { // Create the 'taskid'; do not count the 'killed' tasks against the job! TaskAttemptID taskid = null; if (nextTaskId < (MAX_TASK_EXECS + maxTaskAttempts + numKilledTasks)) { // Make sure that the attempts are unqiue across restarts int attemptId = job.getNumRestarts() * NUM_ATTEMPTS_PER_RESTART + nextTaskId; taskid = new TaskAttemptID(id, attemptId); ++nextTaskId; } else { LOG.warn( "Exceeded limit of " + (MAX_TASK_EXECS + maxTaskAttempts) + " (plus " + numKilledTasks + " killed)" + " attempts for the tip '" + getTIPId() + "'"); return null; } // keep track of the last time we started an attempt at this TIP // used to calculate the progress rate of this TIP setDispatchTime(taskid, JobTracker.getClock().getTime()); if (0 == execStartTime) { // assume task starts running now execStartTime = JobTracker.getClock().getTime(); } return addRunningTask(taskid, taskTracker); }
/** * update progress rate for a task * * <p>The assumption is that the JIP lock is held entering this routine. So it's left * unsynchronized. Currently the only places it's called from are TIP.updateStatus and * JIP.refreshCandidate* */ public void updateProgressRate(long currentTime) { double bestProgressRate = 0; for (TaskStatus ts : taskStatuses.values()) { if (ts.getRunState() == TaskStatus.State.RUNNING || ts.getRunState() == TaskStatus.State.SUCCEEDED || ts.getRunState() == TaskStatus.State.COMMIT_PENDING) { double tsProgressRate = ts.getProgress() / Math.max(1, currentTime - getDispatchTime(ts.getTaskID())); if (tsProgressRate > bestProgressRate) { bestProgressRate = tsProgressRate; } } } DataStatistics taskStats = job.getRunningTaskStatistics(isMapTask()); taskStats.updateStatistics(progressRate, bestProgressRate); progressRate = bestProgressRate; }
/** * Can this task be speculated? This requires that it isn't done or almost done and that it isn't * already being speculatively executed. * * <p>Added for use by queue scheduling algorithms. * * @param currentTime */ boolean canBeSpeculated(long currentTime) { if (skipping || !isRunnable() || !isRunning() || completes != 0 || isOnlyCommitPending() || activeTasks.size() > MAX_TASK_EXECS) { return false; } if (isSpeculativeForced()) { return true; } // no speculation for first few seconds if (currentTime - lastDispatchTime < speculativeLag) { return false; } DataStatistics taskStats = job.getRunningTaskStatistics(isMapTask()); if (LOG.isDebugEnabled()) { LOG.debug( "activeTasks.size(): " + activeTasks.size() + " " + activeTasks.firstKey() + " task's progressrate: " + progressRate + " taskStats : " + taskStats); } // if the task is making progress fast enough to complete within // the acceptable duration allowed for each task - do not speculate if ((maxProgressRateForSpeculation > 0) && (progressRate > maxProgressRateForSpeculation)) { return false; } if (isMapTask() ? job.shouldSpeculateAllRemainingMaps() : job.shouldSpeculateAllRemainingReduces()) { if (LOG.isDebugEnabled()) { LOG.debug("Speculate " + getTIPId() + " because the job is almost finished"); } return true; } // Find if task should be speculated based on standard deviation // the max difference allowed between the tasks's progress rate // and the mean progress rate of sibling tasks. double maxDiff = (taskStats.std() == 0 ? taskStats.mean() / 3 : job.getSlowTaskThreshold() * taskStats.std()); // if stddev > mean - we are stuck. cap the max difference at a // more meaningful number. maxDiff = Math.min(maxDiff, taskStats.mean() * job.getStddevMeanRatioMax()); return (taskStats.mean() - progressRate > maxDiff); }
/** Indicate that one of the taskids in this TaskInProgress has failed. */ public void incompleteSubTask(TaskAttemptID taskid, JobStatus jobStatus) { // // Note the failure and its location // TaskStatus status = taskStatuses.get(taskid); String trackerName; String trackerHostName = null; TaskStatus.State taskState = TaskStatus.State.FAILED; if (status != null) { trackerName = status.getTaskTracker(); trackerHostName = JobInProgressTraits.convertTrackerNameToHostName(trackerName); // Check if the user manually KILLED/FAILED this task-attempt... Boolean shouldFail = tasksToKill.remove(taskid); if (shouldFail != null) { if (status.getRunState() == TaskStatus.State.FAILED || status.getRunState() == TaskStatus.State.KILLED) { taskState = (shouldFail) ? TaskStatus.State.FAILED : TaskStatus.State.KILLED; } else { taskState = (shouldFail) ? TaskStatus.State.FAILED_UNCLEAN : TaskStatus.State.KILLED_UNCLEAN; } status.setRunState(taskState); addDiagnosticInfo(taskid, "Task has been " + taskState + " by the user"); } taskState = status.getRunState(); if (taskState != TaskStatus.State.FAILED && taskState != TaskStatus.State.KILLED && taskState != TaskStatus.State.FAILED_UNCLEAN && taskState != TaskStatus.State.KILLED_UNCLEAN) { LOG.info( "Task '" + taskid + "' running on '" + trackerName + "' in state: '" + taskState + "' being failed!"); status.setRunState(TaskStatus.State.FAILED); taskState = TaskStatus.State.FAILED; } // tasktracker went down and failed time was not reported. if (0 == status.getFinishTime()) { status.setFinishTime(JobTracker.getClock().getTime()); } } this.activeTasks.remove(taskid); // Since we do not fail completed reduces (whose outputs go to hdfs), we // should note this failure only for completed maps, only if this taskid; // completed this map. however if the job is done, there is no need to // manipulate completed maps if (this.isMapTask() && !jobSetup && !jobCleanup && isComplete(taskid) && jobStatus.getRunState() != JobStatus.SUCCEEDED) { this.completes--; // Reset the successfulTaskId since we don't have a SUCCESSFUL task now resetSuccessfulTaskid(); } // Note that there can be failures of tasks that are hosted on a machine // that has not yet registered with restarted jobtracker // recalculate the counts only if its a genuine failure if (tasks.contains(taskid)) { if (taskState == TaskStatus.State.FAILED) { numTaskFailures++; machinesWhereFailed.add(trackerHostName); if (maxSkipRecords > 0) { // skipping feature enabled LOG.debug("TaskInProgress adding" + status.getNextRecordRange()); failedRanges.add(status.getNextRecordRange()); skipping = startSkipping(); } } else if (taskState == TaskStatus.State.KILLED) { numKilledTasks++; } } if (numTaskFailures >= maxTaskAttempts) { LOG.info("TaskInProgress " + getTIPId() + " has failed " + numTaskFailures + " times."); kill(); } }
/** * A status message from a client has arrived. It updates the status of a single * component-thread-task, which might result in an overall TaskInProgress status update. * * @return has the task changed its state noticeably? */ synchronized boolean updateStatus(TaskStatus status) { TaskAttemptID taskid = status.getTaskID(); String taskTracker = status.getTaskTracker(); String diagInfo = status.getDiagnosticInfo(); TaskStatus oldStatus = taskStatuses.get(taskid); boolean changed = true; if (diagInfo != null && diagInfo.length() > 0) { long runTime = status.getRunTime(); LOG.info( "Error from " + taskid + " on " + taskTracker + " runTime(msec) " + runTime + ": " + diagInfo); addDiagnosticInfo(taskid, diagInfo); } if (skipping) { failedRanges.updateState(status); } if (oldStatus != null) { TaskStatus.State oldState = oldStatus.getRunState(); TaskStatus.State newState = status.getRunState(); // We should never recieve a duplicate success/failure/killed // status update for the same taskid! This is a safety check, // and is addressed better at the TaskTracker to ensure this. // @see {@link TaskTracker.transmitHeartbeat()} if ((newState != TaskStatus.State.RUNNING && newState != TaskStatus.State.COMMIT_PENDING && newState != TaskStatus.State.FAILED_UNCLEAN && newState != TaskStatus.State.KILLED_UNCLEAN && newState != TaskStatus.State.UNASSIGNED) && (oldState == newState)) { LOG.warn( "Recieved duplicate status update of '" + newState + "' for '" + taskid + "' of TIP '" + getTIPId() + "'" + "oldTT=" + oldStatus.getTaskTracker() + " while newTT=" + status.getTaskTracker()); return false; } // The task is not allowed to move from completed back to running. // We have seen out of order status messagesmoving tasks from complete // to running. This is a spot fix, but it should be addressed more // globally. if ((newState == TaskStatus.State.RUNNING || newState == TaskStatus.State.UNASSIGNED) && (oldState == TaskStatus.State.FAILED || oldState == TaskStatus.State.KILLED || oldState == TaskStatus.State.FAILED_UNCLEAN || oldState == TaskStatus.State.KILLED_UNCLEAN || oldState == TaskStatus.State.SUCCEEDED || oldState == TaskStatus.State.COMMIT_PENDING)) { return false; } // Do not accept any status once the task is marked FAILED/KILLED // This is to handle the case of the JobTracker timing out a task // due to launch delay, but the TT comes back with any state or // TT got expired if (oldState == TaskStatus.State.FAILED || oldState == TaskStatus.State.KILLED) { tasksToKill.put(taskid, true); return false; } changed = oldState != newState; } // if task is a cleanup attempt, do not replace the complete status, // update only specific fields. // For example, startTime should not be updated, // but finishTime has to be updated. if (!isCleanupAttempt(taskid)) { taskStatuses.put(taskid, status); // we don't want to include setup tasks in the task execution stats if (!isJobSetupTask() && !isJobCleanupTask() && ((isMapTask() && job.hasSpeculativeMaps()) || (!isMapTask() && job.hasSpeculativeReduces()))) { updateProgressRate(JobTracker.getClock().getTime()); } } else { taskStatuses .get(taskid) .statusUpdate( status.getRunState(), status.getProgress(), status.getStateString(), status.getPhase(), status.getFinishTime()); } // Recompute progress recomputeProgress(); return changed; }
/** Adds a previously running task to this tip. This is used in case of jobtracker restarts. */ public Task addRunningTask(TaskAttemptID taskid, String taskTracker, boolean taskCleanup) { // 1 slot is enough for taskCleanup task int numSlotsNeeded = taskCleanup ? 1 : numSlotsRequired; // create the task Task t = null; if (isMapTask()) { LOG.debug( "attempt " + numTaskFailures + " sending skippedRecords " + failedRanges.getIndicesCount()); String splitClass = null; BytesWritable split; if (!jobSetup && !jobCleanup) { splitClass = rawSplit.getClassName(); split = rawSplit.getBytes(); } else { split = new BytesWritable(); } t = new MapTask(jobFile, taskid, partition, splitClass, split, numSlotsNeeded, job.getUser()); } else { t = new ReduceTask(jobFile, taskid, partition, numMaps, numSlotsNeeded, job.getUser()); } if (jobCleanup) { t.setJobCleanupTask(); } if (jobSetup) { t.setJobSetupTask(); } if (taskCleanup) { t.setTaskCleanupTask(); t.setState(taskStatuses.get(taskid).getRunState()); cleanupTasks.put(taskid, taskTracker); } t.setConf(conf); LOG.debug("Launching task with skipRanges:" + failedRanges.getSkipRanges()); t.setSkipRanges(failedRanges.getSkipRanges()); t.setSkipping(skipping); if (failedRanges.isTestAttempt()) { t.setWriteSkipRecs(false); } if (activeTasks.size() >= 1) { speculativeTaskId = taskid; } else { speculativeTaskId = null; } activeTasks.put(taskid, taskTracker); tasks.add(taskid); // Ask JobTracker to note that the task exists // jobtracker.createTaskEntry(taskid, taskTracker, this); /* // code to find call paths to createTaskEntry StackTraceElement[] stackTraceElements = Thread.currentThread().getStackTrace(); boolean found = false; for (StackTraceElement s: stackTraceElements) { if (s.getMethodName().indexOf("heartbeat") != -1 || s.getMethodName().indexOf("findTask") != -1 || s.getMethodName().indexOf("createAndAddAttempt") != -1 || s.getMethodName().indexOf("processTaskAttempt") != -1) { found = true; break; } } if (!found) { RuntimeException e = new RuntimeException ("calling addRunningTask from outside heartbeat"); LOG.info(StringUtils.stringifyException(e)); throw (e); } */ // check and set the first attempt if (firstTaskId == null) { firstTaskId = taskid; } return t; }