/** * Indicate that one of the taskids in this already-completed TaskInProgress has successfully * completed; hence we mark this taskid as {@link TaskStatus.State.KILLED}. */ void alreadyCompletedTask(TaskAttemptID taskid) { // 'KILL' the task completedTask(taskid, TaskStatus.State.KILLED); // Note the reason for the task being 'KILLED' addDiagnosticInfo(taskid, "Already completed TIP"); LOG.info("Already complete TIP " + getTIPId() + " has completed task " + taskid); }
/** Kill the given task */ boolean killTask(TaskAttemptID taskId, boolean shouldFail, String diagnosticInfo) { TaskStatus st = taskStatuses.get(taskId); if (st != null && (st.getRunState() == TaskStatus.State.RUNNING || st.getRunState() == TaskStatus.State.COMMIT_PENDING || st.inTaskCleanupPhase() || st.getRunState() == TaskStatus.State.UNASSIGNED) && tasksToKill.put(taskId, shouldFail) == null) { addDiagnosticInfo(taskId, diagnosticInfo); LOG.info(diagnosticInfo); return true; } return false; }
/** Indicate that one of the taskids in this TaskInProgress has failed. */ public void incompleteSubTask(TaskAttemptID taskid, JobStatus jobStatus) { // // Note the failure and its location // TaskStatus status = taskStatuses.get(taskid); String trackerName; String trackerHostName = null; TaskStatus.State taskState = TaskStatus.State.FAILED; if (status != null) { trackerName = status.getTaskTracker(); trackerHostName = JobInProgressTraits.convertTrackerNameToHostName(trackerName); // Check if the user manually KILLED/FAILED this task-attempt... Boolean shouldFail = tasksToKill.remove(taskid); if (shouldFail != null) { if (status.getRunState() == TaskStatus.State.FAILED || status.getRunState() == TaskStatus.State.KILLED) { taskState = (shouldFail) ? TaskStatus.State.FAILED : TaskStatus.State.KILLED; } else { taskState = (shouldFail) ? TaskStatus.State.FAILED_UNCLEAN : TaskStatus.State.KILLED_UNCLEAN; } status.setRunState(taskState); addDiagnosticInfo(taskid, "Task has been " + taskState + " by the user"); } taskState = status.getRunState(); if (taskState != TaskStatus.State.FAILED && taskState != TaskStatus.State.KILLED && taskState != TaskStatus.State.FAILED_UNCLEAN && taskState != TaskStatus.State.KILLED_UNCLEAN) { LOG.info( "Task '" + taskid + "' running on '" + trackerName + "' in state: '" + taskState + "' being failed!"); status.setRunState(TaskStatus.State.FAILED); taskState = TaskStatus.State.FAILED; } // tasktracker went down and failed time was not reported. if (0 == status.getFinishTime()) { status.setFinishTime(JobTracker.getClock().getTime()); } } this.activeTasks.remove(taskid); // Since we do not fail completed reduces (whose outputs go to hdfs), we // should note this failure only for completed maps, only if this taskid; // completed this map. however if the job is done, there is no need to // manipulate completed maps if (this.isMapTask() && !jobSetup && !jobCleanup && isComplete(taskid) && jobStatus.getRunState() != JobStatus.SUCCEEDED) { this.completes--; // Reset the successfulTaskId since we don't have a SUCCESSFUL task now resetSuccessfulTaskid(); } // Note that there can be failures of tasks that are hosted on a machine // that has not yet registered with restarted jobtracker // recalculate the counts only if its a genuine failure if (tasks.contains(taskid)) { if (taskState == TaskStatus.State.FAILED) { numTaskFailures++; machinesWhereFailed.add(trackerHostName); if (maxSkipRecords > 0) { // skipping feature enabled LOG.debug("TaskInProgress adding" + status.getNextRecordRange()); failedRanges.add(status.getNextRecordRange()); skipping = startSkipping(); } } else if (taskState == TaskStatus.State.KILLED) { numKilledTasks++; } } if (numTaskFailures >= maxTaskAttempts) { LOG.info("TaskInProgress " + getTIPId() + " has failed " + numTaskFailures + " times."); kill(); } }
/** * A status message from a client has arrived. It updates the status of a single * component-thread-task, which might result in an overall TaskInProgress status update. * * @return has the task changed its state noticeably? */ synchronized boolean updateStatus(TaskStatus status) { TaskAttemptID taskid = status.getTaskID(); String taskTracker = status.getTaskTracker(); String diagInfo = status.getDiagnosticInfo(); TaskStatus oldStatus = taskStatuses.get(taskid); boolean changed = true; if (diagInfo != null && diagInfo.length() > 0) { long runTime = status.getRunTime(); LOG.info( "Error from " + taskid + " on " + taskTracker + " runTime(msec) " + runTime + ": " + diagInfo); addDiagnosticInfo(taskid, diagInfo); } if (skipping) { failedRanges.updateState(status); } if (oldStatus != null) { TaskStatus.State oldState = oldStatus.getRunState(); TaskStatus.State newState = status.getRunState(); // We should never recieve a duplicate success/failure/killed // status update for the same taskid! This is a safety check, // and is addressed better at the TaskTracker to ensure this. // @see {@link TaskTracker.transmitHeartbeat()} if ((newState != TaskStatus.State.RUNNING && newState != TaskStatus.State.COMMIT_PENDING && newState != TaskStatus.State.FAILED_UNCLEAN && newState != TaskStatus.State.KILLED_UNCLEAN && newState != TaskStatus.State.UNASSIGNED) && (oldState == newState)) { LOG.warn( "Recieved duplicate status update of '" + newState + "' for '" + taskid + "' of TIP '" + getTIPId() + "'" + "oldTT=" + oldStatus.getTaskTracker() + " while newTT=" + status.getTaskTracker()); return false; } // The task is not allowed to move from completed back to running. // We have seen out of order status messagesmoving tasks from complete // to running. This is a spot fix, but it should be addressed more // globally. if ((newState == TaskStatus.State.RUNNING || newState == TaskStatus.State.UNASSIGNED) && (oldState == TaskStatus.State.FAILED || oldState == TaskStatus.State.KILLED || oldState == TaskStatus.State.FAILED_UNCLEAN || oldState == TaskStatus.State.KILLED_UNCLEAN || oldState == TaskStatus.State.SUCCEEDED || oldState == TaskStatus.State.COMMIT_PENDING)) { return false; } // Do not accept any status once the task is marked FAILED/KILLED // This is to handle the case of the JobTracker timing out a task // due to launch delay, but the TT comes back with any state or // TT got expired if (oldState == TaskStatus.State.FAILED || oldState == TaskStatus.State.KILLED) { tasksToKill.put(taskid, true); return false; } changed = oldState != newState; } // if task is a cleanup attempt, do not replace the complete status, // update only specific fields. // For example, startTime should not be updated, // but finishTime has to be updated. if (!isCleanupAttempt(taskid)) { taskStatuses.put(taskid, status); // we don't want to include setup tasks in the task execution stats if (!isJobSetupTask() && !isJobCleanupTask() && ((isMapTask() && job.hasSpeculativeMaps()) || (!isMapTask() && job.hasSpeculativeReduces()))) { updateProgressRate(JobTracker.getClock().getTime()); } } else { taskStatuses .get(taskid) .statusUpdate( status.getRunState(), status.getProgress(), status.getStateString(), status.getPhase(), status.getFinishTime()); } // Recompute progress recomputeProgress(); return changed; }