/** Return a Task that can be sent to a TaskTracker for execution. */ public Task getTaskToRun(String taskTracker) { // Create the 'taskid'; do not count the 'killed' tasks against the job! TaskAttemptID taskid = null; if (nextTaskId < (MAX_TASK_EXECS + maxTaskAttempts + numKilledTasks)) { // Make sure that the attempts are unqiue across restarts int attemptId = job.getNumRestarts() * NUM_ATTEMPTS_PER_RESTART + nextTaskId; taskid = new TaskAttemptID(id, attemptId); ++nextTaskId; } else { LOG.warn( "Exceeded limit of " + (MAX_TASK_EXECS + maxTaskAttempts) + " (plus " + numKilledTasks + " killed)" + " attempts for the tip '" + getTIPId() + "'"); return null; } // keep track of the last time we started an attempt at this TIP // used to calculate the progress rate of this TIP setDispatchTime(taskid, JobTracker.getClock().getTime()); if (0 == execStartTime) { // assume task starts running now execStartTime = JobTracker.getClock().getTime(); } return addRunningTask(taskid, taskTracker); }
/** The TIP's been ordered kill()ed. */ public void kill() { if (isComplete() || failed) { return; } this.failed = true; killed = true; this.execFinishTime = JobTracker.getClock().getTime(); recomputeProgress(); }
/** Indicate that one of the taskids in this TaskInProgress has successfully completed! */ public void completed(TaskAttemptID taskid) { // // Record that this taskid is complete // completedTask(taskid, TaskStatus.State.SUCCEEDED); // Note the successful taskid setSuccessfulTaskid(taskid); // // Now that the TIP is complete, the other speculative // subtasks will be closed when the owning tasktracker // reports in and calls shouldClose() on this object. // this.completes++; this.execFinishTime = JobTracker.getClock().getTime(); recomputeProgress(); }
/** Initialization common to Map and Reduce */ void init(JobID jobId) { this.startTime = JobTracker.getClock().getTime(); this.id = new TaskID(jobId, isMapTask(), partition); this.skipping = startSkipping(); long speculativeDuration; if (isMapTask()) { this.speculativeLag = conf.getMapSpeculativeLag(); speculativeDuration = conf.getMapSpeculativeDuration(); } else { this.speculativeLag = conf.getReduceSpeculativeLag(); speculativeDuration = conf.getReduceSpeculativeDuration(); } // speculate only if 1/(1000 * progress_rate) > speculativeDuration // ie. : // speculate only if progress_rate < 1/(1000 * speculativeDuration) if (speculativeDuration > 0) { this.maxProgressRateForSpeculation = 1.0 / (1000.0 * speculativeDuration); } else { // disable this check for durations <= 0 this.maxProgressRateForSpeculation = -1.0; } }
@SuppressWarnings("deprecation") public SimulatorJobInProgress( JobID jobid, JobTracker jobtracker, JobConf default_conf, JobStory jobStory) { super(jobid, jobStory.getJobConf(), jobtracker); // jobSetupCleanupNeeded set to false in parent cstr, though // default is true restartCount = 0; jobSetupCleanupNeeded = false; this.memoryPerMap = conf.getMemoryForMapTask(); this.memoryPerReduce = conf.getMemoryForReduceTask(); this.maxTaskFailuresPerTracker = conf.getMaxTaskFailuresPerTracker(); this.jobId = jobid; String url = "http://" + jobtracker.getJobTrackerMachine() + ":" + jobtracker.getInfoPort() + "/jobdetails.jsp?jobid=" + jobid; this.jobtracker = jobtracker; this.conf = jobStory.getJobConf(); this.priority = conf.getJobPriority(); Path jobDir = jobtracker.getSystemDirectoryForJob(jobid); this.jobFile = new Path(jobDir, "job.xml"); this.status = new JobStatus(jobid, 0.0f, 0.0f, 0.0f, 0.0f, JobStatus.PREP, priority, conf.getUser()); this.profile = new JobProfile( jobStory.getUser(), jobid, this.jobFile.toString(), url, jobStory.getName(), conf.getQueueName()); this.startTime = JobTracker.getClock().getTime(); status.setStartTime(startTime); this.resourceEstimator = new ResourceEstimator(this); this.numMapTasks = jobStory.getNumberMaps(); this.numReduceTasks = jobStory.getNumberReduces(); this.taskCompletionEvents = new ArrayList<TaskCompletionEvent>(numMapTasks + numReduceTasks + 10); this.mapFailuresPercent = conf.getMaxMapTaskFailuresPercent(); this.reduceFailuresPercent = conf.getMaxReduceTaskFailuresPercent(); MetricsContext metricsContext = MetricsUtil.getContext("mapred"); this.jobMetrics = MetricsUtil.createRecord(metricsContext, "job"); this.jobMetrics.setTag("user", conf.getUser()); this.jobMetrics.setTag("sessionId", conf.getSessionId()); this.jobMetrics.setTag("jobName", conf.getJobName()); this.jobMetrics.setTag("jobId", jobid.toString()); this.maxLevel = jobtracker.getNumTaskCacheLevels(); this.anyCacheLevel = this.maxLevel + 1; this.nonLocalMaps = new LinkedList<TaskInProgress>(); this.nonLocalRunningMaps = new LinkedHashSet<TaskInProgress>(); this.runningMapCache = new IdentityHashMap<Node, Set<TaskInProgress>>(); this.nonRunningReduces = new LinkedList<TaskInProgress>(); this.runningReduces = new LinkedHashSet<TaskInProgress>(); this.slowTaskThreshold = Math.max(0.0f, conf.getFloat("mapred.speculative.execution.slowTaskThreshold", 1.0f)); this.speculativeCap = conf.getFloat("mapred.speculative.execution.speculativeCap", 0.1f); this.slowNodeThreshold = conf.getFloat("mapred.speculative.execution.slowNodeThreshold", 1.0f); this.jobStory = jobStory; // this.jobHistory = this.jobtracker.getJobHistory(); }
// for initTasks, update information from JobStory object @Override public synchronized void initTasks() throws IOException { boolean loggingEnabled = LOG.isDebugEnabled(); if (loggingEnabled) { LOG.debug("(initTasks@SJIP) Starting Initialization for " + jobId); } numMapTasks = jobStory.getNumberMaps(); numReduceTasks = jobStory.getNumberReduces(); JobHistory.JobInfo.logSubmitted( getJobID(), conf, jobFile.toString(), this.startTime, hasRestarted()); if (loggingEnabled) { LOG.debug("(initTasks@SJIP) Logged to job history for " + jobId); } // checkTaskLimits(); if (loggingEnabled) { LOG.debug("(initTasks@SJIP) Checked task limits for " + jobId); } final String jobFile = "default"; splits = getRawSplits(jobStory.getInputSplits()); if (loggingEnabled) { LOG.debug( "(initTasks@SJIP) Created splits for job = " + jobId + " number of splits = " + splits.length); } // createMapTasks(jobFile, splits); numMapTasks = splits.length; maps = new TaskInProgress[numMapTasks]; for (int i = 0; i < numMapTasks; ++i) { inputLength += splits[i].getDataLength(); maps[i] = new TaskInProgress(jobId, jobFile, splits[i], conf, this, i, numSlotsPerMap); } if (numMapTasks > 0) { nonRunningMapCache = createCache(splits, maxLevel); if (loggingEnabled) { LOG.debug( "initTasks:numMaps=" + numMapTasks + " Size of nonRunningMapCache=" + nonRunningMapCache.size() + " for " + jobId); } } // set the launch time this.launchTime = JobTracker.getClock().getTime(); // createReduceTasks(jobFile); // // Create reduce tasks // this.reduces = new TaskInProgress[numReduceTasks]; for (int i = 0; i < numReduceTasks; i++) { reduces[i] = new TaskInProgress(jobId, jobFile, numMapTasks, i, conf, this, numSlotsPerReduce); nonRunningReduces.add(reduces[i]); } // Calculate the minimum number of maps to be complete before // we should start scheduling reduces completedMapsForReduceSlowstart = (int) Math.ceil( (conf.getFloat( "mapred.reduce.slowstart." + "completed.maps", DEFAULT_COMPLETED_MAPS_PERCENT_FOR_REDUCE_SLOWSTART) * numMapTasks)); tasksInited.set(true); if (loggingEnabled) { LOG.debug( "Initializing job, nowstatus = " + JobStatus.getJobRunState(getStatus().getRunState())); } setupComplete(); if (loggingEnabled) { LOG.debug( "Initializing job, inited-status = " + JobStatus.getJobRunState(getStatus().getRunState())); } }
/** Indicate that one of the taskids in this TaskInProgress has failed. */ public void incompleteSubTask(TaskAttemptID taskid, JobStatus jobStatus) { // // Note the failure and its location // TaskStatus status = taskStatuses.get(taskid); String trackerName; String trackerHostName = null; TaskStatus.State taskState = TaskStatus.State.FAILED; if (status != null) { trackerName = status.getTaskTracker(); trackerHostName = JobInProgressTraits.convertTrackerNameToHostName(trackerName); // Check if the user manually KILLED/FAILED this task-attempt... Boolean shouldFail = tasksToKill.remove(taskid); if (shouldFail != null) { if (status.getRunState() == TaskStatus.State.FAILED || status.getRunState() == TaskStatus.State.KILLED) { taskState = (shouldFail) ? TaskStatus.State.FAILED : TaskStatus.State.KILLED; } else { taskState = (shouldFail) ? TaskStatus.State.FAILED_UNCLEAN : TaskStatus.State.KILLED_UNCLEAN; } status.setRunState(taskState); addDiagnosticInfo(taskid, "Task has been " + taskState + " by the user"); } taskState = status.getRunState(); if (taskState != TaskStatus.State.FAILED && taskState != TaskStatus.State.KILLED && taskState != TaskStatus.State.FAILED_UNCLEAN && taskState != TaskStatus.State.KILLED_UNCLEAN) { LOG.info( "Task '" + taskid + "' running on '" + trackerName + "' in state: '" + taskState + "' being failed!"); status.setRunState(TaskStatus.State.FAILED); taskState = TaskStatus.State.FAILED; } // tasktracker went down and failed time was not reported. if (0 == status.getFinishTime()) { status.setFinishTime(JobTracker.getClock().getTime()); } } this.activeTasks.remove(taskid); // Since we do not fail completed reduces (whose outputs go to hdfs), we // should note this failure only for completed maps, only if this taskid; // completed this map. however if the job is done, there is no need to // manipulate completed maps if (this.isMapTask() && !jobSetup && !jobCleanup && isComplete(taskid) && jobStatus.getRunState() != JobStatus.SUCCEEDED) { this.completes--; // Reset the successfulTaskId since we don't have a SUCCESSFUL task now resetSuccessfulTaskid(); } // Note that there can be failures of tasks that are hosted on a machine // that has not yet registered with restarted jobtracker // recalculate the counts only if its a genuine failure if (tasks.contains(taskid)) { if (taskState == TaskStatus.State.FAILED) { numTaskFailures++; machinesWhereFailed.add(trackerHostName); if (maxSkipRecords > 0) { // skipping feature enabled LOG.debug("TaskInProgress adding" + status.getNextRecordRange()); failedRanges.add(status.getNextRecordRange()); skipping = startSkipping(); } } else if (taskState == TaskStatus.State.KILLED) { numKilledTasks++; } } if (numTaskFailures >= maxTaskAttempts) { LOG.info("TaskInProgress " + getTIPId() + " has failed " + numTaskFailures + " times."); kill(); } }
/** * A status message from a client has arrived. It updates the status of a single * component-thread-task, which might result in an overall TaskInProgress status update. * * @return has the task changed its state noticeably? */ synchronized boolean updateStatus(TaskStatus status) { TaskAttemptID taskid = status.getTaskID(); String taskTracker = status.getTaskTracker(); String diagInfo = status.getDiagnosticInfo(); TaskStatus oldStatus = taskStatuses.get(taskid); boolean changed = true; if (diagInfo != null && diagInfo.length() > 0) { long runTime = status.getRunTime(); LOG.info( "Error from " + taskid + " on " + taskTracker + " runTime(msec) " + runTime + ": " + diagInfo); addDiagnosticInfo(taskid, diagInfo); } if (skipping) { failedRanges.updateState(status); } if (oldStatus != null) { TaskStatus.State oldState = oldStatus.getRunState(); TaskStatus.State newState = status.getRunState(); // We should never recieve a duplicate success/failure/killed // status update for the same taskid! This is a safety check, // and is addressed better at the TaskTracker to ensure this. // @see {@link TaskTracker.transmitHeartbeat()} if ((newState != TaskStatus.State.RUNNING && newState != TaskStatus.State.COMMIT_PENDING && newState != TaskStatus.State.FAILED_UNCLEAN && newState != TaskStatus.State.KILLED_UNCLEAN && newState != TaskStatus.State.UNASSIGNED) && (oldState == newState)) { LOG.warn( "Recieved duplicate status update of '" + newState + "' for '" + taskid + "' of TIP '" + getTIPId() + "'" + "oldTT=" + oldStatus.getTaskTracker() + " while newTT=" + status.getTaskTracker()); return false; } // The task is not allowed to move from completed back to running. // We have seen out of order status messagesmoving tasks from complete // to running. This is a spot fix, but it should be addressed more // globally. if ((newState == TaskStatus.State.RUNNING || newState == TaskStatus.State.UNASSIGNED) && (oldState == TaskStatus.State.FAILED || oldState == TaskStatus.State.KILLED || oldState == TaskStatus.State.FAILED_UNCLEAN || oldState == TaskStatus.State.KILLED_UNCLEAN || oldState == TaskStatus.State.SUCCEEDED || oldState == TaskStatus.State.COMMIT_PENDING)) { return false; } // Do not accept any status once the task is marked FAILED/KILLED // This is to handle the case of the JobTracker timing out a task // due to launch delay, but the TT comes back with any state or // TT got expired if (oldState == TaskStatus.State.FAILED || oldState == TaskStatus.State.KILLED) { tasksToKill.put(taskid, true); return false; } changed = oldState != newState; } // if task is a cleanup attempt, do not replace the complete status, // update only specific fields. // For example, startTime should not be updated, // but finishTime has to be updated. if (!isCleanupAttempt(taskid)) { taskStatuses.put(taskid, status); // we don't want to include setup tasks in the task execution stats if (!isJobSetupTask() && !isJobCleanupTask() && ((isMapTask() && job.hasSpeculativeMaps()) || (!isMapTask() && job.hasSpeculativeReduces()))) { updateProgressRate(JobTracker.getClock().getTime()); } } else { taskStatuses .get(taskid) .statusUpdate( status.getRunState(), status.getProgress(), status.getStateString(), status.getPhase(), status.getFinishTime()); } // Recompute progress recomputeProgress(); return changed; }