Beispiel #1
0
  /** Return a Task that can be sent to a TaskTracker for execution. */
  public Task getTaskToRun(String taskTracker) {

    // Create the 'taskid'; do not count the 'killed' tasks against the job!
    TaskAttemptID taskid = null;
    if (nextTaskId < (MAX_TASK_EXECS + maxTaskAttempts + numKilledTasks)) {
      // Make sure that the attempts are unqiue across restarts
      int attemptId = job.getNumRestarts() * NUM_ATTEMPTS_PER_RESTART + nextTaskId;
      taskid = new TaskAttemptID(id, attemptId);
      ++nextTaskId;
    } else {
      LOG.warn(
          "Exceeded limit of "
              + (MAX_TASK_EXECS + maxTaskAttempts)
              + " (plus "
              + numKilledTasks
              + " killed)"
              + " attempts for the tip '"
              + getTIPId()
              + "'");
      return null;
    }
    // keep track of the last time we started an attempt at this TIP
    // used to calculate the progress rate of this TIP
    setDispatchTime(taskid, JobTracker.getClock().getTime());
    if (0 == execStartTime) {
      // assume task starts running now
      execStartTime = JobTracker.getClock().getTime();
    }
    return addRunningTask(taskid, taskTracker);
  }
Beispiel #2
0
 /** The TIP's been ordered kill()ed. */
 public void kill() {
   if (isComplete() || failed) {
     return;
   }
   this.failed = true;
   killed = true;
   this.execFinishTime = JobTracker.getClock().getTime();
   recomputeProgress();
 }
Beispiel #3
0
  /** Indicate that one of the taskids in this TaskInProgress has successfully completed! */
  public void completed(TaskAttemptID taskid) {
    //
    // Record that this taskid is complete
    //
    completedTask(taskid, TaskStatus.State.SUCCEEDED);

    // Note the successful taskid
    setSuccessfulTaskid(taskid);

    //
    // Now that the TIP is complete, the other speculative
    // subtasks will be closed when the owning tasktracker
    // reports in and calls shouldClose() on this object.
    //

    this.completes++;
    this.execFinishTime = JobTracker.getClock().getTime();
    recomputeProgress();
  }
Beispiel #4
0
  /** Initialization common to Map and Reduce */
  void init(JobID jobId) {
    this.startTime = JobTracker.getClock().getTime();
    this.id = new TaskID(jobId, isMapTask(), partition);
    this.skipping = startSkipping();
    long speculativeDuration;
    if (isMapTask()) {
      this.speculativeLag = conf.getMapSpeculativeLag();
      speculativeDuration = conf.getMapSpeculativeDuration();
    } else {
      this.speculativeLag = conf.getReduceSpeculativeLag();
      speculativeDuration = conf.getReduceSpeculativeDuration();
    }

    // speculate only if 1/(1000 * progress_rate) > speculativeDuration
    // ie. :
    // speculate only if progress_rate < 1/(1000 * speculativeDuration)

    if (speculativeDuration > 0) {
      this.maxProgressRateForSpeculation = 1.0 / (1000.0 * speculativeDuration);
    } else {
      // disable this check for durations <= 0
      this.maxProgressRateForSpeculation = -1.0;
    }
  }
  @SuppressWarnings("deprecation")
  public SimulatorJobInProgress(
      JobID jobid, JobTracker jobtracker, JobConf default_conf, JobStory jobStory) {
    super(jobid, jobStory.getJobConf(), jobtracker);
    // jobSetupCleanupNeeded set to false in parent cstr, though
    // default is true

    restartCount = 0;
    jobSetupCleanupNeeded = false;

    this.memoryPerMap = conf.getMemoryForMapTask();
    this.memoryPerReduce = conf.getMemoryForReduceTask();
    this.maxTaskFailuresPerTracker = conf.getMaxTaskFailuresPerTracker();

    this.jobId = jobid;
    String url =
        "http://"
            + jobtracker.getJobTrackerMachine()
            + ":"
            + jobtracker.getInfoPort()
            + "/jobdetails.jsp?jobid="
            + jobid;
    this.jobtracker = jobtracker;
    this.conf = jobStory.getJobConf();
    this.priority = conf.getJobPriority();
    Path jobDir = jobtracker.getSystemDirectoryForJob(jobid);
    this.jobFile = new Path(jobDir, "job.xml");
    this.status =
        new JobStatus(jobid, 0.0f, 0.0f, 0.0f, 0.0f, JobStatus.PREP, priority, conf.getUser());
    this.profile =
        new JobProfile(
            jobStory.getUser(),
            jobid,
            this.jobFile.toString(),
            url,
            jobStory.getName(),
            conf.getQueueName());
    this.startTime = JobTracker.getClock().getTime();
    status.setStartTime(startTime);
    this.resourceEstimator = new ResourceEstimator(this);

    this.numMapTasks = jobStory.getNumberMaps();
    this.numReduceTasks = jobStory.getNumberReduces();
    this.taskCompletionEvents =
        new ArrayList<TaskCompletionEvent>(numMapTasks + numReduceTasks + 10);

    this.mapFailuresPercent = conf.getMaxMapTaskFailuresPercent();
    this.reduceFailuresPercent = conf.getMaxReduceTaskFailuresPercent();
    MetricsContext metricsContext = MetricsUtil.getContext("mapred");
    this.jobMetrics = MetricsUtil.createRecord(metricsContext, "job");
    this.jobMetrics.setTag("user", conf.getUser());
    this.jobMetrics.setTag("sessionId", conf.getSessionId());
    this.jobMetrics.setTag("jobName", conf.getJobName());
    this.jobMetrics.setTag("jobId", jobid.toString());

    this.maxLevel = jobtracker.getNumTaskCacheLevels();
    this.anyCacheLevel = this.maxLevel + 1;
    this.nonLocalMaps = new LinkedList<TaskInProgress>();
    this.nonLocalRunningMaps = new LinkedHashSet<TaskInProgress>();
    this.runningMapCache = new IdentityHashMap<Node, Set<TaskInProgress>>();
    this.nonRunningReduces = new LinkedList<TaskInProgress>();
    this.runningReduces = new LinkedHashSet<TaskInProgress>();
    this.slowTaskThreshold =
        Math.max(0.0f, conf.getFloat("mapred.speculative.execution.slowTaskThreshold", 1.0f));
    this.speculativeCap = conf.getFloat("mapred.speculative.execution.speculativeCap", 0.1f);
    this.slowNodeThreshold = conf.getFloat("mapred.speculative.execution.slowNodeThreshold", 1.0f);

    this.jobStory = jobStory;
    //    this.jobHistory = this.jobtracker.getJobHistory();
  }
  // for initTasks, update information from JobStory object
  @Override
  public synchronized void initTasks() throws IOException {
    boolean loggingEnabled = LOG.isDebugEnabled();
    if (loggingEnabled) {
      LOG.debug("(initTasks@SJIP) Starting Initialization for " + jobId);
    }
    numMapTasks = jobStory.getNumberMaps();
    numReduceTasks = jobStory.getNumberReduces();

    JobHistory.JobInfo.logSubmitted(
        getJobID(), conf, jobFile.toString(), this.startTime, hasRestarted());
    if (loggingEnabled) {
      LOG.debug("(initTasks@SJIP) Logged to job history for " + jobId);
    }

    //    checkTaskLimits();

    if (loggingEnabled) {
      LOG.debug("(initTasks@SJIP) Checked task limits for " + jobId);
    }

    final String jobFile = "default";
    splits = getRawSplits(jobStory.getInputSplits());
    if (loggingEnabled) {
      LOG.debug(
          "(initTasks@SJIP) Created splits for job = "
              + jobId
              + " number of splits = "
              + splits.length);
    }

    //    createMapTasks(jobFile, splits);

    numMapTasks = splits.length;
    maps = new TaskInProgress[numMapTasks];
    for (int i = 0; i < numMapTasks; ++i) {
      inputLength += splits[i].getDataLength();
      maps[i] = new TaskInProgress(jobId, jobFile, splits[i], conf, this, i, numSlotsPerMap);
    }
    if (numMapTasks > 0) {
      nonRunningMapCache = createCache(splits, maxLevel);
      if (loggingEnabled) {
        LOG.debug(
            "initTasks:numMaps="
                + numMapTasks
                + " Size of nonRunningMapCache="
                + nonRunningMapCache.size()
                + " for "
                + jobId);
      }
    }

    // set the launch time
    this.launchTime = JobTracker.getClock().getTime();

    //    createReduceTasks(jobFile);

    //
    // Create reduce tasks
    //
    this.reduces = new TaskInProgress[numReduceTasks];
    for (int i = 0; i < numReduceTasks; i++) {
      reduces[i] =
          new TaskInProgress(jobId, jobFile, numMapTasks, i, conf, this, numSlotsPerReduce);
      nonRunningReduces.add(reduces[i]);
    }

    // Calculate the minimum number of maps to be complete before
    // we should start scheduling reduces
    completedMapsForReduceSlowstart =
        (int)
            Math.ceil(
                (conf.getFloat(
                        "mapred.reduce.slowstart." + "completed.maps",
                        DEFAULT_COMPLETED_MAPS_PERCENT_FOR_REDUCE_SLOWSTART)
                    * numMapTasks));

    tasksInited.set(true);
    if (loggingEnabled) {
      LOG.debug(
          "Initializing job, nowstatus = " + JobStatus.getJobRunState(getStatus().getRunState()));
    }
    setupComplete();

    if (loggingEnabled) {
      LOG.debug(
          "Initializing job, inited-status = "
              + JobStatus.getJobRunState(getStatus().getRunState()));
    }
  }
Beispiel #7
0
  /** Indicate that one of the taskids in this TaskInProgress has failed. */
  public void incompleteSubTask(TaskAttemptID taskid, JobStatus jobStatus) {
    //
    // Note the failure and its location
    //
    TaskStatus status = taskStatuses.get(taskid);
    String trackerName;
    String trackerHostName = null;
    TaskStatus.State taskState = TaskStatus.State.FAILED;
    if (status != null) {
      trackerName = status.getTaskTracker();
      trackerHostName = JobInProgressTraits.convertTrackerNameToHostName(trackerName);
      // Check if the user manually KILLED/FAILED this task-attempt...
      Boolean shouldFail = tasksToKill.remove(taskid);
      if (shouldFail != null) {
        if (status.getRunState() == TaskStatus.State.FAILED
            || status.getRunState() == TaskStatus.State.KILLED) {
          taskState = (shouldFail) ? TaskStatus.State.FAILED : TaskStatus.State.KILLED;
        } else {
          taskState =
              (shouldFail) ? TaskStatus.State.FAILED_UNCLEAN : TaskStatus.State.KILLED_UNCLEAN;
        }
        status.setRunState(taskState);
        addDiagnosticInfo(taskid, "Task has been " + taskState + " by the user");
      }

      taskState = status.getRunState();
      if (taskState != TaskStatus.State.FAILED
          && taskState != TaskStatus.State.KILLED
          && taskState != TaskStatus.State.FAILED_UNCLEAN
          && taskState != TaskStatus.State.KILLED_UNCLEAN) {
        LOG.info(
            "Task '"
                + taskid
                + "' running on '"
                + trackerName
                + "' in state: '"
                + taskState
                + "' being failed!");
        status.setRunState(TaskStatus.State.FAILED);
        taskState = TaskStatus.State.FAILED;
      }

      // tasktracker went down and failed time was not reported.
      if (0 == status.getFinishTime()) {
        status.setFinishTime(JobTracker.getClock().getTime());
      }
    }

    this.activeTasks.remove(taskid);

    // Since we do not fail completed reduces (whose outputs go to hdfs), we
    // should note this failure only for completed maps, only if this taskid;
    // completed this map. however if the job is done, there is no need to
    // manipulate completed maps
    if (this.isMapTask()
        && !jobSetup
        && !jobCleanup
        && isComplete(taskid)
        && jobStatus.getRunState() != JobStatus.SUCCEEDED) {
      this.completes--;

      // Reset the successfulTaskId since we don't have a SUCCESSFUL task now
      resetSuccessfulTaskid();
    }

    // Note that there can be failures of tasks that are hosted on a machine
    // that has not yet registered with restarted jobtracker
    // recalculate the counts only if its a genuine failure
    if (tasks.contains(taskid)) {
      if (taskState == TaskStatus.State.FAILED) {
        numTaskFailures++;
        machinesWhereFailed.add(trackerHostName);
        if (maxSkipRecords > 0) {
          // skipping feature enabled
          LOG.debug("TaskInProgress adding" + status.getNextRecordRange());
          failedRanges.add(status.getNextRecordRange());
          skipping = startSkipping();
        }

      } else if (taskState == TaskStatus.State.KILLED) {
        numKilledTasks++;
      }
    }

    if (numTaskFailures >= maxTaskAttempts) {
      LOG.info("TaskInProgress " + getTIPId() + " has failed " + numTaskFailures + " times.");
      kill();
    }
  }
Beispiel #8
0
  /**
   * A status message from a client has arrived. It updates the status of a single
   * component-thread-task, which might result in an overall TaskInProgress status update.
   *
   * @return has the task changed its state noticeably?
   */
  synchronized boolean updateStatus(TaskStatus status) {
    TaskAttemptID taskid = status.getTaskID();
    String taskTracker = status.getTaskTracker();
    String diagInfo = status.getDiagnosticInfo();
    TaskStatus oldStatus = taskStatuses.get(taskid);
    boolean changed = true;
    if (diagInfo != null && diagInfo.length() > 0) {
      long runTime = status.getRunTime();
      LOG.info(
          "Error from "
              + taskid
              + " on "
              + taskTracker
              + " runTime(msec) "
              + runTime
              + ": "
              + diagInfo);
      addDiagnosticInfo(taskid, diagInfo);
    }

    if (skipping) {
      failedRanges.updateState(status);
    }

    if (oldStatus != null) {
      TaskStatus.State oldState = oldStatus.getRunState();
      TaskStatus.State newState = status.getRunState();

      // We should never recieve a duplicate success/failure/killed
      // status update for the same taskid! This is a safety check,
      // and is addressed better at the TaskTracker to ensure this.
      // @see {@link TaskTracker.transmitHeartbeat()}
      if ((newState != TaskStatus.State.RUNNING
              && newState != TaskStatus.State.COMMIT_PENDING
              && newState != TaskStatus.State.FAILED_UNCLEAN
              && newState != TaskStatus.State.KILLED_UNCLEAN
              && newState != TaskStatus.State.UNASSIGNED)
          && (oldState == newState)) {
        LOG.warn(
            "Recieved duplicate status update of '"
                + newState
                + "' for '"
                + taskid
                + "' of TIP '"
                + getTIPId()
                + "'"
                + "oldTT="
                + oldStatus.getTaskTracker()
                + " while newTT="
                + status.getTaskTracker());
        return false;
      }

      // The task is not allowed to move from completed back to running.
      // We have seen out of order status messagesmoving tasks from complete
      // to running. This is a spot fix, but it should be addressed more
      // globally.
      if ((newState == TaskStatus.State.RUNNING || newState == TaskStatus.State.UNASSIGNED)
          && (oldState == TaskStatus.State.FAILED
              || oldState == TaskStatus.State.KILLED
              || oldState == TaskStatus.State.FAILED_UNCLEAN
              || oldState == TaskStatus.State.KILLED_UNCLEAN
              || oldState == TaskStatus.State.SUCCEEDED
              || oldState == TaskStatus.State.COMMIT_PENDING)) {
        return false;
      }

      // Do not accept any status once the task is marked FAILED/KILLED
      // This is to handle the case of the JobTracker timing out a task
      // due to launch delay, but the TT comes back with any state or
      // TT got expired
      if (oldState == TaskStatus.State.FAILED || oldState == TaskStatus.State.KILLED) {
        tasksToKill.put(taskid, true);
        return false;
      }

      changed = oldState != newState;
    }
    // if task is a cleanup attempt, do not replace the complete status,
    // update only specific fields.
    // For example, startTime should not be updated,
    // but finishTime has to be updated.
    if (!isCleanupAttempt(taskid)) {
      taskStatuses.put(taskid, status);
      // we don't want to include setup tasks in the task execution stats
      if (!isJobSetupTask()
          && !isJobCleanupTask()
          && ((isMapTask() && job.hasSpeculativeMaps())
              || (!isMapTask() && job.hasSpeculativeReduces()))) {
        updateProgressRate(JobTracker.getClock().getTime());
      }
    } else {
      taskStatuses
          .get(taskid)
          .statusUpdate(
              status.getRunState(),
              status.getProgress(),
              status.getStateString(),
              status.getPhase(),
              status.getFinishTime());
    }

    // Recompute progress
    recomputeProgress();
    return changed;
  }