Ejemplo n.º 1
0
  /**
   * A status message from a client has arrived. It updates the status of a single
   * component-thread-task, which might result in an overall TaskInProgress status update.
   *
   * @return has the task changed its state noticeably?
   */
  synchronized boolean updateStatus(TaskStatus status) {
    TaskAttemptID taskid = status.getTaskID();
    String taskTracker = status.getTaskTracker();
    String diagInfo = status.getDiagnosticInfo();
    TaskStatus oldStatus = taskStatuses.get(taskid);
    boolean changed = true;
    if (diagInfo != null && diagInfo.length() > 0) {
      long runTime = status.getRunTime();
      LOG.info(
          "Error from "
              + taskid
              + " on "
              + taskTracker
              + " runTime(msec) "
              + runTime
              + ": "
              + diagInfo);
      addDiagnosticInfo(taskid, diagInfo);
    }

    if (skipping) {
      failedRanges.updateState(status);
    }

    if (oldStatus != null) {
      TaskStatus.State oldState = oldStatus.getRunState();
      TaskStatus.State newState = status.getRunState();

      // We should never recieve a duplicate success/failure/killed
      // status update for the same taskid! This is a safety check,
      // and is addressed better at the TaskTracker to ensure this.
      // @see {@link TaskTracker.transmitHeartbeat()}
      if ((newState != TaskStatus.State.RUNNING
              && newState != TaskStatus.State.COMMIT_PENDING
              && newState != TaskStatus.State.FAILED_UNCLEAN
              && newState != TaskStatus.State.KILLED_UNCLEAN
              && newState != TaskStatus.State.UNASSIGNED)
          && (oldState == newState)) {
        LOG.warn(
            "Recieved duplicate status update of '"
                + newState
                + "' for '"
                + taskid
                + "' of TIP '"
                + getTIPId()
                + "'"
                + "oldTT="
                + oldStatus.getTaskTracker()
                + " while newTT="
                + status.getTaskTracker());
        return false;
      }

      // The task is not allowed to move from completed back to running.
      // We have seen out of order status messagesmoving tasks from complete
      // to running. This is a spot fix, but it should be addressed more
      // globally.
      if ((newState == TaskStatus.State.RUNNING || newState == TaskStatus.State.UNASSIGNED)
          && (oldState == TaskStatus.State.FAILED
              || oldState == TaskStatus.State.KILLED
              || oldState == TaskStatus.State.FAILED_UNCLEAN
              || oldState == TaskStatus.State.KILLED_UNCLEAN
              || oldState == TaskStatus.State.SUCCEEDED
              || oldState == TaskStatus.State.COMMIT_PENDING)) {
        return false;
      }

      // Do not accept any status once the task is marked FAILED/KILLED
      // This is to handle the case of the JobTracker timing out a task
      // due to launch delay, but the TT comes back with any state or
      // TT got expired
      if (oldState == TaskStatus.State.FAILED || oldState == TaskStatus.State.KILLED) {
        tasksToKill.put(taskid, true);
        return false;
      }

      changed = oldState != newState;
    }
    // if task is a cleanup attempt, do not replace the complete status,
    // update only specific fields.
    // For example, startTime should not be updated,
    // but finishTime has to be updated.
    if (!isCleanupAttempt(taskid)) {
      taskStatuses.put(taskid, status);
      // we don't want to include setup tasks in the task execution stats
      if (!isJobSetupTask()
          && !isJobCleanupTask()
          && ((isMapTask() && job.hasSpeculativeMaps())
              || (!isMapTask() && job.hasSpeculativeReduces()))) {
        updateProgressRate(JobTracker.getClock().getTime());
      }
    } else {
      taskStatuses
          .get(taskid)
          .statusUpdate(
              status.getRunState(),
              status.getProgress(),
              status.getStateString(),
              status.getPhase(),
              status.getFinishTime());
    }

    // Recompute progress
    recomputeProgress();
    return changed;
  }
Ejemplo n.º 2
0
  /** Indicate that one of the taskids in this TaskInProgress has failed. */
  public void incompleteSubTask(TaskAttemptID taskid, JobStatus jobStatus) {
    //
    // Note the failure and its location
    //
    TaskStatus status = taskStatuses.get(taskid);
    String trackerName;
    String trackerHostName = null;
    TaskStatus.State taskState = TaskStatus.State.FAILED;
    if (status != null) {
      trackerName = status.getTaskTracker();
      trackerHostName = JobInProgressTraits.convertTrackerNameToHostName(trackerName);
      // Check if the user manually KILLED/FAILED this task-attempt...
      Boolean shouldFail = tasksToKill.remove(taskid);
      if (shouldFail != null) {
        if (status.getRunState() == TaskStatus.State.FAILED
            || status.getRunState() == TaskStatus.State.KILLED) {
          taskState = (shouldFail) ? TaskStatus.State.FAILED : TaskStatus.State.KILLED;
        } else {
          taskState =
              (shouldFail) ? TaskStatus.State.FAILED_UNCLEAN : TaskStatus.State.KILLED_UNCLEAN;
        }
        status.setRunState(taskState);
        addDiagnosticInfo(taskid, "Task has been " + taskState + " by the user");
      }

      taskState = status.getRunState();
      if (taskState != TaskStatus.State.FAILED
          && taskState != TaskStatus.State.KILLED
          && taskState != TaskStatus.State.FAILED_UNCLEAN
          && taskState != TaskStatus.State.KILLED_UNCLEAN) {
        LOG.info(
            "Task '"
                + taskid
                + "' running on '"
                + trackerName
                + "' in state: '"
                + taskState
                + "' being failed!");
        status.setRunState(TaskStatus.State.FAILED);
        taskState = TaskStatus.State.FAILED;
      }

      // tasktracker went down and failed time was not reported.
      if (0 == status.getFinishTime()) {
        status.setFinishTime(JobTracker.getClock().getTime());
      }
    }

    this.activeTasks.remove(taskid);

    // Since we do not fail completed reduces (whose outputs go to hdfs), we
    // should note this failure only for completed maps, only if this taskid;
    // completed this map. however if the job is done, there is no need to
    // manipulate completed maps
    if (this.isMapTask()
        && !jobSetup
        && !jobCleanup
        && isComplete(taskid)
        && jobStatus.getRunState() != JobStatus.SUCCEEDED) {
      this.completes--;

      // Reset the successfulTaskId since we don't have a SUCCESSFUL task now
      resetSuccessfulTaskid();
    }

    // Note that there can be failures of tasks that are hosted on a machine
    // that has not yet registered with restarted jobtracker
    // recalculate the counts only if its a genuine failure
    if (tasks.contains(taskid)) {
      if (taskState == TaskStatus.State.FAILED) {
        numTaskFailures++;
        machinesWhereFailed.add(trackerHostName);
        if (maxSkipRecords > 0) {
          // skipping feature enabled
          LOG.debug("TaskInProgress adding" + status.getNextRecordRange());
          failedRanges.add(status.getNextRecordRange());
          skipping = startSkipping();
        }

      } else if (taskState == TaskStatus.State.KILLED) {
        numKilledTasks++;
      }
    }

    if (numTaskFailures >= maxTaskAttempts) {
      LOG.info("TaskInProgress " + getTIPId() + " has failed " + numTaskFailures + " times.");
      kill();
    }
  }
Ejemplo n.º 3
0
  /** Adds a previously running task to this tip. This is used in case of jobtracker restarts. */
  public Task addRunningTask(TaskAttemptID taskid, String taskTracker, boolean taskCleanup) {
    // 1 slot is enough for taskCleanup task
    int numSlotsNeeded = taskCleanup ? 1 : numSlotsRequired;
    // create the task
    Task t = null;
    if (isMapTask()) {
      LOG.debug(
          "attempt "
              + numTaskFailures
              + " sending skippedRecords "
              + failedRanges.getIndicesCount());
      String splitClass = null;
      BytesWritable split;
      if (!jobSetup && !jobCleanup) {
        splitClass = rawSplit.getClassName();
        split = rawSplit.getBytes();
      } else {
        split = new BytesWritable();
      }
      t = new MapTask(jobFile, taskid, partition, splitClass, split, numSlotsNeeded, job.getUser());
    } else {
      t = new ReduceTask(jobFile, taskid, partition, numMaps, numSlotsNeeded, job.getUser());
    }
    if (jobCleanup) {
      t.setJobCleanupTask();
    }
    if (jobSetup) {
      t.setJobSetupTask();
    }
    if (taskCleanup) {
      t.setTaskCleanupTask();
      t.setState(taskStatuses.get(taskid).getRunState());
      cleanupTasks.put(taskid, taskTracker);
    }
    t.setConf(conf);
    LOG.debug("Launching task with skipRanges:" + failedRanges.getSkipRanges());
    t.setSkipRanges(failedRanges.getSkipRanges());
    t.setSkipping(skipping);
    if (failedRanges.isTestAttempt()) {
      t.setWriteSkipRecs(false);
    }

    if (activeTasks.size() >= 1) {
      speculativeTaskId = taskid;
    } else {
      speculativeTaskId = null;
    }
    activeTasks.put(taskid, taskTracker);
    tasks.add(taskid);

    // Ask JobTracker to note that the task exists
    // jobtracker.createTaskEntry(taskid, taskTracker, this);

    /*
      // code to find call paths to createTaskEntry
      StackTraceElement[] stackTraceElements = Thread.currentThread().getStackTrace();
      boolean found = false;
      for (StackTraceElement s: stackTraceElements) {
      if (s.getMethodName().indexOf("heartbeat") != -1 ||
      s.getMethodName().indexOf("findTask") != -1 ||
      s.getMethodName().indexOf("createAndAddAttempt") != -1 ||
      s.getMethodName().indexOf("processTaskAttempt") != -1) {
      found = true;
      break;
      }
      }

      if (!found) {
      RuntimeException e = new RuntimeException ("calling addRunningTask from outside heartbeat");
      LOG.info(StringUtils.stringifyException(e));
      throw (e);
      }
    */

    // check and set the first attempt
    if (firstTaskId == null) {
      firstTaskId = taskid;
    }
    return t;
  }