Exemple #1
0
 public Vector completedJobs() {
   Vector v = new Vector();
   for (Iterator it = jobs.values().iterator(); it.hasNext(); ) {
     JobInProgress jip = (JobInProgress) it.next();
     JobStatus status = jip.getStatus();
     if (status.getRunState() == JobStatus.SUCCEEDED) {
       v.add(jip);
     }
   }
   return v;
 }
Exemple #2
0
 public Vector runningJobs() {
   Vector v = new Vector();
   for (Iterator it = jobs.values().iterator(); it.hasNext(); ) {
     JobInProgress jip = (JobInProgress) it.next();
     JobStatus status = jip.getStatus();
     if (status.getRunState() == JobStatus.RUNNING) {
       v.add(jip);
     }
   }
   return v;
 }
Exemple #3
0
  /**
   * Check the ACLs for a user doing the passed operation.
   *
   * <ul>
   *   <li>If ACLs are disabled, allow all users.
   *   <li>Otherwise, if the operation is not a job operation(for eg. submit-job-to-queue), then
   *       allow only (a) clusterOwner(who started the cluster), (b) cluster administrators and (c)
   *       members of queue-submit-job-acl for the queue.
   *   <li>If the operation is a job operation, then allow only (a) jobOwner, (b) clusterOwner(who
   *       started the cluster), (c) cluster administrators, (d) members of queue admins acl for the
   *       queue and (e) members of job acl for the job operation
   * </ul>
   *
   * @param job the job on which operation is requested
   * @param callerUGI the user who is requesting the operation
   * @param operation the operation for which authorization is needed
   * @throws AccessControlException
   */
  void checkAccess(JobInProgress job, UserGroupInformation callerUGI, Operation operation)
      throws AccessControlException {

    String queue = job.getProfile().getQueueName();
    String jobId = job.getJobID().toString();
    JobStatus jobStatus = job.getStatus();
    String jobOwner = jobStatus.getUsername();
    AccessControlList jobAcl = jobStatus.getJobACLs().get(operation.jobACLNeeded);

    checkAccess(jobId, callerUGI, queue, operation, jobOwner, jobAcl);
  }
Exemple #4
0
  /**
   * Check the ACLs for a user doing the passed job operation.
   *
   * <ul>
   *   <li>If ACLs are disabled, allow all users.
   *   <li>Otherwise, allow only (a) jobOwner, (b) clusterOwner(who started the cluster), (c)
   *       cluster administrators, (d) members of job acl for the jobOperation
   * </ul>
   *
   * @param jobStatus the status of the job
   * @param callerUGI the user who is trying to perform the operation
   * @param queue the job queue name
   * @param operation the operation for which authorization is needed
   */
  void checkAccess(
      JobStatus jobStatus, UserGroupInformation callerUGI, String queue, Operation operation)
      throws AccessControlException {

    String jobId = jobStatus.getJobID().toString();
    String jobOwner = jobStatus.getUsername();
    AccessControlList jobAcl = jobStatus.getJobACLs().get(operation.jobACLNeeded);

    // If acls are enabled, check if callerUGI is jobOwner, queue admin,
    // cluster admin or part of job ACL
    checkAccess(jobId, callerUGI, queue, operation, jobOwner, jobAcl);
  }
Exemple #5
0
 public static JobStatus downgrade(org.apache.hadoop.mapreduce.JobStatus stat) {
   JobStatus old =
       new JobStatus(
           JobID.downgrade(stat.getJobID()),
           stat.getSetupProgress(),
           stat.getMapProgress(),
           stat.getReduceProgress(),
           stat.getCleanupProgress(),
           stat.getState().getValue(),
           JobPriority.valueOf(stat.getPriority().name()),
           stat.getUsername(),
           stat.getJobName(),
           stat.getJobFile(),
           stat.getTrackingUrl());
   old.setStartTime(stat.getStartTime());
   old.setFinishTime(stat.getFinishTime());
   old.setSchedulingInfo(stat.getSchedulingInfo());
   old.setHistoryFile(stat.getHistoryFile());
   return old;
 }
 private boolean isWait(JobInProgress job) {
   long bookingTime = job.getJobConf().getLong(BOOKING_TIME, 0);
   String[] dependencyJobs = job.getJobConf().getStrings(BOOKING_DEPENDENCY_JOBID, null);
   boolean bookingTimeFilter = false;
   boolean dependencyJobFilter = false;
   if (bookingTime >= System.currentTimeMillis()) {
     bookingTimeFilter = true;
   }
   if (null != dependencyJobs) {
     for (String dependencyJob : dependencyJobs) {
       JobStatus dependencyJobStatus = (JobStatus) finishJobStatus.get(dependencyJob);
       if (null != dependencyJobStatus
           && dependencyJobStatus.getRunState() != JobStatus.SUCCEEDED) {
         dependencyJobFilter = true;
       }
     }
   }
   if (bookingTimeFilter || dependencyJobFilter) return true;
   else return false;
 }
    public synchronized boolean statusUpdate(
        TaskAttemptID taskId, TaskStatus taskStatus, JvmContext context)
        throws IOException, InterruptedException {
      LOG.info(taskStatus.getStateString());
      int taskIndex = mapIds.indexOf(taskId);
      if (taskIndex >= 0) { // mapping
        float numTasks = (float) this.numMapTasks;
        partialMapProgress[taskIndex] = taskStatus.getProgress();
        mapCounters[taskIndex] = taskStatus.getCounters();
        float partialProgress = 0.0f;
        for (float f : partialMapProgress) {
          partialProgress += f;
        }
        status.setMapProgress(partialProgress / numTasks);
      } else {
        reduceCounters = taskStatus.getCounters();
        status.setReduceProgress(taskStatus.getProgress());
      }

      // ignore phase

      return true;
    }
  // for initTasks, update information from JobStory object
  @Override
  public synchronized void initTasks() throws IOException {
    boolean loggingEnabled = LOG.isDebugEnabled();
    if (loggingEnabled) {
      LOG.debug("(initTasks@SJIP) Starting Initialization for " + jobId);
    }
    numMapTasks = jobStory.getNumberMaps();
    numReduceTasks = jobStory.getNumberReduces();

    JobHistory.JobInfo.logSubmitted(
        getJobID(), conf, jobFile.toString(), this.startTime, hasRestarted());
    if (loggingEnabled) {
      LOG.debug("(initTasks@SJIP) Logged to job history for " + jobId);
    }

    //    checkTaskLimits();

    if (loggingEnabled) {
      LOG.debug("(initTasks@SJIP) Checked task limits for " + jobId);
    }

    final String jobFile = "default";
    splits = getRawSplits(jobStory.getInputSplits());
    if (loggingEnabled) {
      LOG.debug(
          "(initTasks@SJIP) Created splits for job = "
              + jobId
              + " number of splits = "
              + splits.length);
    }

    //    createMapTasks(jobFile, splits);

    numMapTasks = splits.length;
    maps = new TaskInProgress[numMapTasks];
    for (int i = 0; i < numMapTasks; ++i) {
      inputLength += splits[i].getDataLength();
      maps[i] = new TaskInProgress(jobId, jobFile, splits[i], conf, this, i, numSlotsPerMap);
    }
    if (numMapTasks > 0) {
      nonRunningMapCache = createCache(splits, maxLevel);
      if (loggingEnabled) {
        LOG.debug(
            "initTasks:numMaps="
                + numMapTasks
                + " Size of nonRunningMapCache="
                + nonRunningMapCache.size()
                + " for "
                + jobId);
      }
    }

    // set the launch time
    this.launchTime = JobTracker.getClock().getTime();

    //    createReduceTasks(jobFile);

    //
    // Create reduce tasks
    //
    this.reduces = new TaskInProgress[numReduceTasks];
    for (int i = 0; i < numReduceTasks; i++) {
      reduces[i] =
          new TaskInProgress(jobId, jobFile, numMapTasks, i, conf, this, numSlotsPerReduce);
      nonRunningReduces.add(reduces[i]);
    }

    // Calculate the minimum number of maps to be complete before
    // we should start scheduling reduces
    completedMapsForReduceSlowstart =
        (int)
            Math.ceil(
                (conf.getFloat(
                        "mapred.reduce.slowstart." + "completed.maps",
                        DEFAULT_COMPLETED_MAPS_PERCENT_FOR_REDUCE_SLOWSTART)
                    * numMapTasks));

    tasksInited.set(true);
    if (loggingEnabled) {
      LOG.debug(
          "Initializing job, nowstatus = " + JobStatus.getJobRunState(getStatus().getRunState()));
    }
    setupComplete();

    if (loggingEnabled) {
      LOG.debug(
          "Initializing job, inited-status = "
              + JobStatus.getJobRunState(getStatus().getRunState()));
    }
  }
  /** Indicate that one of the taskids in this TaskInProgress has failed. */
  public void incompleteSubTask(TaskAttemptID taskid, JobStatus jobStatus) {
    //
    // Note the failure and its location
    //
    TaskStatus status = taskStatuses.get(taskid);
    String trackerName;
    String trackerHostName = null;
    TaskStatus.State taskState = TaskStatus.State.FAILED;
    if (status != null) {
      trackerName = status.getTaskTracker();
      trackerHostName = JobInProgressTraits.convertTrackerNameToHostName(trackerName);
      // Check if the user manually KILLED/FAILED this task-attempt...
      Boolean shouldFail = tasksToKill.remove(taskid);
      if (shouldFail != null) {
        if (status.getRunState() == TaskStatus.State.FAILED
            || status.getRunState() == TaskStatus.State.KILLED) {
          taskState = (shouldFail) ? TaskStatus.State.FAILED : TaskStatus.State.KILLED;
        } else {
          taskState =
              (shouldFail) ? TaskStatus.State.FAILED_UNCLEAN : TaskStatus.State.KILLED_UNCLEAN;
        }
        status.setRunState(taskState);
        addDiagnosticInfo(taskid, "Task has been " + taskState + " by the user");
      }

      taskState = status.getRunState();
      if (taskState != TaskStatus.State.FAILED
          && taskState != TaskStatus.State.KILLED
          && taskState != TaskStatus.State.FAILED_UNCLEAN
          && taskState != TaskStatus.State.KILLED_UNCLEAN) {
        LOG.info(
            "Task '"
                + taskid
                + "' running on '"
                + trackerName
                + "' in state: '"
                + taskState
                + "' being failed!");
        status.setRunState(TaskStatus.State.FAILED);
        taskState = TaskStatus.State.FAILED;
      }

      // tasktracker went down and failed time was not reported.
      if (0 == status.getFinishTime()) {
        status.setFinishTime(JobTracker.getClock().getTime());
      }
    }

    this.activeTasks.remove(taskid);

    // Since we do not fail completed reduces (whose outputs go to hdfs), we
    // should note this failure only for completed maps, only if this taskid;
    // completed this map. however if the job is done, there is no need to
    // manipulate completed maps
    if (this.isMapTask()
        && !jobSetup
        && !jobCleanup
        && isComplete(taskid)
        && jobStatus.getRunState() != JobStatus.SUCCEEDED) {
      this.completes--;

      // Reset the successfulTaskId since we don't have a SUCCESSFUL task now
      resetSuccessfulTaskid();
    }

    // Note that there can be failures of tasks that are hosted on a machine
    // that has not yet registered with restarted jobtracker
    // recalculate the counts only if its a genuine failure
    if (tasks.contains(taskid)) {
      if (taskState == TaskStatus.State.FAILED) {
        numTaskFailures++;
        machinesWhereFailed.add(trackerHostName);
        if (maxSkipRecords > 0) {
          // skipping feature enabled
          LOG.debug("TaskInProgress adding" + status.getNextRecordRange());
          failedRanges.add(status.getNextRecordRange());
          skipping = startSkipping();
        }

      } else if (taskState == TaskStatus.State.KILLED) {
        numKilledTasks++;
      }
    }

    if (numTaskFailures >= maxTaskAttempts) {
      LOG.info("TaskInProgress " + getTIPId() + " has failed " + numTaskFailures + " times.");
      kill();
    }
  }
    @SuppressWarnings("unchecked")
    @Override
    public void run() {
      JobID jobId = profile.getJobID();
      JobContext jContext = new JobContextImpl(conf, jobId);
      org.apache.hadoop.mapreduce.OutputCommitter outputCommitter = null;
      try {
        outputCommitter = createOutputCommitter(conf.getUseNewMapper(), jobId, conf);
      } catch (Exception e) {
        LOG.info("Failed to createOutputCommitter", e);
        return;
      }

      try {
        TaskSplitMetaInfo[] taskSplitMetaInfos =
            SplitMetaInfoReader.readSplitMetaInfo(jobId, localFs, conf, systemJobDir);
        int numReduceTasks = job.getNumReduceTasks();
        if (numReduceTasks > 1 || numReduceTasks < 0) {
          // we only allow 0 or 1 reducer in local mode
          numReduceTasks = 1;
          job.setNumReduceTasks(1);
        }
        outputCommitter.setupJob(jContext);
        status.setSetupProgress(1.0f);

        Map<TaskAttemptID, MapOutputFile> mapOutputFiles =
            Collections.synchronizedMap(new HashMap<TaskAttemptID, MapOutputFile>());

        List<MapTaskRunnable> taskRunnables =
            getMapTaskRunnables(taskSplitMetaInfos, jobId, mapOutputFiles);

        ExecutorService mapService = createMapExecutor(taskRunnables.size());
        // Start populating the executor with work units.
        // They may begin running immediately (in other threads).
        for (Runnable r : taskRunnables) {
          mapService.submit(r);
        }

        try {
          mapService.shutdown(); // Instructs queue to drain.

          // Wait for tasks to finish; do not use a time-based timeout.
          // (See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6179024)
          LOG.info("Waiting for map tasks");
          mapService.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
        } catch (InterruptedException ie) {
          // Cancel all threads.
          mapService.shutdownNow();
          throw ie;
        }

        LOG.info("Map task executor complete.");

        // After waiting for the map tasks to complete, if any of these
        // have thrown an exception, rethrow it now in the main thread context.
        for (MapTaskRunnable r : taskRunnables) {
          if (r.storedException != null) {
            throw new Exception(r.storedException);
          }
        }

        TaskAttemptID reduceId = new TaskAttemptID(new TaskID(jobId, false, 0), 0);
        try {
          if (numReduceTasks > 0) {
            ReduceTask reduce =
                new ReduceTask(systemJobFile.toString(), reduceId, 0, mapIds.size(), 1);
            reduce.setUser(UserGroupInformation.getCurrentUser().getShortUserName());
            JobConf localConf = new JobConf(job);
            localConf.set("mapreduce.jobtracker.address", "local");
            TaskRunner.setupChildMapredLocalDirs(reduce, localConf);
            // move map output to reduce input
            for (int i = 0; i < mapIds.size(); i++) {
              if (!this.isInterrupted()) {
                TaskAttemptID mapId = mapIds.get(i);
                Path mapOut = mapOutputFiles.get(mapId).getOutputFile();
                MapOutputFile localOutputFile = new MapOutputFile();
                localOutputFile.setConf(localConf);
                Path reduceIn =
                    localOutputFile.getInputFileForWrite(
                        mapId.getTaskID(), localFs.getFileStatus(mapOut).getLen());
                if (!localFs.mkdirs(reduceIn.getParent())) {
                  throw new IOException(
                      "Mkdirs failed to create " + reduceIn.getParent().toString());
                }
                if (!localFs.rename(mapOut, reduceIn))
                  throw new IOException("Couldn't rename " + mapOut);
              } else {
                throw new InterruptedException();
              }
            }
            if (!this.isInterrupted()) {
              reduce.setJobFile(localJobFile.toString());
              localConf.setUser(reduce.getUser());
              reduce.localizeConfiguration(localConf);
              reduce.setConf(localConf);
              reduce_tasks += 1;
              myMetrics.launchReduce(reduce.getTaskID());
              reduce.run(localConf, this);
              myMetrics.completeReduce(reduce.getTaskID());
              reduce_tasks -= 1;
            } else {
              throw new InterruptedException();
            }
          }
        } finally {
          for (MapOutputFile output : mapOutputFiles.values()) {
            output.removeAll();
          }
        }
        // delete the temporary directory in output directory
        outputCommitter.commitJob(jContext);
        status.setCleanupProgress(1.0f);

        if (killed) {
          this.status.setRunState(JobStatus.KILLED);
        } else {
          this.status.setRunState(JobStatus.SUCCEEDED);
        }

        JobEndNotifier.localRunnerNotification(job, status);

      } catch (Throwable t) {
        try {
          outputCommitter.abortJob(jContext, org.apache.hadoop.mapreduce.JobStatus.State.FAILED);
        } catch (IOException ioe) {
          LOG.info("Error cleaning up job:" + id);
        }
        status.setCleanupProgress(1.0f);
        if (killed) {
          this.status.setRunState(JobStatus.KILLED);
        } else {
          this.status.setRunState(JobStatus.FAILED);
        }
        LOG.warn(id, t);

        JobEndNotifier.localRunnerNotification(job, status);

      } finally {
        try {
          fs.delete(systemJobFile.getParent(), true); // delete submit dir
          localFs.delete(localJobFile, true); // delete local copy
          // Cleanup distributed cache
          taskDistributedCacheManager.release();
          trackerDistributedCacheManager.purgeCache();
        } catch (IOException e) {
          LOG.warn("Error cleaning up " + id + ": " + e);
        }
      }
    }