public Vector completedJobs() { Vector v = new Vector(); for (Iterator it = jobs.values().iterator(); it.hasNext(); ) { JobInProgress jip = (JobInProgress) it.next(); JobStatus status = jip.getStatus(); if (status.getRunState() == JobStatus.SUCCEEDED) { v.add(jip); } } return v; }
public Vector runningJobs() { Vector v = new Vector(); for (Iterator it = jobs.values().iterator(); it.hasNext(); ) { JobInProgress jip = (JobInProgress) it.next(); JobStatus status = jip.getStatus(); if (status.getRunState() == JobStatus.RUNNING) { v.add(jip); } } return v; }
/** * Check the ACLs for a user doing the passed operation. * * <ul> * <li>If ACLs are disabled, allow all users. * <li>Otherwise, if the operation is not a job operation(for eg. submit-job-to-queue), then * allow only (a) clusterOwner(who started the cluster), (b) cluster administrators and (c) * members of queue-submit-job-acl for the queue. * <li>If the operation is a job operation, then allow only (a) jobOwner, (b) clusterOwner(who * started the cluster), (c) cluster administrators, (d) members of queue admins acl for the * queue and (e) members of job acl for the job operation * </ul> * * @param job the job on which operation is requested * @param callerUGI the user who is requesting the operation * @param operation the operation for which authorization is needed * @throws AccessControlException */ void checkAccess(JobInProgress job, UserGroupInformation callerUGI, Operation operation) throws AccessControlException { String queue = job.getProfile().getQueueName(); String jobId = job.getJobID().toString(); JobStatus jobStatus = job.getStatus(); String jobOwner = jobStatus.getUsername(); AccessControlList jobAcl = jobStatus.getJobACLs().get(operation.jobACLNeeded); checkAccess(jobId, callerUGI, queue, operation, jobOwner, jobAcl); }
/** * Check the ACLs for a user doing the passed job operation. * * <ul> * <li>If ACLs are disabled, allow all users. * <li>Otherwise, allow only (a) jobOwner, (b) clusterOwner(who started the cluster), (c) * cluster administrators, (d) members of job acl for the jobOperation * </ul> * * @param jobStatus the status of the job * @param callerUGI the user who is trying to perform the operation * @param queue the job queue name * @param operation the operation for which authorization is needed */ void checkAccess( JobStatus jobStatus, UserGroupInformation callerUGI, String queue, Operation operation) throws AccessControlException { String jobId = jobStatus.getJobID().toString(); String jobOwner = jobStatus.getUsername(); AccessControlList jobAcl = jobStatus.getJobACLs().get(operation.jobACLNeeded); // If acls are enabled, check if callerUGI is jobOwner, queue admin, // cluster admin or part of job ACL checkAccess(jobId, callerUGI, queue, operation, jobOwner, jobAcl); }
public static JobStatus downgrade(org.apache.hadoop.mapreduce.JobStatus stat) { JobStatus old = new JobStatus( JobID.downgrade(stat.getJobID()), stat.getSetupProgress(), stat.getMapProgress(), stat.getReduceProgress(), stat.getCleanupProgress(), stat.getState().getValue(), JobPriority.valueOf(stat.getPriority().name()), stat.getUsername(), stat.getJobName(), stat.getJobFile(), stat.getTrackingUrl()); old.setStartTime(stat.getStartTime()); old.setFinishTime(stat.getFinishTime()); old.setSchedulingInfo(stat.getSchedulingInfo()); old.setHistoryFile(stat.getHistoryFile()); return old; }
private boolean isWait(JobInProgress job) { long bookingTime = job.getJobConf().getLong(BOOKING_TIME, 0); String[] dependencyJobs = job.getJobConf().getStrings(BOOKING_DEPENDENCY_JOBID, null); boolean bookingTimeFilter = false; boolean dependencyJobFilter = false; if (bookingTime >= System.currentTimeMillis()) { bookingTimeFilter = true; } if (null != dependencyJobs) { for (String dependencyJob : dependencyJobs) { JobStatus dependencyJobStatus = (JobStatus) finishJobStatus.get(dependencyJob); if (null != dependencyJobStatus && dependencyJobStatus.getRunState() != JobStatus.SUCCEEDED) { dependencyJobFilter = true; } } } if (bookingTimeFilter || dependencyJobFilter) return true; else return false; }
public synchronized boolean statusUpdate( TaskAttemptID taskId, TaskStatus taskStatus, JvmContext context) throws IOException, InterruptedException { LOG.info(taskStatus.getStateString()); int taskIndex = mapIds.indexOf(taskId); if (taskIndex >= 0) { // mapping float numTasks = (float) this.numMapTasks; partialMapProgress[taskIndex] = taskStatus.getProgress(); mapCounters[taskIndex] = taskStatus.getCounters(); float partialProgress = 0.0f; for (float f : partialMapProgress) { partialProgress += f; } status.setMapProgress(partialProgress / numTasks); } else { reduceCounters = taskStatus.getCounters(); status.setReduceProgress(taskStatus.getProgress()); } // ignore phase return true; }
// for initTasks, update information from JobStory object @Override public synchronized void initTasks() throws IOException { boolean loggingEnabled = LOG.isDebugEnabled(); if (loggingEnabled) { LOG.debug("(initTasks@SJIP) Starting Initialization for " + jobId); } numMapTasks = jobStory.getNumberMaps(); numReduceTasks = jobStory.getNumberReduces(); JobHistory.JobInfo.logSubmitted( getJobID(), conf, jobFile.toString(), this.startTime, hasRestarted()); if (loggingEnabled) { LOG.debug("(initTasks@SJIP) Logged to job history for " + jobId); } // checkTaskLimits(); if (loggingEnabled) { LOG.debug("(initTasks@SJIP) Checked task limits for " + jobId); } final String jobFile = "default"; splits = getRawSplits(jobStory.getInputSplits()); if (loggingEnabled) { LOG.debug( "(initTasks@SJIP) Created splits for job = " + jobId + " number of splits = " + splits.length); } // createMapTasks(jobFile, splits); numMapTasks = splits.length; maps = new TaskInProgress[numMapTasks]; for (int i = 0; i < numMapTasks; ++i) { inputLength += splits[i].getDataLength(); maps[i] = new TaskInProgress(jobId, jobFile, splits[i], conf, this, i, numSlotsPerMap); } if (numMapTasks > 0) { nonRunningMapCache = createCache(splits, maxLevel); if (loggingEnabled) { LOG.debug( "initTasks:numMaps=" + numMapTasks + " Size of nonRunningMapCache=" + nonRunningMapCache.size() + " for " + jobId); } } // set the launch time this.launchTime = JobTracker.getClock().getTime(); // createReduceTasks(jobFile); // // Create reduce tasks // this.reduces = new TaskInProgress[numReduceTasks]; for (int i = 0; i < numReduceTasks; i++) { reduces[i] = new TaskInProgress(jobId, jobFile, numMapTasks, i, conf, this, numSlotsPerReduce); nonRunningReduces.add(reduces[i]); } // Calculate the minimum number of maps to be complete before // we should start scheduling reduces completedMapsForReduceSlowstart = (int) Math.ceil( (conf.getFloat( "mapred.reduce.slowstart." + "completed.maps", DEFAULT_COMPLETED_MAPS_PERCENT_FOR_REDUCE_SLOWSTART) * numMapTasks)); tasksInited.set(true); if (loggingEnabled) { LOG.debug( "Initializing job, nowstatus = " + JobStatus.getJobRunState(getStatus().getRunState())); } setupComplete(); if (loggingEnabled) { LOG.debug( "Initializing job, inited-status = " + JobStatus.getJobRunState(getStatus().getRunState())); } }
/** Indicate that one of the taskids in this TaskInProgress has failed. */ public void incompleteSubTask(TaskAttemptID taskid, JobStatus jobStatus) { // // Note the failure and its location // TaskStatus status = taskStatuses.get(taskid); String trackerName; String trackerHostName = null; TaskStatus.State taskState = TaskStatus.State.FAILED; if (status != null) { trackerName = status.getTaskTracker(); trackerHostName = JobInProgressTraits.convertTrackerNameToHostName(trackerName); // Check if the user manually KILLED/FAILED this task-attempt... Boolean shouldFail = tasksToKill.remove(taskid); if (shouldFail != null) { if (status.getRunState() == TaskStatus.State.FAILED || status.getRunState() == TaskStatus.State.KILLED) { taskState = (shouldFail) ? TaskStatus.State.FAILED : TaskStatus.State.KILLED; } else { taskState = (shouldFail) ? TaskStatus.State.FAILED_UNCLEAN : TaskStatus.State.KILLED_UNCLEAN; } status.setRunState(taskState); addDiagnosticInfo(taskid, "Task has been " + taskState + " by the user"); } taskState = status.getRunState(); if (taskState != TaskStatus.State.FAILED && taskState != TaskStatus.State.KILLED && taskState != TaskStatus.State.FAILED_UNCLEAN && taskState != TaskStatus.State.KILLED_UNCLEAN) { LOG.info( "Task '" + taskid + "' running on '" + trackerName + "' in state: '" + taskState + "' being failed!"); status.setRunState(TaskStatus.State.FAILED); taskState = TaskStatus.State.FAILED; } // tasktracker went down and failed time was not reported. if (0 == status.getFinishTime()) { status.setFinishTime(JobTracker.getClock().getTime()); } } this.activeTasks.remove(taskid); // Since we do not fail completed reduces (whose outputs go to hdfs), we // should note this failure only for completed maps, only if this taskid; // completed this map. however if the job is done, there is no need to // manipulate completed maps if (this.isMapTask() && !jobSetup && !jobCleanup && isComplete(taskid) && jobStatus.getRunState() != JobStatus.SUCCEEDED) { this.completes--; // Reset the successfulTaskId since we don't have a SUCCESSFUL task now resetSuccessfulTaskid(); } // Note that there can be failures of tasks that are hosted on a machine // that has not yet registered with restarted jobtracker // recalculate the counts only if its a genuine failure if (tasks.contains(taskid)) { if (taskState == TaskStatus.State.FAILED) { numTaskFailures++; machinesWhereFailed.add(trackerHostName); if (maxSkipRecords > 0) { // skipping feature enabled LOG.debug("TaskInProgress adding" + status.getNextRecordRange()); failedRanges.add(status.getNextRecordRange()); skipping = startSkipping(); } } else if (taskState == TaskStatus.State.KILLED) { numKilledTasks++; } } if (numTaskFailures >= maxTaskAttempts) { LOG.info("TaskInProgress " + getTIPId() + " has failed " + numTaskFailures + " times."); kill(); } }
@SuppressWarnings("unchecked") @Override public void run() { JobID jobId = profile.getJobID(); JobContext jContext = new JobContextImpl(conf, jobId); org.apache.hadoop.mapreduce.OutputCommitter outputCommitter = null; try { outputCommitter = createOutputCommitter(conf.getUseNewMapper(), jobId, conf); } catch (Exception e) { LOG.info("Failed to createOutputCommitter", e); return; } try { TaskSplitMetaInfo[] taskSplitMetaInfos = SplitMetaInfoReader.readSplitMetaInfo(jobId, localFs, conf, systemJobDir); int numReduceTasks = job.getNumReduceTasks(); if (numReduceTasks > 1 || numReduceTasks < 0) { // we only allow 0 or 1 reducer in local mode numReduceTasks = 1; job.setNumReduceTasks(1); } outputCommitter.setupJob(jContext); status.setSetupProgress(1.0f); Map<TaskAttemptID, MapOutputFile> mapOutputFiles = Collections.synchronizedMap(new HashMap<TaskAttemptID, MapOutputFile>()); List<MapTaskRunnable> taskRunnables = getMapTaskRunnables(taskSplitMetaInfos, jobId, mapOutputFiles); ExecutorService mapService = createMapExecutor(taskRunnables.size()); // Start populating the executor with work units. // They may begin running immediately (in other threads). for (Runnable r : taskRunnables) { mapService.submit(r); } try { mapService.shutdown(); // Instructs queue to drain. // Wait for tasks to finish; do not use a time-based timeout. // (See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6179024) LOG.info("Waiting for map tasks"); mapService.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); } catch (InterruptedException ie) { // Cancel all threads. mapService.shutdownNow(); throw ie; } LOG.info("Map task executor complete."); // After waiting for the map tasks to complete, if any of these // have thrown an exception, rethrow it now in the main thread context. for (MapTaskRunnable r : taskRunnables) { if (r.storedException != null) { throw new Exception(r.storedException); } } TaskAttemptID reduceId = new TaskAttemptID(new TaskID(jobId, false, 0), 0); try { if (numReduceTasks > 0) { ReduceTask reduce = new ReduceTask(systemJobFile.toString(), reduceId, 0, mapIds.size(), 1); reduce.setUser(UserGroupInformation.getCurrentUser().getShortUserName()); JobConf localConf = new JobConf(job); localConf.set("mapreduce.jobtracker.address", "local"); TaskRunner.setupChildMapredLocalDirs(reduce, localConf); // move map output to reduce input for (int i = 0; i < mapIds.size(); i++) { if (!this.isInterrupted()) { TaskAttemptID mapId = mapIds.get(i); Path mapOut = mapOutputFiles.get(mapId).getOutputFile(); MapOutputFile localOutputFile = new MapOutputFile(); localOutputFile.setConf(localConf); Path reduceIn = localOutputFile.getInputFileForWrite( mapId.getTaskID(), localFs.getFileStatus(mapOut).getLen()); if (!localFs.mkdirs(reduceIn.getParent())) { throw new IOException( "Mkdirs failed to create " + reduceIn.getParent().toString()); } if (!localFs.rename(mapOut, reduceIn)) throw new IOException("Couldn't rename " + mapOut); } else { throw new InterruptedException(); } } if (!this.isInterrupted()) { reduce.setJobFile(localJobFile.toString()); localConf.setUser(reduce.getUser()); reduce.localizeConfiguration(localConf); reduce.setConf(localConf); reduce_tasks += 1; myMetrics.launchReduce(reduce.getTaskID()); reduce.run(localConf, this); myMetrics.completeReduce(reduce.getTaskID()); reduce_tasks -= 1; } else { throw new InterruptedException(); } } } finally { for (MapOutputFile output : mapOutputFiles.values()) { output.removeAll(); } } // delete the temporary directory in output directory outputCommitter.commitJob(jContext); status.setCleanupProgress(1.0f); if (killed) { this.status.setRunState(JobStatus.KILLED); } else { this.status.setRunState(JobStatus.SUCCEEDED); } JobEndNotifier.localRunnerNotification(job, status); } catch (Throwable t) { try { outputCommitter.abortJob(jContext, org.apache.hadoop.mapreduce.JobStatus.State.FAILED); } catch (IOException ioe) { LOG.info("Error cleaning up job:" + id); } status.setCleanupProgress(1.0f); if (killed) { this.status.setRunState(JobStatus.KILLED); } else { this.status.setRunState(JobStatus.FAILED); } LOG.warn(id, t); JobEndNotifier.localRunnerNotification(job, status); } finally { try { fs.delete(systemJobFile.getParent(), true); // delete submit dir localFs.delete(localJobFile, true); // delete local copy // Cleanup distributed cache taskDistributedCacheManager.release(); trackerDistributedCacheManager.purgeCache(); } catch (IOException e) { LOG.warn("Error cleaning up " + id + ": " + e); } } }