/** * Registers an newly incoming runtime task with the task manager. * * @param id the ID of the task to register * @param jobConfiguration the job configuration that has been attached to the original job graph * @param environment the environment of the task to be registered * @return the task to be started or <code>null</code> if a task with the same ID was already * running */ private Task createAndRegisterTask( final ExecutionVertexID id, final Configuration jobConfiguration, final RuntimeEnvironment environment) throws InsufficientResourcesException, IOException { if (id == null) { throw new IllegalArgumentException("Argument id is null"); } if (environment == null) { throw new IllegalArgumentException("Argument environment is null"); } // Task creation and registration must be atomic Task task; synchronized (this) { final Task runningTask = this.runningTasks.get(id); boolean registerTask = true; if (runningTask == null) { task = new Task(id, environment, this); } else { if (runningTask instanceof Task) { // Task is already running return null; } else { // There is already a replay task running, we will simply restart it task = runningTask; registerTask = false; } } if (registerTask) { // Register the task with the byte buffered channel manager this.channelManager.register(task); boolean enableProfiling = false; if (this.profiler != null && jobConfiguration.getBoolean(ProfilingUtils.PROFILE_JOB_KEY, true)) { enableProfiling = true; } // Register environment, input, and output gates for profiling if (enableProfiling) { task.registerProfiler(this.profiler, jobConfiguration); } this.runningTasks.put(id, task); } } return task; }
/** * Unregisters a finished or aborted task. * * @param id the ID of the task to be unregistered */ private void unregisterTask(final ExecutionVertexID id) { // Task de-registration must be atomic synchronized (this) { final Task task = this.runningTasks.remove(id); if (task == null) { LOG.error("Cannot find task with ID " + id + " to unregister"); return; } // remove the local tmp file for unregistered tasks. for (Entry<String, DistributedCacheEntry> e : DistributedCache.readFileInfoFromConfig(task.getEnvironment().getJobConfiguration())) { this.fileCache.deleteTmpFile(e.getKey(), e.getValue(), task.getJobID()); } // Unregister task from the byte buffered channel manager this.channelManager.unregister(id, task); // Unregister task from profiling task.unregisterProfiler(this.profiler); // Unregister task from memory manager task.unregisterMemoryManager(this.memoryManager); // Unregister task from library cache manager try { LibraryCacheManager.unregister(task.getJobID()); } catch (IOException e) { if (LOG.isDebugEnabled()) { LOG.debug("Unregistering the job vertex ID " + id + " caused an IOException"); } } } }
@Override public List<TaskSubmissionResult> submitTasks(final List<TaskDeploymentDescriptor> tasks) throws IOException { final List<TaskSubmissionResult> submissionResultList = new SerializableArrayList<TaskSubmissionResult>(); final List<Task> tasksToStart = new ArrayList<Task>(); // Make sure all tasks are fully registered before they are started for (final TaskDeploymentDescriptor tdd : tasks) { final JobID jobID = tdd.getJobID(); final ExecutionVertexID vertexID = tdd.getVertexID(); RuntimeEnvironment re; // retrieve the registered cache files from job configuration and create the local tmp file. Map<String, FutureTask<Path>> cpTasks = new HashMap<String, FutureTask<Path>>(); for (Entry<String, DistributedCacheEntry> e : DistributedCache.readFileInfoFromConfig(tdd.getJobConfiguration())) { FutureTask<Path> cp = this.fileCache.createTmpFile(e.getKey(), e.getValue(), jobID); cpTasks.put(e.getKey(), cp); } try { re = new RuntimeEnvironment( tdd, this.memoryManager, this.ioManager, new TaskInputSplitProvider(jobID, vertexID, this.globalInputSplitProvider), this.accumulatorProtocolProxy, cpTasks); } catch (Throwable t) { final TaskSubmissionResult result = new TaskSubmissionResult(vertexID, AbstractTaskResult.ReturnCode.DEPLOYMENT_ERROR); result.setDescription(StringUtils.stringifyException(t)); LOG.error(result.getDescription(), t); submissionResultList.add(result); continue; } final Configuration jobConfiguration = tdd.getJobConfiguration(); // Register the task Task task; try { task = createAndRegisterTask(vertexID, jobConfiguration, re); } catch (InsufficientResourcesException e) { final TaskSubmissionResult result = new TaskSubmissionResult( vertexID, AbstractTaskResult.ReturnCode.INSUFFICIENT_RESOURCES); result.setDescription(e.getMessage()); LOG.error(result.getDescription(), e); submissionResultList.add(result); continue; } if (task == null) { final TaskSubmissionResult result = new TaskSubmissionResult(vertexID, AbstractTaskResult.ReturnCode.TASK_NOT_FOUND); result.setDescription( "Task " + re.getTaskNameWithIndex() + " (" + vertexID + ") was already running"); LOG.error(result.getDescription()); submissionResultList.add(result); continue; } submissionResultList.add( new TaskSubmissionResult(vertexID, AbstractTaskResult.ReturnCode.SUCCESS)); tasksToStart.add(task); } // Now start the tasks for (final Task task : tasksToStart) { task.startExecution(); } return submissionResultList; }