/** Recover framework/worker information persisted by a prior incarnation of the RM. */ private void recoverWorkers() throws Exception { // if this application master starts as part of an ApplicationMaster/JobManager recovery, // then some worker tasks are most likely still alive and we can re-obtain them final List<MesosWorkerStore.Worker> tasksFromPreviousAttempts = workerStore.recoverWorkers(); if (!tasksFromPreviousAttempts.isEmpty()) { LOG.info("Retrieved {} TaskManagers from previous attempt", tasksFromPreviousAttempts.size()); List<Tuple2<TaskRequest, String>> toAssign = new ArrayList<>(tasksFromPreviousAttempts.size()); List<LaunchableTask> toLaunch = new ArrayList<>(tasksFromPreviousAttempts.size()); for (final MesosWorkerStore.Worker worker : tasksFromPreviousAttempts) { LaunchableMesosWorker launchable = createLaunchableMesosWorker(worker.taskID()); switch (worker.state()) { case New: workersInNew.put(extractResourceID(worker.taskID()), worker); toLaunch.add(launchable); break; case Launched: workersInLaunch.put(extractResourceID(worker.taskID()), worker); toAssign.add(new Tuple2<>(launchable.taskRequest(), worker.hostname().get())); break; case Released: workersBeingReturned.put(extractResourceID(worker.taskID()), worker); break; } taskRouter.tell(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)), self()); } // tell the launch coordinator about prior assignments if (toAssign.size() >= 1) { launchCoordinator.tell(new LaunchCoordinator.Assign(toAssign), self()); } // tell the launch coordinator to launch any new tasks if (toLaunch.size() >= 1) { launchCoordinator.tell(new LaunchCoordinator.Launch(toLaunch), self()); } } }
/** * Plan for some additional workers to be launched. * * @param numWorkers The number of workers to allocate. */ @Override protected void requestNewWorkers(int numWorkers) { try { List<TaskMonitor.TaskGoalStateUpdated> toMonitor = new ArrayList<>(numWorkers); List<LaunchableTask> toLaunch = new ArrayList<>(numWorkers); // generate new workers into persistent state and launch associated actors for (int i = 0; i < numWorkers; i++) { MesosWorkerStore.Worker worker = MesosWorkerStore.Worker.newWorker(workerStore.newTaskID()); workerStore.putWorker(worker); workersInNew.put(extractResourceID(worker.taskID()), worker); LaunchableMesosWorker launchable = createLaunchableMesosWorker(worker.taskID()); LOG.info( "Scheduling Mesos task {} with ({} MB, {} cpus).", launchable.taskID().getValue(), launchable.taskRequest().getMemory(), launchable.taskRequest().getCPUs()); toMonitor.add(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker))); toLaunch.add(launchable); } // tell the task router about the new plans for (TaskMonitor.TaskGoalStateUpdated update : toMonitor) { taskRouter.tell(update, self()); } // tell the launch coordinator to launch the new tasks if (toLaunch.size() >= 1) { launchCoordinator.tell(new LaunchCoordinator.Launch(toLaunch), self()); } } catch (Exception ex) { fatalError("unable to request new workers", ex); } }