/** * Plan for some additional workers to be launched. * * @param numWorkers The number of workers to allocate. */ @Override protected void requestNewWorkers(int numWorkers) { try { List<TaskMonitor.TaskGoalStateUpdated> toMonitor = new ArrayList<>(numWorkers); List<LaunchableTask> toLaunch = new ArrayList<>(numWorkers); // generate new workers into persistent state and launch associated actors for (int i = 0; i < numWorkers; i++) { MesosWorkerStore.Worker worker = MesosWorkerStore.Worker.newWorker(workerStore.newTaskID()); workerStore.putWorker(worker); workersInNew.put(extractResourceID(worker.taskID()), worker); LaunchableMesosWorker launchable = createLaunchableMesosWorker(worker.taskID()); LOG.info( "Scheduling Mesos task {} with ({} MB, {} cpus).", launchable.taskID().getValue(), launchable.taskRequest().getMemory(), launchable.taskRequest().getCPUs()); toMonitor.add(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker))); toLaunch.add(launchable); } // tell the task router about the new plans for (TaskMonitor.TaskGoalStateUpdated update : toMonitor) { taskRouter.tell(update, self()); } // tell the launch coordinator to launch the new tasks if (toLaunch.size() >= 1) { launchCoordinator.tell(new LaunchCoordinator.Launch(toLaunch), self()); } } catch (Exception ex) { fatalError("unable to request new workers", ex); } }
/** * Accept offers as advised by the launch coordinator. * * <p>Acceptance is routed through the RM to update the persistent state before forwarding the * message to Mesos. */ private void acceptOffers(AcceptOffers msg) { try { List<TaskMonitor.TaskGoalStateUpdated> toMonitor = new ArrayList<>(msg.operations().size()); // transition the persistent state of some tasks to Launched for (Protos.Offer.Operation op : msg.operations()) { if (op.getType() != Protos.Offer.Operation.Type.LAUNCH) { continue; } for (Protos.TaskInfo info : op.getLaunch().getTaskInfosList()) { MesosWorkerStore.Worker worker = workersInNew.remove(extractResourceID(info.getTaskId())); assert (worker != null); worker = worker.launchWorker(info.getSlaveId(), msg.hostname()); workerStore.putWorker(worker); workersInLaunch.put(extractResourceID(worker.taskID()), worker); LOG.info( "Launching Mesos task {} on host {}.", worker.taskID().getValue(), worker.hostname().get()); toMonitor.add(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker))); } } // tell the task router about the new plans for (TaskMonitor.TaskGoalStateUpdated update : toMonitor) { taskRouter.tell(update, self()); } // send the acceptance message to Mesos schedulerDriver.acceptOffers(msg.offerIds(), msg.operations(), msg.filters()); } catch (Exception ex) { fatalError("unable to accept offers", ex); } }
/** * Extracts the Mesos task goal state from the worker information. * * @param worker the persistent worker information. * @return goal state information for the {@Link TaskMonitor}. */ static TaskMonitor.TaskGoalState extractGoalState(MesosWorkerStore.Worker worker) { switch (worker.state()) { case New: return new TaskMonitor.New(worker.taskID()); case Launched: return new TaskMonitor.Launched(worker.taskID(), worker.slaveID().get()); case Released: return new TaskMonitor.Released(worker.taskID(), worker.slaveID().get()); default: throw new IllegalArgumentException("unsupported worker state"); } }
/** Recover framework/worker information persisted by a prior incarnation of the RM. */ private void recoverWorkers() throws Exception { // if this application master starts as part of an ApplicationMaster/JobManager recovery, // then some worker tasks are most likely still alive and we can re-obtain them final List<MesosWorkerStore.Worker> tasksFromPreviousAttempts = workerStore.recoverWorkers(); if (!tasksFromPreviousAttempts.isEmpty()) { LOG.info("Retrieved {} TaskManagers from previous attempt", tasksFromPreviousAttempts.size()); List<Tuple2<TaskRequest, String>> toAssign = new ArrayList<>(tasksFromPreviousAttempts.size()); List<LaunchableTask> toLaunch = new ArrayList<>(tasksFromPreviousAttempts.size()); for (final MesosWorkerStore.Worker worker : tasksFromPreviousAttempts) { LaunchableMesosWorker launchable = createLaunchableMesosWorker(worker.taskID()); switch (worker.state()) { case New: workersInNew.put(extractResourceID(worker.taskID()), worker); toLaunch.add(launchable); break; case Launched: workersInLaunch.put(extractResourceID(worker.taskID()), worker); toAssign.add(new Tuple2<>(launchable.taskRequest(), worker.hostname().get())); break; case Released: workersBeingReturned.put(extractResourceID(worker.taskID()), worker); break; } taskRouter.tell(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)), self()); } // tell the launch coordinator about prior assignments if (toAssign.size() >= 1) { launchCoordinator.tell(new LaunchCoordinator.Assign(toAssign), self()); } // tell the launch coordinator to launch any new tasks if (toLaunch.size() >= 1) { launchCoordinator.tell(new LaunchCoordinator.Launch(toLaunch), self()); } } }
/** Plan for the removal of the given worker. */ private void releaseWorker(MesosWorkerStore.Worker worker) { try { LOG.info("Releasing worker {}", worker.taskID()); // update persistent state of worker to Released worker = worker.releaseWorker(); workerStore.putWorker(worker); workersBeingReturned.put(extractResourceID(worker.taskID()), worker); taskRouter.tell(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)), self()); if (worker.hostname().isDefined()) { // tell the launch coordinator that the task is being unassigned from the host, for planning // purposes launchCoordinator.tell( new LaunchCoordinator.Unassign(worker.taskID(), worker.hostname().get()), self()); } } catch (Exception ex) { fatalError("unable to release worker", ex); } }