Java MesosWorkerStore 예제들, org.apache.flink.mesos.runtime.clusterframework.store.MesosWorkerStore Java 예제들

예제 #1

0

파일 보기

파일: MesosFlinkResourceManager.java 프로젝트: jeyhunkarimov/flink

  /**
   * Plan for some additional workers to be launched.
   *
   * @param numWorkers The number of workers to allocate.
   */
  @Override
  protected void requestNewWorkers(int numWorkers) {

    try {
      List<TaskMonitor.TaskGoalStateUpdated> toMonitor = new ArrayList<>(numWorkers);
      List<LaunchableTask> toLaunch = new ArrayList<>(numWorkers);

      // generate new workers into persistent state and launch associated actors
      for (int i = 0; i < numWorkers; i++) {
        MesosWorkerStore.Worker worker = MesosWorkerStore.Worker.newWorker(workerStore.newTaskID());
        workerStore.putWorker(worker);
        workersInNew.put(extractResourceID(worker.taskID()), worker);

        LaunchableMesosWorker launchable = createLaunchableMesosWorker(worker.taskID());

        LOG.info(
            "Scheduling Mesos task {} with ({} MB, {} cpus).",
            launchable.taskID().getValue(),
            launchable.taskRequest().getMemory(),
            launchable.taskRequest().getCPUs());

        toMonitor.add(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)));
        toLaunch.add(launchable);
      }

      // tell the task router about the new plans
      for (TaskMonitor.TaskGoalStateUpdated update : toMonitor) {
        taskRouter.tell(update, self());
      }

      // tell the launch coordinator to launch the new tasks
      if (toLaunch.size() >= 1) {
        launchCoordinator.tell(new LaunchCoordinator.Launch(toLaunch), self());
      }
    } catch (Exception ex) {
      fatalError("unable to request new workers", ex);
    }
  }

예제 #2

0

파일 보기

파일: MesosFlinkResourceManager.java 프로젝트: jeyhunkarimov/flink

  /** Called when connected to Mesos as a new framework. */
  private void registered(Registered message) {
    connectionMonitor.tell(message, self());

    try {
      workerStore.setFrameworkID(Option.apply(message.frameworkId()));
    } catch (Exception ex) {
      fatalError("unable to store the assigned framework ID", ex);
      return;
    }

    launchCoordinator.tell(message, self());
    reconciliationCoordinator.tell(message, self());
    taskRouter.tell(message, self());
  }

예제 #3

0

파일 보기

파일: MesosFlinkResourceManager.java 프로젝트: jeyhunkarimov/flink

  @Override
  protected void initialize() throws Exception {
    LOG.info("Initializing Mesos resource master");

    workerStore.start();

    // create the scheduler driver to communicate with Mesos
    schedulerCallbackHandler = new SchedulerProxy(self());

    // register with Mesos
    FrameworkInfo.Builder frameworkInfo = mesosConfig.frameworkInfo().clone().setCheckpoint(true);

    Option<Protos.FrameworkID> frameworkID = workerStore.getFrameworkID();
    if (frameworkID.isEmpty()) {
      LOG.info("Registering as new framework.");
    } else {
      LOG.info(
          "Recovery scenario: re-registering using framework ID {}.", frameworkID.get().getValue());
      frameworkInfo.setId(frameworkID.get());
    }

    MesosConfiguration initializedMesosConfig = mesosConfig.withFrameworkInfo(frameworkInfo);
    MesosConfiguration.logMesosConfig(LOG, initializedMesosConfig);
    schedulerDriver = initializedMesosConfig.createDriver(schedulerCallbackHandler, false);

    // create supporting actors
    connectionMonitor = createConnectionMonitor();
    launchCoordinator = createLaunchCoordinator();
    reconciliationCoordinator = createReconciliationCoordinator();
    taskRouter = createTaskRouter();

    recoverWorkers();

    connectionMonitor.tell(new ConnectionMonitor.Start(), self());
    schedulerDriver.start();
  }

예제 #4

0

파일 보기

파일: MesosFlinkResourceManager.java 프로젝트: jeyhunkarimov/flink

  /**
   * Called to shut down the cluster (not a failover situation).
   *
   * @param finalStatus The application status to report.
   * @param optionalDiagnostics An optional diagnostics message.
   */
  @Override
  protected void shutdownApplication(ApplicationStatus finalStatus, String optionalDiagnostics) {

    LOG.info("Shutting down and unregistering as a Mesos framework.");
    try {
      // unregister the framework, which implicitly removes all tasks.
      schedulerDriver.stop(false);
    } catch (Exception ex) {
      LOG.warn("unable to unregister the framework", ex);
    }

    try {
      workerStore.stop(true);
    } catch (Exception ex) {
      LOG.warn("unable to stop the worker state store", ex);
    }

    context().stop(self());
  }

예제 #5

0

파일 보기

파일: MesosFlinkResourceManager.java 프로젝트: jeyhunkarimov/flink

  /** Plan for the removal of the given worker. */
  private void releaseWorker(MesosWorkerStore.Worker worker) {
    try {
      LOG.info("Releasing worker {}", worker.taskID());

      // update persistent state of worker to Released
      worker = worker.releaseWorker();
      workerStore.putWorker(worker);
      workersBeingReturned.put(extractResourceID(worker.taskID()), worker);
      taskRouter.tell(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)), self());

      if (worker.hostname().isDefined()) {
        // tell the launch coordinator that the task is being unassigned from the host, for planning
        // purposes
        launchCoordinator.tell(
            new LaunchCoordinator.Unassign(worker.taskID(), worker.hostname().get()), self());
      }
    } catch (Exception ex) {
      fatalError("unable to release worker", ex);
    }
  }

예제 #6

0

파일 보기

파일: MesosFlinkResourceManager.java 프로젝트: jeyhunkarimov/flink

  /** Recover framework/worker information persisted by a prior incarnation of the RM. */
  private void recoverWorkers() throws Exception {
    // if this application master starts as part of an ApplicationMaster/JobManager recovery,
    // then some worker tasks are most likely still alive and we can re-obtain them
    final List<MesosWorkerStore.Worker> tasksFromPreviousAttempts = workerStore.recoverWorkers();

    if (!tasksFromPreviousAttempts.isEmpty()) {
      LOG.info("Retrieved {} TaskManagers from previous attempt", tasksFromPreviousAttempts.size());

      List<Tuple2<TaskRequest, String>> toAssign =
          new ArrayList<>(tasksFromPreviousAttempts.size());
      List<LaunchableTask> toLaunch = new ArrayList<>(tasksFromPreviousAttempts.size());

      for (final MesosWorkerStore.Worker worker : tasksFromPreviousAttempts) {
        LaunchableMesosWorker launchable = createLaunchableMesosWorker(worker.taskID());

        switch (worker.state()) {
          case New:
            workersInNew.put(extractResourceID(worker.taskID()), worker);
            toLaunch.add(launchable);
            break;
          case Launched:
            workersInLaunch.put(extractResourceID(worker.taskID()), worker);
            toAssign.add(new Tuple2<>(launchable.taskRequest(), worker.hostname().get()));
            break;
          case Released:
            workersBeingReturned.put(extractResourceID(worker.taskID()), worker);
            break;
        }
        taskRouter.tell(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)), self());
      }

      // tell the launch coordinator about prior assignments
      if (toAssign.size() >= 1) {
        launchCoordinator.tell(new LaunchCoordinator.Assign(toAssign), self());
      }
      // tell the launch coordinator to launch any new tasks
      if (toLaunch.size() >= 1) {
        launchCoordinator.tell(new LaunchCoordinator.Launch(toLaunch), self());
      }
    }
  }

예제 #7

0

파일 보기

파일: MesosFlinkResourceManager.java 프로젝트: jeyhunkarimov/flink

  /**
   * Accept offers as advised by the launch coordinator.
   *
   * <p>Acceptance is routed through the RM to update the persistent state before forwarding the
   * message to Mesos.
   */
  private void acceptOffers(AcceptOffers msg) {

    try {
      List<TaskMonitor.TaskGoalStateUpdated> toMonitor = new ArrayList<>(msg.operations().size());

      // transition the persistent state of some tasks to Launched
      for (Protos.Offer.Operation op : msg.operations()) {
        if (op.getType() != Protos.Offer.Operation.Type.LAUNCH) {
          continue;
        }
        for (Protos.TaskInfo info : op.getLaunch().getTaskInfosList()) {
          MesosWorkerStore.Worker worker = workersInNew.remove(extractResourceID(info.getTaskId()));
          assert (worker != null);

          worker = worker.launchWorker(info.getSlaveId(), msg.hostname());
          workerStore.putWorker(worker);
          workersInLaunch.put(extractResourceID(worker.taskID()), worker);

          LOG.info(
              "Launching Mesos task {} on host {}.",
              worker.taskID().getValue(),
              worker.hostname().get());

          toMonitor.add(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)));
        }
      }

      // tell the task router about the new plans
      for (TaskMonitor.TaskGoalStateUpdated update : toMonitor) {
        taskRouter.tell(update, self());
      }

      // send the acceptance message to Mesos
      schedulerDriver.acceptOffers(msg.offerIds(), msg.operations(), msg.filters());
    } catch (Exception ex) {
      fatalError("unable to accept offers", ex);
    }
  }

예제 #8

0

파일 보기

파일: MesosFlinkResourceManager.java 프로젝트: jeyhunkarimov/flink

  /** Invoked when a Mesos task reaches a terminal status. */
  private void taskTerminated(Protos.TaskID taskID, Protos.TaskStatus status) {
    // this callback occurs for failed containers and for released containers alike

    final ResourceID id = extractResourceID(taskID);

    boolean existed;
    try {
      existed = workerStore.removeWorker(taskID);
    } catch (Exception ex) {
      fatalError("unable to remove worker", ex);
      return;
    }

    if (!existed) {
      LOG.info("Received a termination notice for an unrecognized worker: {}", id);
      return;
    }

    // check if this is a failed task or a released task
    if (workersBeingReturned.remove(id) != null) {
      // regular finished worker that we released
      LOG.info("Worker {} finished successfully with diagnostics: {}", id, status.getMessage());
    } else {
      // failed worker, either at startup, or running
      final MesosWorkerStore.Worker launched = workersInLaunch.remove(id);
      if (launched != null) {
        LOG.info(
            "Mesos task {} failed, with a TaskManager in launch or registration. "
                + "State: {} Reason: {} ({})",
            id,
            status.getState(),
            status.getReason(),
            status.getMessage());
        // we will trigger re-acquiring new workers at the end
      } else {
        // failed registered worker
        LOG.info(
            "Mesos task {} failed, with a registered TaskManager. " + "State: {} Reason: {} ({})",
            id,
            status.getState(),
            status.getReason(),
            status.getMessage());

        // notify the generic logic, which notifies the JobManager, etc.
        notifyWorkerFailed(id, "Mesos task " + id + " failed.  State: " + status.getState());
      }

      // general failure logging
      failedTasksSoFar++;

      String diagMessage =
          String.format(
              "Diagnostics for task %s in state %s : " + "reason=%s message=%s",
              id, status.getState(), status.getReason(), status.getMessage());
      sendInfoMessage(diagMessage);

      LOG.info(diagMessage);
      LOG.info("Total number of failed tasks so far: {}", failedTasksSoFar);

      // maxFailedTasks == -1 is infinite number of retries.
      if (maxFailedTasks >= 0 && failedTasksSoFar > maxFailedTasks) {
        String msg =
            "Stopping Mesos session because the number of failed tasks ("
                + failedTasksSoFar
                + ") exceeded the maximum failed tasks ("
                + maxFailedTasks
                + "). This number is controlled by the '"
                + ConfigConstants.MESOS_MAX_FAILED_TASKS
                + "' configuration setting. "
                + "By default its the number of requested tasks.";

        LOG.error(msg);
        self()
            .tell(
                decorateMessage(new StopCluster(ApplicationStatus.FAILED, msg)),
                ActorRef.noSender());

        // no need to do anything else
        return;
      }
    }

    // in case failed containers were among the finished containers, make
    // sure we re-examine and request new ones
    triggerCheckWorkers();
  }