Exemplo n.º 1
0
  /** Used to update the existing deployment of a job. */
  @Override
  public void updateDeployment(final String host, final Deployment deployment)
      throws HostNotFoundException, JobNotDeployedException {
    log.info("updating deployment {}: {}", deployment, host);

    final ZooKeeperClient client = provider.get("updateDeployment");

    final JobId jobId = deployment.getJobId();
    final Job job = getJob(client, jobId);

    if (job == null) {
      throw new JobNotDeployedException(host, jobId);
    }

    assertHostExists(client, host);
    assertTaskExists(client, host, deployment.getJobId());

    final String path = Paths.configHostJob(host, jobId);
    final Task task = new Task(job, deployment.getGoal());
    try {
      client.setData(path, task.toJsonBytes());
    } catch (Exception e) {
      throw new HeliosRuntimeException(
          "updating deployment " + deployment + " on host " + host + " failed", e);
    }
  }
Exemplo n.º 2
0
  /** Undeploys the job specified by {@code jobId} on {@code host}. */
  @Override
  public Deployment undeployJob(final String host, final JobId jobId)
      throws HostNotFoundException, JobNotDeployedException {
    log.info("undeploying {}: {}", jobId, host);
    final ZooKeeperClient client = provider.get("undeployJob");

    assertHostExists(client, host);

    final Deployment deployment = getDeployment(host, jobId);
    if (deployment == null) {
      throw new JobNotDeployedException(host, jobId);
    }

    // TODO (dano): Is this safe? can the ports of an undeployed job collide with a new deployment?
    // TODO (drewc):  If it's still in UNDEPLOY, that means the agent hasn't gotten to it
    //    yet, which means it probably won't see the new job yet either.  However, it may spin up
    //    a new supervisor for the new job before the old one is done being torn down.  So it can
    //    race and lose.  With a little change to the Agent where we manage the supervisors, with
    //    some coordination, we could remove the race, such that this race goes away.  Specifically,
    //    we're creating new Supervisors before updating existing ones.  If we swap that, part of
    //    the problem goes away, but we'd need some coordination between the Supervisor and the
    //    agent such that the Agent could wait until the Supervisor had handled the Goal change.
    //    Additionally, since ZK guarantees we'll see the writes in the proper order, we wouldn't
    //    need to deal with seeing the new job before the UNDEPLOY.

    final Job job = getJob(client, jobId);
    final String path = Paths.configHostJob(host, jobId);
    final Task task = new Task(job, UNDEPLOY);
    final List<ZooKeeperOperation> operations =
        Lists.newArrayList(set(path, task.toJsonBytes()), delete(Paths.configJobHost(jobId, host)));

    final List<Integer> staticPorts = staticPorts(job);
    for (int port : staticPorts) {
      operations.add(delete(Paths.configHostPort(host, port)));
    }

    try {
      client.transaction(operations);
    } catch (NoNodeException e) {
      if (e.getPath().equals(path)) {
        // NoNodeException on updating the deployment node may happen due to retry failures.
        // If the deployment isn't there anymore, we're done.
        return deployment;
      } else {
        // The relation node deletes should not fail unless there is a programming error.
        throw new HeliosRuntimeException("Removing deployment failed", e);
      }
    } catch (KeeperException e) {
      throw new HeliosRuntimeException("Removing deployment failed", e);
    }
    return deployment;
  }