/** Used to update the existing deployment of a job. */ @Override public void updateDeployment(final String host, final Deployment deployment) throws HostNotFoundException, JobNotDeployedException { log.info("updating deployment {}: {}", deployment, host); final ZooKeeperClient client = provider.get("updateDeployment"); final JobId jobId = deployment.getJobId(); final Job job = getJob(client, jobId); if (job == null) { throw new JobNotDeployedException(host, jobId); } assertHostExists(client, host); assertTaskExists(client, host, deployment.getJobId()); final String path = Paths.configHostJob(host, jobId); final Task task = new Task(job, deployment.getGoal()); try { client.setData(path, task.toJsonBytes()); } catch (Exception e) { throw new HeliosRuntimeException( "updating deployment " + deployment + " on host " + host + " failed", e); } }
/** Undeploys the job specified by {@code jobId} on {@code host}. */ @Override public Deployment undeployJob(final String host, final JobId jobId) throws HostNotFoundException, JobNotDeployedException { log.info("undeploying {}: {}", jobId, host); final ZooKeeperClient client = provider.get("undeployJob"); assertHostExists(client, host); final Deployment deployment = getDeployment(host, jobId); if (deployment == null) { throw new JobNotDeployedException(host, jobId); } // TODO (dano): Is this safe? can the ports of an undeployed job collide with a new deployment? // TODO (drewc): If it's still in UNDEPLOY, that means the agent hasn't gotten to it // yet, which means it probably won't see the new job yet either. However, it may spin up // a new supervisor for the new job before the old one is done being torn down. So it can // race and lose. With a little change to the Agent where we manage the supervisors, with // some coordination, we could remove the race, such that this race goes away. Specifically, // we're creating new Supervisors before updating existing ones. If we swap that, part of // the problem goes away, but we'd need some coordination between the Supervisor and the // agent such that the Agent could wait until the Supervisor had handled the Goal change. // Additionally, since ZK guarantees we'll see the writes in the proper order, we wouldn't // need to deal with seeing the new job before the UNDEPLOY. final Job job = getJob(client, jobId); final String path = Paths.configHostJob(host, jobId); final Task task = new Task(job, UNDEPLOY); final List<ZooKeeperOperation> operations = Lists.newArrayList(set(path, task.toJsonBytes()), delete(Paths.configJobHost(jobId, host))); final List<Integer> staticPorts = staticPorts(job); for (int port : staticPorts) { operations.add(delete(Paths.configHostPort(host, port))); } try { client.transaction(operations); } catch (NoNodeException e) { if (e.getPath().equals(path)) { // NoNodeException on updating the deployment node may happen due to retry failures. // If the deployment isn't there anymore, we're done. return deployment; } else { // The relation node deletes should not fail unless there is a programming error. throw new HeliosRuntimeException("Removing deployment failed", e); } } catch (KeeperException e) { throw new HeliosRuntimeException("Removing deployment failed", e); } return deployment; }