/** Returns the current status of the host named by {@code host}. */ @Override public HostStatus getHostStatus(final String host) { final Stat stat; final ZooKeeperClient client = provider.get("getHostStatus"); try { stat = client.exists(Paths.configHostId(host)); } catch (KeeperException e) { throw new HeliosRuntimeException("Failed to check host status", e); } if (stat == null) { return null; } final boolean up = checkHostUp(client, host); final HostInfo hostInfo = getHostInfo(client, host); final AgentInfo agentInfo = getAgentInfo(client, host); final Map<JobId, Deployment> tasks = getTasks(client, host); final Map<JobId, TaskStatus> statuses = getTaskStatuses(client, host); final Map<String, String> environment = getEnvironment(client, host); return HostStatus.newBuilder() .setJobs(tasks) .setStatuses(fromNullable(statuses).or(EMPTY_STATUSES)) .setHostInfo(hostInfo) .setAgentInfo(agentInfo) .setStatus(up ? UP : DOWN) .setEnvironment(environment) .build(); }
/** Used to update the existing deployment of a job. */ @Override public void updateDeployment(final String host, final Deployment deployment) throws HostNotFoundException, JobNotDeployedException { log.info("updating deployment {}: {}", deployment, host); final ZooKeeperClient client = provider.get("updateDeployment"); final JobId jobId = deployment.getJobId(); final Job job = getJob(client, jobId); if (job == null) { throw new JobNotDeployedException(host, jobId); } assertHostExists(client, host); assertTaskExists(client, host, deployment.getJobId()); final String path = Paths.configHostJob(host, jobId); final Task task = new Task(job, deployment.getGoal()); try { client.setData(path, task.toJsonBytes()); } catch (Exception e) { throw new HeliosRuntimeException( "updating deployment " + deployment + " on host " + host + " failed", e); } }
/** Deletes a job from ZooKeeper. Ensures that job is not currently running anywhere. */ @Override public Job removeJob(final JobId id) throws JobDoesNotExistException, JobStillDeployedException { log.info("removing job: id={}", id); final ZooKeeperClient client = provider.get("removeJob"); final Job job = getJob(client, id); if (job == null) { throw new JobDoesNotExistException(id); } // TODO (dano): handle retry failures try { final ImmutableList.Builder<ZooKeeperOperation> operations = ImmutableList.builder(); final UUID jobCreationOperationId = getJobCreation(client, id); if (jobCreationOperationId != null) { operations.add(delete(Paths.configJobCreation(id, jobCreationOperationId))); } operations.add( delete(Paths.configJobHosts(id)), delete(Paths.configJobRefShort(id)), delete(Paths.configJob(id))); client.transaction(operations.build()); } catch (final NoNodeException e) { throw new JobDoesNotExistException(id); } catch (final NotEmptyException e) { throw new JobStillDeployedException(id, listJobHosts(client, id)); } catch (final KeeperException e) { throw new HeliosRuntimeException("removing job " + id + " failed", e); } return job; }
/** Returns a {@link Map} of {@link JobId} to {@link Job} objects for all of the jobs known. */ @Override public Map<JobId, Job> getJobs() { log.debug("getting jobs"); final String folder = Paths.configJobs(); final ZooKeeperClient client = provider.get("getJobs"); try { final List<String> ids; try { ids = client.getChildren(folder); } catch (NoNodeException e) { return Maps.newHashMap(); } final Map<JobId, Job> descriptors = Maps.newHashMap(); for (final String id : ids) { final JobId jobId = JobId.fromString(id); final String path = Paths.configJob(jobId); final byte[] data = client.getData(path); final Job descriptor = parse(data, Job.class); descriptors.put(descriptor.getId(), descriptor); } return descriptors; } catch (KeeperException | IOException e) { throw new HeliosRuntimeException("getting jobs failed", e); } }
/** Adds a job into the configuration. */ @Override public void addJob(final Job job) throws JobExistsException { log.info("adding job: {}", job); final JobId id = job.getId(); final UUID operationId = UUID.randomUUID(); final String creationPath = Paths.configJobCreation(id, operationId); final ZooKeeperClient client = provider.get("addJob"); try { try { client.ensurePath(Paths.historyJob(id)); client.transaction( create(Paths.configJob(id), job), create(Paths.configJobRefShort(id), id), create(Paths.configJobHosts(id)), create(creationPath)); } catch (final NodeExistsException e) { if (client.exists(creationPath) != null) { // The job was created, we're done here return; } throw new JobExistsException(id.toString()); } } catch (final KeeperException e) { throw new HeliosRuntimeException("adding job " + job + " failed", e); } }
private Map<JobId, Deployment> getTasks(final ZooKeeperClient client, final String host) { final Map<JobId, Deployment> jobs = Maps.newHashMap(); try { final String folder = Paths.configHostJobs(host); final List<String> jobIds; try { jobIds = client.getChildren(folder); } catch (KeeperException.NoNodeException e) { return null; } for (final String jobIdString : jobIds) { final JobId jobId = JobId.fromString(jobIdString); final String containerPath = Paths.configHostJob(host, jobId); try { final byte[] data = client.getData(containerPath); final Task task = parse(data, Task.class); jobs.put(jobId, Deployment.of(jobId, task.getGoal())); } catch (KeeperException.NoNodeException ignored) { log.debug("deployment config node disappeared: {}", jobIdString); } } } catch (KeeperException | IOException e) { throw new HeliosRuntimeException("getting deployment config failed", e); } return jobs; }
/** Undeploys the job specified by {@code jobId} on {@code host}. */ @Override public Deployment undeployJob(final String host, final JobId jobId) throws HostNotFoundException, JobNotDeployedException { log.info("undeploying {}: {}", jobId, host); final ZooKeeperClient client = provider.get("undeployJob"); assertHostExists(client, host); final Deployment deployment = getDeployment(host, jobId); if (deployment == null) { throw new JobNotDeployedException(host, jobId); } // TODO (dano): Is this safe? can the ports of an undeployed job collide with a new deployment? // TODO (drewc): If it's still in UNDEPLOY, that means the agent hasn't gotten to it // yet, which means it probably won't see the new job yet either. However, it may spin up // a new supervisor for the new job before the old one is done being torn down. So it can // race and lose. With a little change to the Agent where we manage the supervisors, with // some coordination, we could remove the race, such that this race goes away. Specifically, // we're creating new Supervisors before updating existing ones. If we swap that, part of // the problem goes away, but we'd need some coordination between the Supervisor and the // agent such that the Agent could wait until the Supervisor had handled the Goal change. // Additionally, since ZK guarantees we'll see the writes in the proper order, we wouldn't // need to deal with seeing the new job before the UNDEPLOY. final Job job = getJob(client, jobId); final String path = Paths.configHostJob(host, jobId); final Task task = new Task(job, UNDEPLOY); final List<ZooKeeperOperation> operations = Lists.newArrayList(set(path, task.toJsonBytes()), delete(Paths.configJobHost(jobId, host))); final List<Integer> staticPorts = staticPorts(job); for (int port : staticPorts) { operations.add(delete(Paths.configHostPort(host, port))); } try { client.transaction(operations); } catch (NoNodeException e) { if (e.getPath().equals(path)) { // NoNodeException on updating the deployment node may happen due to retry failures. // If the deployment isn't there anymore, we're done. return deployment; } else { // The relation node deletes should not fail unless there is a programming error. throw new HeliosRuntimeException("Removing deployment failed", e); } } catch (KeeperException e) { throw new HeliosRuntimeException("Removing deployment failed", e); } return deployment; }
/** Returns the current deployment state of {@code jobId} on {@code host}. */ @Override public Deployment getDeployment(final String host, final JobId jobId) { final String path = Paths.configHostJob(host, jobId); final ZooKeeperClient client = provider.get("getDeployment"); try { final byte[] data = client.getData(path); final Task task = parse(data, Task.class); return Deployment.of(jobId, task.getGoal()); } catch (KeeperException.NoNodeException e) { return null; } catch (KeeperException | IOException e) { throw new HeliosRuntimeException("getting deployment failed", e); } }
/** Returns a list of the host names of the currently running masters. */ @Override public List<String> getRunningMasters() { final ZooKeeperClient client = provider.get("getRunningMasters"); try { final List<String> masters = client.getChildren(Paths.statusMaster()); final ImmutableList.Builder<String> upMasters = ImmutableList.builder(); for (final String master : masters) { if (client.exists(Paths.statusMasterUp(master)) != null) { upMasters.add(master); } } return upMasters.build(); } catch (KeeperException e) { throw new HeliosRuntimeException("listing masters failed", e); } }
private boolean checkHostUp(final ZooKeeperClient client, final String host) { try { final Stat stat = client.exists(Paths.statusHostUp(host)); return stat != null; } catch (KeeperException e) { throw new HeliosRuntimeException("getting host " + host + " up status failed", e); } }
private void assertTaskExists(final ZooKeeperClient client, final String host, final JobId jobId) throws JobNotDeployedException { try { client.getData(Paths.configHostJob(host, jobId)); } catch (NoNodeException e) { throw new JobNotDeployedException(host, jobId); } catch (KeeperException e) { throw new HeliosRuntimeException(e); } }
private UUID getJobCreation(final ZooKeeperClient client, final JobId id) throws KeeperException { final String parent = Paths.configHostJobCreationParent(id); final List<String> children = client.getChildren(parent); for (final String child : children) { if (Paths.isConfigJobCreation(id, parent, child)) { return Paths.configJobCreationId(id, parent, child); } } return null; }
private void assertHostExists(final ZooKeeperClient client, final String host) throws HostNotFoundException { try { client.getData(Paths.configHost(host)); } catch (NoNodeException e) { throw new HostNotFoundException(host, e); } catch (KeeperException e) { throw new HeliosRuntimeException(e); } }
private void assertJobExists(final ZooKeeperClient client, final JobId id) throws JobDoesNotExistException { try { final String path = Paths.configJob(id); if (client.stat(path) == null) { throw new JobDoesNotExistException(id); } } catch (KeeperException e) { throw new HeliosRuntimeException("checking job existence failed", e); } }
private <T> T tryGetEntity( final ZooKeeperClient client, String path, TypeReference<T> type, String name) { try { final byte[] data = client.getData(path); return Json.read(data, type); } catch (NoNodeException e) { return null; } catch (KeeperException | IOException e) { throw new HeliosRuntimeException("reading " + name + " info failed", e); } }
/** Given a jobId, returns the N most recent events in it's history in the cluster. */ @Override public List<TaskStatusEvent> getJobHistory(final JobId jobId) throws JobDoesNotExistException { final Job descriptor = getJob(jobId); if (descriptor == null) { throw new JobDoesNotExistException(jobId); } final ZooKeeperClient client = provider.get("getJobHistory"); final List<String> hosts; try { hosts = client.getChildren(Paths.historyJobHosts(jobId)); } catch (NoNodeException e) { return emptyList(); } catch (KeeperException e) { throw Throwables.propagate(e); } final List<TaskStatusEvent> jsEvents = Lists.newArrayList(); for (String host : hosts) { final List<String> events; try { events = client.getChildren(Paths.historyJobHostEvents(jobId, host)); } catch (KeeperException e) { throw Throwables.propagate(e); } for (String event : events) { try { byte[] data = client.getData(Paths.historyJobHostEventsTimestamp(jobId, host, Long.valueOf(event))); final TaskStatus status = Json.read(data, TaskStatus.class); jsEvents.add(new TaskStatusEvent(status, Long.valueOf(event), host)); } catch (NoNodeException e) { // ignore, it went away before we read it } catch (KeeperException | IOException e) { throw Throwables.propagate(e); } } } return Ordering.from(EVENT_COMPARATOR).sortedCopy(jsEvents); }
private Job getJob(final ZooKeeperClient client, final JobId id) { final String path = Paths.configJob(id); try { final byte[] data = client.getData(path); return Json.read(data, Job.class); } catch (NoNodeException e) { // Return null to indicate that the job does not exist return null; } catch (KeeperException | IOException e) { throw new HeliosRuntimeException("getting job " + id + " failed", e); } }
private List<String> listJobHosts(final ZooKeeperClient client, final JobId jobId) throws JobDoesNotExistException { final List<String> hosts; try { hosts = client.getChildren(Paths.configJobHosts(jobId)); } catch (NoNodeException e) { throw new JobDoesNotExistException(jobId); } catch (KeeperException e) { throw new HeliosRuntimeException("failed to list hosts for job: " + jobId, e); } return hosts; }
@Nullable private TaskStatus getTaskStatus( final ZooKeeperClient client, final String host, final JobId jobId) { final String containerPath = Paths.statusHostJob(host, jobId); try { final byte[] data = client.getData(containerPath); return parse(data, TaskStatus.class); } catch (NoNodeException ignored) { return null; } catch (KeeperException | IOException e) { throw new HeliosRuntimeException( "Getting task " + jobId + " status " + "for host " + host + " failed", e); } }
private List<JobId> listHostJobs(final ZooKeeperClient client, final String host) { final List<String> jobIdStrings; final String folder = Paths.statusHostJobs(host); try { jobIdStrings = client.getChildren(folder); } catch (KeeperException.NoNodeException e) { return null; } catch (KeeperException e) { throw new HeliosRuntimeException("List tasks for host failed: " + host, e); } final ImmutableList.Builder<JobId> jobIds = ImmutableList.builder(); for (String jobIdString : jobIdStrings) { jobIds.add(JobId.fromString(jobIdString)); } return jobIds.build(); }
/** * Registers a host into ZooKeeper. The {@code id} is initially generated randomly by the Agent * and persisted on disk. This way, in the event that you have two agents attempting to register * with the same value of @{code host}, the first one will win. */ @Override public void registerHost(final String host, final String id) { log.info("registering host: {}", host); final ZooKeeperClient client = provider.get("registerHost"); try { // TODO (dano): this code is replicated in AgentZooKeeperRegistrar // This would've been nice to do in a transaction but PathChildrenCache ensures paths // so we can't know what paths already exist so assembling a suitable transaction is too // painful. client.ensurePath(Paths.configHost(host)); client.ensurePath(Paths.configHostJobs(host)); client.ensurePath(Paths.configHostPorts(host)); client.ensurePath(Paths.statusHost(host)); client.ensurePath(Paths.statusHostJobs(host)); // Finish registration by creating the id node last client.createAndSetData(Paths.configHostId(host), id.getBytes(UTF_8)); } catch (Exception e) { throw new HeliosRuntimeException("registering host " + host + " failed", e); } }
/** * Undoes the effect of {@link ZooKeeperMasterModel#registerHost(String, String)}. Cleans up any * leftover host-related things. */ @Override public void deregisterHost(final String host) throws HostNotFoundException, HostStillInUseException { log.info("deregistering host: {}", host); final ZooKeeperClient client = provider.get("deregisterHost"); // TODO (dano): handle retry failures try { final List<ZooKeeperOperation> operations = Lists.newArrayList(); // Remove all jobs deployed to this host final List<JobId> jobs = listHostJobs(client, host); if (jobs == null) { if (client.exists(Paths.configHost(host)) == null) { throw new HostNotFoundException("host [" + host + "] does not exist"); } } for (JobId job : jobs) { final String hostJobPath = Paths.configHostJob(host, job); final List<String> nodes = client.listRecursive(hostJobPath); for (final String node : reverse(nodes)) { operations.add(delete(node)); } if (client.exists(Paths.configJobHost(job, host)) != null) { operations.add(delete(Paths.configJobHost(job, host))); } // Clean out the history for each job try { final List<String> history = client.listRecursive(Paths.historyJobHost(job, host)); for (String s : reverse(history)) { operations.add(delete(s)); } } catch (NoNodeException ignore) { } } operations.add(delete(Paths.configHostJobs(host))); // Remove the host status try { final List<String> nodes = client.listRecursive(Paths.statusHost(host)); for (final String node : reverse(nodes)) { operations.add(delete(node)); } } catch (NoNodeException ignore) { } // Remove port allocations try { final List<String> ports = client.getChildren(Paths.configHostPorts(host)); for (final String port : ports) { operations.add(delete(Paths.configHostPort(host, Integer.valueOf(port)))); } operations.add(delete(Paths.configHostPorts(host))); } catch (NoNodeException ignore) { } // Remove host id String idPath = Paths.configHostId(host); if (client.exists(idPath) != null) { operations.add(delete(idPath)); } // Remove host config root operations.add(delete(Paths.configHost(host))); client.transaction(operations); } catch (NotEmptyException e) { final HostStatus hostStatus = getHostStatus(host); final List<JobId> jobs = hostStatus != null ? ImmutableList.copyOf(hostStatus.getJobs().keySet()) : Collections.<JobId>emptyList(); throw new HostStillInUseException(host, jobs); } catch (NoNodeException e) { throw new HostNotFoundException(host); } catch (KeeperException e) { throw new HeliosRuntimeException(e); } }
// TODO(drewc): this kinda screams "long method" private void deployJobRetry( final ZooKeeperClient client, final String host, final Deployment deployment, int count) throws JobDoesNotExistException, JobAlreadyDeployedException, HostNotFoundException, JobPortAllocationConflictException { if (count == 3) { throw new HeliosRuntimeException( "3 failures (possibly concurrent modifications) while " + "deploying. Giving up."); } log.info("deploying {}: {} (retry={})", deployment, host, count); final JobId id = deployment.getJobId(); final Job job = getJob(id); if (job == null) { throw new JobDoesNotExistException(id); } final UUID operationId = UUID.randomUUID(); final String jobPath = Paths.configJob(id); final String taskPath = Paths.configHostJob(host, id); final String taskCreationPath = Paths.configHostJobCreation(host, id, operationId); final List<Integer> staticPorts = staticPorts(job); final Map<String, byte[]> portNodes = Maps.newHashMap(); final byte[] idJson = id.toJsonBytes(); for (final int port : staticPorts) { final String path = Paths.configHostPort(host, port); portNodes.put(path, idJson); } final Task task = new Task(job, deployment.getGoal()); final List<ZooKeeperOperation> operations = Lists.newArrayList( check(jobPath), create(portNodes), create(Paths.configJobHost(id, host))); // Attempt to read a task here. If it's goal is UNDEPLOY, it's as good as not existing try { final Node existing = client.getNode(taskPath); byte[] bytes = existing.getBytes(); Task readTask = Json.read(bytes, Task.class); if (readTask.getGoal() != Goal.UNDEPLOY) { throw new JobAlreadyDeployedException(host, id); } operations.add(check(taskPath, existing.getStat().getVersion())); operations.add(set(taskPath, task)); } catch (NoNodeException e) { operations.add(create(taskPath, task)); operations.add(create(taskCreationPath)); } catch (IOException | KeeperException e) { throw new HeliosRuntimeException("reading existing task description failed", e); } // TODO (dano): Failure handling is racy wrt agent and job modifications. try { client.transaction(operations); log.info("deployed {}: {} (retry={})", deployment, host, count); } catch (NoNodeException e) { // Either the job, the host or the task went away assertJobExists(client, id); assertHostExists(client, host); // If the job and host still exists, we likely tried to redeploy a job that had an UNDEPLOY // goal and lost the race with the agent removing the task before we could set it. Retry. deployJobRetry(client, host, deployment, count + 1); } catch (NodeExistsException e) { // Check for conflict due to transaction retry try { if (client.exists(taskCreationPath) != null) { // Our creation operation node existed, we're done here return; } } catch (KeeperException ex) { throw new HeliosRuntimeException("checking job deployment failed", ex); } try { // Check if the job was already deployed if (client.stat(taskPath) != null) { throw new JobAlreadyDeployedException(host, id); } } catch (KeeperException ex) { throw new HeliosRuntimeException("checking job deployment failed", e); } // Check for static port collisions for (final int port : staticPorts) { final String path = Paths.configHostPort(host, port); try { if (client.stat(path) == null) { continue; } final byte[] b = client.getData(path); final JobId existingJobId = parse(b, JobId.class); throw new JobPortAllocationConflictException(id, existingJobId, host, port); } catch (KeeperException | IOException ex) { throw new HeliosRuntimeException("checking port allocations failed", e); } } // Catch all for logic and ephemeral issues throw new HeliosRuntimeException("deploying job failed", e); } catch (KeeperException e) { throw new HeliosRuntimeException("deploying job failed", e); } }