private Map<JobId, Deployment> getTasks(final ZooKeeperClient client, final String host) { final Map<JobId, Deployment> jobs = Maps.newHashMap(); try { final String folder = Paths.configHostJobs(host); final List<String> jobIds; try { jobIds = client.getChildren(folder); } catch (KeeperException.NoNodeException e) { return null; } for (final String jobIdString : jobIds) { final JobId jobId = JobId.fromString(jobIdString); final String containerPath = Paths.configHostJob(host, jobId); try { final byte[] data = client.getData(containerPath); final Task task = parse(data, Task.class); jobs.put(jobId, Deployment.of(jobId, task.getGoal())); } catch (KeeperException.NoNodeException ignored) { log.debug("deployment config node disappeared: {}", jobIdString); } } } catch (KeeperException | IOException e) { throw new HeliosRuntimeException("getting deployment config failed", e); } return jobs; }
/** Returns a {@link Map} of {@link JobId} to {@link Job} objects for all of the jobs known. */ @Override public Map<JobId, Job> getJobs() { log.debug("getting jobs"); final String folder = Paths.configJobs(); final ZooKeeperClient client = provider.get("getJobs"); try { final List<String> ids; try { ids = client.getChildren(folder); } catch (NoNodeException e) { return Maps.newHashMap(); } final Map<JobId, Job> descriptors = Maps.newHashMap(); for (final String id : ids) { final JobId jobId = JobId.fromString(id); final String path = Paths.configJob(jobId); final byte[] data = client.getData(path); final Job descriptor = parse(data, Job.class); descriptors.put(descriptor.getId(), descriptor); } return descriptors; } catch (KeeperException | IOException e) { throw new HeliosRuntimeException("getting jobs failed", e); } }
private void assertTaskExists(final ZooKeeperClient client, final String host, final JobId jobId) throws JobNotDeployedException { try { client.getData(Paths.configHostJob(host, jobId)); } catch (NoNodeException e) { throw new JobNotDeployedException(host, jobId); } catch (KeeperException e) { throw new HeliosRuntimeException(e); } }
private void assertHostExists(final ZooKeeperClient client, final String host) throws HostNotFoundException { try { client.getData(Paths.configHost(host)); } catch (NoNodeException e) { throw new HostNotFoundException(host, e); } catch (KeeperException e) { throw new HeliosRuntimeException(e); } }
private <T> T tryGetEntity( final ZooKeeperClient client, String path, TypeReference<T> type, String name) { try { final byte[] data = client.getData(path); return Json.read(data, type); } catch (NoNodeException e) { return null; } catch (KeeperException | IOException e) { throw new HeliosRuntimeException("reading " + name + " info failed", e); } }
private Job getJob(final ZooKeeperClient client, final JobId id) { final String path = Paths.configJob(id); try { final byte[] data = client.getData(path); return Json.read(data, Job.class); } catch (NoNodeException e) { // Return null to indicate that the job does not exist return null; } catch (KeeperException | IOException e) { throw new HeliosRuntimeException("getting job " + id + " failed", e); } }
@Nullable private TaskStatus getTaskStatus( final ZooKeeperClient client, final String host, final JobId jobId) { final String containerPath = Paths.statusHostJob(host, jobId); try { final byte[] data = client.getData(containerPath); return parse(data, TaskStatus.class); } catch (NoNodeException ignored) { return null; } catch (KeeperException | IOException e) { throw new HeliosRuntimeException( "Getting task " + jobId + " status " + "for host " + host + " failed", e); } }
/** Returns the current deployment state of {@code jobId} on {@code host}. */ @Override public Deployment getDeployment(final String host, final JobId jobId) { final String path = Paths.configHostJob(host, jobId); final ZooKeeperClient client = provider.get("getDeployment"); try { final byte[] data = client.getData(path); final Task task = parse(data, Task.class); return Deployment.of(jobId, task.getGoal()); } catch (KeeperException.NoNodeException e) { return null; } catch (KeeperException | IOException e) { throw new HeliosRuntimeException("getting deployment failed", e); } }
/** Given a jobId, returns the N most recent events in it's history in the cluster. */ @Override public List<TaskStatusEvent> getJobHistory(final JobId jobId) throws JobDoesNotExistException { final Job descriptor = getJob(jobId); if (descriptor == null) { throw new JobDoesNotExistException(jobId); } final ZooKeeperClient client = provider.get("getJobHistory"); final List<String> hosts; try { hosts = client.getChildren(Paths.historyJobHosts(jobId)); } catch (NoNodeException e) { return emptyList(); } catch (KeeperException e) { throw Throwables.propagate(e); } final List<TaskStatusEvent> jsEvents = Lists.newArrayList(); for (String host : hosts) { final List<String> events; try { events = client.getChildren(Paths.historyJobHostEvents(jobId, host)); } catch (KeeperException e) { throw Throwables.propagate(e); } for (String event : events) { try { byte[] data = client.getData(Paths.historyJobHostEventsTimestamp(jobId, host, Long.valueOf(event))); final TaskStatus status = Json.read(data, TaskStatus.class); jsEvents.add(new TaskStatusEvent(status, Long.valueOf(event), host)); } catch (NoNodeException e) { // ignore, it went away before we read it } catch (KeeperException | IOException e) { throw Throwables.propagate(e); } } } return Ordering.from(EVENT_COMPARATOR).sortedCopy(jsEvents); }
// TODO(drewc): this kinda screams "long method" private void deployJobRetry( final ZooKeeperClient client, final String host, final Deployment deployment, int count) throws JobDoesNotExistException, JobAlreadyDeployedException, HostNotFoundException, JobPortAllocationConflictException { if (count == 3) { throw new HeliosRuntimeException( "3 failures (possibly concurrent modifications) while " + "deploying. Giving up."); } log.info("deploying {}: {} (retry={})", deployment, host, count); final JobId id = deployment.getJobId(); final Job job = getJob(id); if (job == null) { throw new JobDoesNotExistException(id); } final UUID operationId = UUID.randomUUID(); final String jobPath = Paths.configJob(id); final String taskPath = Paths.configHostJob(host, id); final String taskCreationPath = Paths.configHostJobCreation(host, id, operationId); final List<Integer> staticPorts = staticPorts(job); final Map<String, byte[]> portNodes = Maps.newHashMap(); final byte[] idJson = id.toJsonBytes(); for (final int port : staticPorts) { final String path = Paths.configHostPort(host, port); portNodes.put(path, idJson); } final Task task = new Task(job, deployment.getGoal()); final List<ZooKeeperOperation> operations = Lists.newArrayList( check(jobPath), create(portNodes), create(Paths.configJobHost(id, host))); // Attempt to read a task here. If it's goal is UNDEPLOY, it's as good as not existing try { final Node existing = client.getNode(taskPath); byte[] bytes = existing.getBytes(); Task readTask = Json.read(bytes, Task.class); if (readTask.getGoal() != Goal.UNDEPLOY) { throw new JobAlreadyDeployedException(host, id); } operations.add(check(taskPath, existing.getStat().getVersion())); operations.add(set(taskPath, task)); } catch (NoNodeException e) { operations.add(create(taskPath, task)); operations.add(create(taskCreationPath)); } catch (IOException | KeeperException e) { throw new HeliosRuntimeException("reading existing task description failed", e); } // TODO (dano): Failure handling is racy wrt agent and job modifications. try { client.transaction(operations); log.info("deployed {}: {} (retry={})", deployment, host, count); } catch (NoNodeException e) { // Either the job, the host or the task went away assertJobExists(client, id); assertHostExists(client, host); // If the job and host still exists, we likely tried to redeploy a job that had an UNDEPLOY // goal and lost the race with the agent removing the task before we could set it. Retry. deployJobRetry(client, host, deployment, count + 1); } catch (NodeExistsException e) { // Check for conflict due to transaction retry try { if (client.exists(taskCreationPath) != null) { // Our creation operation node existed, we're done here return; } } catch (KeeperException ex) { throw new HeliosRuntimeException("checking job deployment failed", ex); } try { // Check if the job was already deployed if (client.stat(taskPath) != null) { throw new JobAlreadyDeployedException(host, id); } } catch (KeeperException ex) { throw new HeliosRuntimeException("checking job deployment failed", e); } // Check for static port collisions for (final int port : staticPorts) { final String path = Paths.configHostPort(host, port); try { if (client.stat(path) == null) { continue; } final byte[] b = client.getData(path); final JobId existingJobId = parse(b, JobId.class); throw new JobPortAllocationConflictException(id, existingJobId, host, port); } catch (KeeperException | IOException ex) { throw new HeliosRuntimeException("checking port allocations failed", e); } } // Catch all for logic and ephemeral issues throw new HeliosRuntimeException("deploying job failed", e); } catch (KeeperException e) { throw new HeliosRuntimeException("deploying job failed", e); } }