/** Used to update the existing deployment of a job. */ @Override public void updateDeployment(final String host, final Deployment deployment) throws HostNotFoundException, JobNotDeployedException { log.info("updating deployment {}: {}", deployment, host); final ZooKeeperClient client = provider.get("updateDeployment"); final JobId jobId = deployment.getJobId(); final Job job = getJob(client, jobId); if (job == null) { throw new JobNotDeployedException(host, jobId); } assertHostExists(client, host); assertTaskExists(client, host, deployment.getJobId()); final String path = Paths.configHostJob(host, jobId); final Task task = new Task(job, deployment.getGoal()); try { client.setData(path, task.toJsonBytes()); } catch (Exception e) { throw new HeliosRuntimeException( "updating deployment " + deployment + " on host " + host + " failed", e); } }
public void assertVolumes(final JobId jobId) throws Exception { // Wait for agent to come up awaitHostRegistered(client, testHost(), LONG_WAIT_SECONDS, SECONDS); awaitHostStatus(client, testHost(), UP, LONG_WAIT_SECONDS, SECONDS); // Deploy the job on the agent final Deployment deployment = Deployment.of(jobId, START); final JobDeployResponse deployed = client.deploy(deployment, testHost()).get(); assertEquals(JobDeployResponse.Status.OK, deployed.getStatus()); // Wait for the job to run final TaskStatus taskStatus = awaitJobState(client, testHost(), jobId, RUNNING, LONG_WAIT_SECONDS, SECONDS); assertJobEquals(job, taskStatus.getJob()); final Integer barPort = taskStatus.getPorts().get("bar").getExternalPort(); final Integer hostnamePort = taskStatus.getPorts().get("hostname").getExternalPort(); assert barPort != null; assert hostnamePort != null; // Read "foo" from /volume/bar final String foo = recvUtf8(barPort, 3); assertEquals("foo", foo); // Read hostname from /hostname final String hostname = getNewDockerClient().info().name(); final String mountedHostname = recvUtf8(hostnamePort, hostname.length()); assertEquals(hostname, mountedHostname); }
private Map<JobId, Deployment> getTasks(final ZooKeeperClient client, final String host) { final Map<JobId, Deployment> jobs = Maps.newHashMap(); try { final String folder = Paths.configHostJobs(host); final List<String> jobIds; try { jobIds = client.getChildren(folder); } catch (KeeperException.NoNodeException e) { return null; } for (final String jobIdString : jobIds) { final JobId jobId = JobId.fromString(jobIdString); final String containerPath = Paths.configHostJob(host, jobId); try { final byte[] data = client.getData(containerPath); final Task task = parse(data, Task.class); jobs.put(jobId, Deployment.of(jobId, task.getGoal())); } catch (KeeperException.NoNodeException ignored) { log.debug("deployment config node disappeared: {}", jobIdString); } } } catch (KeeperException | IOException e) { throw new HeliosRuntimeException("getting deployment config failed", e); } return jobs; }
public ListenableFuture<SetGoalResponse> setGoal( final Deployment job, final String host, final String token) { return transform( request( uri(path("/hosts/%s/jobs/%s", host, job.getJobId()), ImmutableMap.of("token", token)), "PATCH", job), ConvertResponseToPojo.create( SetGoalResponse.class, ImmutableSet.of(HTTP_OK, HTTP_NOT_FOUND, HTTP_FORBIDDEN))); }
public ListenableFuture<JobDeployResponse> deploy( final Deployment job, final String host, final String token) { final Set<Integer> deserializeReturnCodes = ImmutableSet.of(HTTP_OK, HTTP_NOT_FOUND, HTTP_BAD_METHOD, HTTP_BAD_REQUEST, HTTP_FORBIDDEN); return transform( request( uri(path("/hosts/%s/jobs/%s", host, job.getJobId()), ImmutableMap.of("token", token)), "PUT", job), ConvertResponseToPojo.create(JobDeployResponse.class, deserializeReturnCodes)); }
@Test public void test() throws Exception { startDefaultMaster(); startDefaultAgent(testHost()); final HeliosClient client = defaultClient(); // Create a job using an image exposing port 11211 but without mapping it final Job job1 = Job.newBuilder() .setName(testTag + "memcached") .setVersion("v1") .setImage("rohan/memcached-mini") .build(); final JobId jobId1 = job1.getId(); client.createJob(job1).get(); // Create a job using an image exposing port 11211 and map it to a specific external port final Job job2 = Job.newBuilder() .setName(testTag + "memcached") .setVersion("v2") .setImage("rohan/memcached-mini") .setPorts(ImmutableMap.of("tcp", PortMapping.of(11211, externalPort))) .build(); final JobId jobId2 = job2.getId(); client.createJob(job2).get(); // Wait for agent to come up awaitHostRegistered(client, testHost(), LONG_WAIT_MINUTES, MINUTES); awaitHostStatus(client, testHost(), UP, LONG_WAIT_MINUTES, MINUTES); // Deploy the jobs on the agent client.deploy(Deployment.of(jobId1, START), testHost()).get(); client.deploy(Deployment.of(jobId2, START), testHost()).get(); // Wait for the jobs to run awaitJobState(client, testHost(), jobId1, RUNNING, LONG_WAIT_MINUTES, MINUTES); awaitJobState(client, testHost(), jobId2, RUNNING, LONG_WAIT_MINUTES, MINUTES); }
/** Returns the current deployment state of {@code jobId} on {@code host}. */ @Override public Deployment getDeployment(final String host, final JobId jobId) { final String path = Paths.configHostJob(host, jobId); final ZooKeeperClient client = provider.get("getDeployment"); try { final byte[] data = client.getData(path); final Task task = parse(data, Task.class); return Deployment.of(jobId, task.getGoal()); } catch (KeeperException.NoNodeException e) { return null; } catch (KeeperException | IOException e) { throw new HeliosRuntimeException("getting deployment failed", e); } }
// TODO(drewc): this kinda screams "long method" private void deployJobRetry( final ZooKeeperClient client, final String host, final Deployment deployment, int count) throws JobDoesNotExistException, JobAlreadyDeployedException, HostNotFoundException, JobPortAllocationConflictException { if (count == 3) { throw new HeliosRuntimeException( "3 failures (possibly concurrent modifications) while " + "deploying. Giving up."); } log.info("deploying {}: {} (retry={})", deployment, host, count); final JobId id = deployment.getJobId(); final Job job = getJob(id); if (job == null) { throw new JobDoesNotExistException(id); } final UUID operationId = UUID.randomUUID(); final String jobPath = Paths.configJob(id); final String taskPath = Paths.configHostJob(host, id); final String taskCreationPath = Paths.configHostJobCreation(host, id, operationId); final List<Integer> staticPorts = staticPorts(job); final Map<String, byte[]> portNodes = Maps.newHashMap(); final byte[] idJson = id.toJsonBytes(); for (final int port : staticPorts) { final String path = Paths.configHostPort(host, port); portNodes.put(path, idJson); } final Task task = new Task(job, deployment.getGoal()); final List<ZooKeeperOperation> operations = Lists.newArrayList( check(jobPath), create(portNodes), create(Paths.configJobHost(id, host))); // Attempt to read a task here. If it's goal is UNDEPLOY, it's as good as not existing try { final Node existing = client.getNode(taskPath); byte[] bytes = existing.getBytes(); Task readTask = Json.read(bytes, Task.class); if (readTask.getGoal() != Goal.UNDEPLOY) { throw new JobAlreadyDeployedException(host, id); } operations.add(check(taskPath, existing.getStat().getVersion())); operations.add(set(taskPath, task)); } catch (NoNodeException e) { operations.add(create(taskPath, task)); operations.add(create(taskCreationPath)); } catch (IOException | KeeperException e) { throw new HeliosRuntimeException("reading existing task description failed", e); } // TODO (dano): Failure handling is racy wrt agent and job modifications. try { client.transaction(operations); log.info("deployed {}: {} (retry={})", deployment, host, count); } catch (NoNodeException e) { // Either the job, the host or the task went away assertJobExists(client, id); assertHostExists(client, host); // If the job and host still exists, we likely tried to redeploy a job that had an UNDEPLOY // goal and lost the race with the agent removing the task before we could set it. Retry. deployJobRetry(client, host, deployment, count + 1); } catch (NodeExistsException e) { // Check for conflict due to transaction retry try { if (client.exists(taskCreationPath) != null) { // Our creation operation node existed, we're done here return; } } catch (KeeperException ex) { throw new HeliosRuntimeException("checking job deployment failed", ex); } try { // Check if the job was already deployed if (client.stat(taskPath) != null) { throw new JobAlreadyDeployedException(host, id); } } catch (KeeperException ex) { throw new HeliosRuntimeException("checking job deployment failed", e); } // Check for static port collisions for (final int port : staticPorts) { final String path = Paths.configHostPort(host, port); try { if (client.stat(path) == null) { continue; } final byte[] b = client.getData(path); final JobId existingJobId = parse(b, JobId.class); throw new JobPortAllocationConflictException(id, existingJobId, host, port); } catch (KeeperException | IOException ex) { throw new HeliosRuntimeException("checking port allocations failed", e); } } // Catch all for logic and ephemeral issues throw new HeliosRuntimeException("deploying job failed", e); } catch (KeeperException e) { throw new HeliosRuntimeException("deploying job failed", e); } }