/** Returns the current status of the host named by {@code host}. */ @Override public HostStatus getHostStatus(final String host) { final Stat stat; final ZooKeeperClient client = provider.get("getHostStatus"); try { stat = client.exists(Paths.configHostId(host)); } catch (KeeperException e) { throw new HeliosRuntimeException("Failed to check host status", e); } if (stat == null) { return null; } final boolean up = checkHostUp(client, host); final HostInfo hostInfo = getHostInfo(client, host); final AgentInfo agentInfo = getAgentInfo(client, host); final Map<JobId, Deployment> tasks = getTasks(client, host); final Map<JobId, TaskStatus> statuses = getTaskStatuses(client, host); final Map<String, String> environment = getEnvironment(client, host); return HostStatus.newBuilder() .setJobs(tasks) .setStatuses(fromNullable(statuses).or(EMPTY_STATUSES)) .setHostInfo(hostInfo) .setAgentInfo(agentInfo) .setStatus(up ? UP : DOWN) .setEnvironment(environment) .build(); }
@Override protected void runOneIteration() { log.debug("Reaping agents"); final List<String> agents = masterModel.listHosts(); for (final String agent : agents) { try { final HostStatus hostStatus = masterModel.getHostStatus(agent); if (hostStatus == null || hostStatus.getStatus() != HostStatus.Status.DOWN) { // Host not found or host not DOWN -- nothing to do, move on to the next host continue; } final AgentInfo agentInfo = hostStatus.getAgentInfo(); if (agentInfo == null) { continue; } final long downSince = agentInfo.getStartTime() + agentInfo.getUptime(); final long downDurationMillis = clock.now().getMillis() - downSince; if (downDurationMillis >= timeoutMillis) { try { log.info( "Reaping dead agent '{}' (DOWN for {} hours)", agent, DurationFormatUtils.formatDurationHMS(downDurationMillis)); masterModel.deregisterHost(agent); } catch (Exception e) { log.warn("Failed to reap agent '{}'", agent, e); } } } catch (Exception e) { log.warn("Failed to determine if agent '{}' should be reaped", agent, e); } } }
/** * Undoes the effect of {@link ZooKeeperMasterModel#registerHost(String, String)}. Cleans up any * leftover host-related things. */ @Override public void deregisterHost(final String host) throws HostNotFoundException, HostStillInUseException { log.info("deregistering host: {}", host); final ZooKeeperClient client = provider.get("deregisterHost"); // TODO (dano): handle retry failures try { final List<ZooKeeperOperation> operations = Lists.newArrayList(); // Remove all jobs deployed to this host final List<JobId> jobs = listHostJobs(client, host); if (jobs == null) { if (client.exists(Paths.configHost(host)) == null) { throw new HostNotFoundException("host [" + host + "] does not exist"); } } for (JobId job : jobs) { final String hostJobPath = Paths.configHostJob(host, job); final List<String> nodes = client.listRecursive(hostJobPath); for (final String node : reverse(nodes)) { operations.add(delete(node)); } if (client.exists(Paths.configJobHost(job, host)) != null) { operations.add(delete(Paths.configJobHost(job, host))); } // Clean out the history for each job try { final List<String> history = client.listRecursive(Paths.historyJobHost(job, host)); for (String s : reverse(history)) { operations.add(delete(s)); } } catch (NoNodeException ignore) { } } operations.add(delete(Paths.configHostJobs(host))); // Remove the host status try { final List<String> nodes = client.listRecursive(Paths.statusHost(host)); for (final String node : reverse(nodes)) { operations.add(delete(node)); } } catch (NoNodeException ignore) { } // Remove port allocations try { final List<String> ports = client.getChildren(Paths.configHostPorts(host)); for (final String port : ports) { operations.add(delete(Paths.configHostPort(host, Integer.valueOf(port)))); } operations.add(delete(Paths.configHostPorts(host))); } catch (NoNodeException ignore) { } // Remove host id String idPath = Paths.configHostId(host); if (client.exists(idPath) != null) { operations.add(delete(idPath)); } // Remove host config root operations.add(delete(Paths.configHost(host))); client.transaction(operations); } catch (NotEmptyException e) { final HostStatus hostStatus = getHostStatus(host); final List<JobId> jobs = hostStatus != null ? ImmutableList.copyOf(hostStatus.getJobs().keySet()) : Collections.<JobId>emptyList(); throw new HostStillInUseException(host, jobs); } catch (NoNodeException e) { throw new HostNotFoundException(host); } catch (KeeperException e) { throw new HeliosRuntimeException(e); } }