/** Returns the current status of the host named by {@code host}. */
  @Override
  public HostStatus getHostStatus(final String host) {
    final Stat stat;
    final ZooKeeperClient client = provider.get("getHostStatus");

    try {
      stat = client.exists(Paths.configHostId(host));
    } catch (KeeperException e) {
      throw new HeliosRuntimeException("Failed to check host status", e);
    }

    if (stat == null) {
      return null;
    }

    final boolean up = checkHostUp(client, host);
    final HostInfo hostInfo = getHostInfo(client, host);
    final AgentInfo agentInfo = getAgentInfo(client, host);
    final Map<JobId, Deployment> tasks = getTasks(client, host);
    final Map<JobId, TaskStatus> statuses = getTaskStatuses(client, host);
    final Map<String, String> environment = getEnvironment(client, host);

    return HostStatus.newBuilder()
        .setJobs(tasks)
        .setStatuses(fromNullable(statuses).or(EMPTY_STATUSES))
        .setHostInfo(hostInfo)
        .setAgentInfo(agentInfo)
        .setStatus(up ? UP : DOWN)
        .setEnvironment(environment)
        .build();
  }
Beispiel #2
0
  @Override
  protected void runOneIteration() {
    log.debug("Reaping agents");
    final List<String> agents = masterModel.listHosts();
    for (final String agent : agents) {
      try {
        final HostStatus hostStatus = masterModel.getHostStatus(agent);
        if (hostStatus == null || hostStatus.getStatus() != HostStatus.Status.DOWN) {
          // Host not found or host not DOWN -- nothing to do, move on to the next host
          continue;
        }

        final AgentInfo agentInfo = hostStatus.getAgentInfo();
        if (agentInfo == null) {
          continue;
        }

        final long downSince = agentInfo.getStartTime() + agentInfo.getUptime();
        final long downDurationMillis = clock.now().getMillis() - downSince;

        if (downDurationMillis >= timeoutMillis) {
          try {
            log.info(
                "Reaping dead agent '{}' (DOWN for {} hours)",
                agent,
                DurationFormatUtils.formatDurationHMS(downDurationMillis));
            masterModel.deregisterHost(agent);
          } catch (Exception e) {
            log.warn("Failed to reap agent '{}'", agent, e);
          }
        }
      } catch (Exception e) {
        log.warn("Failed to determine if agent '{}' should be reaped", agent, e);
      }
    }
  }
  /**
   * Undoes the effect of {@link ZooKeeperMasterModel#registerHost(String, String)}. Cleans up any
   * leftover host-related things.
   */
  @Override
  public void deregisterHost(final String host)
      throws HostNotFoundException, HostStillInUseException {
    log.info("deregistering host: {}", host);
    final ZooKeeperClient client = provider.get("deregisterHost");
    // TODO (dano): handle retry failures
    try {
      final List<ZooKeeperOperation> operations = Lists.newArrayList();

      // Remove all jobs deployed to this host
      final List<JobId> jobs = listHostJobs(client, host);

      if (jobs == null) {
        if (client.exists(Paths.configHost(host)) == null) {
          throw new HostNotFoundException("host [" + host + "] does not exist");
        }
      }

      for (JobId job : jobs) {
        final String hostJobPath = Paths.configHostJob(host, job);
        final List<String> nodes = client.listRecursive(hostJobPath);
        for (final String node : reverse(nodes)) {
          operations.add(delete(node));
        }
        if (client.exists(Paths.configJobHost(job, host)) != null) {
          operations.add(delete(Paths.configJobHost(job, host)));
        }
        // Clean out the history for each job
        try {
          final List<String> history = client.listRecursive(Paths.historyJobHost(job, host));
          for (String s : reverse(history)) {
            operations.add(delete(s));
          }
        } catch (NoNodeException ignore) {
        }
      }
      operations.add(delete(Paths.configHostJobs(host)));

      // Remove the host status
      try {
        final List<String> nodes = client.listRecursive(Paths.statusHost(host));
        for (final String node : reverse(nodes)) {
          operations.add(delete(node));
        }
      } catch (NoNodeException ignore) {
      }

      // Remove port allocations
      try {
        final List<String> ports = client.getChildren(Paths.configHostPorts(host));
        for (final String port : ports) {
          operations.add(delete(Paths.configHostPort(host, Integer.valueOf(port))));
        }
        operations.add(delete(Paths.configHostPorts(host)));
      } catch (NoNodeException ignore) {
      }

      // Remove host id
      String idPath = Paths.configHostId(host);
      if (client.exists(idPath) != null) {
        operations.add(delete(idPath));
      }

      // Remove host config root
      operations.add(delete(Paths.configHost(host)));

      client.transaction(operations);
    } catch (NotEmptyException e) {
      final HostStatus hostStatus = getHostStatus(host);
      final List<JobId> jobs =
          hostStatus != null
              ? ImmutableList.copyOf(hostStatus.getJobs().keySet())
              : Collections.<JobId>emptyList();
      throw new HostStillInUseException(host, jobs);
    } catch (NoNodeException e) {
      throw new HostNotFoundException(host);
    } catch (KeeperException e) {
      throw new HeliosRuntimeException(e);
    }
  }