Exemplo n.º 1
0
  /** Returns the current status of the host named by {@code host}. */
  @Override
  public HostStatus getHostStatus(final String host) {
    final Stat stat;
    final ZooKeeperClient client = provider.get("getHostStatus");

    try {
      stat = client.exists(Paths.configHostId(host));
    } catch (KeeperException e) {
      throw new HeliosRuntimeException("Failed to check host status", e);
    }

    if (stat == null) {
      return null;
    }

    final boolean up = checkHostUp(client, host);
    final HostInfo hostInfo = getHostInfo(client, host);
    final AgentInfo agentInfo = getAgentInfo(client, host);
    final Map<JobId, Deployment> tasks = getTasks(client, host);
    final Map<JobId, TaskStatus> statuses = getTaskStatuses(client, host);
    final Map<String, String> environment = getEnvironment(client, host);

    return HostStatus.newBuilder()
        .setJobs(tasks)
        .setStatuses(fromNullable(statuses).or(EMPTY_STATUSES))
        .setHostInfo(hostInfo)
        .setAgentInfo(agentInfo)
        .setStatus(up ? UP : DOWN)
        .setEnvironment(environment)
        .build();
  }
Exemplo n.º 2
0
  /** Used to update the existing deployment of a job. */
  @Override
  public void updateDeployment(final String host, final Deployment deployment)
      throws HostNotFoundException, JobNotDeployedException {
    log.info("updating deployment {}: {}", deployment, host);

    final ZooKeeperClient client = provider.get("updateDeployment");

    final JobId jobId = deployment.getJobId();
    final Job job = getJob(client, jobId);

    if (job == null) {
      throw new JobNotDeployedException(host, jobId);
    }

    assertHostExists(client, host);
    assertTaskExists(client, host, deployment.getJobId());

    final String path = Paths.configHostJob(host, jobId);
    final Task task = new Task(job, deployment.getGoal());
    try {
      client.setData(path, task.toJsonBytes());
    } catch (Exception e) {
      throw new HeliosRuntimeException(
          "updating deployment " + deployment + " on host " + host + " failed", e);
    }
  }
Exemplo n.º 3
0
  /** Deletes a job from ZooKeeper. Ensures that job is not currently running anywhere. */
  @Override
  public Job removeJob(final JobId id) throws JobDoesNotExistException, JobStillDeployedException {
    log.info("removing job: id={}", id);
    final ZooKeeperClient client = provider.get("removeJob");
    final Job job = getJob(client, id);
    if (job == null) {
      throw new JobDoesNotExistException(id);
    }
    // TODO (dano): handle retry failures
    try {
      final ImmutableList.Builder<ZooKeeperOperation> operations = ImmutableList.builder();
      final UUID jobCreationOperationId = getJobCreation(client, id);
      if (jobCreationOperationId != null) {
        operations.add(delete(Paths.configJobCreation(id, jobCreationOperationId)));
      }
      operations.add(
          delete(Paths.configJobHosts(id)),
          delete(Paths.configJobRefShort(id)),
          delete(Paths.configJob(id)));
      client.transaction(operations.build());
    } catch (final NoNodeException e) {
      throw new JobDoesNotExistException(id);
    } catch (final NotEmptyException e) {
      throw new JobStillDeployedException(id, listJobHosts(client, id));
    } catch (final KeeperException e) {
      throw new HeliosRuntimeException("removing job " + id + " failed", e);
    }

    return job;
  }
Exemplo n.º 4
0
 /**
  * Creates a config entry within the specified agent to un/deploy a job, or more generally, change
  * the deployment status according to the {@link Goal} value in {@link Deployment}.
  */
 @Override
 public void deployJob(final String host, final Deployment deployment)
     throws JobDoesNotExistException, JobAlreadyDeployedException, HostNotFoundException,
         JobPortAllocationConflictException {
   final ZooKeeperClient client = provider.get("deployJob");
   deployJobRetry(client, host, deployment, 0);
 }
Exemplo n.º 5
0
 /** Returns a {@link Map} of {@link JobId} to {@link Job} objects for all of the jobs known. */
 @Override
 public Map<JobId, Job> getJobs() {
   log.debug("getting jobs");
   final String folder = Paths.configJobs();
   final ZooKeeperClient client = provider.get("getJobs");
   try {
     final List<String> ids;
     try {
       ids = client.getChildren(folder);
     } catch (NoNodeException e) {
       return Maps.newHashMap();
     }
     final Map<JobId, Job> descriptors = Maps.newHashMap();
     for (final String id : ids) {
       final JobId jobId = JobId.fromString(id);
       final String path = Paths.configJob(jobId);
       final byte[] data = client.getData(path);
       final Job descriptor = parse(data, Job.class);
       descriptors.put(descriptor.getId(), descriptor);
     }
     return descriptors;
   } catch (KeeperException | IOException e) {
     throw new HeliosRuntimeException("getting jobs failed", e);
   }
 }
Exemplo n.º 6
0
 /** Adds a job into the configuration. */
 @Override
 public void addJob(final Job job) throws JobExistsException {
   log.info("adding job: {}", job);
   final JobId id = job.getId();
   final UUID operationId = UUID.randomUUID();
   final String creationPath = Paths.configJobCreation(id, operationId);
   final ZooKeeperClient client = provider.get("addJob");
   try {
     try {
       client.ensurePath(Paths.historyJob(id));
       client.transaction(
           create(Paths.configJob(id), job),
           create(Paths.configJobRefShort(id), id),
           create(Paths.configJobHosts(id)),
           create(creationPath));
     } catch (final NodeExistsException e) {
       if (client.exists(creationPath) != null) {
         // The job was created, we're done here
         return;
       }
       throw new JobExistsException(id.toString());
     }
   } catch (final KeeperException e) {
     throw new HeliosRuntimeException("adding job " + job + " failed", e);
   }
 }
Exemplo n.º 7
0
 /** Returns a list of the hosts/agents that have been registered. */
 @Override
 public List<String> listHosts() {
   try {
     // TODO (dano): only return hosts whose agents completed registration (i.e. has id nodes)
     return provider.get("listHosts").getChildren(Paths.configHosts());
   } catch (KeeperException.NoNodeException e) {
     return emptyList();
   } catch (KeeperException e) {
     throw new HeliosRuntimeException("listing hosts failed", e);
   }
 }
Exemplo n.º 8
0
  /** Undeploys the job specified by {@code jobId} on {@code host}. */
  @Override
  public Deployment undeployJob(final String host, final JobId jobId)
      throws HostNotFoundException, JobNotDeployedException {
    log.info("undeploying {}: {}", jobId, host);
    final ZooKeeperClient client = provider.get("undeployJob");

    assertHostExists(client, host);

    final Deployment deployment = getDeployment(host, jobId);
    if (deployment == null) {
      throw new JobNotDeployedException(host, jobId);
    }

    // TODO (dano): Is this safe? can the ports of an undeployed job collide with a new deployment?
    // TODO (drewc):  If it's still in UNDEPLOY, that means the agent hasn't gotten to it
    //    yet, which means it probably won't see the new job yet either.  However, it may spin up
    //    a new supervisor for the new job before the old one is done being torn down.  So it can
    //    race and lose.  With a little change to the Agent where we manage the supervisors, with
    //    some coordination, we could remove the race, such that this race goes away.  Specifically,
    //    we're creating new Supervisors before updating existing ones.  If we swap that, part of
    //    the problem goes away, but we'd need some coordination between the Supervisor and the
    //    agent such that the Agent could wait until the Supervisor had handled the Goal change.
    //    Additionally, since ZK guarantees we'll see the writes in the proper order, we wouldn't
    //    need to deal with seeing the new job before the UNDEPLOY.

    final Job job = getJob(client, jobId);
    final String path = Paths.configHostJob(host, jobId);
    final Task task = new Task(job, UNDEPLOY);
    final List<ZooKeeperOperation> operations =
        Lists.newArrayList(set(path, task.toJsonBytes()), delete(Paths.configJobHost(jobId, host)));

    final List<Integer> staticPorts = staticPorts(job);
    for (int port : staticPorts) {
      operations.add(delete(Paths.configHostPort(host, port)));
    }

    try {
      client.transaction(operations);
    } catch (NoNodeException e) {
      if (e.getPath().equals(path)) {
        // NoNodeException on updating the deployment node may happen due to retry failures.
        // If the deployment isn't there anymore, we're done.
        return deployment;
      } else {
        // The relation node deletes should not fail unless there is a programming error.
        throw new HeliosRuntimeException("Removing deployment failed", e);
      }
    } catch (KeeperException e) {
      throw new HeliosRuntimeException("Removing deployment failed", e);
    }
    return deployment;
  }
Exemplo n.º 9
0
 /** Returns the current deployment state of {@code jobId} on {@code host}. */
 @Override
 public Deployment getDeployment(final String host, final JobId jobId) {
   final String path = Paths.configHostJob(host, jobId);
   final ZooKeeperClient client = provider.get("getDeployment");
   try {
     final byte[] data = client.getData(path);
     final Task task = parse(data, Task.class);
     return Deployment.of(jobId, task.getGoal());
   } catch (KeeperException.NoNodeException e) {
     return null;
   } catch (KeeperException | IOException e) {
     throw new HeliosRuntimeException("getting deployment failed", e);
   }
 }
Exemplo n.º 10
0
 /** Returns a list of the host names of the currently running masters. */
 @Override
 public List<String> getRunningMasters() {
   final ZooKeeperClient client = provider.get("getRunningMasters");
   try {
     final List<String> masters = client.getChildren(Paths.statusMaster());
     final ImmutableList.Builder<String> upMasters = ImmutableList.builder();
     for (final String master : masters) {
       if (client.exists(Paths.statusMasterUp(master)) != null) {
         upMasters.add(master);
       }
     }
     return upMasters.build();
   } catch (KeeperException e) {
     throw new HeliosRuntimeException("listing masters failed", e);
   }
 }
Exemplo n.º 11
0
  /** Given a jobId, returns the N most recent events in it's history in the cluster. */
  @Override
  public List<TaskStatusEvent> getJobHistory(final JobId jobId) throws JobDoesNotExistException {
    final Job descriptor = getJob(jobId);
    if (descriptor == null) {
      throw new JobDoesNotExistException(jobId);
    }
    final ZooKeeperClient client = provider.get("getJobHistory");
    final List<String> hosts;
    try {
      hosts = client.getChildren(Paths.historyJobHosts(jobId));
    } catch (NoNodeException e) {
      return emptyList();
    } catch (KeeperException e) {
      throw Throwables.propagate(e);
    }

    final List<TaskStatusEvent> jsEvents = Lists.newArrayList();

    for (String host : hosts) {
      final List<String> events;
      try {
        events = client.getChildren(Paths.historyJobHostEvents(jobId, host));
      } catch (KeeperException e) {
        throw Throwables.propagate(e);
      }

      for (String event : events) {
        try {
          byte[] data =
              client.getData(Paths.historyJobHostEventsTimestamp(jobId, host, Long.valueOf(event)));
          final TaskStatus status = Json.read(data, TaskStatus.class);
          jsEvents.add(new TaskStatusEvent(status, Long.valueOf(event), host));
        } catch (NoNodeException e) { // ignore, it went away before we read it
        } catch (KeeperException | IOException e) {
          throw Throwables.propagate(e);
        }
      }
    }

    return Ordering.from(EVENT_COMPARATOR).sortedCopy(jsEvents);
  }
Exemplo n.º 12
0
  /**
   * Registers a host into ZooKeeper. The {@code id} is initially generated randomly by the Agent
   * and persisted on disk. This way, in the event that you have two agents attempting to register
   * with the same value of @{code host}, the first one will win.
   */
  @Override
  public void registerHost(final String host, final String id) {
    log.info("registering host: {}", host);
    final ZooKeeperClient client = provider.get("registerHost");
    try {
      // TODO (dano): this code is replicated in AgentZooKeeperRegistrar

      // This would've been nice to do in a transaction but PathChildrenCache ensures paths
      // so we can't know what paths already exist so assembling a suitable transaction is too
      // painful.
      client.ensurePath(Paths.configHost(host));
      client.ensurePath(Paths.configHostJobs(host));
      client.ensurePath(Paths.configHostPorts(host));
      client.ensurePath(Paths.statusHost(host));
      client.ensurePath(Paths.statusHostJobs(host));

      // Finish registration by creating the id node last
      client.createAndSetData(Paths.configHostId(host), id.getBytes(UTF_8));
    } catch (Exception e) {
      throw new HeliosRuntimeException("registering host " + host + " failed", e);
    }
  }
Exemplo n.º 13
0
  /** Returns the current job status as a {@link JobStatus} object. */
  @Override
  public JobStatus getJobStatus(final JobId jobId) {
    final ZooKeeperClient client = provider.get("getJobStatus");

    final Job job = getJob(client, jobId);
    if (job == null) {
      return null;
    }

    final List<String> hosts;
    try {
      hosts = listJobHosts(client, jobId);
    } catch (JobDoesNotExistException e) {
      return null;
    }

    final ImmutableMap.Builder<String, Deployment> deployments = ImmutableMap.builder();
    final ImmutableMap.Builder<String, TaskStatus> taskStatuses = ImmutableMap.builder();
    for (final String host : hosts) {
      final TaskStatus taskStatus = getTaskStatus(client, host, jobId);
      if (taskStatus != null) {
        taskStatuses.put(host, taskStatus);
      }
      final Deployment deployment = getDeployment(host, jobId);
      if (deployment != null) {
        deployments.put(host, deployment);
      }
    }

    final Map<String, Deployment> deploymentsMap = deployments.build();
    return JobStatus.newBuilder()
        .setJob(job)
        .setDeployments(deploymentsMap)
        .setTaskStatuses(taskStatuses.build())
        .build();
  }
Exemplo n.º 14
0
 /** Returns the job configuration for the job specified by {@code id} as a {@link Job} object. */
 @Override
 public Job getJob(final JobId id) {
   log.debug("getting job: {}", id);
   final ZooKeeperClient client = provider.get("getJob");
   return getJob(client, id);
 }
Exemplo n.º 15
0
  /**
   * Undoes the effect of {@link ZooKeeperMasterModel#registerHost(String, String)}. Cleans up any
   * leftover host-related things.
   */
  @Override
  public void deregisterHost(final String host)
      throws HostNotFoundException, HostStillInUseException {
    log.info("deregistering host: {}", host);
    final ZooKeeperClient client = provider.get("deregisterHost");
    // TODO (dano): handle retry failures
    try {
      final List<ZooKeeperOperation> operations = Lists.newArrayList();

      // Remove all jobs deployed to this host
      final List<JobId> jobs = listHostJobs(client, host);

      if (jobs == null) {
        if (client.exists(Paths.configHost(host)) == null) {
          throw new HostNotFoundException("host [" + host + "] does not exist");
        }
      }

      for (JobId job : jobs) {
        final String hostJobPath = Paths.configHostJob(host, job);
        final List<String> nodes = client.listRecursive(hostJobPath);
        for (final String node : reverse(nodes)) {
          operations.add(delete(node));
        }
        if (client.exists(Paths.configJobHost(job, host)) != null) {
          operations.add(delete(Paths.configJobHost(job, host)));
        }
        // Clean out the history for each job
        try {
          final List<String> history = client.listRecursive(Paths.historyJobHost(job, host));
          for (String s : reverse(history)) {
            operations.add(delete(s));
          }
        } catch (NoNodeException ignore) {
        }
      }
      operations.add(delete(Paths.configHostJobs(host)));

      // Remove the host status
      try {
        final List<String> nodes = client.listRecursive(Paths.statusHost(host));
        for (final String node : reverse(nodes)) {
          operations.add(delete(node));
        }
      } catch (NoNodeException ignore) {
      }

      // Remove port allocations
      try {
        final List<String> ports = client.getChildren(Paths.configHostPorts(host));
        for (final String port : ports) {
          operations.add(delete(Paths.configHostPort(host, Integer.valueOf(port))));
        }
        operations.add(delete(Paths.configHostPorts(host)));
      } catch (NoNodeException ignore) {
      }

      // Remove host id
      String idPath = Paths.configHostId(host);
      if (client.exists(idPath) != null) {
        operations.add(delete(idPath));
      }

      // Remove host config root
      operations.add(delete(Paths.configHost(host)));

      client.transaction(operations);
    } catch (NotEmptyException e) {
      final HostStatus hostStatus = getHostStatus(host);
      final List<JobId> jobs =
          hostStatus != null
              ? ImmutableList.copyOf(hostStatus.getJobs().keySet())
              : Collections.<JobId>emptyList();
      throw new HostStillInUseException(host, jobs);
    } catch (NoNodeException e) {
      throw new HostNotFoundException(host);
    } catch (KeeperException e) {
      throw new HeliosRuntimeException(e);
    }
  }