Ejemplo n.º 1
0
  private List<SingularityRequestParent> getRequestsWithDeployState(
      Iterable<SingularityRequestWithState> requests) {
    List<String> requestIds = Lists.newArrayList();
    for (SingularityRequestWithState requestWithState : requests) {
      requestIds.add(requestWithState.getRequest().getId());
    }

    List<SingularityRequestParent> parents = Lists.newArrayListWithCapacity(requestIds.size());

    Map<String, SingularityRequestDeployState> deployStates =
        deployManager.getRequestDeployStatesByRequestIds(requestIds);

    for (SingularityRequestWithState requestWithState : requests) {
      Optional<SingularityRequestDeployState> deployState =
          Optional.fromNullable(deployStates.get(requestWithState.getRequest().getId()));
      parents.add(
          new SingularityRequestParent(
              requestWithState.getRequest(),
              requestWithState.getState(),
              deployState,
              Optional.<SingularityDeploy>absent(),
              Optional.<SingularityDeploy>absent(),
              Optional.<SingularityPendingDeploy>absent()));
    }

    return parents;
  }
Ejemplo n.º 2
0
  @POST
  @Path("/request/{requestId}/exit-cooldown")
  public SingularityRequestParent exitCooldown(
      @PathParam("requestId") String requestId, @QueryParam("user") Optional<String> user) {
    final SingularityRequestWithState requestWithState = fetchRequestWithState(requestId);

    checkConflict(
        requestWithState.getState() == RequestState.SYSTEM_COOLDOWN,
        "Request %s is not in SYSTEM_COOLDOWN state, it is in %s",
        requestId,
        requestWithState.getState());

    final Optional<String> maybeDeployId = deployManager.getInUseDeployId(requestId);

    final long now = System.currentTimeMillis();

    requestManager.exitCooldown(requestWithState.getRequest(), now, user);

    if (maybeDeployId.isPresent() && !requestWithState.getRequest().isOneOff()) {
      requestManager.addToPendingQueue(
          new SingularityPendingRequest(
              requestId,
              maybeDeployId.get(),
              now,
              user,
              PendingType.IMMEDIATE,
              Collections.<String>emptyList()));
    }

    return fillEntireRequest(requestWithState);
  }
  private SingularityDeployStatistics getDeployStatistics(String requestId, String deployId) {
    final Optional<SingularityDeployStatistics> maybeDeployStatistics =
        deployManager.getDeployStatistics(requestId, deployId);

    if (maybeDeployStatistics.isPresent()) {
      return maybeDeployStatistics.get();
    }

    return new SingularityDeployStatisticsBuilder(requestId, deployId).build();
  }
Ejemplo n.º 4
0
  private String getAndCheckDeployId(String requestId) {
    Optional<String> maybeDeployId = deployManager.getInUseDeployId(requestId);

    checkConflict(
        maybeDeployId.isPresent(),
        "Can not schedule/bounce a request (%s) with no deploy",
        requestId);

    return maybeDeployId.get();
  }
Ejemplo n.º 5
0
  private void enqueueHealthAndNewTaskChecks() {
    final long start = System.currentTimeMillis();

    final List<SingularityTask> activeTasks = taskManager.getActiveTasks();
    final Map<SingularityTaskId, SingularityTask> activeTaskMap =
        Maps.uniqueIndex(activeTasks, SingularityTaskIdHolder.getTaskIdFunction());

    final Map<SingularityTaskId, List<SingularityTaskHistoryUpdate>> taskUpdates =
        taskManager.getTaskHistoryUpdates(activeTaskMap.keySet());

    final Map<SingularityDeployKey, SingularityPendingDeploy> pendingDeploys =
        Maps.uniqueIndex(
            deployManager.getPendingDeploys(), SingularityDeployKey.FROM_PENDING_TO_DEPLOY_KEY);
    final Map<String, SingularityRequestWithState> idToRequest =
        Maps.uniqueIndex(
            requestManager.getRequests(), SingularityRequestWithState.REQUEST_STATE_TO_REQUEST_ID);

    requestManager.getActiveRequests();
    int enqueuedNewTaskChecks = 0;
    int enqueuedHealthchecks = 0;

    for (Map.Entry<SingularityTaskId, SingularityTask> entry : activeTaskMap.entrySet()) {
      SingularityTaskId taskId = entry.getKey();
      SingularityTask task = entry.getValue();
      SimplifiedTaskState simplifiedTaskState =
          SingularityTaskHistoryUpdate.getCurrentState(taskUpdates.get(taskId));

      if (simplifiedTaskState != SimplifiedTaskState.DONE) {
        SingularityDeployKey deployKey =
            new SingularityDeployKey(taskId.getRequestId(), taskId.getDeployId());
        Optional<SingularityPendingDeploy> pendingDeploy =
            Optional.fromNullable(pendingDeploys.get(deployKey));
        Optional<SingularityRequestWithState> request =
            Optional.fromNullable(idToRequest.get(taskId.getRequestId()));

        if (!pendingDeploy.isPresent()) {
          newTaskChecker.enqueueNewTaskCheck(task, request, healthchecker);
          enqueuedNewTaskChecks++;
        }
        if (simplifiedTaskState == SimplifiedTaskState.RUNNING) {
          if (healthchecker.enqueueHealthcheck(task, pendingDeploy, request)) {
            enqueuedHealthchecks++;
          }
        }
      }
    }

    LOG.info(
        "Enqueued {} health checks and {} new task checks (out of {} active tasks) in {}",
        enqueuedHealthchecks,
        enqueuedNewTaskChecks,
        activeTasks.size(),
        JavaUtils.duration(start));
  }
  private boolean shouldScheduleTasks(
      SingularityPendingRequest pendingRequest,
      Optional<SingularityRequestWithState> maybeRequest) {
    if (!isRequestActive(maybeRequest)) {
      return false;
    }

    Optional<SingularityRequestDeployState> maybeRequestDeployState =
        deployManager.getRequestDeployState(pendingRequest.getRequestId());

    return isDeployInUse(maybeRequestDeployState, pendingRequest.getDeployId(), false);
  }
Ejemplo n.º 7
0
  private SingularityRequestDeployHolder getDeployHolder(String requestId) {
    Optional<SingularityRequestDeployState> requestDeployState =
        deployManager.getRequestDeployState(requestId);

    Optional<SingularityDeploy> activeDeploy = Optional.absent();
    Optional<SingularityDeploy> pendingDeploy = Optional.absent();

    if (requestDeployState.isPresent()) {
      if (requestDeployState.get().getActiveDeploy().isPresent()) {
        activeDeploy =
            deployManager.getDeploy(
                requestId, requestDeployState.get().getActiveDeploy().get().getDeployId());
      }
      if (requestDeployState.get().getPendingDeploy().isPresent()) {
        pendingDeploy =
            deployManager.getDeploy(
                requestId, requestDeployState.get().getPendingDeploy().get().getDeployId());
      }
    }

    return new SingularityRequestDeployHolder(activeDeploy, pendingDeploy);
  }
Ejemplo n.º 8
0
  private void checkActiveRequest(
      SingularityRequestWithState requestWithState,
      Map<SingularityDeployKey, SingularityPendingTaskId> deployKeyToPendingTaskId,
      final long timestamp) {
    final SingularityRequest request = requestWithState.getRequest();

    if (request.getRequestType() == RequestType.ON_DEMAND
        || request.getRequestType() == RequestType.RUN_ONCE) {
      return; // There's no situation where we'd want to schedule an On Demand or Run Once request
              // at startup, so don't even bother with them.
    }

    Optional<SingularityRequestDeployState> requestDeployState =
        deployManager.getRequestDeployState(request.getId());

    if (!requestDeployState.isPresent()
        || !requestDeployState.get().getActiveDeploy().isPresent()) {
      LOG.debug("No active deploy for {} - not scheduling on startup", request.getId());
      return;
    }

    final String activeDeployId = requestDeployState.get().getActiveDeploy().get().getDeployId();

    if (request.isScheduled()) {
      SingularityDeployKey deployKey = new SingularityDeployKey(request.getId(), activeDeployId);
      SingularityPendingTaskId pendingTaskId = deployKeyToPendingTaskId.get(deployKey);

      if (pendingTaskId != null
          && pendingTaskId.getCreatedAt() >= requestWithState.getTimestamp()) {
        LOG.info(
            "Not rescheduling {} because {} is newer than {}",
            request.getId(),
            pendingTaskId,
            requestWithState.getTimestamp());
        return;
      }
    }

    requestManager.addToPendingQueue(
        new SingularityPendingRequest(
            request.getId(),
            activeDeployId,
            timestamp,
            Optional.<String>absent(),
            PendingType.STARTUP,
            Optional.<Boolean>absent(),
            Optional.<String>absent()));
  }
Ejemplo n.º 9
0
  private void checkReschedule(
      SingularityRequest newRequest, Optional<SingularityRequest> maybeOldRequest, long timestamp) {
    if (!maybeOldRequest.isPresent()) {
      return;
    }

    if (shouldReschedule(newRequest, maybeOldRequest.get())) {
      Optional<String> maybeDeployId = deployManager.getInUseDeployId(newRequest.getId());

      if (maybeDeployId.isPresent()) {
        requestManager.addToPendingQueue(
            new SingularityPendingRequest(
                newRequest.getId(), maybeDeployId.get(), timestamp, PendingType.UPDATED_REQUEST));
      }
    }
  }
Ejemplo n.º 10
0
  @POST
  @Path("/request/{requestId}/unpause")
  @ApiOperation(
      value = "Unpause a Singularity Request, scheduling new tasks immediately",
      response = SingularityRequestParent.class)
  @ApiResponses({
    @ApiResponse(code = 409, message = "Request is not paused"),
  })
  public SingularityRequestParent unpause(
      @ApiParam("The request ID to unpause") @PathParam("requestId") String requestId,
      @ApiParam("Username of the person requesting the unpause") @QueryParam("user")
          Optional<String> user) {
    SingularityRequestWithState requestWithState = fetchRequestWithState(requestId);

    checkConflict(
        requestWithState.getState() == RequestState.PAUSED,
        "Request %s is not in PAUSED state, it is in %s",
        requestId,
        requestWithState.getState());

    mailer.sendRequestUnpausedMail(requestWithState.getRequest(), user);

    Optional<String> maybeDeployId = deployManager.getInUseDeployId(requestId);

    final long now = System.currentTimeMillis();

    requestManager.unpause(requestWithState.getRequest(), now, user);

    if (maybeDeployId.isPresent() && !requestWithState.getRequest().isOneOff()) {
      requestManager.addToPendingQueue(
          new SingularityPendingRequest(
              requestId,
              maybeDeployId.get(),
              now,
              user,
              PendingType.UNPAUSED,
              Collections.<String>emptyList()));
    }

    return fillEntireRequest(
        new SingularityRequestWithState(requestWithState.getRequest(), RequestState.ACTIVE, now));
  }
  private List<SingularityTaskRequest> checkForStaleScheduledTasks(
      List<SingularityPendingTask> pendingTasks, List<SingularityTaskRequest> taskRequests) {
    final Set<String> foundPendingTaskId = Sets.newHashSetWithExpectedSize(taskRequests.size());
    final Set<String> requestIds = Sets.newHashSetWithExpectedSize(taskRequests.size());

    for (SingularityTaskRequest taskRequest : taskRequests) {
      foundPendingTaskId.add(taskRequest.getPendingTask().getPendingTaskId().getId());
      requestIds.add(taskRequest.getRequest().getId());
    }

    for (SingularityPendingTask pendingTask : pendingTasks) {
      if (!foundPendingTaskId.contains(pendingTask.getPendingTaskId().getId())) {
        LOG.info("Removing stale pending task {}", pendingTask.getPendingTaskId());
        taskManager.deletePendingTask(pendingTask.getPendingTaskId());
      }
    }

    // TODO this check isn't necessary if we keep track better during deploys
    final Map<String, SingularityRequestDeployState> deployStates =
        deployManager.getRequestDeployStatesByRequestIds(requestIds);
    final List<SingularityTaskRequest> taskRequestsWithValidDeploys =
        Lists.newArrayListWithCapacity(taskRequests.size());

    for (SingularityTaskRequest taskRequest : taskRequests) {
      SingularityRequestDeployState requestDeployState =
          deployStates.get(taskRequest.getRequest().getId());

      if (!matchesDeploy(requestDeployState, taskRequest)) {
        LOG.info(
            "Removing stale pending task {} because the deployId did not match active/pending deploys {}",
            taskRequest.getPendingTask().getPendingTaskId(),
            requestDeployState);
        taskManager.deletePendingTask(taskRequest.getPendingTask().getPendingTaskId());
      } else {
        taskRequestsWithValidDeploys.add(taskRequest);
      }
    }

    return taskRequestsWithValidDeploys;
  }
  private void updateDeployStatistics(
      SingularityDeployStatistics deployStatistics,
      SingularityTaskId taskId,
      long timestamp,
      ExtendedTaskState state,
      Optional<PendingType> scheduleResult) {
    SingularityDeployStatisticsBuilder bldr = deployStatistics.toBuilder();

    if (bldr.getAverageRuntimeMillis().isPresent()) {
      long newAvgRuntimeMillis =
          (bldr.getAverageRuntimeMillis().get() * bldr.getNumTasks()
                  + (timestamp - taskId.getStartedAt()))
              / (bldr.getNumTasks() + 1);

      bldr.setAverageRuntimeMillis(Optional.of(newAvgRuntimeMillis));
    } else {
      bldr.setAverageRuntimeMillis(Optional.of(timestamp - taskId.getStartedAt()));
    }

    bldr.setNumTasks(bldr.getNumTasks() + 1);

    if (!bldr.getLastFinishAt().isPresent() || timestamp > bldr.getLastFinishAt().get()) {
      bldr.setLastFinishAt(Optional.of(timestamp));
      bldr.setLastTaskState(Optional.of(state));
    }

    final ListMultimap<Integer, Long> instanceSequentialFailureTimestamps =
        bldr.getInstanceSequentialFailureTimestamps();
    final List<Long> sequentialFailureTimestamps =
        instanceSequentialFailureTimestamps.get(taskId.getInstanceNo());

    if (!state.isSuccess()) {
      if (SingularityTaskHistoryUpdate.getUpdate(
              taskManager.getTaskHistoryUpdates(taskId), ExtendedTaskState.TASK_CLEANING)
          .isPresent()) {
        LOG.debug("{} failed with {} after cleaning - ignoring it for cooldown", taskId, state);
      } else {

        if (sequentialFailureTimestamps.size() < configuration.getCooldownAfterFailures()) {
          sequentialFailureTimestamps.add(timestamp);
        } else if (timestamp > sequentialFailureTimestamps.get(0)) {
          sequentialFailureTimestamps.set(0, timestamp);
        }

        Collections.sort(sequentialFailureTimestamps);
      }
    } else {
      bldr.setNumSuccess(bldr.getNumSuccess() + 1);
      sequentialFailureTimestamps.clear();
    }

    if (scheduleResult.isPresent() && scheduleResult.get() == PendingType.RETRY) {
      bldr.setNumSequentialRetries(bldr.getNumSequentialRetries() + 1);
    } else {
      bldr.setNumSequentialRetries(0);
    }

    final SingularityDeployStatistics newStatistics = bldr.build();

    LOG.trace("Saving new deploy statistics {}", newStatistics);

    deployManager.saveDeployStatistics(newStatistics);
  }
  private Optional<PendingType> handleCompletedTaskWithStatistics(
      Optional<SingularityTask> task,
      SingularityTaskId taskId,
      long timestamp,
      ExtendedTaskState state,
      SingularityDeployStatistics deployStatistics,
      SingularityCreateResult taskHistoryUpdateCreateResult,
      SingularitySchedulerStateCache stateCache) {
    final Optional<SingularityRequestWithState> maybeRequestWithState =
        requestManager.getRequest(taskId.getRequestId());

    if (!isRequestActive(maybeRequestWithState)) {
      LOG.warn(
          "Not scheduling a new task, {} is {}",
          taskId.getRequestId(),
          SingularityRequestWithState.getRequestState(maybeRequestWithState));
      return Optional.absent();
    }

    RequestState requestState = maybeRequestWithState.get().getState();
    final SingularityRequest request = maybeRequestWithState.get().getRequest();

    final Optional<SingularityRequestDeployState> requestDeployState =
        deployManager.getRequestDeployState(request.getId());

    if (!isDeployInUse(requestDeployState, taskId.getDeployId(), true)) {
      LOG.debug(
          "Task {} completed, but it didn't match active deploy state {} - ignoring",
          taskId.getId(),
          requestDeployState);
      return Optional.absent();
    }

    if (taskHistoryUpdateCreateResult == SingularityCreateResult.CREATED
        && requestState != RequestState.SYSTEM_COOLDOWN) {
      mailer.sendTaskCompletedMail(task, taskId, request, state);
    } else if (requestState == RequestState.SYSTEM_COOLDOWN) {
      LOG.debug("Not sending a task completed email because task {} is in SYSTEM_COOLDOWN", taskId);
    } else {
      LOG.debug(
          "Not sending a task completed email for task {} because Singularity already processed this update",
          taskId);
    }

    if (!state.isSuccess()
        && taskHistoryUpdateCreateResult == SingularityCreateResult.CREATED
        && cooldown.shouldEnterCooldown(
            request, taskId, requestState, deployStatistics, timestamp)) {
      LOG.info("Request {} is entering cooldown due to task {}", request.getId(), taskId);
      requestState = RequestState.SYSTEM_COOLDOWN;
      requestManager.cooldown(request, System.currentTimeMillis());
      mailer.sendRequestInCooldownMail(request);
    }

    PendingType pendingType = PendingType.TASK_DONE;

    if (!state.isSuccess() && shouldRetryImmediately(request, deployStatistics)) {
      LOG.debug("Retrying {} because {}", request.getId(), state);
      pendingType = PendingType.RETRY;
    } else if (!request.isAlwaysRunning()) {
      return Optional.absent();
    }

    if (state.isSuccess() && requestState == RequestState.SYSTEM_COOLDOWN) {
      // TODO send not cooldown anymore email
      LOG.info("Request {} succeeded a task, removing from cooldown", request.getId());
      requestState = RequestState.ACTIVE;
      requestManager.exitCooldown(request, System.currentTimeMillis());
    }

    SingularityPendingRequest pendingRequest =
        new SingularityPendingRequest(
            request.getId(),
            requestDeployState.get().getActiveDeploy().get().getDeployId(),
            System.currentTimeMillis(),
            pendingType);

    scheduleTasks(
        stateCache,
        request,
        requestState,
        deployStatistics,
        pendingRequest,
        getMatchingTaskIds(stateCache, request, pendingRequest));

    return Optional.of(pendingType);
  }
  public void checkForDecomissions(SingularitySchedulerStateCache stateCache) {
    final long start = System.currentTimeMillis();

    final Set<String> requestIdsToReschedule = Sets.newHashSet();
    final Set<SingularityTaskId> matchingTaskIds = Sets.newHashSet();

    final Collection<SingularityTaskId> activeTaskIds = stateCache.getActiveTaskIds();

    final Map<SingularitySlave, MachineState> slaves =
        getDefaultMap(slaveManager.getObjectsFiltered(MachineState.STARTING_DECOMMISSION));

    for (SingularitySlave slave : slaves.keySet()) {
      boolean foundTask = false;

      for (SingularityTask activeTask : taskManager.getTasksOnSlave(activeTaskIds, slave)) {
        cleanupTaskDueToDecomission(requestIdsToReschedule, matchingTaskIds, activeTask, slave);
        foundTask = true;
      }

      if (!foundTask) {
        slaves.put(slave, MachineState.DECOMMISSIONED);
      }
    }

    final Map<SingularityRack, MachineState> racks =
        getDefaultMap(rackManager.getObjectsFiltered(MachineState.STARTING_DECOMMISSION));

    for (SingularityRack rack : racks.keySet()) {
      boolean foundTask = false;

      for (SingularityTaskId activeTaskId : activeTaskIds) {
        if (rack.getId().equals(activeTaskId.getRackId())) {
          foundTask = true;
        }

        if (matchingTaskIds.contains(activeTaskId)) {
          continue;
        }

        if (rack.getId().equals(activeTaskId.getRackId())) {
          Optional<SingularityTask> maybeTask = taskManager.getTask(activeTaskId);
          cleanupTaskDueToDecomission(
              requestIdsToReschedule, matchingTaskIds, maybeTask.get(), rack);
        }
      }

      if (!foundTask) {
        racks.put(rack, MachineState.DECOMMISSIONED);
      }
    }

    for (String requestId : requestIdsToReschedule) {
      LOG.trace("Rescheduling request {} due to decomissions", requestId);

      Optional<String> maybeDeployId = deployManager.getInUseDeployId(requestId);

      if (maybeDeployId.isPresent()) {
        requestManager.addToPendingQueue(
            new SingularityPendingRequest(
                requestId, maybeDeployId.get(), start, PendingType.DECOMISSIONED_SLAVE_OR_RACK));
      } else {
        LOG.warn("Not rescheduling a request ({}) because of no active deploy", requestId);
      }
    }

    changeState(slaves, slaveManager);
    changeState(racks, rackManager);

    if (slaves.isEmpty()
        && racks.isEmpty()
        && requestIdsToReschedule.isEmpty()
        && matchingTaskIds.isEmpty()) {
      LOG.trace("Decomission check found nothing");
    } else {
      LOG.info(
          "Found {} decomissioning slaves, {} decomissioning racks, rescheduling {} requests and scheduling {} tasks for cleanup in {}",
          slaves.size(),
          racks.size(),
          requestIdsToReschedule.size(),
          matchingTaskIds.size(),
          JavaUtils.duration(start));
    }
  }