private List<SingularityRequestParent> getRequestsWithDeployState( Iterable<SingularityRequestWithState> requests) { List<String> requestIds = Lists.newArrayList(); for (SingularityRequestWithState requestWithState : requests) { requestIds.add(requestWithState.getRequest().getId()); } List<SingularityRequestParent> parents = Lists.newArrayListWithCapacity(requestIds.size()); Map<String, SingularityRequestDeployState> deployStates = deployManager.getRequestDeployStatesByRequestIds(requestIds); for (SingularityRequestWithState requestWithState : requests) { Optional<SingularityRequestDeployState> deployState = Optional.fromNullable(deployStates.get(requestWithState.getRequest().getId())); parents.add( new SingularityRequestParent( requestWithState.getRequest(), requestWithState.getState(), deployState, Optional.<SingularityDeploy>absent(), Optional.<SingularityDeploy>absent(), Optional.<SingularityPendingDeploy>absent())); } return parents; }
@POST @Path("/request/{requestId}/exit-cooldown") public SingularityRequestParent exitCooldown( @PathParam("requestId") String requestId, @QueryParam("user") Optional<String> user) { final SingularityRequestWithState requestWithState = fetchRequestWithState(requestId); checkConflict( requestWithState.getState() == RequestState.SYSTEM_COOLDOWN, "Request %s is not in SYSTEM_COOLDOWN state, it is in %s", requestId, requestWithState.getState()); final Optional<String> maybeDeployId = deployManager.getInUseDeployId(requestId); final long now = System.currentTimeMillis(); requestManager.exitCooldown(requestWithState.getRequest(), now, user); if (maybeDeployId.isPresent() && !requestWithState.getRequest().isOneOff()) { requestManager.addToPendingQueue( new SingularityPendingRequest( requestId, maybeDeployId.get(), now, user, PendingType.IMMEDIATE, Collections.<String>emptyList())); } return fillEntireRequest(requestWithState); }
private SingularityDeployStatistics getDeployStatistics(String requestId, String deployId) { final Optional<SingularityDeployStatistics> maybeDeployStatistics = deployManager.getDeployStatistics(requestId, deployId); if (maybeDeployStatistics.isPresent()) { return maybeDeployStatistics.get(); } return new SingularityDeployStatisticsBuilder(requestId, deployId).build(); }
private String getAndCheckDeployId(String requestId) { Optional<String> maybeDeployId = deployManager.getInUseDeployId(requestId); checkConflict( maybeDeployId.isPresent(), "Can not schedule/bounce a request (%s) with no deploy", requestId); return maybeDeployId.get(); }
private void enqueueHealthAndNewTaskChecks() { final long start = System.currentTimeMillis(); final List<SingularityTask> activeTasks = taskManager.getActiveTasks(); final Map<SingularityTaskId, SingularityTask> activeTaskMap = Maps.uniqueIndex(activeTasks, SingularityTaskIdHolder.getTaskIdFunction()); final Map<SingularityTaskId, List<SingularityTaskHistoryUpdate>> taskUpdates = taskManager.getTaskHistoryUpdates(activeTaskMap.keySet()); final Map<SingularityDeployKey, SingularityPendingDeploy> pendingDeploys = Maps.uniqueIndex( deployManager.getPendingDeploys(), SingularityDeployKey.FROM_PENDING_TO_DEPLOY_KEY); final Map<String, SingularityRequestWithState> idToRequest = Maps.uniqueIndex( requestManager.getRequests(), SingularityRequestWithState.REQUEST_STATE_TO_REQUEST_ID); requestManager.getActiveRequests(); int enqueuedNewTaskChecks = 0; int enqueuedHealthchecks = 0; for (Map.Entry<SingularityTaskId, SingularityTask> entry : activeTaskMap.entrySet()) { SingularityTaskId taskId = entry.getKey(); SingularityTask task = entry.getValue(); SimplifiedTaskState simplifiedTaskState = SingularityTaskHistoryUpdate.getCurrentState(taskUpdates.get(taskId)); if (simplifiedTaskState != SimplifiedTaskState.DONE) { SingularityDeployKey deployKey = new SingularityDeployKey(taskId.getRequestId(), taskId.getDeployId()); Optional<SingularityPendingDeploy> pendingDeploy = Optional.fromNullable(pendingDeploys.get(deployKey)); Optional<SingularityRequestWithState> request = Optional.fromNullable(idToRequest.get(taskId.getRequestId())); if (!pendingDeploy.isPresent()) { newTaskChecker.enqueueNewTaskCheck(task, request, healthchecker); enqueuedNewTaskChecks++; } if (simplifiedTaskState == SimplifiedTaskState.RUNNING) { if (healthchecker.enqueueHealthcheck(task, pendingDeploy, request)) { enqueuedHealthchecks++; } } } } LOG.info( "Enqueued {} health checks and {} new task checks (out of {} active tasks) in {}", enqueuedHealthchecks, enqueuedNewTaskChecks, activeTasks.size(), JavaUtils.duration(start)); }
private boolean shouldScheduleTasks( SingularityPendingRequest pendingRequest, Optional<SingularityRequestWithState> maybeRequest) { if (!isRequestActive(maybeRequest)) { return false; } Optional<SingularityRequestDeployState> maybeRequestDeployState = deployManager.getRequestDeployState(pendingRequest.getRequestId()); return isDeployInUse(maybeRequestDeployState, pendingRequest.getDeployId(), false); }
private SingularityRequestDeployHolder getDeployHolder(String requestId) { Optional<SingularityRequestDeployState> requestDeployState = deployManager.getRequestDeployState(requestId); Optional<SingularityDeploy> activeDeploy = Optional.absent(); Optional<SingularityDeploy> pendingDeploy = Optional.absent(); if (requestDeployState.isPresent()) { if (requestDeployState.get().getActiveDeploy().isPresent()) { activeDeploy = deployManager.getDeploy( requestId, requestDeployState.get().getActiveDeploy().get().getDeployId()); } if (requestDeployState.get().getPendingDeploy().isPresent()) { pendingDeploy = deployManager.getDeploy( requestId, requestDeployState.get().getPendingDeploy().get().getDeployId()); } } return new SingularityRequestDeployHolder(activeDeploy, pendingDeploy); }
private void checkActiveRequest( SingularityRequestWithState requestWithState, Map<SingularityDeployKey, SingularityPendingTaskId> deployKeyToPendingTaskId, final long timestamp) { final SingularityRequest request = requestWithState.getRequest(); if (request.getRequestType() == RequestType.ON_DEMAND || request.getRequestType() == RequestType.RUN_ONCE) { return; // There's no situation where we'd want to schedule an On Demand or Run Once request // at startup, so don't even bother with them. } Optional<SingularityRequestDeployState> requestDeployState = deployManager.getRequestDeployState(request.getId()); if (!requestDeployState.isPresent() || !requestDeployState.get().getActiveDeploy().isPresent()) { LOG.debug("No active deploy for {} - not scheduling on startup", request.getId()); return; } final String activeDeployId = requestDeployState.get().getActiveDeploy().get().getDeployId(); if (request.isScheduled()) { SingularityDeployKey deployKey = new SingularityDeployKey(request.getId(), activeDeployId); SingularityPendingTaskId pendingTaskId = deployKeyToPendingTaskId.get(deployKey); if (pendingTaskId != null && pendingTaskId.getCreatedAt() >= requestWithState.getTimestamp()) { LOG.info( "Not rescheduling {} because {} is newer than {}", request.getId(), pendingTaskId, requestWithState.getTimestamp()); return; } } requestManager.addToPendingQueue( new SingularityPendingRequest( request.getId(), activeDeployId, timestamp, Optional.<String>absent(), PendingType.STARTUP, Optional.<Boolean>absent(), Optional.<String>absent())); }
private void checkReschedule( SingularityRequest newRequest, Optional<SingularityRequest> maybeOldRequest, long timestamp) { if (!maybeOldRequest.isPresent()) { return; } if (shouldReschedule(newRequest, maybeOldRequest.get())) { Optional<String> maybeDeployId = deployManager.getInUseDeployId(newRequest.getId()); if (maybeDeployId.isPresent()) { requestManager.addToPendingQueue( new SingularityPendingRequest( newRequest.getId(), maybeDeployId.get(), timestamp, PendingType.UPDATED_REQUEST)); } } }
@POST @Path("/request/{requestId}/unpause") @ApiOperation( value = "Unpause a Singularity Request, scheduling new tasks immediately", response = SingularityRequestParent.class) @ApiResponses({ @ApiResponse(code = 409, message = "Request is not paused"), }) public SingularityRequestParent unpause( @ApiParam("The request ID to unpause") @PathParam("requestId") String requestId, @ApiParam("Username of the person requesting the unpause") @QueryParam("user") Optional<String> user) { SingularityRequestWithState requestWithState = fetchRequestWithState(requestId); checkConflict( requestWithState.getState() == RequestState.PAUSED, "Request %s is not in PAUSED state, it is in %s", requestId, requestWithState.getState()); mailer.sendRequestUnpausedMail(requestWithState.getRequest(), user); Optional<String> maybeDeployId = deployManager.getInUseDeployId(requestId); final long now = System.currentTimeMillis(); requestManager.unpause(requestWithState.getRequest(), now, user); if (maybeDeployId.isPresent() && !requestWithState.getRequest().isOneOff()) { requestManager.addToPendingQueue( new SingularityPendingRequest( requestId, maybeDeployId.get(), now, user, PendingType.UNPAUSED, Collections.<String>emptyList())); } return fillEntireRequest( new SingularityRequestWithState(requestWithState.getRequest(), RequestState.ACTIVE, now)); }
private List<SingularityTaskRequest> checkForStaleScheduledTasks( List<SingularityPendingTask> pendingTasks, List<SingularityTaskRequest> taskRequests) { final Set<String> foundPendingTaskId = Sets.newHashSetWithExpectedSize(taskRequests.size()); final Set<String> requestIds = Sets.newHashSetWithExpectedSize(taskRequests.size()); for (SingularityTaskRequest taskRequest : taskRequests) { foundPendingTaskId.add(taskRequest.getPendingTask().getPendingTaskId().getId()); requestIds.add(taskRequest.getRequest().getId()); } for (SingularityPendingTask pendingTask : pendingTasks) { if (!foundPendingTaskId.contains(pendingTask.getPendingTaskId().getId())) { LOG.info("Removing stale pending task {}", pendingTask.getPendingTaskId()); taskManager.deletePendingTask(pendingTask.getPendingTaskId()); } } // TODO this check isn't necessary if we keep track better during deploys final Map<String, SingularityRequestDeployState> deployStates = deployManager.getRequestDeployStatesByRequestIds(requestIds); final List<SingularityTaskRequest> taskRequestsWithValidDeploys = Lists.newArrayListWithCapacity(taskRequests.size()); for (SingularityTaskRequest taskRequest : taskRequests) { SingularityRequestDeployState requestDeployState = deployStates.get(taskRequest.getRequest().getId()); if (!matchesDeploy(requestDeployState, taskRequest)) { LOG.info( "Removing stale pending task {} because the deployId did not match active/pending deploys {}", taskRequest.getPendingTask().getPendingTaskId(), requestDeployState); taskManager.deletePendingTask(taskRequest.getPendingTask().getPendingTaskId()); } else { taskRequestsWithValidDeploys.add(taskRequest); } } return taskRequestsWithValidDeploys; }
private void updateDeployStatistics( SingularityDeployStatistics deployStatistics, SingularityTaskId taskId, long timestamp, ExtendedTaskState state, Optional<PendingType> scheduleResult) { SingularityDeployStatisticsBuilder bldr = deployStatistics.toBuilder(); if (bldr.getAverageRuntimeMillis().isPresent()) { long newAvgRuntimeMillis = (bldr.getAverageRuntimeMillis().get() * bldr.getNumTasks() + (timestamp - taskId.getStartedAt())) / (bldr.getNumTasks() + 1); bldr.setAverageRuntimeMillis(Optional.of(newAvgRuntimeMillis)); } else { bldr.setAverageRuntimeMillis(Optional.of(timestamp - taskId.getStartedAt())); } bldr.setNumTasks(bldr.getNumTasks() + 1); if (!bldr.getLastFinishAt().isPresent() || timestamp > bldr.getLastFinishAt().get()) { bldr.setLastFinishAt(Optional.of(timestamp)); bldr.setLastTaskState(Optional.of(state)); } final ListMultimap<Integer, Long> instanceSequentialFailureTimestamps = bldr.getInstanceSequentialFailureTimestamps(); final List<Long> sequentialFailureTimestamps = instanceSequentialFailureTimestamps.get(taskId.getInstanceNo()); if (!state.isSuccess()) { if (SingularityTaskHistoryUpdate.getUpdate( taskManager.getTaskHistoryUpdates(taskId), ExtendedTaskState.TASK_CLEANING) .isPresent()) { LOG.debug("{} failed with {} after cleaning - ignoring it for cooldown", taskId, state); } else { if (sequentialFailureTimestamps.size() < configuration.getCooldownAfterFailures()) { sequentialFailureTimestamps.add(timestamp); } else if (timestamp > sequentialFailureTimestamps.get(0)) { sequentialFailureTimestamps.set(0, timestamp); } Collections.sort(sequentialFailureTimestamps); } } else { bldr.setNumSuccess(bldr.getNumSuccess() + 1); sequentialFailureTimestamps.clear(); } if (scheduleResult.isPresent() && scheduleResult.get() == PendingType.RETRY) { bldr.setNumSequentialRetries(bldr.getNumSequentialRetries() + 1); } else { bldr.setNumSequentialRetries(0); } final SingularityDeployStatistics newStatistics = bldr.build(); LOG.trace("Saving new deploy statistics {}", newStatistics); deployManager.saveDeployStatistics(newStatistics); }
private Optional<PendingType> handleCompletedTaskWithStatistics( Optional<SingularityTask> task, SingularityTaskId taskId, long timestamp, ExtendedTaskState state, SingularityDeployStatistics deployStatistics, SingularityCreateResult taskHistoryUpdateCreateResult, SingularitySchedulerStateCache stateCache) { final Optional<SingularityRequestWithState> maybeRequestWithState = requestManager.getRequest(taskId.getRequestId()); if (!isRequestActive(maybeRequestWithState)) { LOG.warn( "Not scheduling a new task, {} is {}", taskId.getRequestId(), SingularityRequestWithState.getRequestState(maybeRequestWithState)); return Optional.absent(); } RequestState requestState = maybeRequestWithState.get().getState(); final SingularityRequest request = maybeRequestWithState.get().getRequest(); final Optional<SingularityRequestDeployState> requestDeployState = deployManager.getRequestDeployState(request.getId()); if (!isDeployInUse(requestDeployState, taskId.getDeployId(), true)) { LOG.debug( "Task {} completed, but it didn't match active deploy state {} - ignoring", taskId.getId(), requestDeployState); return Optional.absent(); } if (taskHistoryUpdateCreateResult == SingularityCreateResult.CREATED && requestState != RequestState.SYSTEM_COOLDOWN) { mailer.sendTaskCompletedMail(task, taskId, request, state); } else if (requestState == RequestState.SYSTEM_COOLDOWN) { LOG.debug("Not sending a task completed email because task {} is in SYSTEM_COOLDOWN", taskId); } else { LOG.debug( "Not sending a task completed email for task {} because Singularity already processed this update", taskId); } if (!state.isSuccess() && taskHistoryUpdateCreateResult == SingularityCreateResult.CREATED && cooldown.shouldEnterCooldown( request, taskId, requestState, deployStatistics, timestamp)) { LOG.info("Request {} is entering cooldown due to task {}", request.getId(), taskId); requestState = RequestState.SYSTEM_COOLDOWN; requestManager.cooldown(request, System.currentTimeMillis()); mailer.sendRequestInCooldownMail(request); } PendingType pendingType = PendingType.TASK_DONE; if (!state.isSuccess() && shouldRetryImmediately(request, deployStatistics)) { LOG.debug("Retrying {} because {}", request.getId(), state); pendingType = PendingType.RETRY; } else if (!request.isAlwaysRunning()) { return Optional.absent(); } if (state.isSuccess() && requestState == RequestState.SYSTEM_COOLDOWN) { // TODO send not cooldown anymore email LOG.info("Request {} succeeded a task, removing from cooldown", request.getId()); requestState = RequestState.ACTIVE; requestManager.exitCooldown(request, System.currentTimeMillis()); } SingularityPendingRequest pendingRequest = new SingularityPendingRequest( request.getId(), requestDeployState.get().getActiveDeploy().get().getDeployId(), System.currentTimeMillis(), pendingType); scheduleTasks( stateCache, request, requestState, deployStatistics, pendingRequest, getMatchingTaskIds(stateCache, request, pendingRequest)); return Optional.of(pendingType); }
public void checkForDecomissions(SingularitySchedulerStateCache stateCache) { final long start = System.currentTimeMillis(); final Set<String> requestIdsToReschedule = Sets.newHashSet(); final Set<SingularityTaskId> matchingTaskIds = Sets.newHashSet(); final Collection<SingularityTaskId> activeTaskIds = stateCache.getActiveTaskIds(); final Map<SingularitySlave, MachineState> slaves = getDefaultMap(slaveManager.getObjectsFiltered(MachineState.STARTING_DECOMMISSION)); for (SingularitySlave slave : slaves.keySet()) { boolean foundTask = false; for (SingularityTask activeTask : taskManager.getTasksOnSlave(activeTaskIds, slave)) { cleanupTaskDueToDecomission(requestIdsToReschedule, matchingTaskIds, activeTask, slave); foundTask = true; } if (!foundTask) { slaves.put(slave, MachineState.DECOMMISSIONED); } } final Map<SingularityRack, MachineState> racks = getDefaultMap(rackManager.getObjectsFiltered(MachineState.STARTING_DECOMMISSION)); for (SingularityRack rack : racks.keySet()) { boolean foundTask = false; for (SingularityTaskId activeTaskId : activeTaskIds) { if (rack.getId().equals(activeTaskId.getRackId())) { foundTask = true; } if (matchingTaskIds.contains(activeTaskId)) { continue; } if (rack.getId().equals(activeTaskId.getRackId())) { Optional<SingularityTask> maybeTask = taskManager.getTask(activeTaskId); cleanupTaskDueToDecomission( requestIdsToReschedule, matchingTaskIds, maybeTask.get(), rack); } } if (!foundTask) { racks.put(rack, MachineState.DECOMMISSIONED); } } for (String requestId : requestIdsToReschedule) { LOG.trace("Rescheduling request {} due to decomissions", requestId); Optional<String> maybeDeployId = deployManager.getInUseDeployId(requestId); if (maybeDeployId.isPresent()) { requestManager.addToPendingQueue( new SingularityPendingRequest( requestId, maybeDeployId.get(), start, PendingType.DECOMISSIONED_SLAVE_OR_RACK)); } else { LOG.warn("Not rescheduling a request ({}) because of no active deploy", requestId); } } changeState(slaves, slaveManager); changeState(racks, rackManager); if (slaves.isEmpty() && racks.isEmpty() && requestIdsToReschedule.isEmpty() && matchingTaskIds.isEmpty()) { LOG.trace("Decomission check found nothing"); } else { LOG.info( "Found {} decomissioning slaves, {} decomissioning racks, rescheduling {} requests and scheduling {} tasks for cleanup in {}", slaves.size(), racks.size(), requestIdsToReschedule.size(), matchingTaskIds.size(), JavaUtils.duration(start)); } }