public void loadSlavesAndRacksFromMaster(MesosMasterStateObject state) { Map<String, SingularitySlave> activeSlavesById = slaveManager.getObjectsByIdForState(MachineState.ACTIVE); Map<String, SingularityRack> activeRacksById = rackManager.getObjectsByIdForState(MachineState.ACTIVE); Map<String, SingularityRack> remainingActiveRacks = Maps.newHashMap(activeRacksById); int slaves = 0; int racks = 0; for (MesosMasterSlaveObject slaveJsonObject : state.getSlaves()) { String slaveId = slaveJsonObject.getId(); String rackId = slaveAndRackHelper.getRackId(slaveJsonObject.getAttributes()); String host = slaveAndRackHelper.getMaybeTruncatedHost(slaveJsonObject.getHostname()); if (activeSlavesById.containsKey(slaveId)) { activeSlavesById.remove(slaveId); } else { SingularitySlave newSlave = new SingularitySlave(slaveId, host, rackId); if (check(newSlave, slaveManager) == CheckResult.NEW) { slaves++; } } if (activeRacksById.containsKey(rackId)) { remainingActiveRacks.remove(rackId); } else { SingularityRack rack = new SingularityRack(rackId); if (check(rack, rackManager) == CheckResult.NEW) { racks++; } } } for (SingularitySlave leftOverSlave : activeSlavesById.values()) { slaveManager.changeState( leftOverSlave, MachineState.MISSING_ON_STARTUP, Optional.<String>absent()); } for (SingularityRack leftOverRack : remainingActiveRacks.values()) { rackManager.changeState( leftOverRack, MachineState.MISSING_ON_STARTUP, Optional.<String>absent()); } LOG.info( "Found {} new racks ({} missing) and {} new slaves ({} missing)", racks, remainingActiveRacks.size(), slaves, activeSlavesById.size()); }
public void slaveLost(SlaveID slaveIdObj) { final String slaveId = slaveIdObj.getValue(); Optional<SingularitySlave> slave = slaveManager.getObject(slaveId); if (slave.isPresent()) { slaveManager.changeState(slave.get(), MachineState.DEAD, Optional.<String>absent()); checkRackAfterSlaveLoss(slave.get()); } else { LOG.warn("Lost a slave {}, but didn't know about it", slaveId); } }
@Timed public void checkStateAfterFinishedTask( SingularityTaskId taskId, String slaveId, SingularitySchedulerStateCache stateCache) { Optional<SingularitySlave> slave = slaveManager.getObject(slaveId); if (!slave.isPresent()) { final String message = String.format("Couldn't find slave with id %s for task %s", slaveId, taskId); LOG.warn(message); exceptionNotifier.notify( message, ImmutableMap.of("slaveId", slaveId, "taskId", taskId.toString())); return; } if (slave.get().getCurrentState().getState() == MachineState.DECOMMISSIONING) { if (!hasTaskLeftOnSlave(taskId, slaveId, stateCache)) { slaveManager.changeState( slave.get(), MachineState.DECOMMISSIONED, slave.get().getCurrentState().getUser()); } } Optional<SingularityRack> rack = rackManager.getObject(slave.get().getRackId()); if (!rack.isPresent()) { final String message = String.format( "Couldn't find rack with id %s for task %s", slave.get().getRackId(), taskId); LOG.warn(message); exceptionNotifier.notify( message, ImmutableMap.of("rackId", slave.get().getRackId(), "taskId", taskId.toString())); return; } if (rack.get().getCurrentState().getState() == MachineState.DECOMMISSIONING) { if (!hasTaskLeftOnRack(taskId, stateCache)) { rackManager.changeState( rack.get(), MachineState.DECOMMISSIONED, rack.get().getCurrentState().getUser()); } } }
private void checkRackAfterSlaveLoss(SingularitySlave lostSlave) { List<SingularitySlave> slaves = slaveManager.getObjectsFiltered(MachineState.ACTIVE); int numInRack = 0; for (SingularitySlave slave : slaves) { if (slave.getRackId().equals(lostSlave.getRackId())) { numInRack++; } } LOG.info("Found {} slaves left in rack {}", numInRack, lostSlave.getRackId()); if (numInRack == 0) { rackManager.changeState(lostSlave.getRackId(), MachineState.DEAD, Optional.<String>absent()); } }
public void checkForDecomissions(SingularitySchedulerStateCache stateCache) { final long start = System.currentTimeMillis(); final Set<String> requestIdsToReschedule = Sets.newHashSet(); final Set<SingularityTaskId> matchingTaskIds = Sets.newHashSet(); final Collection<SingularityTaskId> activeTaskIds = stateCache.getActiveTaskIds(); final Map<SingularitySlave, MachineState> slaves = getDefaultMap(slaveManager.getObjectsFiltered(MachineState.STARTING_DECOMMISSION)); for (SingularitySlave slave : slaves.keySet()) { boolean foundTask = false; for (SingularityTask activeTask : taskManager.getTasksOnSlave(activeTaskIds, slave)) { cleanupTaskDueToDecomission(requestIdsToReschedule, matchingTaskIds, activeTask, slave); foundTask = true; } if (!foundTask) { slaves.put(slave, MachineState.DECOMMISSIONED); } } final Map<SingularityRack, MachineState> racks = getDefaultMap(rackManager.getObjectsFiltered(MachineState.STARTING_DECOMMISSION)); for (SingularityRack rack : racks.keySet()) { boolean foundTask = false; for (SingularityTaskId activeTaskId : activeTaskIds) { if (rack.getId().equals(activeTaskId.getRackId())) { foundTask = true; } if (matchingTaskIds.contains(activeTaskId)) { continue; } if (rack.getId().equals(activeTaskId.getRackId())) { Optional<SingularityTask> maybeTask = taskManager.getTask(activeTaskId); cleanupTaskDueToDecomission( requestIdsToReschedule, matchingTaskIds, maybeTask.get(), rack); } } if (!foundTask) { racks.put(rack, MachineState.DECOMMISSIONED); } } for (String requestId : requestIdsToReschedule) { LOG.trace("Rescheduling request {} due to decomissions", requestId); Optional<String> maybeDeployId = deployManager.getInUseDeployId(requestId); if (maybeDeployId.isPresent()) { requestManager.addToPendingQueue( new SingularityPendingRequest( requestId, maybeDeployId.get(), start, PendingType.DECOMISSIONED_SLAVE_OR_RACK)); } else { LOG.warn("Not rescheduling a request ({}) because of no active deploy", requestId); } } changeState(slaves, slaveManager); changeState(racks, rackManager); if (slaves.isEmpty() && racks.isEmpty() && requestIdsToReschedule.isEmpty() && matchingTaskIds.isEmpty()) { LOG.trace("Decomission check found nothing"); } else { LOG.info( "Found {} decomissioning slaves, {} decomissioning racks, rescheduling {} requests and scheduling {} tasks for cleanup in {}", slaves.size(), racks.size(), requestIdsToReschedule.size(), matchingTaskIds.size(), JavaUtils.duration(start)); } }