private void enqueueHealthAndNewTaskChecks() { final long start = System.currentTimeMillis(); final List<SingularityTask> activeTasks = taskManager.getActiveTasks(); final Map<SingularityTaskId, SingularityTask> activeTaskMap = Maps.uniqueIndex(activeTasks, SingularityTaskIdHolder.getTaskIdFunction()); final Map<SingularityTaskId, List<SingularityTaskHistoryUpdate>> taskUpdates = taskManager.getTaskHistoryUpdates(activeTaskMap.keySet()); final Map<SingularityDeployKey, SingularityPendingDeploy> pendingDeploys = Maps.uniqueIndex( deployManager.getPendingDeploys(), SingularityDeployKey.FROM_PENDING_TO_DEPLOY_KEY); final Map<String, SingularityRequestWithState> idToRequest = Maps.uniqueIndex( requestManager.getRequests(), SingularityRequestWithState.REQUEST_STATE_TO_REQUEST_ID); requestManager.getActiveRequests(); int enqueuedNewTaskChecks = 0; int enqueuedHealthchecks = 0; for (Map.Entry<SingularityTaskId, SingularityTask> entry : activeTaskMap.entrySet()) { SingularityTaskId taskId = entry.getKey(); SingularityTask task = entry.getValue(); SimplifiedTaskState simplifiedTaskState = SingularityTaskHistoryUpdate.getCurrentState(taskUpdates.get(taskId)); if (simplifiedTaskState != SimplifiedTaskState.DONE) { SingularityDeployKey deployKey = new SingularityDeployKey(taskId.getRequestId(), taskId.getDeployId()); Optional<SingularityPendingDeploy> pendingDeploy = Optional.fromNullable(pendingDeploys.get(deployKey)); Optional<SingularityRequestWithState> request = Optional.fromNullable(idToRequest.get(taskId.getRequestId())); if (!pendingDeploy.isPresent()) { newTaskChecker.enqueueNewTaskCheck(task, request, healthchecker); enqueuedNewTaskChecks++; } if (simplifiedTaskState == SimplifiedTaskState.RUNNING) { if (healthchecker.enqueueHealthcheck(task, pendingDeploy, request)) { enqueuedHealthchecks++; } } } } LOG.info( "Enqueued {} health checks and {} new task checks (out of {} active tasks) in {}", enqueuedHealthchecks, enqueuedNewTaskChecks, activeTasks.size(), JavaUtils.duration(start)); }
private <T extends SingularityId> List<T> getChildrenAsIdsForParentsThrows( final String pathNameforLogs, final Collection<String> parents, final IdTranscoder<T> idTranscoder) throws Exception { if (parents.isEmpty()) { return Collections.emptyList(); } final List<T> objects = Lists.newArrayListWithExpectedSize(parents.size()); final CountDownLatch latch = new CountDownLatch(parents.size()); final AtomicInteger missing = new AtomicInteger(); final BackgroundCallback callback = new BackgroundCallback() { @Override public void processResult(CuratorFramework client, CuratorEvent event) throws Exception { if (event.getChildren() == null || event.getChildren().size() == 0) { LOG.trace("Expected children for node {} - but found none", event.getPath()); missing.incrementAndGet(); latch.countDown(); return; } objects.addAll(Lists.transform(event.getChildren(), idTranscoder)); latch.countDown(); } }; final long start = System.currentTimeMillis(); for (String parent : parents) { curator.getChildren().inBackground(callback).forPath(parent); } checkLatch(latch, pathNameforLogs); LOG.trace( "Fetched {} objects from {} (missing {}) in {}", objects.size(), pathNameforLogs, missing.intValue(), JavaUtils.duration(start)); return objects; }
private <T> List<T> getAsyncThrows( final String pathNameForLogs, final Collection<String> paths, final Transcoder<T> transcoder) throws Exception { final List<T> objects = Lists.newArrayListWithCapacity(paths.size()); if (paths.isEmpty()) { return objects; } final CountDownLatch latch = new CountDownLatch(paths.size()); final AtomicInteger missing = new AtomicInteger(); final BackgroundCallback callback = new BackgroundCallback() { @Override public void processResult(CuratorFramework client, CuratorEvent event) throws Exception { if (event.getData() == null || event.getData().length == 0) { LOG.trace("Expected active node {} but it wasn't there", event.getPath()); missing.incrementAndGet(); latch.countDown(); return; } objects.add(transcoder.transcode(event.getData())); latch.countDown(); } }; final long start = System.currentTimeMillis(); for (String path : paths) { curator.getData().inBackground(callback).forPath(path); } checkLatch(latch, pathNameForLogs); LOG.trace( "Fetched {} objects from {} (missing {}) in {}", objects.size(), pathNameForLogs, missing.intValue(), JavaUtils.duration(start)); return objects; }
private <T extends SingularityId> List<T> existsThrows( final String pathNameforLogs, final Collection<String> paths, final IdTranscoder<T> idTranscoder) throws Exception { if (paths.isEmpty()) { return Collections.emptyList(); } final List<T> objects = Lists.newArrayListWithCapacity(paths.size()); final CountDownLatch latch = new CountDownLatch(paths.size()); final BackgroundCallback callback = new BackgroundCallback() { @Override public void processResult(CuratorFramework client, CuratorEvent event) throws Exception { if (event.getStat() == null) { latch.countDown(); return; } objects.add(idTranscoder.apply(ZKPaths.getNodeFromPath(event.getPath()))); latch.countDown(); } }; final long start = System.currentTimeMillis(); for (String path : paths) { curator.checkExists().inBackground(callback).forPath(path); } checkLatch(latch, pathNameforLogs); LOG.trace( "Found {} objects out of {} from {} in {}", objects.size(), paths.size(), pathNameforLogs, JavaUtils.duration(start)); return objects; }
public void download(S3Artifact s3Artifact, Path downloadTo) { final long start = System.currentTimeMillis(); boolean success = false; try { downloadThrows(s3Artifact, downloadTo); success = true; } catch (Throwable t) { throw Throwables.propagate(t); } finally { log.info( "S3 Download {}/{} finished {} after {}", s3Artifact.getS3Bucket(), s3Artifact.getS3ObjectKey(), success ? "successfully" : "with error", JavaUtils.duration(start)); } }
private void combineChunk(Path downloadTo, Path path) throws Exception { final long start = System.currentTimeMillis(); long bytes = 0; log.info("Writing {} to {}", path, downloadTo); try (WritableByteChannel wbs = Files.newByteChannel( downloadTo, EnumSet.of(StandardOpenOption.APPEND, StandardOpenOption.WRITE))) { try (FileChannel readChannel = FileChannel.open( path, EnumSet.of(StandardOpenOption.READ, StandardOpenOption.DELETE_ON_CLOSE))) { bytes = readChannel.size(); readChannel.transferTo(0, bytes, wbs); } } log.info("Finished writing {} bytes in {}", bytes, JavaUtils.duration(start)); }
private void readInitialFiles() throws IOException { final long start = System.currentTimeMillis(); LOG.info( "Scanning for metadata files (*{}) in {}", configuration.getS3MetadataSuffix(), configuration.getS3MetadataDirectory()); int foundFiles = 0; for (Path file : JavaUtils.iterable(configuration.getS3MetadataDirectory())) { if (!isS3MetadataFile(file)) { continue; } if (handleNewOrModifiedS3Metadata(file)) { foundFiles++; } } LOG.info("Found {} file(s) in {}", foundFiles, JavaUtils.duration(start)); }
public void startup(MasterInfo masterInfo, SchedulerDriver driver) throws Exception { final long start = System.currentTimeMillis(); final String uri = mesosClient.getMasterUri(MesosUtils.getMasterHostAndPort(masterInfo)); LOG.info("Starting up... fetching state data from: " + uri); zkDataMigrationRunner.checkMigrations(); MesosMasterStateObject state = mesosClient.getMasterState(uri); slaveAndRackManager.loadSlavesAndRacksFromMaster(state); checkSchedulerForInconsistentState(); enqueueHealthAndNewTaskChecks(); taskReconciliation.startReconciliation(); LOG.info("Finished startup after {}", JavaUtils.duration(start)); }
@Override public void shutdown() { final long start = System.currentTimeMillis(); LOG.info("Gracefully shutting down S3Uploader, this may take a few moments..."); runLock.lock(); try { if (!super.stop()) { LOG.info("Already shutting down, ignoring request"); return; } } finally { runLock.unlock(); } future.cancel(false); scheduler.shutdown(); executorService.shutdown(); LOG.info("Shut down in {}", JavaUtils.duration(start)); }
private boolean handleChunk( S3Artifact s3Artifact, Future<Path> future, Path downloadTo, int chunk, long start, long remainingMillis) { if (remainingMillis <= 0) { remainingMillis = 1; } try { Path path = future.get(remainingMillis, TimeUnit.MILLISECONDS); if (chunk > 0) { combineChunk(downloadTo, path); } return true; } catch (TimeoutException te) { log.error( "Chunk {} for {} timed out after {} - had {} remaining", chunk, s3Artifact.getFilename(), JavaUtils.duration(start), JavaUtils.durationFromMillis(remainingMillis)); future.cancel(true); exceptionNotifier.notify( te, ImmutableMap.of("filename", s3Artifact.getFilename(), "chunk", Integer.toString(chunk))); } catch (Throwable t) { log.error("Error while handling chunk {} for {}", chunk, s3Artifact.getFilename(), t); exceptionNotifier.notify( t, ImmutableMap.of("filename", s3Artifact.getFilename(), "chunk", Integer.toString(chunk))); } return false; }
public void drainPendingQueue(final SingularitySchedulerStateCache stateCache) { final long start = System.currentTimeMillis(); final ImmutableList<SingularityPendingRequest> pendingRequests = ImmutableList.copyOf(requestManager.getPendingRequests()); if (pendingRequests.isEmpty()) { LOG.trace("Pending queue was empty"); return; } LOG.info("Pending queue had {} requests", pendingRequests.size()); int totalNewScheduledTasks = 0; int heldForScheduledActiveTask = 0; int obsoleteRequests = 0; for (SingularityPendingRequest pendingRequest : pendingRequests) { Optional<SingularityRequestWithState> maybeRequest = requestManager.getRequest(pendingRequest.getRequestId()); if (shouldScheduleTasks(pendingRequest, maybeRequest)) { final List<SingularityTaskId> matchingTaskIds = getMatchingTaskIds(stateCache, maybeRequest.get().getRequest(), pendingRequest); final SingularityDeployStatistics deployStatistics = getDeployStatistics(pendingRequest.getRequestId(), pendingRequest.getDeployId()); final RequestState requestState = checkCooldown(maybeRequest.get(), deployStatistics); int numScheduledTasks = scheduleTasks( stateCache, maybeRequest.get().getRequest(), requestState, deployStatistics, pendingRequest, matchingTaskIds); if (numScheduledTasks == 0 && !matchingTaskIds.isEmpty() && maybeRequest.get().getRequest().isScheduled() && pendingRequest.getPendingType() == PendingType.NEW_DEPLOY) { LOG.trace( "Holding pending request {} because it is scheduled and has an active task", pendingRequest); heldForScheduledActiveTask++; continue; } LOG.debug( "Pending request {} resulted in {} new scheduled tasks", pendingRequest, numScheduledTasks); totalNewScheduledTasks += numScheduledTasks; } else { LOG.debug( "Pending request {} was obsolete (request {})", pendingRequest, SingularityRequestWithState.getRequestState(maybeRequest)); obsoleteRequests++; } requestManager.deletePendingRequest(pendingRequest); } LOG.info( "Scheduled {} new tasks ({} obsolete requests, {} held) in {}", totalNewScheduledTasks, obsoleteRequests, heldForScheduledActiveTask, JavaUtils.duration(start)); }
public void checkForDecomissions(SingularitySchedulerStateCache stateCache) { final long start = System.currentTimeMillis(); final Set<String> requestIdsToReschedule = Sets.newHashSet(); final Set<SingularityTaskId> matchingTaskIds = Sets.newHashSet(); final Collection<SingularityTaskId> activeTaskIds = stateCache.getActiveTaskIds(); final Map<SingularitySlave, MachineState> slaves = getDefaultMap(slaveManager.getObjectsFiltered(MachineState.STARTING_DECOMMISSION)); for (SingularitySlave slave : slaves.keySet()) { boolean foundTask = false; for (SingularityTask activeTask : taskManager.getTasksOnSlave(activeTaskIds, slave)) { cleanupTaskDueToDecomission(requestIdsToReschedule, matchingTaskIds, activeTask, slave); foundTask = true; } if (!foundTask) { slaves.put(slave, MachineState.DECOMMISSIONED); } } final Map<SingularityRack, MachineState> racks = getDefaultMap(rackManager.getObjectsFiltered(MachineState.STARTING_DECOMMISSION)); for (SingularityRack rack : racks.keySet()) { boolean foundTask = false; for (SingularityTaskId activeTaskId : activeTaskIds) { if (rack.getId().equals(activeTaskId.getRackId())) { foundTask = true; } if (matchingTaskIds.contains(activeTaskId)) { continue; } if (rack.getId().equals(activeTaskId.getRackId())) { Optional<SingularityTask> maybeTask = taskManager.getTask(activeTaskId); cleanupTaskDueToDecomission( requestIdsToReschedule, matchingTaskIds, maybeTask.get(), rack); } } if (!foundTask) { racks.put(rack, MachineState.DECOMMISSIONED); } } for (String requestId : requestIdsToReschedule) { LOG.trace("Rescheduling request {} due to decomissions", requestId); Optional<String> maybeDeployId = deployManager.getInUseDeployId(requestId); if (maybeDeployId.isPresent()) { requestManager.addToPendingQueue( new SingularityPendingRequest( requestId, maybeDeployId.get(), start, PendingType.DECOMISSIONED_SLAVE_OR_RACK)); } else { LOG.warn("Not rescheduling a request ({}) because of no active deploy", requestId); } } changeState(slaves, slaveManager); changeState(racks, rackManager); if (slaves.isEmpty() && racks.isEmpty() && requestIdsToReschedule.isEmpty() && matchingTaskIds.isEmpty()) { LOG.trace("Decomission check found nothing"); } else { LOG.info( "Found {} decomissioning slaves, {} decomissioning racks, rescheduling {} requests and scheduling {} tasks for cleanup in {}", slaves.size(), racks.size(), requestIdsToReschedule.size(), matchingTaskIds.size(), JavaUtils.duration(start)); } }