public SqlQueryScheduler( QueryStateMachine queryStateMachine, LocationFactory locationFactory, StageExecutionPlan plan, NodePartitioningManager nodePartitioningManager, NodeScheduler nodeScheduler, RemoteTaskFactory remoteTaskFactory, Session session, boolean summarizeTaskInfo, int splitBatchSize, ExecutorService executor, OutputBuffers rootOutputBuffers, NodeTaskMap nodeTaskMap, ExecutionPolicy executionPolicy) { this.queryStateMachine = requireNonNull(queryStateMachine, "queryStateMachine is null"); this.executionPolicy = requireNonNull(executionPolicy, "schedulerPolicyFactory is null"); this.summarizeTaskInfo = summarizeTaskInfo; // todo come up with a better way to build this, or eliminate this map ImmutableMap.Builder<StageId, StageScheduler> stageSchedulers = ImmutableMap.builder(); ImmutableMap.Builder<StageId, StageLinkage> stageLinkages = ImmutableMap.builder(); // Only fetch a distribution once per query to assure all stages see the same machine // assignments Map<PartitioningHandle, NodePartitionMap> partitioningCache = new HashMap<>(); List<SqlStageExecution> stages = createStages( Optional.empty(), new AtomicInteger(), locationFactory, plan.withBucketToPartition(Optional.of(new int[1])), nodeScheduler, remoteTaskFactory, session, splitBatchSize, partitioningHandle -> partitioningCache.computeIfAbsent( partitioningHandle, handle -> nodePartitioningManager.getNodePartitioningMap(session, handle)), executor, nodeTaskMap, stageSchedulers, stageLinkages); SqlStageExecution rootStage = stages.get(0); rootStage.setOutputBuffers(rootOutputBuffers); this.rootStageId = rootStage.getStageId(); this.stages = stages.stream().collect(toImmutableMap(SqlStageExecution::getStageId)); this.stageSchedulers = stageSchedulers.build(); this.stageLinkages = stageLinkages.build(); this.executor = executor; rootStage.addStateChangeListener( state -> { if (state == FINISHED) { queryStateMachine.transitionToFinishing(); } else if (state == CANCELED) { // output stage was canceled queryStateMachine.transitionToFailed( new PrestoException(USER_CANCELED, "Query was canceled")); } }); for (SqlStageExecution stage : stages) { stage.addStateChangeListener( state -> { if (queryStateMachine.isDone()) { return; } if (state == FAILED) { queryStateMachine.transitionToFailed( stage.getStageInfo().getFailureCause().toException()); } else if (state == ABORTED) { // this should never happen, since abort can only be triggered in query clean up after // the query is finished queryStateMachine.transitionToFailed( new PrestoException(INTERNAL_ERROR, "Query stage was aborted")); } else if (queryStateMachine.getQueryState() == QueryState.STARTING) { // if the stage has at least one task, we are running if (stage.hasTasks()) { queryStateMachine.transitionToRunning(); } } }); } }
private void schedule() { try (SetThreadName ignored = new SetThreadName("Query-%s", queryStateMachine.getQueryId())) { Set<StageId> completedStages = new HashSet<>(); ExecutionSchedule executionSchedule = executionPolicy.createExecutionSchedule(stages.values()); while (!executionSchedule.isFinished()) { List<CompletableFuture<?>> blockedStages = new ArrayList<>(); for (SqlStageExecution stage : executionSchedule.getStagesToSchedule()) { stage.beginScheduling(); // perform some scheduling work ScheduleResult result = stageSchedulers.get(stage.getStageId()).schedule(); // modify parent and children based on the results of the scheduling if (result.isFinished()) { stage.schedulingComplete(); } else if (!result.getBlocked().isDone()) { blockedStages.add(result.getBlocked()); } stageLinkages .get(stage.getStageId()) .processScheduleResults(stage.getState(), result.getNewTasks()); } // make sure to update stage linkage at least once per loop to catch async state changes // (e.g., partial cancel) for (SqlStageExecution stage : stages.values()) { if (!completedStages.contains(stage.getStageId()) && stage.getState().isDone()) { stageLinkages .get(stage.getStageId()) .processScheduleResults(stage.getState(), ImmutableSet.of()); completedStages.add(stage.getStageId()); } } // wait for a state change and then schedule again if (!blockedStages.isEmpty()) { tryGetFutureValue(firstCompletedFuture(blockedStages), 100, MILLISECONDS); for (CompletableFuture<?> blockedStage : blockedStages) { blockedStage.cancel(true); } } } for (SqlStageExecution stage : stages.values()) { StageState state = stage.getState(); if (state != SCHEDULED && state != RUNNING && !state.isDone()) { throw new PrestoException( INTERNAL_ERROR, format( "Scheduling is complete, but stage %s is in state %s", stage.getStageId(), state)); } } } catch (Throwable t) { queryStateMachine.transitionToFailed(t); throw Throwables.propagate(t); } finally { RuntimeException closeError = new RuntimeException(); for (StageScheduler scheduler : stageSchedulers.values()) { try { scheduler.close(); } catch (Throwable t) { queryStateMachine.transitionToFailed(t); closeError.addSuppressed(t); } } if (closeError.getSuppressed().length > 0) { throw closeError; } } }