public void cancelStage(StageId stageId) { try (SetThreadName ignored = new SetThreadName("Query-%s", queryStateMachine.getQueryId())) { SqlStageExecution sqlStageExecution = stages.get(stageId); SqlStageExecution stage = requireNonNull(sqlStageExecution, () -> format("Stage %s does not exist", stageId)); stage.cancel(); } }
public SqlQueryScheduler( QueryStateMachine queryStateMachine, LocationFactory locationFactory, StageExecutionPlan plan, NodePartitioningManager nodePartitioningManager, NodeScheduler nodeScheduler, RemoteTaskFactory remoteTaskFactory, Session session, boolean summarizeTaskInfo, int splitBatchSize, ExecutorService executor, OutputBuffers rootOutputBuffers, NodeTaskMap nodeTaskMap, ExecutionPolicy executionPolicy) { this.queryStateMachine = requireNonNull(queryStateMachine, "queryStateMachine is null"); this.executionPolicy = requireNonNull(executionPolicy, "schedulerPolicyFactory is null"); this.summarizeTaskInfo = summarizeTaskInfo; // todo come up with a better way to build this, or eliminate this map ImmutableMap.Builder<StageId, StageScheduler> stageSchedulers = ImmutableMap.builder(); ImmutableMap.Builder<StageId, StageLinkage> stageLinkages = ImmutableMap.builder(); // Only fetch a distribution once per query to assure all stages see the same machine // assignments Map<PartitioningHandle, NodePartitionMap> partitioningCache = new HashMap<>(); List<SqlStageExecution> stages = createStages( Optional.empty(), new AtomicInteger(), locationFactory, plan.withBucketToPartition(Optional.of(new int[1])), nodeScheduler, remoteTaskFactory, session, splitBatchSize, partitioningHandle -> partitioningCache.computeIfAbsent( partitioningHandle, handle -> nodePartitioningManager.getNodePartitioningMap(session, handle)), executor, nodeTaskMap, stageSchedulers, stageLinkages); SqlStageExecution rootStage = stages.get(0); rootStage.setOutputBuffers(rootOutputBuffers); this.rootStageId = rootStage.getStageId(); this.stages = stages.stream().collect(toImmutableMap(SqlStageExecution::getStageId)); this.stageSchedulers = stageSchedulers.build(); this.stageLinkages = stageLinkages.build(); this.executor = executor; rootStage.addStateChangeListener( state -> { if (state == FINISHED) { queryStateMachine.transitionToFinishing(); } else if (state == CANCELED) { // output stage was canceled queryStateMachine.transitionToFailed( new PrestoException(USER_CANCELED, "Query was canceled")); } }); for (SqlStageExecution stage : stages) { stage.addStateChangeListener( state -> { if (queryStateMachine.isDone()) { return; } if (state == FAILED) { queryStateMachine.transitionToFailed( stage.getStageInfo().getFailureCause().toException()); } else if (state == ABORTED) { // this should never happen, since abort can only be triggered in query clean up after // the query is finished queryStateMachine.transitionToFailed( new PrestoException(INTERNAL_ERROR, "Query stage was aborted")); } else if (queryStateMachine.getQueryState() == QueryState.STARTING) { // if the stage has at least one task, we are running if (stage.hasTasks()) { queryStateMachine.transitionToRunning(); } } }); } }
private void schedule() { try (SetThreadName ignored = new SetThreadName("Query-%s", queryStateMachine.getQueryId())) { Set<StageId> completedStages = new HashSet<>(); ExecutionSchedule executionSchedule = executionPolicy.createExecutionSchedule(stages.values()); while (!executionSchedule.isFinished()) { List<CompletableFuture<?>> blockedStages = new ArrayList<>(); for (SqlStageExecution stage : executionSchedule.getStagesToSchedule()) { stage.beginScheduling(); // perform some scheduling work ScheduleResult result = stageSchedulers.get(stage.getStageId()).schedule(); // modify parent and children based on the results of the scheduling if (result.isFinished()) { stage.schedulingComplete(); } else if (!result.getBlocked().isDone()) { blockedStages.add(result.getBlocked()); } stageLinkages .get(stage.getStageId()) .processScheduleResults(stage.getState(), result.getNewTasks()); } // make sure to update stage linkage at least once per loop to catch async state changes // (e.g., partial cancel) for (SqlStageExecution stage : stages.values()) { if (!completedStages.contains(stage.getStageId()) && stage.getState().isDone()) { stageLinkages .get(stage.getStageId()) .processScheduleResults(stage.getState(), ImmutableSet.of()); completedStages.add(stage.getStageId()); } } // wait for a state change and then schedule again if (!blockedStages.isEmpty()) { tryGetFutureValue(firstCompletedFuture(blockedStages), 100, MILLISECONDS); for (CompletableFuture<?> blockedStage : blockedStages) { blockedStage.cancel(true); } } } for (SqlStageExecution stage : stages.values()) { StageState state = stage.getState(); if (state != SCHEDULED && state != RUNNING && !state.isDone()) { throw new PrestoException( INTERNAL_ERROR, format( "Scheduling is complete, but stage %s is in state %s", stage.getStageId(), state)); } } } catch (Throwable t) { queryStateMachine.transitionToFailed(t); throw Throwables.propagate(t); } finally { RuntimeException closeError = new RuntimeException(); for (StageScheduler scheduler : stageSchedulers.values()) { try { scheduler.close(); } catch (Throwable t) { queryStateMachine.transitionToFailed(t); closeError.addSuppressed(t); } } if (closeError.getSuppressed().length > 0) { throw closeError; } } }
private List<SqlStageExecution> createStages( Optional<SqlStageExecution> parent, AtomicInteger nextStageId, LocationFactory locationFactory, StageExecutionPlan plan, NodeScheduler nodeScheduler, RemoteTaskFactory remoteTaskFactory, Session session, int splitBatchSize, Function<PartitioningHandle, NodePartitionMap> partitioningCache, ExecutorService executor, NodeTaskMap nodeTaskMap, ImmutableMap.Builder<StageId, StageScheduler> stageSchedulers, ImmutableMap.Builder<StageId, StageLinkage> stageLinkages) { ImmutableList.Builder<SqlStageExecution> stages = ImmutableList.builder(); StageId stageId = new StageId(queryStateMachine.getQueryId(), String.valueOf(nextStageId.getAndIncrement())); SqlStageExecution stage = new SqlStageExecution( stageId, locationFactory.createStageLocation(stageId), plan.getFragment(), remoteTaskFactory, session, summarizeTaskInfo, nodeTaskMap, executor); stages.add(stage); Optional<int[]> bucketToPartition; PartitioningHandle partitioningHandle = plan.getFragment().getPartitioning(); if (partitioningHandle.equals(SOURCE_DISTRIBUTION)) { // nodes are selected dynamically based on the constraints of the splits and the system load SplitSource splitSource = plan.getDataSource().get(); NodeSelector nodeSelector = nodeScheduler.createNodeSelector(splitSource.getDataSourceName()); SplitPlacementPolicy placementPolicy = new DynamicSplitPlacementPolicy(nodeSelector, stage::getAllTasks); stageSchedulers.put( stageId, new SourcePartitionedScheduler(stage, splitSource, placementPolicy, splitBatchSize)); bucketToPartition = Optional.of(new int[1]); } else { // nodes are pre determined by the nodePartitionMap NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning()); if (plan.getDataSource().isPresent()) { stageSchedulers.put( stageId, new FixedSourcePartitionedScheduler( stage, plan.getDataSource().get(), nodePartitionMap, splitBatchSize, nodeScheduler.createNodeSelector(null))); bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition()); } else { Map<Integer, Node> partitionToNode = nodePartitionMap.getPartitionToNode(); // todo this should asynchronously wait a standard timeout period before failing checkCondition(!partitionToNode.isEmpty(), NO_NODES_AVAILABLE, "No worker nodes available"); stageSchedulers.put(stageId, new FixedCountScheduler(stage, partitionToNode)); bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition()); } } ImmutableSet.Builder<SqlStageExecution> childStagesBuilder = ImmutableSet.builder(); for (StageExecutionPlan subStagePlan : plan.getSubStages()) { List<SqlStageExecution> subTree = createStages( Optional.of(stage), nextStageId, locationFactory, subStagePlan.withBucketToPartition(bucketToPartition), nodeScheduler, remoteTaskFactory, session, splitBatchSize, partitioningCache, executor, nodeTaskMap, stageSchedulers, stageLinkages); stages.addAll(subTree); SqlStageExecution childStage = subTree.get(0); childStagesBuilder.add(childStage); } Set<SqlStageExecution> childStages = childStagesBuilder.build(); stage.addStateChangeListener( newState -> { if (newState.isDone()) { childStages.stream().forEach(SqlStageExecution::cancel); } }); stageLinkages.put(stageId, new StageLinkage(plan.getFragment().getId(), parent, childStages)); return stages.build(); }