public SqlQueryScheduler( QueryStateMachine queryStateMachine, LocationFactory locationFactory, StageExecutionPlan plan, NodePartitioningManager nodePartitioningManager, NodeScheduler nodeScheduler, RemoteTaskFactory remoteTaskFactory, Session session, boolean summarizeTaskInfo, int splitBatchSize, ExecutorService executor, OutputBuffers rootOutputBuffers, NodeTaskMap nodeTaskMap, ExecutionPolicy executionPolicy) { this.queryStateMachine = requireNonNull(queryStateMachine, "queryStateMachine is null"); this.executionPolicy = requireNonNull(executionPolicy, "schedulerPolicyFactory is null"); this.summarizeTaskInfo = summarizeTaskInfo; // todo come up with a better way to build this, or eliminate this map ImmutableMap.Builder<StageId, StageScheduler> stageSchedulers = ImmutableMap.builder(); ImmutableMap.Builder<StageId, StageLinkage> stageLinkages = ImmutableMap.builder(); // Only fetch a distribution once per query to assure all stages see the same machine // assignments Map<PartitioningHandle, NodePartitionMap> partitioningCache = new HashMap<>(); List<SqlStageExecution> stages = createStages( Optional.empty(), new AtomicInteger(), locationFactory, plan.withBucketToPartition(Optional.of(new int[1])), nodeScheduler, remoteTaskFactory, session, splitBatchSize, partitioningHandle -> partitioningCache.computeIfAbsent( partitioningHandle, handle -> nodePartitioningManager.getNodePartitioningMap(session, handle)), executor, nodeTaskMap, stageSchedulers, stageLinkages); SqlStageExecution rootStage = stages.get(0); rootStage.setOutputBuffers(rootOutputBuffers); this.rootStageId = rootStage.getStageId(); this.stages = stages.stream().collect(toImmutableMap(SqlStageExecution::getStageId)); this.stageSchedulers = stageSchedulers.build(); this.stageLinkages = stageLinkages.build(); this.executor = executor; rootStage.addStateChangeListener( state -> { if (state == FINISHED) { queryStateMachine.transitionToFinishing(); } else if (state == CANCELED) { // output stage was canceled queryStateMachine.transitionToFailed( new PrestoException(USER_CANCELED, "Query was canceled")); } }); for (SqlStageExecution stage : stages) { stage.addStateChangeListener( state -> { if (queryStateMachine.isDone()) { return; } if (state == FAILED) { queryStateMachine.transitionToFailed( stage.getStageInfo().getFailureCause().toException()); } else if (state == ABORTED) { // this should never happen, since abort can only be triggered in query clean up after // the query is finished queryStateMachine.transitionToFailed( new PrestoException(INTERNAL_ERROR, "Query stage was aborted")); } else if (queryStateMachine.getQueryState() == QueryState.STARTING) { // if the stage has at least one task, we are running if (stage.hasTasks()) { queryStateMachine.transitionToRunning(); } } }); } }
private List<SqlStageExecution> createStages( Optional<SqlStageExecution> parent, AtomicInteger nextStageId, LocationFactory locationFactory, StageExecutionPlan plan, NodeScheduler nodeScheduler, RemoteTaskFactory remoteTaskFactory, Session session, int splitBatchSize, Function<PartitioningHandle, NodePartitionMap> partitioningCache, ExecutorService executor, NodeTaskMap nodeTaskMap, ImmutableMap.Builder<StageId, StageScheduler> stageSchedulers, ImmutableMap.Builder<StageId, StageLinkage> stageLinkages) { ImmutableList.Builder<SqlStageExecution> stages = ImmutableList.builder(); StageId stageId = new StageId(queryStateMachine.getQueryId(), String.valueOf(nextStageId.getAndIncrement())); SqlStageExecution stage = new SqlStageExecution( stageId, locationFactory.createStageLocation(stageId), plan.getFragment(), remoteTaskFactory, session, summarizeTaskInfo, nodeTaskMap, executor); stages.add(stage); Optional<int[]> bucketToPartition; PartitioningHandle partitioningHandle = plan.getFragment().getPartitioning(); if (partitioningHandle.equals(SOURCE_DISTRIBUTION)) { // nodes are selected dynamically based on the constraints of the splits and the system load SplitSource splitSource = plan.getDataSource().get(); NodeSelector nodeSelector = nodeScheduler.createNodeSelector(splitSource.getDataSourceName()); SplitPlacementPolicy placementPolicy = new DynamicSplitPlacementPolicy(nodeSelector, stage::getAllTasks); stageSchedulers.put( stageId, new SourcePartitionedScheduler(stage, splitSource, placementPolicy, splitBatchSize)); bucketToPartition = Optional.of(new int[1]); } else { // nodes are pre determined by the nodePartitionMap NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning()); if (plan.getDataSource().isPresent()) { stageSchedulers.put( stageId, new FixedSourcePartitionedScheduler( stage, plan.getDataSource().get(), nodePartitionMap, splitBatchSize, nodeScheduler.createNodeSelector(null))); bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition()); } else { Map<Integer, Node> partitionToNode = nodePartitionMap.getPartitionToNode(); // todo this should asynchronously wait a standard timeout period before failing checkCondition(!partitionToNode.isEmpty(), NO_NODES_AVAILABLE, "No worker nodes available"); stageSchedulers.put(stageId, new FixedCountScheduler(stage, partitionToNode)); bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition()); } } ImmutableSet.Builder<SqlStageExecution> childStagesBuilder = ImmutableSet.builder(); for (StageExecutionPlan subStagePlan : plan.getSubStages()) { List<SqlStageExecution> subTree = createStages( Optional.of(stage), nextStageId, locationFactory, subStagePlan.withBucketToPartition(bucketToPartition), nodeScheduler, remoteTaskFactory, session, splitBatchSize, partitioningCache, executor, nodeTaskMap, stageSchedulers, stageLinkages); stages.addAll(subTree); SqlStageExecution childStage = subTree.get(0); childStagesBuilder.add(childStage); } Set<SqlStageExecution> childStages = childStagesBuilder.build(); stage.addStateChangeListener( newState -> { if (newState.isDone()) { childStages.stream().forEach(SqlStageExecution::cancel); } }); stageLinkages.put(stageId, new StageLinkage(plan.getFragment().getId(), parent, childStages)); return stages.build(); }