Example #1
0
 public void cancelStage(StageId stageId) {
   try (SetThreadName ignored = new SetThreadName("Query-%s", queryStateMachine.getQueryId())) {
     SqlStageExecution sqlStageExecution = stages.get(stageId);
     SqlStageExecution stage =
         requireNonNull(sqlStageExecution, () -> format("Stage %s does not exist", stageId));
     stage.cancel();
   }
 }
Example #2
0
  public SqlQueryScheduler(
      QueryStateMachine queryStateMachine,
      LocationFactory locationFactory,
      StageExecutionPlan plan,
      NodePartitioningManager nodePartitioningManager,
      NodeScheduler nodeScheduler,
      RemoteTaskFactory remoteTaskFactory,
      Session session,
      boolean summarizeTaskInfo,
      int splitBatchSize,
      ExecutorService executor,
      OutputBuffers rootOutputBuffers,
      NodeTaskMap nodeTaskMap,
      ExecutionPolicy executionPolicy) {
    this.queryStateMachine = requireNonNull(queryStateMachine, "queryStateMachine is null");
    this.executionPolicy = requireNonNull(executionPolicy, "schedulerPolicyFactory is null");
    this.summarizeTaskInfo = summarizeTaskInfo;

    // todo come up with a better way to build this, or eliminate this map
    ImmutableMap.Builder<StageId, StageScheduler> stageSchedulers = ImmutableMap.builder();
    ImmutableMap.Builder<StageId, StageLinkage> stageLinkages = ImmutableMap.builder();

    // Only fetch a distribution once per query to assure all stages see the same machine
    // assignments
    Map<PartitioningHandle, NodePartitionMap> partitioningCache = new HashMap<>();

    List<SqlStageExecution> stages =
        createStages(
            Optional.empty(),
            new AtomicInteger(),
            locationFactory,
            plan.withBucketToPartition(Optional.of(new int[1])),
            nodeScheduler,
            remoteTaskFactory,
            session,
            splitBatchSize,
            partitioningHandle ->
                partitioningCache.computeIfAbsent(
                    partitioningHandle,
                    handle -> nodePartitioningManager.getNodePartitioningMap(session, handle)),
            executor,
            nodeTaskMap,
            stageSchedulers,
            stageLinkages);

    SqlStageExecution rootStage = stages.get(0);
    rootStage.setOutputBuffers(rootOutputBuffers);
    this.rootStageId = rootStage.getStageId();

    this.stages = stages.stream().collect(toImmutableMap(SqlStageExecution::getStageId));

    this.stageSchedulers = stageSchedulers.build();
    this.stageLinkages = stageLinkages.build();

    this.executor = executor;

    rootStage.addStateChangeListener(
        state -> {
          if (state == FINISHED) {
            queryStateMachine.transitionToFinishing();
          } else if (state == CANCELED) {
            // output stage was canceled
            queryStateMachine.transitionToFailed(
                new PrestoException(USER_CANCELED, "Query was canceled"));
          }
        });

    for (SqlStageExecution stage : stages) {
      stage.addStateChangeListener(
          state -> {
            if (queryStateMachine.isDone()) {
              return;
            }
            if (state == FAILED) {
              queryStateMachine.transitionToFailed(
                  stage.getStageInfo().getFailureCause().toException());
            } else if (state == ABORTED) {
              // this should never happen, since abort can only be triggered in query clean up after
              // the query is finished
              queryStateMachine.transitionToFailed(
                  new PrestoException(INTERNAL_ERROR, "Query stage was aborted"));
            } else if (queryStateMachine.getQueryState() == QueryState.STARTING) {
              // if the stage has at least one task, we are running
              if (stage.hasTasks()) {
                queryStateMachine.transitionToRunning();
              }
            }
          });
    }
  }
Example #3
0
 public void abort() {
   try (SetThreadName ignored = new SetThreadName("Query-%s", queryStateMachine.getQueryId())) {
     stages.values().stream().forEach(SqlStageExecution::abort);
   }
 }
Example #4
0
  private void schedule() {
    try (SetThreadName ignored = new SetThreadName("Query-%s", queryStateMachine.getQueryId())) {
      Set<StageId> completedStages = new HashSet<>();
      ExecutionSchedule executionSchedule =
          executionPolicy.createExecutionSchedule(stages.values());
      while (!executionSchedule.isFinished()) {
        List<CompletableFuture<?>> blockedStages = new ArrayList<>();
        for (SqlStageExecution stage : executionSchedule.getStagesToSchedule()) {
          stage.beginScheduling();

          // perform some scheduling work
          ScheduleResult result = stageSchedulers.get(stage.getStageId()).schedule();

          // modify parent and children based on the results of the scheduling
          if (result.isFinished()) {
            stage.schedulingComplete();
          } else if (!result.getBlocked().isDone()) {
            blockedStages.add(result.getBlocked());
          }
          stageLinkages
              .get(stage.getStageId())
              .processScheduleResults(stage.getState(), result.getNewTasks());
        }

        // make sure to update stage linkage at least once per loop to catch async state changes
        // (e.g., partial cancel)
        for (SqlStageExecution stage : stages.values()) {
          if (!completedStages.contains(stage.getStageId()) && stage.getState().isDone()) {
            stageLinkages
                .get(stage.getStageId())
                .processScheduleResults(stage.getState(), ImmutableSet.of());
            completedStages.add(stage.getStageId());
          }
        }

        // wait for a state change and then schedule again
        if (!blockedStages.isEmpty()) {
          tryGetFutureValue(firstCompletedFuture(blockedStages), 100, MILLISECONDS);
          for (CompletableFuture<?> blockedStage : blockedStages) {
            blockedStage.cancel(true);
          }
        }
      }

      for (SqlStageExecution stage : stages.values()) {
        StageState state = stage.getState();
        if (state != SCHEDULED && state != RUNNING && !state.isDone()) {
          throw new PrestoException(
              INTERNAL_ERROR,
              format(
                  "Scheduling is complete, but stage %s is in state %s",
                  stage.getStageId(), state));
        }
      }
    } catch (Throwable t) {
      queryStateMachine.transitionToFailed(t);
      throw Throwables.propagate(t);
    } finally {
      RuntimeException closeError = new RuntimeException();
      for (StageScheduler scheduler : stageSchedulers.values()) {
        try {
          scheduler.close();
        } catch (Throwable t) {
          queryStateMachine.transitionToFailed(t);
          closeError.addSuppressed(t);
        }
      }
      if (closeError.getSuppressed().length > 0) {
        throw closeError;
      }
    }
  }
Example #5
0
  private List<SqlStageExecution> createStages(
      Optional<SqlStageExecution> parent,
      AtomicInteger nextStageId,
      LocationFactory locationFactory,
      StageExecutionPlan plan,
      NodeScheduler nodeScheduler,
      RemoteTaskFactory remoteTaskFactory,
      Session session,
      int splitBatchSize,
      Function<PartitioningHandle, NodePartitionMap> partitioningCache,
      ExecutorService executor,
      NodeTaskMap nodeTaskMap,
      ImmutableMap.Builder<StageId, StageScheduler> stageSchedulers,
      ImmutableMap.Builder<StageId, StageLinkage> stageLinkages) {
    ImmutableList.Builder<SqlStageExecution> stages = ImmutableList.builder();

    StageId stageId =
        new StageId(queryStateMachine.getQueryId(), String.valueOf(nextStageId.getAndIncrement()));
    SqlStageExecution stage =
        new SqlStageExecution(
            stageId,
            locationFactory.createStageLocation(stageId),
            plan.getFragment(),
            remoteTaskFactory,
            session,
            summarizeTaskInfo,
            nodeTaskMap,
            executor);

    stages.add(stage);

    Optional<int[]> bucketToPartition;
    PartitioningHandle partitioningHandle = plan.getFragment().getPartitioning();
    if (partitioningHandle.equals(SOURCE_DISTRIBUTION)) {
      // nodes are selected dynamically based on the constraints of the splits and the system load
      SplitSource splitSource = plan.getDataSource().get();
      NodeSelector nodeSelector = nodeScheduler.createNodeSelector(splitSource.getDataSourceName());
      SplitPlacementPolicy placementPolicy =
          new DynamicSplitPlacementPolicy(nodeSelector, stage::getAllTasks);
      stageSchedulers.put(
          stageId,
          new SourcePartitionedScheduler(stage, splitSource, placementPolicy, splitBatchSize));
      bucketToPartition = Optional.of(new int[1]);
    } else {
      // nodes are pre determined by the nodePartitionMap
      NodePartitionMap nodePartitionMap =
          partitioningCache.apply(plan.getFragment().getPartitioning());

      if (plan.getDataSource().isPresent()) {
        stageSchedulers.put(
            stageId,
            new FixedSourcePartitionedScheduler(
                stage,
                plan.getDataSource().get(),
                nodePartitionMap,
                splitBatchSize,
                nodeScheduler.createNodeSelector(null)));
        bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition());
      } else {
        Map<Integer, Node> partitionToNode = nodePartitionMap.getPartitionToNode();
        // todo this should asynchronously wait a standard timeout period before failing
        checkCondition(!partitionToNode.isEmpty(), NO_NODES_AVAILABLE, "No worker nodes available");
        stageSchedulers.put(stageId, new FixedCountScheduler(stage, partitionToNode));
        bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition());
      }
    }

    ImmutableSet.Builder<SqlStageExecution> childStagesBuilder = ImmutableSet.builder();
    for (StageExecutionPlan subStagePlan : plan.getSubStages()) {
      List<SqlStageExecution> subTree =
          createStages(
              Optional.of(stage),
              nextStageId,
              locationFactory,
              subStagePlan.withBucketToPartition(bucketToPartition),
              nodeScheduler,
              remoteTaskFactory,
              session,
              splitBatchSize,
              partitioningCache,
              executor,
              nodeTaskMap,
              stageSchedulers,
              stageLinkages);
      stages.addAll(subTree);

      SqlStageExecution childStage = subTree.get(0);
      childStagesBuilder.add(childStage);
    }
    Set<SqlStageExecution> childStages = childStagesBuilder.build();
    stage.addStateChangeListener(
        newState -> {
          if (newState.isDone()) {
            childStages.stream().forEach(SqlStageExecution::cancel);
          }
        });

    stageLinkages.put(stageId, new StageLinkage(plan.getFragment().getId(), parent, childStages));

    return stages.build();
  }