Ejemplo n.º 1
0
  private void setupSortMemoryAllocations(final PhysicalPlan plan) {
    // look for external sorts
    final List<ExternalSort> sortList = new LinkedList<>();
    for (final PhysicalOperator op : plan.getSortedOperators()) {
      if (op instanceof ExternalSort) {
        sortList.add((ExternalSort) op);
      }
    }

    // if there are any sorts, compute the maximum allocation, and set it on them
    if (sortList.size() > 0) {
      final OptionManager optionManager = queryContext.getOptions();
      final long maxWidthPerNode =
          optionManager.getOption(ExecConstants.MAX_WIDTH_PER_NODE_KEY).num_val;
      long maxAllocPerNode =
          Math.min(
              DrillConfig.getMaxDirectMemory(),
              queryContext.getConfig().getLong(ExecConstants.TOP_LEVEL_MAX_ALLOC));
      maxAllocPerNode =
          Math.min(
              maxAllocPerNode,
              optionManager.getOption(ExecConstants.MAX_QUERY_MEMORY_PER_NODE_KEY).num_val);
      final long maxSortAlloc = maxAllocPerNode / (sortList.size() * maxWidthPerNode);
      logger.debug("Max sort alloc: {}", maxSortAlloc);

      for (final ExternalSort externalSort : sortList) {
        externalSort.setMaxAllocation(maxSortAlloc);
      }
    }
  }
Ejemplo n.º 2
0
  /**
   * Given a relNode tree for SELECT statement, convert to Drill Logical RelNode tree.
   *
   * @param relNode
   * @return
   * @throws SqlUnsupportedException
   * @throws RelConversionException
   */
  protected DrillRel convertToDrel(RelNode relNode)
      throws SqlUnsupportedException, RelConversionException {
    try {
      final DrillRel convertedRelNode;

      if (!context.getPlannerSettings().isHepJoinOptEnabled()) {
        convertedRelNode = (DrillRel) logicalPlanningVolcano(relNode);
      } else {
        convertedRelNode = (DrillRel) logicalPlanningVolcanoAndLopt(relNode);
      }

      if (convertedRelNode instanceof DrillStoreRel) {
        throw new UnsupportedOperationException();
      } else {

        // If the query contains a limit 0 clause, disable distributed mode since it is overkill for
        // determining schema.
        if (FindLimit0Visitor.containsLimit0(convertedRelNode)) {
          context.getPlannerSettings().forceSingleMode();
        }

        return convertedRelNode;
      }
    } catch (RelOptPlanner.CannotPlanException ex) {
      logger.error(ex.getMessage());

      if (JoinUtils.checkCartesianJoin(
          relNode, new ArrayList<Integer>(), new ArrayList<Integer>())) {
        throw new UnsupportedRelOperatorException(
            "This query cannot be planned possibly due to either a cartesian join or an inequality join");
      } else {
        throw ex;
      }
    }
  }
Ejemplo n.º 3
0
  private void parseAndRunLogicalPlan(String json) {

    try {
      LogicalPlan logicalPlan = context.getPlanReader().readLogicalPlan(json);

      if (logicalPlan.getProperties().resultMode == ResultMode.LOGICAL) {
        fail(
            "Failure running plan.  You requested a result mode of LOGICAL and submitted a logical plan.  In this case you're output mode must be PHYSICAL or EXEC.",
            new Exception());
      }
      if (logger.isDebugEnabled())
        logger.debug("Logical {}", logicalPlan.unparse(context.getConfig()));
      PhysicalPlan physicalPlan = convert(logicalPlan);

      if (logicalPlan.getProperties().resultMode == ResultMode.PHYSICAL) {
        returnPhysical(physicalPlan);
        return;
      }

      if (logger.isDebugEnabled())
        logger.debug(
            "Physical {}", context.getConfig().getMapper().writeValueAsString(physicalPlan));
      runPhysicalPlan(physicalPlan);
    } catch (IOException e) {
      fail("Failure while parsing logical plan.", e);
    } catch (OptimizerException e) {
      fail("Failure while converting logical plan to physical plan.", e);
    }
  }
Ejemplo n.º 4
0
  private QueryWorkUnit getQueryWorkUnit(final PhysicalPlan plan) throws ExecutionSetupException {
    final PhysicalOperator rootOperator = plan.getSortedOperators(false).iterator().next();
    final Fragment rootFragment = rootOperator.accept(MakeFragmentsVisitor.INSTANCE, null);
    final SimpleParallelizer parallelizer = new SimpleParallelizer(queryContext);
    final QueryWorkUnit queryWorkUnit =
        parallelizer.getFragments(
            queryContext.getOptions().getOptionList(),
            queryContext.getCurrentEndpoint(),
            queryId,
            queryContext.getActiveEndpoints(),
            drillbitContext.getPlanReader(),
            rootFragment,
            initiatingClient.getSession(),
            queryContext.getQueryContextInfo());

    if (logger.isTraceEnabled()) {
      final StringBuilder sb = new StringBuilder();
      sb.append("PlanFragments for query ");
      sb.append(queryId);
      sb.append('\n');

      final List<PlanFragment> planFragments = queryWorkUnit.getFragments();
      final int fragmentCount = planFragments.size();
      int fragmentIndex = 0;
      for (final PlanFragment planFragment : planFragments) {
        final FragmentHandle fragmentHandle = planFragment.getHandle();
        sb.append("PlanFragment(");
        sb.append(++fragmentIndex);
        sb.append('/');
        sb.append(fragmentCount);
        sb.append(") major_fragment_id ");
        sb.append(fragmentHandle.getMajorFragmentId());
        sb.append(" minor_fragment_id ");
        sb.append(fragmentHandle.getMinorFragmentId());
        sb.append('\n');

        final DrillbitEndpoint endpointAssignment = planFragment.getAssignment();
        sb.append("  DrillbitEndpoint address ");
        sb.append(endpointAssignment.getAddress());
        sb.append('\n');

        String jsonString = "<<malformed JSON>>";
        sb.append("  fragment_json: ");
        final ObjectMapper objectMapper = new ObjectMapper();
        try {
          final Object json = objectMapper.readValue(planFragment.getFragmentJson(), Object.class);
          jsonString = objectMapper.defaultPrettyPrintingWriter().writeValueAsString(json);
        } catch (final Exception e) {
          // we've already set jsonString to a fallback value
        }
        sb.append(jsonString);

        logger.trace(sb.toString());
      }
    }

    return queryWorkUnit;
  }
Ejemplo n.º 5
0
 protected void log(final String name, final PhysicalPlan plan, final Logger logger)
     throws JsonProcessingException {
   if (logger.isDebugEnabled()) {
     String planText = plan.unparse(context.getConfig().getMapper().writer());
     logger.debug(name + " : \n" + planText);
   }
 }
Ejemplo n.º 6
0
 private PhysicalPlan convert(final LogicalPlan plan) throws OptimizerException {
   if (logger.isDebugEnabled()) {
     logger.debug("Converting logical plan {}.", plan.toJsonStringSafe(queryContext.getConfig()));
   }
   return new BasicOptimizer(queryContext, initiatingClient)
       .optimize(new BasicOptimizer.BasicOptimizationContext(queryContext), plan);
 }
Ejemplo n.º 7
0
  /**
   * Set up the root fragment (which will run locally), and submit it for execution.
   *
   * @param rootFragment
   * @param rootOperator
   * @throws ExecutionSetupException
   */
  private void setupRootFragment(final PlanFragment rootFragment, final FragmentRoot rootOperator)
      throws ExecutionSetupException {
    @SuppressWarnings("resource")
    final FragmentContext rootContext =
        new FragmentContext(
            drillbitContext,
            rootFragment,
            queryContext,
            initiatingClient,
            drillbitContext.getFunctionImplementationRegistry());
    @SuppressWarnings("resource")
    final IncomingBuffers buffers = new IncomingBuffers(rootFragment, rootContext);
    rootContext.setBuffers(buffers);

    queryManager.addFragmentStatusTracker(rootFragment, true);

    final ControlTunnel tunnel =
        drillbitContext.getController().getTunnel(queryContext.getCurrentEndpoint());
    final FragmentExecutor rootRunner =
        new FragmentExecutor(
            rootContext,
            rootFragment,
            new FragmentStatusReporter(rootContext, tunnel),
            rootOperator);
    final RootFragmentManager fragmentManager =
        new RootFragmentManager(rootFragment.getHandle(), buffers, rootRunner);

    if (buffers.isDone()) {
      // if we don't have to wait for any incoming data, start the fragment runner.
      bee.addFragmentRunner(fragmentManager.getRunnable());
    } else {
      // if we do, record the fragment manager in the workBus.
      drillbitContext.getWorkBus().addFragmentManager(fragmentManager);
    }
  }
Ejemplo n.º 8
0
 /**
  * Resume the query. Regardless of the current state, this method sends a resume signal to all
  * fragments. This method can be called multiple times.
  */
 public void resume() {
   resume = true;
   // resume all pauses through query context
   queryContext.getExecutionControls().unpauseAll();
   // resume all pauses through all fragment contexts
   queryManager.unpauseExecutingFragments(drillbitContext);
 }
Ejemplo n.º 9
0
 private void parseAndRunPhysicalPlan(String json) {
   try {
     PhysicalPlan plan = context.getPlanReader().readPhysicalPlan(json);
     runPhysicalPlan(plan);
   } catch (IOException e) {
     fail("Failure while parsing physical plan.", e);
   }
 }
Ejemplo n.º 10
0
 public DefaultSqlHandler(SqlHandlerConfig config, Pointer<String> textPlan) {
   super();
   this.planner = config.getPlanner();
   this.context = config.getContext();
   this.hepPlanner = config.getHepPlanner();
   this.config = config;
   this.textPlan = textPlan;
   targetSliceSize = context.getOptions().getOption(ExecConstants.SLICE_TARGET).num_val;
 }
Ejemplo n.º 11
0
 protected PhysicalPlan convertToPlan(PhysicalOperator op) {
   PlanPropertiesBuilder propsBuilder = PlanProperties.builder();
   propsBuilder.type(PlanType.APACHE_DRILL_PHYSICAL);
   propsBuilder.version(1);
   propsBuilder.options(new JSONOptions(context.getOptions().getOptionList()));
   propsBuilder.resultMode(ResultMode.EXEC);
   propsBuilder.generator(this.getClass().getSimpleName(), "");
   return new PhysicalPlan(propsBuilder.build(), getPops(op));
 }
Ejemplo n.º 12
0
 private void log(final PhysicalPlan plan) {
   if (logger.isDebugEnabled()) {
     try {
       final String planText = queryContext.getConfig().getMapper().writeValueAsString(plan);
       logger.debug("Physical {}", planText);
     } catch (final IOException e) {
       logger.warn("Error while attempting to log physical plan.", e);
     }
   }
 }
Ejemplo n.º 13
0
 public SimpleParallelizer(QueryContext context) {
   OptionManager optionManager = context.getOptions();
   long sliceTarget = optionManager.getOption(ExecConstants.SLICE_TARGET).num_val;
   this.parallelizationThreshold = sliceTarget > 0 ? sliceTarget : 1;
   this.maxWidthPerNode =
       optionManager.getOption(ExecConstants.MAX_WIDTH_PER_NODE_KEY).num_val.intValue();
   this.maxGlobalWidth =
       optionManager.getOption(ExecConstants.MAX_WIDTH_GLOBAL_KEY).num_val.intValue();
   this.affinityFactor =
       optionManager.getOption(ExecConstants.AFFINITY_FACTOR_KEY).float_val.intValue();
 }
Ejemplo n.º 14
0
 private void fail(String message, Throwable t) {
   if (isFinished()) {
     logger.error("Received a failure message query finished of: {}", message, t);
   }
   DrillPBError error =
       ErrorHelper.logAndConvertError(context.getCurrentEndpoint(), message, t, logger);
   QueryResult result =
       QueryResult //
           .newBuilder() //
           .addError(error) //
           .setIsLastChunk(true) //
           .setQueryState(QueryState.FAILED) //
           .setQueryId(queryId) //
           .build();
   cleanupAndSendResult(result);
 }
Ejemplo n.º 15
0
  /**
   * This limits the number of "small" and "large" queries that a Drill cluster will run
   * simultaneously, if queueing is enabled. If the query is unable to run, this will block until it
   * can. Beware that this is called under run(), and so will consume a Thread while it waits for
   * the required distributed semaphore.
   *
   * @param plan the query plan
   * @throws ForemanSetupException
   */
  private void acquireQuerySemaphore(final PhysicalPlan plan) throws ForemanSetupException {
    final OptionManager optionManager = queryContext.getOptions();
    final boolean queuingEnabled = optionManager.getOption(ExecConstants.ENABLE_QUEUE);
    if (queuingEnabled) {
      final long queueThreshold = optionManager.getOption(ExecConstants.QUEUE_THRESHOLD_SIZE);
      double totalCost = 0;
      for (final PhysicalOperator ops : plan.getSortedOperators()) {
        totalCost += ops.getCost();
      }

      final long queueTimeout = optionManager.getOption(ExecConstants.QUEUE_TIMEOUT);
      final String queueName;

      try {
        @SuppressWarnings("resource")
        final ClusterCoordinator clusterCoordinator = drillbitContext.getClusterCoordinator();
        final DistributedSemaphore distributedSemaphore;

        // get the appropriate semaphore
        if (totalCost > queueThreshold) {
          final int largeQueue = (int) optionManager.getOption(ExecConstants.LARGE_QUEUE_SIZE);
          distributedSemaphore = clusterCoordinator.getSemaphore("query.large", largeQueue);
          queueName = "large";
        } else {
          final int smallQueue = (int) optionManager.getOption(ExecConstants.SMALL_QUEUE_SIZE);
          distributedSemaphore = clusterCoordinator.getSemaphore("query.small", smallQueue);
          queueName = "small";
        }

        lease = distributedSemaphore.acquire(queueTimeout, TimeUnit.MILLISECONDS);
      } catch (final Exception e) {
        throw new ForemanSetupException("Unable to acquire slot for query.", e);
      }

      if (lease == null) {
        throw UserException.resourceError()
            .message(
                "Unable to acquire queue resources for query within timeout.  Timeout for %s queue was set at %d seconds.",
                queueName, queueTimeout / 1000)
            .build(logger);
      }
    }
  }
Ejemplo n.º 16
0
  private RelNode preprocessNode(RelNode rel) throws SqlUnsupportedException {
    /*
     * Traverse the tree to do the following pre-processing tasks: 1. replace the convert_from, convert_to function to
     * actual implementations Eg: convert_from(EXPR, 'JSON') be converted to convert_fromjson(EXPR); TODO: Ideally all
     * function rewrites would move here instead of DrillOptiq.
     *
     * 2. see where the tree contains unsupported functions; throw SqlUnsupportedException if there is any.
     */

    PreProcessLogicalRel visitor =
        PreProcessLogicalRel.createVisitor(
            planner.getTypeFactory(), context.getDrillOperatorTable());
    try {
      rel = rel.accept(visitor);
    } catch (UnsupportedOperationException ex) {
      visitor.convertException();
      throw ex;
    }

    return rel;
  }
Ejemplo n.º 17
0
  private void runPhysicalPlan(PhysicalPlan plan) {

    if (plan.getProperties().resultMode != ResultMode.EXEC) {
      fail(
          String.format(
              "Failure running plan.  You requested a result mode of %s and a physical plan can only be output as EXEC",
              plan.getProperties().resultMode),
          new Exception());
    }
    PhysicalOperator rootOperator = plan.getSortedOperators(false).iterator().next();

    MakeFragmentsVisitor makeFragmentsVisitor = new MakeFragmentsVisitor();
    Fragment rootFragment;
    try {
      rootFragment = rootOperator.accept(makeFragmentsVisitor, null);
    } catch (FragmentSetupException e) {
      fail("Failure while fragmenting query.", e);
      return;
    }

    PlanningSet planningSet = StatsCollector.collectStats(rootFragment);
    SimpleParallelizer parallelizer = new SimpleParallelizer();

    try {
      QueryWorkUnit work =
          parallelizer.getFragments(
              context.getCurrentEndpoint(),
              queryId,
              context.getActiveEndpoints(),
              context.getPlanReader(),
              rootFragment,
              planningSet,
              context.getConfig().getInt(ExecConstants.GLOBAL_MAX_WIDTH),
              context.getConfig().getInt(ExecConstants.MAX_WIDTH_PER_ENDPOINT));

      this.context
          .getWorkBus()
          .setFragmentStatusListener(
              work.getRootFragment().getHandle().getQueryId(), fragmentManager);
      List<PlanFragment> leafFragments = Lists.newArrayList();
      List<PlanFragment> intermediateFragments = Lists.newArrayList();

      // store fragments in distributed grid.
      logger.debug("Storing fragments");
      for (PlanFragment f : work.getFragments()) {

        // store all fragments in grid since they are part of handshake.

        context.getCache().storeFragment(f);
        if (f.getLeafFragment()) {
          leafFragments.add(f);
        } else {
          intermediateFragments.add(f);
        }
      }

      logger.debug("Fragments stored.");

      logger.debug("Submitting fragments to run.");
      fragmentManager.runFragments(
          bee,
          work.getRootFragment(),
          work.getRootOperator(),
          initiatingClient,
          leafFragments,
          intermediateFragments);
      logger.debug("Fragments running.");

    } catch (ExecutionSetupException | RpcException e) {
      fail("Failure while setting up query.", e);
    }
  }
Ejemplo n.º 18
0
 private PhysicalPlan convert(LogicalPlan plan) throws OptimizerException {
   if (logger.isDebugEnabled())
     logger.debug("Converting logical plan {}.", plan.toJsonStringSafe(context.getConfig()));
   return new BasicOptimizer(DrillConfig.create(), context)
       .optimize(new BasicOptimizer.BasicOptimizationContext(), plan);
 }
Ejemplo n.º 19
0
 private void returnPhysical(PhysicalPlan plan) {
   String jsonPlan = plan.unparse(context.getConfig().getMapper().writer());
   runPhysicalPlan(DirectPlan.createDirectPlan(context, new PhysicalFromLogicalExplain(jsonPlan)));
 }
Ejemplo n.º 20
0
  protected Prel convertToPrel(RelNode drel)
      throws RelConversionException, SqlUnsupportedException {
    Preconditions.checkArgument(drel.getConvention() == DrillRel.DRILL_LOGICAL);
    RelTraitSet traits =
        drel.getTraitSet().plus(Prel.DRILL_PHYSICAL).plus(DrillDistributionTrait.SINGLETON);
    Prel phyRelNode;
    try {
      final RelNode relNode = planner.transform(DrillSqlWorker.PHYSICAL_MEM_RULES, traits, drel);
      phyRelNode = (Prel) relNode.accept(new PrelFinalizer());
    } catch (RelOptPlanner.CannotPlanException ex) {
      logger.error(ex.getMessage());

      if (JoinUtils.checkCartesianJoin(drel, new ArrayList<Integer>(), new ArrayList<Integer>())) {
        throw new UnsupportedRelOperatorException(
            "This query cannot be planned possibly due to either a cartesian join or an inequality join");
      } else {
        throw ex;
      }
    }

    OptionManager queryOptions = context.getOptions();

    if (context.getPlannerSettings().isMemoryEstimationEnabled()
        && !MemoryEstimationVisitor.enoughMemory(
            phyRelNode, queryOptions, context.getActiveEndpoints().size())) {
      log("Not enough memory for this plan", phyRelNode, logger);
      logger.debug("Re-planning without hash operations.");

      queryOptions.setOption(
          OptionValue.createBoolean(
              OptionValue.OptionType.QUERY, PlannerSettings.HASHJOIN.getOptionName(), false));
      queryOptions.setOption(
          OptionValue.createBoolean(
              OptionValue.OptionType.QUERY, PlannerSettings.HASHAGG.getOptionName(), false));

      try {
        final RelNode relNode = planner.transform(DrillSqlWorker.PHYSICAL_MEM_RULES, traits, drel);
        phyRelNode = (Prel) relNode.accept(new PrelFinalizer());
      } catch (RelOptPlanner.CannotPlanException ex) {
        logger.error(ex.getMessage());

        if (JoinUtils.checkCartesianJoin(
            drel, new ArrayList<Integer>(), new ArrayList<Integer>())) {
          throw new UnsupportedRelOperatorException(
              "This query cannot be planned possibly due to either a cartesian join or an inequality join");
        } else {
          throw ex;
        }
      }
    }

    /*  The order of the following transformation is important */

    /*
     * 0.) For select * from join query, we need insert project on top of scan and a top project just
     * under screen operator. The project on top of scan will rename from * to T1*, while the top project
     * will rename T1* to *, before it output the final result. Only the top project will allow
     * duplicate columns, since user could "explicitly" ask for duplicate columns ( select *, col, *).
     * The rest of projects will remove the duplicate column when we generate POP in json format.
     */
    phyRelNode = StarColumnConverter.insertRenameProject(phyRelNode);

    /*
     * 1.)
     * Join might cause naming conflicts from its left and right child.
     * In such case, we have to insert Project to rename the conflicting names.
     */
    phyRelNode = JoinPrelRenameVisitor.insertRenameProject(phyRelNode);

    /*
     * 1.1) Swap left / right for INNER hash join, if left's row count is < (1 + margin) right's row count.
     * We want to have smaller dataset on the right side, since hash table builds on right side.
     */
    if (context.getPlannerSettings().isHashJoinSwapEnabled()) {
      phyRelNode =
          SwapHashJoinVisitor.swapHashJoin(
              phyRelNode, new Double(context.getPlannerSettings().getHashJoinSwapMarginFactor()));
    }

    /*
     * 1.2) Break up all expressions with complex outputs into their own project operations
     */
    phyRelNode =
        ((Prel) phyRelNode)
            .accept(
                new SplitUpComplexExpressions(
                    planner.getTypeFactory(),
                    context.getDrillOperatorTable(),
                    context.getPlannerSettings().functionImplementationRegistry),
                null);

    /*
     * 1.3) Projections that contain reference to flatten are rewritten as Flatten operators followed by Project
     */
    phyRelNode =
        ((Prel) phyRelNode)
            .accept(
                new RewriteProjectToFlatten(
                    planner.getTypeFactory(), context.getDrillOperatorTable()),
                null);

    /*
     * 2.)
     * Since our operators work via names rather than indices, we have to make to reorder any
     * output before we return data to the user as we may have accidentally shuffled things.
     * This adds a trivial project to reorder columns prior to output.
     */
    phyRelNode = FinalColumnReorderer.addFinalColumnOrdering(phyRelNode);

    /*
     * 3.)
     * If two fragments are both estimated to be parallelization one, remove the exchange
     * separating them
     */
    phyRelNode = ExcessiveExchangeIdentifier.removeExcessiveEchanges(phyRelNode, targetSliceSize);

    /* 4.)
     * Add ProducerConsumer after each scan if the option is set
     * Use the configured queueSize
     */
    /* DRILL-1617 Disabling ProducerConsumer as it produces incorrect results
    if (context.getOptions().getOption(PlannerSettings.PRODUCER_CONSUMER.getOptionName()).bool_val) {
      long queueSize = context.getOptions().getOption(PlannerSettings.PRODUCER_CONSUMER_QUEUE_SIZE.getOptionName()).num_val;
      phyRelNode = ProducerConsumerPrelVisitor.addProducerConsumerToScans(phyRelNode, (int) queueSize);
    }
    */

    /* 5.)
     * if the client does not support complex types (Map, Repeated)
     * insert a project which which would convert
     */
    if (!context.getSession().isSupportComplexTypes()) {
      logger.debug("Client does not support complex types, add ComplexToJson operator.");
      phyRelNode = ComplexToJsonPrelVisitor.addComplexToJsonPrel(phyRelNode);
    }

    /* 6.)
     * Insert LocalExchange (mux and/or demux) nodes
     */
    phyRelNode = InsertLocalExchangeVisitor.insertLocalExchanges(phyRelNode, queryOptions);

    /* 7.)
     * Next, we add any required selection vector removers given the supported encodings of each
     * operator. This will ultimately move to a new trait but we're managing here for now to avoid
     * introducing new issues in planning before the next release
     */
    phyRelNode = SelectionVectorPrelVisitor.addSelectionRemoversWhereNecessary(phyRelNode);

    /* 8.)
     * Finally, Make sure that the no rels are repeats.
     * This could happen in the case of querying the same table twice as Optiq may canonicalize these.
     */
    phyRelNode = RelUniqifier.uniqifyGraph(phyRelNode);

    return phyRelNode;
  }
Ejemplo n.º 21
0
 private void returnPhysical(final PhysicalPlan plan) throws ExecutionSetupException {
   final String jsonPlan = plan.unparse(queryContext.getConfig().getMapper().writer());
   runPhysicalPlan(
       DirectPlan.createDirectPlan(queryContext, new PhysicalFromLogicalExplain(jsonPlan)));
 }
Ejemplo n.º 22
0
  /**
   * Set up the non-root fragments for execution. Some may be local, and some may be remote.
   * Messages are sent immediately, so they may start returning data even before we complete this.
   *
   * @param fragments the fragments
   * @throws ForemanException
   */
  private void setupNonRootFragments(final Collection<PlanFragment> fragments)
      throws ForemanException {
    /*
     * We will send a single message to each endpoint, regardless of how many fragments will be
     * executed there. We need to start up the intermediate fragments first so that they will be
     * ready once the leaf fragments start producing data. To satisfy both of these, we will
     * make a pass through the fragments and put them into these two maps according to their
     * leaf/intermediate state, as well as their target drillbit.
     */
    final Multimap<DrillbitEndpoint, PlanFragment> leafFragmentMap = ArrayListMultimap.create();
    final Multimap<DrillbitEndpoint, PlanFragment> intFragmentMap = ArrayListMultimap.create();

    // record all fragments for status purposes.
    for (final PlanFragment planFragment : fragments) {
      logger.trace(
          "Tracking intermediate remote node {} with data {}",
          planFragment.getAssignment(),
          planFragment.getFragmentJson());
      queryManager.addFragmentStatusTracker(planFragment, false);
      if (planFragment.getLeafFragment()) {
        leafFragmentMap.put(planFragment.getAssignment(), planFragment);
      } else {
        intFragmentMap.put(planFragment.getAssignment(), planFragment);
      }
    }

    /*
     * We need to wait for the intermediates to be sent so that they'll be set up by the time
     * the leaves start producing data. We'll use this latch to wait for the responses.
     *
     * However, in order not to hang the process if any of the RPC requests fails, we always
     * count down (see FragmentSubmitFailures), but we count the number of failures so that we'll
     * know if any submissions did fail.
     */
    final int numIntFragments = intFragmentMap.keySet().size();
    final ExtendedLatch endpointLatch = new ExtendedLatch(numIntFragments);
    final FragmentSubmitFailures fragmentSubmitFailures = new FragmentSubmitFailures();

    // send remote intermediate fragments
    for (final DrillbitEndpoint ep : intFragmentMap.keySet()) {
      sendRemoteFragments(ep, intFragmentMap.get(ep), endpointLatch, fragmentSubmitFailures);
    }

    final long timeout = RPC_WAIT_IN_MSECS_PER_FRAGMENT * numIntFragments;
    if (numIntFragments > 0 && !endpointLatch.awaitUninterruptibly(timeout)) {
      long numberRemaining = endpointLatch.getCount();
      throw UserException.connectionError()
          .message(
              "Exceeded timeout (%d) while waiting send intermediate work fragments to remote nodes. "
                  + "Sent %d and only heard response back from %d nodes.",
              timeout, numIntFragments, numIntFragments - numberRemaining)
          .build(logger);
    }

    // if any of the intermediate fragment submissions failed, fail the query
    final List<FragmentSubmitFailures.SubmissionException> submissionExceptions =
        fragmentSubmitFailures.submissionExceptions;
    if (submissionExceptions.size() > 0) {
      Set<DrillbitEndpoint> endpoints = Sets.newHashSet();
      StringBuilder sb = new StringBuilder();
      boolean first = true;

      for (FragmentSubmitFailures.SubmissionException e :
          fragmentSubmitFailures.submissionExceptions) {
        DrillbitEndpoint endpoint = e.drillbitEndpoint;
        if (endpoints.add(endpoint)) {
          if (first) {
            first = false;
          } else {
            sb.append(", ");
          }
          sb.append(endpoint.getAddress());
        }
      }
      throw UserException.connectionError(submissionExceptions.get(0).rpcException)
          .message("Error setting up remote intermediate fragment execution")
          .addContext("Nodes with failures", sb.toString())
          .build(logger);
    }

    injector.injectChecked(
        queryContext.getExecutionControls(), "send-fragments", ForemanException.class);
    /*
     * Send the remote (leaf) fragments; we don't wait for these. Any problems will come in through
     * the regular sendListener event delivery.
     */
    for (final DrillbitEndpoint ep : leafFragmentMap.keySet()) {
      sendRemoteFragments(ep, leafFragmentMap.get(ep), null, null);
    }
  }
Ejemplo n.º 23
0
 private void log(final LogicalPlan plan) {
   if (logger.isDebugEnabled()) {
     logger.debug("Logical {}", plan.unparse(queryContext.getConfig()));
   }
 }
Ejemplo n.º 24
0
  /**
   * Called by execution pool to do query setup, and kick off remote execution.
   *
   * <p>Note that completion of this function is not the end of the Foreman's role in the query's
   * lifecycle.
   */
  @Override
  public void run() {
    // rename the thread we're using for debugging purposes
    final Thread currentThread = Thread.currentThread();
    final String originalName = currentThread.getName();
    currentThread.setName(QueryIdHelper.getQueryId(queryId) + ":foreman");

    // track how long the query takes
    queryManager.markStartTime();

    try {
      injector.injectChecked(
          queryContext.getExecutionControls(), "run-try-beginning", ForemanException.class);
      queryText = queryRequest.getPlan();

      // convert a run query request into action
      switch (queryRequest.getType()) {
        case LOGICAL:
          parseAndRunLogicalPlan(queryRequest.getPlan());
          break;
        case PHYSICAL:
          parseAndRunPhysicalPlan(queryRequest.getPlan());
          break;
        case SQL:
          runSQL(queryRequest.getPlan());
          break;
        default:
          throw new IllegalStateException();
      }
      injector.injectChecked(
          queryContext.getExecutionControls(), "run-try-end", ForemanException.class);
    } catch (final OutOfMemoryException | OutOfMemoryRuntimeException e) {
      moveToState(QueryState.FAILED, UserException.memoryError(e).build(logger));
    } catch (final ForemanException e) {
      moveToState(QueryState.FAILED, e);
    } catch (AssertionError | Exception ex) {
      moveToState(
          QueryState.FAILED,
          new ForemanException(
              "Unexpected exception during fragment initialization: " + ex.getMessage(), ex));
    } catch (final OutOfMemoryError e) {
      if ("Direct buffer memory".equals(e.getMessage())) {
        moveToState(
            QueryState.FAILED,
            UserException.resourceError(e)
                .message("One or more nodes ran out of memory while executing the query.")
                .build(logger));
      } else {
        /*
         * FragmentExecutors use a DrillbitStatusListener to watch out for the death of their query's Foreman. So, if we
         * die here, they should get notified about that, and cancel themselves; we don't have to attempt to notify
         * them, which might not work under these conditions.
         */
        System.out.println("Node ran out of Heap memory, exiting.");
        e.printStackTrace();
        System.out.flush();
        System.exit(-1);
      }

    } finally {
      /*
       * Begin accepting external events.
       *
       * Doing this here in the finally clause will guarantee that it occurs. Otherwise, if there
       * is an exception anywhere during setup, it wouldn't occur, and any events that are generated
       * as a result of any partial setup that was done (such as the FragmentSubmitListener,
       * the ResponseSendListener, or an external call to cancel()), will hang the thread that makes the
       * event delivery call.
       *
       * If we do throw an exception during setup, and have already moved to QueryState.FAILED, we just need to
       * make sure that we can't make things any worse as those events are delivered, but allow
       * any necessary remaining cleanup to proceed.
       *
       * Note that cancellations cannot be simulated before this point, i.e. pauses can be injected, because Foreman
       * would wait on the cancelling thread to signal a resume and the cancelling thread would wait on the Foreman
       * to accept events.
       */
      acceptExternalEvents.countDown();

      // If we received the resume signal before fragments are setup, the first call does not
      // actually resume the
      // fragments. Since setup is done, all fragments must have been delivered to remote nodes. Now
      // we can resume.
      if (resume) {
        resume();
      }
      injector.injectPause(queryContext.getExecutionControls(), "foreman-ready", logger);

      // restore the thread's original name
      currentThread.setName(originalName);
    }

    /*
     * Note that despite the run() completing, the Foreman continues to exist, and receives
     * events (indirectly, through the QueryManager's use of stateListener), about fragment
     * completions. It won't go away until everything is completed, failed, or cancelled.
     */
  }