Example #1
0
    void sendFailure(UserRpcException e) {
      sendOnce();
      UserException uex = UserException.systemError(e).addIdentity(e.getEndpoint()).build(logger);

      logger.error("Unexpected Error while handling request message", e);

      OutboundRpcMessage outMessage =
          new OutboundRpcMessage(
              RpcMode.RESPONSE_FAILURE, 0, coordinationId, uex.getOrCreatePBError(false));

      if (RpcConstants.EXTRA_DEBUGGING) {
        logger.debug("Adding message to outbound buffer. {}", outMessage);
      }
      connection.getChannel().writeAndFlush(outMessage);
    }
Example #2
0
  @Override
  public void dataArrived(QueryDataBatch result, ConnectionThrottle throttle) {
    final QueryData header = result.getHeader();
    final DrillBuf data = result.getData();

    if (data != null) {
      count.addAndGet(header.getRowCount());
      try {
        loader.load(header.getDef(), data);
        // TODO:  Clean:  DRILL-2933:  That load(...) no longer throws
        // SchemaChangeException, so check/clean catch clause below.
      } catch (SchemaChangeException e) {
        submissionFailed(UserException.systemError(e).build(logger));
      }

      switch (format) {
        case TABLE:
          VectorUtil.showVectorAccessibleContent(loader, columnWidth);
          break;
        case TSV:
          VectorUtil.showVectorAccessibleContent(loader, "\t");
          break;
        case CSV:
          VectorUtil.showVectorAccessibleContent(loader, ",");
          break;
      }
      loader.clear();
    }

    result.release();
  }
Example #3
0
 public int await() throws Exception {
   latch.await();
   if (exception != null) {
     exception.addSuppressed(new DrillRuntimeException("Exception in executor threadpool"));
     throw exception;
   }
   return count.get();
 }
 @Override
 public void dropTable(String tableName) {
   try {
     plugin.getClient().deleteTable(tableName);
   } catch (Exception e) {
     throw UserException.dataWriteError(e)
         .message("Failure while trying to drop table '%s'.", tableName)
         .addContext("plugin", name)
         .build(logger);
   }
 }
Example #5
0
 @Override
 public void validate(OptionValue v) {
   super.validate(v);
   try {
     CompilerPolicy.valueOf(v.string_val.toUpperCase());
   } catch (IllegalArgumentException e) {
     throw UserException.validationError()
         .message(
             "Invalid value '%s' specified for option '%s'. Valid values are %s.",
             v.string_val, getOptionName(), Arrays.toString(CompilerPolicy.values()))
         .build(logger);
   }
 }
Example #6
0
 private void sendFinalState() {
   final FragmentState outcome = fragmentState.get();
   if (outcome == FragmentState.FAILED) {
     final FragmentHandle handle = getContext().getHandle();
     final UserException uex =
         UserException.systemError(deferredException.getAndClear())
             .addIdentity(getContext().getIdentity())
             .addContext(
                 "Fragment", handle.getMajorFragmentId() + ":" + handle.getMinorFragmentId())
             .build(logger);
     statusReporter.fail(uex);
   } else {
     statusReporter.stateChanged(outcome);
   }
 }
Example #7
0
  private void initCols(Schema schema) throws SchemaChangeException {
    ImmutableList.Builder<ProjectedColumnInfo> pciBuilder = ImmutableList.builder();

    for (int i = 0; i < schema.getColumnCount(); i++) {
      ColumnSchema col = schema.getColumnByIndex(i);

      final String name = col.getName();
      final Type kuduType = col.getType();
      MinorType minorType = TYPES.get(kuduType);
      if (minorType == null) {
        logger.warn(
            "Ignoring column that is unsupported.",
            UserException.unsupportedError()
                .message(
                    "A column you queried has a data type that is not currently supported by the Kudu storage plugin. "
                        + "The column's name was %s and its Kudu data type was %s. ",
                    name, kuduType.toString())
                .addContext("column Name", name)
                .addContext("plugin", "kudu")
                .build(logger));

        continue;
      }
      MajorType majorType;
      if (col.isNullable()) {
        majorType = Types.optional(minorType);
      } else {
        majorType = Types.required(minorType);
      }
      MaterializedField field = MaterializedField.create(name, majorType);
      final Class<? extends ValueVector> clazz =
          (Class<? extends ValueVector>)
              TypeHelper.getValueVectorClass(minorType, majorType.getMode());
      ValueVector vector = output.addField(field, clazz);
      vector.allocateNew();

      ProjectedColumnInfo pci = new ProjectedColumnInfo();
      pci.vv = vector;
      pci.kuduColumn = col;
      pci.index = i;
      pciBuilder.add(pci);
    }

    projectedCols = pciBuilder.build();
  }
Example #8
0
 private SelectionVector2 newSV2() throws OutOfMemoryException, InterruptedException {
   SelectionVector2 sv2 = new SelectionVector2(oAllocator);
   if (!sv2.allocateNewSafe(incoming.getRecordCount())) {
     try {
       final BatchGroup merged = mergeAndSpill(batchGroups);
       if (merged != null) {
         spilledBatchGroups.add(merged);
       } else {
         throw UserException.memoryError(
                 "Unable to allocate sv2 for %d records, and not enough batchGroups to spill.",
                 incoming.getRecordCount())
             .addContext("batchGroups.size", batchGroups.size())
             .addContext("spilledBatchGroups.size", spilledBatchGroups.size())
             .addContext("allocated memory", oAllocator.getAllocatedMemory())
             .addContext("allocator limit", oAllocator.getLimit())
             .build(logger);
       }
     } catch (SchemaChangeException e) {
       throw new RuntimeException(e);
     }
     int waitTime = 1;
     while (true) {
       try {
         Thread.sleep(waitTime * 1000);
       } catch (final InterruptedException e) {
         if (!context.shouldContinue()) {
           throw e;
         }
       }
       waitTime *= 2;
       if (sv2.allocateNewSafe(incoming.getRecordCount())) {
         break;
       }
       if (waitTime >= 32) {
         throw new OutOfMemoryException("Unable to allocate sv2 buffer after repeated attempts");
       }
     }
   }
   for (int i = 0; i < incoming.getRecordCount(); i++) {
     sv2.setIndex(i, (char) i);
   }
   sv2.setRecordCount(incoming.getRecordCount());
   return sv2;
 }
  /**
   * Generates the next record batch
   *
   * @return number of records in the batch
   */
  @Override
  public int next() {
    reader.resetForNextBatch();
    int cnt = 0;

    try {
      while (cnt < MAX_RECORDS_PER_BATCH && reader.parseNext()) {
        cnt++;
      }
      reader.finishBatch();
      return cnt;
    } catch (IOException | TextParsingException e) {
      throw UserException.dataReadError(e)
          .addContext(
              "Failure while reading file %s. Happened at or shortly before byte position %d.",
              split.getPath(), reader.getPos())
          .build(logger);
    }
  }
Example #10
0
  /**
   * This limits the number of "small" and "large" queries that a Drill cluster will run
   * simultaneously, if queueing is enabled. If the query is unable to run, this will block until it
   * can. Beware that this is called under run(), and so will consume a Thread while it waits for
   * the required distributed semaphore.
   *
   * @param plan the query plan
   * @throws ForemanSetupException
   */
  private void acquireQuerySemaphore(final PhysicalPlan plan) throws ForemanSetupException {
    final OptionManager optionManager = queryContext.getOptions();
    final boolean queuingEnabled = optionManager.getOption(ExecConstants.ENABLE_QUEUE);
    if (queuingEnabled) {
      final long queueThreshold = optionManager.getOption(ExecConstants.QUEUE_THRESHOLD_SIZE);
      double totalCost = 0;
      for (final PhysicalOperator ops : plan.getSortedOperators()) {
        totalCost += ops.getCost();
      }

      final long queueTimeout = optionManager.getOption(ExecConstants.QUEUE_TIMEOUT);
      final String queueName;

      try {
        @SuppressWarnings("resource")
        final ClusterCoordinator clusterCoordinator = drillbitContext.getClusterCoordinator();
        final DistributedSemaphore distributedSemaphore;

        // get the appropriate semaphore
        if (totalCost > queueThreshold) {
          final int largeQueue = (int) optionManager.getOption(ExecConstants.LARGE_QUEUE_SIZE);
          distributedSemaphore = clusterCoordinator.getSemaphore("query.large", largeQueue);
          queueName = "large";
        } else {
          final int smallQueue = (int) optionManager.getOption(ExecConstants.SMALL_QUEUE_SIZE);
          distributedSemaphore = clusterCoordinator.getSemaphore("query.small", smallQueue);
          queueName = "small";
        }

        lease = distributedSemaphore.acquire(queueTimeout, TimeUnit.MILLISECONDS);
      } catch (final Exception e) {
        throw new ForemanSetupException("Unable to acquire slot for query.", e);
      }

      if (lease == null) {
        throw UserException.resourceError()
            .message(
                "Unable to acquire queue resources for query within timeout.  Timeout for %s queue was set at %d seconds.",
                queueName, queueTimeout / 1000)
            .build(logger);
      }
    }
  }
  /**
   * Performs the initial setup required for the record reader. Initializes the input stream,
   * handling of the output record batch and the actual reader to be used.
   *
   * @param context operator context from which buffer's will be allocated and managed
   * @param outputMutator Used to create the schema in the output record batch
   * @throws ExecutionSetupException
   */
  @Override
  public void setup(OperatorContext context, OutputMutator outputMutator)
      throws ExecutionSetupException {

    oContext = context;
    readBuffer = context.getManagedBuffer(READ_BUFFER);
    whitespaceBuffer = context.getManagedBuffer(WHITE_SPACE_BUFFER);

    // setup Output, Input, and Reader
    try {
      TextOutput output = null;
      TextInput input = null;
      InputStream stream = null;

      // setup Output using OutputMutator
      if (settings.isHeaderExtractionEnabled()) {
        // extract header and use that to setup a set of VarCharVectors
        String[] fieldNames = extractHeader();
        output = new FieldVarCharOutput(outputMutator, fieldNames, getColumns(), isStarQuery());
      } else {
        // simply use RepeatedVarCharVector
        output = new RepeatedVarCharOutput(outputMutator, getColumns(), isStarQuery());
      }

      // setup Input using InputStream
      stream = dfs.openPossiblyCompressedStream(split.getPath());
      input =
          new TextInput(
              settings, stream, readBuffer, split.getStart(), split.getStart() + split.getLength());

      // setup Reader using Input and Output
      reader = new TextReader(settings, input, output, whitespaceBuffer);
      reader.start();

    } catch (SchemaChangeException | IOException e) {
      throw new ExecutionSetupException(
          String.format("Failure while setting up text reader for file %s", split.getPath()), e);
    } catch (IllegalArgumentException e) {
      throw UserException.dataReadError(e)
          .addContext("File Path", split.getPath().toString())
          .build(logger);
    }
  }
Example #12
0
    @Override
    public void close() {
      Preconditions.checkState(!isClosed);
      Preconditions.checkState(resultState != null);

      logger.info("foreman cleaning up.");
      injector.injectPause(queryContext.getExecutionControls(), "foreman-cleanup", logger);

      // remove the channel disconnected listener (doesn't throw)
      closeFuture.removeListener(closeListener);

      // log the query summary
      logQuerySummary();

      // These are straight forward removals from maps, so they won't throw.
      drillbitContext.getWorkBus().removeFragmentStatusListener(queryId);
      drillbitContext
          .getClusterCoordinator()
          .removeDrillbitStatusListener(queryManager.getDrillbitStatusListener());

      suppressingClose(queryContext);

      /*
       * We do our best to write the latest state, but even that could fail. If it does, we can't write
       * the (possibly newly failing) state, so we continue on anyway.
       *
       * We only need to do this if the resultState differs from the last recorded state
       */
      if (resultState != state) {
        suppressingClose(
            new AutoCloseable() {
              @Override
              public void close() throws Exception {
                recordNewState(resultState);
              }
            });
      }

      /*
       * Construct the response based on the latest resultState. The builder shouldn't fail.
       */
      final QueryResult.Builder resultBuilder =
          QueryResult.newBuilder().setQueryId(queryId).setQueryState(resultState);
      final UserException uex;
      if (resultException != null) {
        final boolean verbose =
            queryContext.getOptions().getOption(ExecConstants.ENABLE_VERBOSE_ERRORS_KEY).bool_val;
        uex =
            UserException.systemError(resultException)
                .addIdentity(queryContext.getCurrentEndpoint())
                .build(logger);
        resultBuilder.addError(uex.getOrCreatePBError(verbose));
      } else {
        uex = null;
      }

      // we store the final result here so we can capture any error/errorId in the profile for later
      // debugging.
      queryManager.writeFinalProfile(uex);

      /*
       * If sending the result fails, we don't really have any way to modify the result we tried to send;
       * it is possible it got sent but the result came from a later part of the code path. It is also
       * possible the connection has gone away, so this is irrelevant because there's nowhere to
       * send anything to.
       */
      try {
        // send whatever result we ended up with
        initiatingClient.sendResult(responseListener, resultBuilder.build(), true);
      } catch (final Exception e) {
        addException(e);
        logger.warn("Exception sending result to client", resultException);
      }

      // Remove the Foreman from the running query list.
      bee.retireForeman(Foreman.this);

      try {
        releaseLease();
      } finally {
        isClosed = true;
      }
    }
Example #13
0
  /**
   * Called by execution pool to do query setup, and kick off remote execution.
   *
   * <p>Note that completion of this function is not the end of the Foreman's role in the query's
   * lifecycle.
   */
  @Override
  public void run() {
    // rename the thread we're using for debugging purposes
    final Thread currentThread = Thread.currentThread();
    final String originalName = currentThread.getName();
    currentThread.setName(QueryIdHelper.getQueryId(queryId) + ":foreman");

    // track how long the query takes
    queryManager.markStartTime();

    try {
      injector.injectChecked(
          queryContext.getExecutionControls(), "run-try-beginning", ForemanException.class);
      queryText = queryRequest.getPlan();

      // convert a run query request into action
      switch (queryRequest.getType()) {
        case LOGICAL:
          parseAndRunLogicalPlan(queryRequest.getPlan());
          break;
        case PHYSICAL:
          parseAndRunPhysicalPlan(queryRequest.getPlan());
          break;
        case SQL:
          runSQL(queryRequest.getPlan());
          break;
        default:
          throw new IllegalStateException();
      }
      injector.injectChecked(
          queryContext.getExecutionControls(), "run-try-end", ForemanException.class);
    } catch (final OutOfMemoryException | OutOfMemoryRuntimeException e) {
      moveToState(QueryState.FAILED, UserException.memoryError(e).build(logger));
    } catch (final ForemanException e) {
      moveToState(QueryState.FAILED, e);
    } catch (AssertionError | Exception ex) {
      moveToState(
          QueryState.FAILED,
          new ForemanException(
              "Unexpected exception during fragment initialization: " + ex.getMessage(), ex));
    } catch (final OutOfMemoryError e) {
      if ("Direct buffer memory".equals(e.getMessage())) {
        moveToState(
            QueryState.FAILED,
            UserException.resourceError(e)
                .message("One or more nodes ran out of memory while executing the query.")
                .build(logger));
      } else {
        /*
         * FragmentExecutors use a DrillbitStatusListener to watch out for the death of their query's Foreman. So, if we
         * die here, they should get notified about that, and cancel themselves; we don't have to attempt to notify
         * them, which might not work under these conditions.
         */
        System.out.println("Node ran out of Heap memory, exiting.");
        e.printStackTrace();
        System.out.flush();
        System.exit(-1);
      }

    } finally {
      /*
       * Begin accepting external events.
       *
       * Doing this here in the finally clause will guarantee that it occurs. Otherwise, if there
       * is an exception anywhere during setup, it wouldn't occur, and any events that are generated
       * as a result of any partial setup that was done (such as the FragmentSubmitListener,
       * the ResponseSendListener, or an external call to cancel()), will hang the thread that makes the
       * event delivery call.
       *
       * If we do throw an exception during setup, and have already moved to QueryState.FAILED, we just need to
       * make sure that we can't make things any worse as those events are delivered, but allow
       * any necessary remaining cleanup to proceed.
       *
       * Note that cancellations cannot be simulated before this point, i.e. pauses can be injected, because Foreman
       * would wait on the cancelling thread to signal a resume and the cancelling thread would wait on the Foreman
       * to accept events.
       */
      acceptExternalEvents.countDown();

      // If we received the resume signal before fragments are setup, the first call does not
      // actually resume the
      // fragments. Since setup is done, all fragments must have been delivered to remote nodes. Now
      // we can resume.
      if (resume) {
        resume();
      }
      injector.injectPause(queryContext.getExecutionControls(), "foreman-ready", logger);

      // restore the thread's original name
      currentThread.setName(originalName);
    }

    /*
     * Note that despite the run() completing, the Foreman continues to exist, and receives
     * events (indirectly, through the QueryManager's use of stateListener), about fragment
     * completions. It won't go away until everything is completed, failed, or cancelled.
     */
  }
Example #14
0
  /**
   * Set up the non-root fragments for execution. Some may be local, and some may be remote.
   * Messages are sent immediately, so they may start returning data even before we complete this.
   *
   * @param fragments the fragments
   * @throws ForemanException
   */
  private void setupNonRootFragments(final Collection<PlanFragment> fragments)
      throws ForemanException {
    /*
     * We will send a single message to each endpoint, regardless of how many fragments will be
     * executed there. We need to start up the intermediate fragments first so that they will be
     * ready once the leaf fragments start producing data. To satisfy both of these, we will
     * make a pass through the fragments and put them into these two maps according to their
     * leaf/intermediate state, as well as their target drillbit.
     */
    final Multimap<DrillbitEndpoint, PlanFragment> leafFragmentMap = ArrayListMultimap.create();
    final Multimap<DrillbitEndpoint, PlanFragment> intFragmentMap = ArrayListMultimap.create();

    // record all fragments for status purposes.
    for (final PlanFragment planFragment : fragments) {
      logger.trace(
          "Tracking intermediate remote node {} with data {}",
          planFragment.getAssignment(),
          planFragment.getFragmentJson());
      queryManager.addFragmentStatusTracker(planFragment, false);
      if (planFragment.getLeafFragment()) {
        leafFragmentMap.put(planFragment.getAssignment(), planFragment);
      } else {
        intFragmentMap.put(planFragment.getAssignment(), planFragment);
      }
    }

    /*
     * We need to wait for the intermediates to be sent so that they'll be set up by the time
     * the leaves start producing data. We'll use this latch to wait for the responses.
     *
     * However, in order not to hang the process if any of the RPC requests fails, we always
     * count down (see FragmentSubmitFailures), but we count the number of failures so that we'll
     * know if any submissions did fail.
     */
    final int numIntFragments = intFragmentMap.keySet().size();
    final ExtendedLatch endpointLatch = new ExtendedLatch(numIntFragments);
    final FragmentSubmitFailures fragmentSubmitFailures = new FragmentSubmitFailures();

    // send remote intermediate fragments
    for (final DrillbitEndpoint ep : intFragmentMap.keySet()) {
      sendRemoteFragments(ep, intFragmentMap.get(ep), endpointLatch, fragmentSubmitFailures);
    }

    final long timeout = RPC_WAIT_IN_MSECS_PER_FRAGMENT * numIntFragments;
    if (numIntFragments > 0 && !endpointLatch.awaitUninterruptibly(timeout)) {
      long numberRemaining = endpointLatch.getCount();
      throw UserException.connectionError()
          .message(
              "Exceeded timeout (%d) while waiting send intermediate work fragments to remote nodes. "
                  + "Sent %d and only heard response back from %d nodes.",
              timeout, numIntFragments, numIntFragments - numberRemaining)
          .build(logger);
    }

    // if any of the intermediate fragment submissions failed, fail the query
    final List<FragmentSubmitFailures.SubmissionException> submissionExceptions =
        fragmentSubmitFailures.submissionExceptions;
    if (submissionExceptions.size() > 0) {
      Set<DrillbitEndpoint> endpoints = Sets.newHashSet();
      StringBuilder sb = new StringBuilder();
      boolean first = true;

      for (FragmentSubmitFailures.SubmissionException e :
          fragmentSubmitFailures.submissionExceptions) {
        DrillbitEndpoint endpoint = e.drillbitEndpoint;
        if (endpoints.add(endpoint)) {
          if (first) {
            first = false;
          } else {
            sb.append(", ");
          }
          sb.append(endpoint.getAddress());
        }
      }
      throw UserException.connectionError(submissionExceptions.get(0).rpcException)
          .message("Error setting up remote intermediate fragment execution")
          .addContext("Nodes with failures", sb.toString())
          .build(logger);
    }

    injector.injectChecked(
        queryContext.getExecutionControls(), "send-fragments", ForemanException.class);
    /*
     * Send the remote (leaf) fragments; we don't wait for these. Any problems will come in through
     * the regular sendListener event delivery.
     */
    for (final DrillbitEndpoint ep : leafFragmentMap.keySet()) {
      sendRemoteFragments(ep, leafFragmentMap.get(ep), null, null);
    }
  }
Example #15
0
 private static void throwAsUnsupportedException(UserException ex) throws Exception {
   SqlUnsupportedException.errorClassNameToException(
       ex.getOrCreatePBError(false).getException().getExceptionClass());
   throw ex;
 }
Example #16
0
  @Override
  public IterOutcome innerNext() {
    if (schema != null) {
      if (spillCount == 0) {
        return (getSelectionVector4().next()) ? IterOutcome.OK : IterOutcome.NONE;
      } else {
        Stopwatch w = Stopwatch.createStarted();
        int count = copier.next(targetRecordCount);
        if (count > 0) {
          long t = w.elapsed(TimeUnit.MICROSECONDS);
          logger.debug("Took {} us to merge {} records", t, count);
          container.setRecordCount(count);
          return IterOutcome.OK;
        } else {
          logger.debug("copier returned 0 records");
          return IterOutcome.NONE;
        }
      }
    }

    int totalCount = 0;
    int totalBatches = 0; // total number of batches received so far

    try {
      container.clear();
      outer:
      while (true) {
        IterOutcome upstream;
        if (first) {
          upstream = IterOutcome.OK_NEW_SCHEMA;
        } else {
          upstream = next(incoming);
        }
        if (upstream == IterOutcome.OK && sorter == null) {
          upstream = IterOutcome.OK_NEW_SCHEMA;
        }
        switch (upstream) {
          case NONE:
            if (first) {
              return upstream;
            }
            break outer;
          case NOT_YET:
            throw new UnsupportedOperationException();
          case STOP:
            return upstream;
          case OK_NEW_SCHEMA:
          case OK:
            VectorContainer convertedBatch;
            // only change in the case that the schema truly changes.  Artificial schema changes are
            // ignored.
            if (upstream == IterOutcome.OK_NEW_SCHEMA && !incoming.getSchema().equals(schema)) {
              if (schema != null) {
                if (unionTypeEnabled) {
                  this.schema = SchemaUtil.mergeSchemas(schema, incoming.getSchema());
                } else {
                  throw new SchemaChangeException(
                      "Schema changes not supported in External Sort. Please enable Union type");
                }
              } else {
                schema = incoming.getSchema();
              }
              convertedBatch = SchemaUtil.coerceContainer(incoming, schema, oContext);
              for (BatchGroup b : batchGroups) {
                b.setSchema(schema);
              }
              for (BatchGroup b : spilledBatchGroups) {
                b.setSchema(schema);
              }
              this.sorter = createNewSorter(context, convertedBatch);
            } else {
              convertedBatch = SchemaUtil.coerceContainer(incoming, schema, oContext);
            }
            if (first) {
              first = false;
            }
            if (convertedBatch.getRecordCount() == 0) {
              for (VectorWrapper<?> w : convertedBatch) {
                w.clear();
              }
              break;
            }
            SelectionVector2 sv2;
            if (incoming.getSchema().getSelectionVectorMode()
                == BatchSchema.SelectionVectorMode.TWO_BYTE) {
              sv2 = incoming.getSelectionVector2().clone();
            } else {
              try {
                sv2 = newSV2();
              } catch (InterruptedException e) {
                return IterOutcome.STOP;
              } catch (OutOfMemoryException e) {
                throw new OutOfMemoryException(e);
              }
            }

            int count = sv2.getCount();
            totalCount += count;
            totalBatches++;
            sorter.setup(context, sv2, convertedBatch);
            sorter.sort(sv2);
            RecordBatchData rbd = new RecordBatchData(convertedBatch, oAllocator);
            boolean success = false;
            try {
              rbd.setSv2(sv2);
              batchGroups.add(new BatchGroup(rbd.getContainer(), rbd.getSv2(), oContext));
              if (peakNumBatches < batchGroups.size()) {
                peakNumBatches = batchGroups.size();
                stats.setLongStat(Metric.PEAK_BATCHES_IN_MEMORY, peakNumBatches);
              }

              batchesSinceLastSpill++;
              if ( // If we haven't spilled so far, do we have enough memory for MSorter if this
              // turns out to be the last incoming batch?
              (spillCount == 0 && !hasMemoryForInMemorySort(totalCount))
                  ||
                  // If we haven't spilled so far, make sure we don't exceed the maximum number of
                  // batches SV4 can address
                  (spillCount == 0 && totalBatches > Character.MAX_VALUE)
                  ||
                  // TODO(DRILL-4438) - consider setting this threshold more intelligently,
                  // lowering caused a failing low memory condition (test in
                  // BasicPhysicalOpUnitTest)
                  // to complete successfully (although it caused perf decrease as there was more
                  // spilling)

                  // current memory used is more than 95% of memory usage limit of this operator
                  (oAllocator.getAllocatedMemory() > .95 * oAllocator.getLimit())
                  ||
                  // Number of incoming batches (BatchGroups) exceed the limit and number of
                  // incoming batches accumulated
                  // since the last spill exceed the defined limit
                  (batchGroups.size() > SPILL_THRESHOLD
                      && batchesSinceLastSpill >= SPILL_BATCH_GROUP_SIZE)) {

                if (firstSpillBatchCount == 0) {
                  firstSpillBatchCount = batchGroups.size();
                }

                if (spilledBatchGroups.size() > firstSpillBatchCount / 2) {
                  logger.info("Merging spills");
                  final BatchGroup merged = mergeAndSpill(spilledBatchGroups);
                  if (merged != null) {
                    spilledBatchGroups.addFirst(merged);
                  }
                }
                final BatchGroup merged = mergeAndSpill(batchGroups);
                if (merged != null) { // make sure we don't add null to spilledBatchGroups
                  spilledBatchGroups.add(merged);
                  batchesSinceLastSpill = 0;
                }
              }
              success = true;
            } finally {
              if (!success) {
                rbd.clear();
              }
            }
            break;
          case OUT_OF_MEMORY:
            logger.debug("received OUT_OF_MEMORY, trying to spill");
            if (batchesSinceLastSpill > 2) {
              final BatchGroup merged = mergeAndSpill(batchGroups);
              if (merged != null) {
                spilledBatchGroups.add(merged);
                batchesSinceLastSpill = 0;
              }
            } else {
              logger.debug("not enough batches to spill, sending OUT_OF_MEMORY downstream");
              return IterOutcome.OUT_OF_MEMORY;
            }
            break;
          default:
            throw new UnsupportedOperationException();
        }
      }

      if (totalCount == 0) {
        return IterOutcome.NONE;
      }
      if (spillCount == 0) {

        if (builder != null) {
          builder.clear();
          builder.close();
        }
        builder = new SortRecordBatchBuilder(oAllocator);

        for (BatchGroup group : batchGroups) {
          RecordBatchData rbd = new RecordBatchData(group.getContainer(), oAllocator);
          rbd.setSv2(group.getSv2());
          builder.add(rbd);
        }

        builder.build(context, container);
        sv4 = builder.getSv4();
        mSorter = createNewMSorter();
        mSorter.setup(context, oAllocator, getSelectionVector4(), this.container);

        // For testing memory-leak purpose, inject exception after mSorter finishes setup
        injector.injectUnchecked(context.getExecutionControls(), INTERRUPTION_AFTER_SETUP);
        mSorter.sort(this.container);

        // sort may have prematurely exited due to should continue returning false.
        if (!context.shouldContinue()) {
          return IterOutcome.STOP;
        }

        // For testing memory-leak purpose, inject exception after mSorter finishes sorting
        injector.injectUnchecked(context.getExecutionControls(), INTERRUPTION_AFTER_SORT);
        sv4 = mSorter.getSV4();

        container.buildSchema(SelectionVectorMode.FOUR_BYTE);
      } else { // some batches were spilled
        final BatchGroup merged = mergeAndSpill(batchGroups);
        if (merged != null) {
          spilledBatchGroups.add(merged);
        }
        batchGroups.addAll(spilledBatchGroups);
        spilledBatchGroups =
            null; // no need to cleanup spilledBatchGroups, all it's batches are in batchGroups now

        logger.warn(
            "Starting to merge. {} batch groups. Current allocated memory: {}",
            batchGroups.size(),
            oAllocator.getAllocatedMemory());
        VectorContainer hyperBatch = constructHyperBatch(batchGroups);
        createCopier(hyperBatch, batchGroups, container, false);

        int estimatedRecordSize = 0;
        for (VectorWrapper<?> w : batchGroups.get(0)) {
          try {
            estimatedRecordSize += TypeHelper.getSize(w.getField().getType());
          } catch (UnsupportedOperationException e) {
            estimatedRecordSize += 50;
          }
        }
        targetRecordCount =
            Math.min(MAX_BATCH_SIZE, Math.max(1, COPIER_BATCH_MEM_LIMIT / estimatedRecordSize));
        int count = copier.next(targetRecordCount);
        container.buildSchema(SelectionVectorMode.NONE);
        container.setRecordCount(count);
      }

      return IterOutcome.OK_NEW_SCHEMA;

    } catch (SchemaChangeException ex) {
      kill(false);
      context.fail(
          UserException.unsupportedError(ex)
              .message("Sort doesn't currently support sorts with changing schemas")
              .build(logger));
      return IterOutcome.STOP;
    } catch (ClassTransformationException | IOException ex) {
      kill(false);
      context.fail(ex);
      return IterOutcome.STOP;
    } catch (UnsupportedOperationException e) {
      throw new RuntimeException(e);
    }
  }
Example #17
0
  @Override
  public IterOutcome innerNext() {

    // if a special batch has been sent, we have no data in the incoming so exit early
    if (specialBatchSent) {
      return IterOutcome.NONE;
    }

    // this is only called on the first batch. Beyond this, the aggregator manages batches.
    if (aggregator == null || first) {
      IterOutcome outcome;
      if (first && incoming.getRecordCount() > 0) {
        first = false;
        outcome = IterOutcome.OK_NEW_SCHEMA;
      } else {
        outcome = next(incoming);
      }
      logger.debug("Next outcome of {}", outcome);
      switch (outcome) {
        case NONE:
          if (first && popConfig.getKeys().length == 0) {
            // if we have a straight aggregate and empty input batch, we need to handle it in a
            // different way
            constructSpecialBatch();
            first = false;
            // set state to indicate the fact that we have sent a special batch and input is empty
            specialBatchSent = true;
            return IterOutcome.OK;
          }
        case OUT_OF_MEMORY:
        case NOT_YET:
        case STOP:
          return outcome;
        case OK_NEW_SCHEMA:
          if (!createAggregator()) {
            done = true;
            return IterOutcome.STOP;
          }
          break;
        case OK:
          break;
        default:
          throw new IllegalStateException(String.format("unknown outcome %s", outcome));
      }
    }

    AggOutcome out = aggregator.doWork();
    recordCount = aggregator.getOutputCount();
    logger.debug("Aggregator response {}, records {}", out, aggregator.getOutputCount());
    switch (out) {
      case CLEANUP_AND_RETURN:
        if (!first) {
          container.zeroVectors();
        }
        done = true;
        // fall through
      case RETURN_OUTCOME:
        IterOutcome outcome = aggregator.getOutcome();
        if (outcome == IterOutcome.NONE && first) {
          first = false;
          done = true;
          return IterOutcome.OK_NEW_SCHEMA;
        } else if (outcome == IterOutcome.OK && first) {
          outcome = IterOutcome.OK_NEW_SCHEMA;
        } else if (outcome != IterOutcome.OUT_OF_MEMORY) {
          first = false;
        }
        return outcome;
      case UPDATE_AGGREGATOR:
        context.fail(
            UserException.unsupportedError()
                .message("Streaming aggregate does not support schema changes")
                .build(logger));
        close();
        killIncoming(false);
        return IterOutcome.STOP;
      default:
        throw new IllegalStateException(String.format("Unknown state %s.", out));
    }
  }
Example #18
0
  public BatchGroup mergeAndSpill(LinkedList<BatchGroup> batchGroups) throws SchemaChangeException {
    logger.debug("Copier allocator current allocation {}", copierAllocator.getAllocatedMemory());
    logger.debug(
        "mergeAndSpill: starting total size in memory = {}", oAllocator.getAllocatedMemory());
    VectorContainer outputContainer = new VectorContainer();
    List<BatchGroup> batchGroupList = Lists.newArrayList();
    int batchCount = batchGroups.size();
    for (int i = 0; i < batchCount / 2; i++) {
      if (batchGroups.size() == 0) {
        break;
      }
      BatchGroup batch = batchGroups.pollLast();
      assert batch != null : "Encountered a null batch during merge and spill operation";
      batchGroupList.add(batch);
    }

    if (batchGroupList.size() == 0) {
      return null;
    }
    int estimatedRecordSize = 0;
    for (VectorWrapper<?> w : batchGroupList.get(0)) {
      try {
        estimatedRecordSize += TypeHelper.getSize(w.getField().getType());
      } catch (UnsupportedOperationException e) {
        estimatedRecordSize += 50;
      }
    }
    int targetRecordCount = Math.max(1, COPIER_BATCH_MEM_LIMIT / estimatedRecordSize);
    VectorContainer hyperBatch = constructHyperBatch(batchGroupList);
    createCopier(hyperBatch, batchGroupList, outputContainer, true);

    int count = copier.next(targetRecordCount);
    assert count > 0;

    logger.debug(
        "mergeAndSpill: estimated record size = {}, target record count = {}",
        estimatedRecordSize,
        targetRecordCount);

    // 1 output container is kept in memory, so we want to hold on to it and transferClone
    // allows keeping ownership
    VectorContainer c1 = VectorContainer.getTransferClone(outputContainer, oContext);
    c1.buildSchema(BatchSchema.SelectionVectorMode.NONE);
    c1.setRecordCount(count);

    String spillDir = dirs.next();
    Path currSpillPath = new Path(Joiner.on("/").join(spillDir, fileName));
    currSpillDirs.add(currSpillPath);
    String outputFile = Joiner.on("/").join(currSpillPath, spillCount++);
    try {
      fs.deleteOnExit(currSpillPath);
    } catch (IOException e) {
      // since this is meant to be used in a batches's spilling, we don't propagate the exception
      logger.warn("Unable to mark spill directory " + currSpillPath + " for deleting on exit", e);
    }
    stats.setLongStat(Metric.SPILL_COUNT, spillCount);
    BatchGroup newGroup = new BatchGroup(c1, fs, outputFile, oContext);
    try (AutoCloseable a = AutoCloseables.all(batchGroupList)) {
      logger.info("Merging and spilling to {}", outputFile);
      while ((count = copier.next(targetRecordCount)) > 0) {
        outputContainer.buildSchema(BatchSchema.SelectionVectorMode.NONE);
        outputContainer.setRecordCount(count);
        // note that addBatch also clears the outputContainer
        newGroup.addBatch(outputContainer);
      }
      injector.injectChecked(
          context.getExecutionControls(), INTERRUPTION_WHILE_SPILLING, IOException.class);
      newGroup.closeOutputStream();
    } catch (Throwable e) {
      // we only need to cleanup newGroup if spill failed
      try {
        AutoCloseables.close(e, newGroup);
      } catch (Throwable t) {
        /* close() may hit the same IO issue; just ignore */
      }
      throw UserException.resourceError(e)
          .message("External Sort encountered an error while spilling to disk")
          .addContext(e.getMessage() /* more detail */)
          .build(logger);
    } finally {
      hyperBatch.clear();
    }
    logger.debug("mergeAndSpill: final total size in memory = {}", oAllocator.getAllocatedMemory());
    logger.info("Completed spilling to {}", outputFile);
    return newGroup;
  }
Example #19
0
  private StreamingAggregator createAggregatorInternal()
      throws SchemaChangeException, ClassTransformationException, IOException {
    ClassGenerator<StreamingAggregator> cg =
        CodeGenerator.getRoot(
            StreamingAggTemplate.TEMPLATE_DEFINITION, context.getFunctionRegistry());
    container.clear();

    LogicalExpression[] keyExprs = new LogicalExpression[popConfig.getKeys().length];
    LogicalExpression[] valueExprs = new LogicalExpression[popConfig.getExprs().length];
    TypedFieldId[] keyOutputIds = new TypedFieldId[popConfig.getKeys().length];

    ErrorCollector collector = new ErrorCollectorImpl();

    for (int i = 0; i < keyExprs.length; i++) {
      final NamedExpression ne = popConfig.getKeys()[i];
      final LogicalExpression expr =
          ExpressionTreeMaterializer.materialize(
              ne.getExpr(), incoming, collector, context.getFunctionRegistry());
      if (expr == null) {
        continue;
      }
      keyExprs[i] = expr;
      final MaterializedField outputField =
          MaterializedField.create(ne.getRef(), expr.getMajorType());
      final ValueVector vector = TypeHelper.getNewVector(outputField, oContext.getAllocator());
      keyOutputIds[i] = container.add(vector);
    }

    for (int i = 0; i < valueExprs.length; i++) {
      final NamedExpression ne = popConfig.getExprs()[i];
      final LogicalExpression expr =
          ExpressionTreeMaterializer.materialize(
              ne.getExpr(), incoming, collector, context.getFunctionRegistry());
      if (expr instanceof IfExpression) {
        throw UserException.unsupportedError(
                new UnsupportedOperationException(
                    "Union type not supported in aggregate functions"))
            .build(logger);
      }
      if (expr == null) {
        continue;
      }

      final MaterializedField outputField =
          MaterializedField.create(ne.getRef(), expr.getMajorType());
      ValueVector vector = TypeHelper.getNewVector(outputField, oContext.getAllocator());
      TypedFieldId id = container.add(vector);
      valueExprs[i] = new ValueVectorWriteExpression(id, expr, true);
    }

    if (collector.hasErrors()) {
      throw new SchemaChangeException(
          "Failure while materializing expression. " + collector.toErrorString());
    }

    setupIsSame(cg, keyExprs);
    setupIsSameApart(cg, keyExprs);
    addRecordValues(cg, valueExprs);
    outputRecordKeys(cg, keyOutputIds, keyExprs);
    outputRecordKeysPrev(cg, keyOutputIds, keyExprs);

    cg.getBlock("resetValues")._return(JExpr.TRUE);
    getIndex(cg);

    container.buildSchema(SelectionVectorMode.NONE);
    StreamingAggregator agg = context.getImplementationClass(cg);
    agg.setup(oContext, incoming, this);
    return agg;
  }
Example #20
0
  @Override
  public void run() {
    // if a cancel thread has already entered this executor, we have not reason to continue.
    if (!hasCloseoutThread.compareAndSet(false, true)) {
      return;
    }

    final Thread myThread = Thread.currentThread();
    myThreadRef.set(myThread);
    final String originalThreadName = myThread.getName();
    final FragmentHandle fragmentHandle = fragmentContext.getHandle();
    final DrillbitContext drillbitContext = fragmentContext.getDrillbitContext();
    final ClusterCoordinator clusterCoordinator = drillbitContext.getClusterCoordinator();
    final DrillbitStatusListener drillbitStatusListener = new FragmentDrillbitStatusListener();
    final String newThreadName = QueryIdHelper.getExecutorThreadName(fragmentHandle);

    try {

      myThread.setName(newThreadName);

      // if we didn't get the root operator when the executor was created, create it now.
      final FragmentRoot rootOperator =
          this.rootOperator != null
              ? this.rootOperator
              : drillbitContext.getPlanReader().readFragmentOperator(fragment.getFragmentJson());

      root = ImplCreator.getExec(fragmentContext, rootOperator);
      if (root == null) {
        return;
      }

      clusterCoordinator.addDrillbitStatusListener(drillbitStatusListener);
      updateState(FragmentState.RUNNING);

      acceptExternalEvents.countDown();
      injector.injectPause(fragmentContext.getExecutionControls(), "fragment-running", logger);

      final DrillbitEndpoint endpoint = drillbitContext.getEndpoint();
      logger.debug(
          "Starting fragment {}:{} on {}:{}",
          fragmentHandle.getMajorFragmentId(),
          fragmentHandle.getMinorFragmentId(),
          endpoint.getAddress(),
          endpoint.getUserPort());

      final UserGroupInformation queryUserUgi =
          fragmentContext.isImpersonationEnabled()
              ? ImpersonationUtil.createProxyUgi(fragmentContext.getQueryUserName())
              : ImpersonationUtil.getProcessUserUGI();

      queryUserUgi.doAs(
          new PrivilegedExceptionAction<Void>() {
            public Void run() throws Exception {
              injector.injectChecked(
                  fragmentContext.getExecutionControls(), "fragment-execution", IOException.class);
              /*
               * Run the query until root.next returns false OR we no longer need to continue.
               */
              while (shouldContinue() && root.next()) {
                // loop
              }

              return null;
            }
          });

    } catch (OutOfMemoryError | OutOfMemoryException e) {
      if (!(e instanceof OutOfMemoryError) || "Direct buffer memory".equals(e.getMessage())) {
        fail(UserException.memoryError(e).build(logger));
      } else {
        // we have a heap out of memory error. The JVM in unstable, exit.
        CatastrophicFailure.exit(
            e, "Unable to handle out of memory condition in FragmentExecutor.", -2);
      }
    } catch (AssertionError | Exception e) {
      fail(e);
    } finally {

      // no longer allow this thread to be interrupted. We synchronize here to make sure that cancel
      // can't set an
      // interruption after we have moved beyond this block.
      synchronized (myThreadRef) {
        myThreadRef.set(null);
        Thread.interrupted();
      }

      // We need to sure we countDown at least once. We'll do it here to guarantee that.
      acceptExternalEvents.countDown();

      // here we could be in FAILED, RUNNING, or CANCELLATION_REQUESTED
      cleanup(FragmentState.FINISHED);

      clusterCoordinator.removeDrillbitStatusListener(drillbitStatusListener);

      myThread.setName(originalThreadName);
    }
  }