void sendFailure(UserRpcException e) { sendOnce(); UserException uex = UserException.systemError(e).addIdentity(e.getEndpoint()).build(logger); logger.error("Unexpected Error while handling request message", e); OutboundRpcMessage outMessage = new OutboundRpcMessage( RpcMode.RESPONSE_FAILURE, 0, coordinationId, uex.getOrCreatePBError(false)); if (RpcConstants.EXTRA_DEBUGGING) { logger.debug("Adding message to outbound buffer. {}", outMessage); } connection.getChannel().writeAndFlush(outMessage); }
@Override public void dataArrived(QueryDataBatch result, ConnectionThrottle throttle) { final QueryData header = result.getHeader(); final DrillBuf data = result.getData(); if (data != null) { count.addAndGet(header.getRowCount()); try { loader.load(header.getDef(), data); // TODO: Clean: DRILL-2933: That load(...) no longer throws // SchemaChangeException, so check/clean catch clause below. } catch (SchemaChangeException e) { submissionFailed(UserException.systemError(e).build(logger)); } switch (format) { case TABLE: VectorUtil.showVectorAccessibleContent(loader, columnWidth); break; case TSV: VectorUtil.showVectorAccessibleContent(loader, "\t"); break; case CSV: VectorUtil.showVectorAccessibleContent(loader, ","); break; } loader.clear(); } result.release(); }
public int await() throws Exception { latch.await(); if (exception != null) { exception.addSuppressed(new DrillRuntimeException("Exception in executor threadpool")); throw exception; } return count.get(); }
@Override public void dropTable(String tableName) { try { plugin.getClient().deleteTable(tableName); } catch (Exception e) { throw UserException.dataWriteError(e) .message("Failure while trying to drop table '%s'.", tableName) .addContext("plugin", name) .build(logger); } }
@Override public void validate(OptionValue v) { super.validate(v); try { CompilerPolicy.valueOf(v.string_val.toUpperCase()); } catch (IllegalArgumentException e) { throw UserException.validationError() .message( "Invalid value '%s' specified for option '%s'. Valid values are %s.", v.string_val, getOptionName(), Arrays.toString(CompilerPolicy.values())) .build(logger); } }
private void sendFinalState() { final FragmentState outcome = fragmentState.get(); if (outcome == FragmentState.FAILED) { final FragmentHandle handle = getContext().getHandle(); final UserException uex = UserException.systemError(deferredException.getAndClear()) .addIdentity(getContext().getIdentity()) .addContext( "Fragment", handle.getMajorFragmentId() + ":" + handle.getMinorFragmentId()) .build(logger); statusReporter.fail(uex); } else { statusReporter.stateChanged(outcome); } }
private void initCols(Schema schema) throws SchemaChangeException { ImmutableList.Builder<ProjectedColumnInfo> pciBuilder = ImmutableList.builder(); for (int i = 0; i < schema.getColumnCount(); i++) { ColumnSchema col = schema.getColumnByIndex(i); final String name = col.getName(); final Type kuduType = col.getType(); MinorType minorType = TYPES.get(kuduType); if (minorType == null) { logger.warn( "Ignoring column that is unsupported.", UserException.unsupportedError() .message( "A column you queried has a data type that is not currently supported by the Kudu storage plugin. " + "The column's name was %s and its Kudu data type was %s. ", name, kuduType.toString()) .addContext("column Name", name) .addContext("plugin", "kudu") .build(logger)); continue; } MajorType majorType; if (col.isNullable()) { majorType = Types.optional(minorType); } else { majorType = Types.required(minorType); } MaterializedField field = MaterializedField.create(name, majorType); final Class<? extends ValueVector> clazz = (Class<? extends ValueVector>) TypeHelper.getValueVectorClass(minorType, majorType.getMode()); ValueVector vector = output.addField(field, clazz); vector.allocateNew(); ProjectedColumnInfo pci = new ProjectedColumnInfo(); pci.vv = vector; pci.kuduColumn = col; pci.index = i; pciBuilder.add(pci); } projectedCols = pciBuilder.build(); }
private SelectionVector2 newSV2() throws OutOfMemoryException, InterruptedException { SelectionVector2 sv2 = new SelectionVector2(oAllocator); if (!sv2.allocateNewSafe(incoming.getRecordCount())) { try { final BatchGroup merged = mergeAndSpill(batchGroups); if (merged != null) { spilledBatchGroups.add(merged); } else { throw UserException.memoryError( "Unable to allocate sv2 for %d records, and not enough batchGroups to spill.", incoming.getRecordCount()) .addContext("batchGroups.size", batchGroups.size()) .addContext("spilledBatchGroups.size", spilledBatchGroups.size()) .addContext("allocated memory", oAllocator.getAllocatedMemory()) .addContext("allocator limit", oAllocator.getLimit()) .build(logger); } } catch (SchemaChangeException e) { throw new RuntimeException(e); } int waitTime = 1; while (true) { try { Thread.sleep(waitTime * 1000); } catch (final InterruptedException e) { if (!context.shouldContinue()) { throw e; } } waitTime *= 2; if (sv2.allocateNewSafe(incoming.getRecordCount())) { break; } if (waitTime >= 32) { throw new OutOfMemoryException("Unable to allocate sv2 buffer after repeated attempts"); } } } for (int i = 0; i < incoming.getRecordCount(); i++) { sv2.setIndex(i, (char) i); } sv2.setRecordCount(incoming.getRecordCount()); return sv2; }
/** * Generates the next record batch * * @return number of records in the batch */ @Override public int next() { reader.resetForNextBatch(); int cnt = 0; try { while (cnt < MAX_RECORDS_PER_BATCH && reader.parseNext()) { cnt++; } reader.finishBatch(); return cnt; } catch (IOException | TextParsingException e) { throw UserException.dataReadError(e) .addContext( "Failure while reading file %s. Happened at or shortly before byte position %d.", split.getPath(), reader.getPos()) .build(logger); } }
/** * This limits the number of "small" and "large" queries that a Drill cluster will run * simultaneously, if queueing is enabled. If the query is unable to run, this will block until it * can. Beware that this is called under run(), and so will consume a Thread while it waits for * the required distributed semaphore. * * @param plan the query plan * @throws ForemanSetupException */ private void acquireQuerySemaphore(final PhysicalPlan plan) throws ForemanSetupException { final OptionManager optionManager = queryContext.getOptions(); final boolean queuingEnabled = optionManager.getOption(ExecConstants.ENABLE_QUEUE); if (queuingEnabled) { final long queueThreshold = optionManager.getOption(ExecConstants.QUEUE_THRESHOLD_SIZE); double totalCost = 0; for (final PhysicalOperator ops : plan.getSortedOperators()) { totalCost += ops.getCost(); } final long queueTimeout = optionManager.getOption(ExecConstants.QUEUE_TIMEOUT); final String queueName; try { @SuppressWarnings("resource") final ClusterCoordinator clusterCoordinator = drillbitContext.getClusterCoordinator(); final DistributedSemaphore distributedSemaphore; // get the appropriate semaphore if (totalCost > queueThreshold) { final int largeQueue = (int) optionManager.getOption(ExecConstants.LARGE_QUEUE_SIZE); distributedSemaphore = clusterCoordinator.getSemaphore("query.large", largeQueue); queueName = "large"; } else { final int smallQueue = (int) optionManager.getOption(ExecConstants.SMALL_QUEUE_SIZE); distributedSemaphore = clusterCoordinator.getSemaphore("query.small", smallQueue); queueName = "small"; } lease = distributedSemaphore.acquire(queueTimeout, TimeUnit.MILLISECONDS); } catch (final Exception e) { throw new ForemanSetupException("Unable to acquire slot for query.", e); } if (lease == null) { throw UserException.resourceError() .message( "Unable to acquire queue resources for query within timeout. Timeout for %s queue was set at %d seconds.", queueName, queueTimeout / 1000) .build(logger); } } }
/** * Performs the initial setup required for the record reader. Initializes the input stream, * handling of the output record batch and the actual reader to be used. * * @param context operator context from which buffer's will be allocated and managed * @param outputMutator Used to create the schema in the output record batch * @throws ExecutionSetupException */ @Override public void setup(OperatorContext context, OutputMutator outputMutator) throws ExecutionSetupException { oContext = context; readBuffer = context.getManagedBuffer(READ_BUFFER); whitespaceBuffer = context.getManagedBuffer(WHITE_SPACE_BUFFER); // setup Output, Input, and Reader try { TextOutput output = null; TextInput input = null; InputStream stream = null; // setup Output using OutputMutator if (settings.isHeaderExtractionEnabled()) { // extract header and use that to setup a set of VarCharVectors String[] fieldNames = extractHeader(); output = new FieldVarCharOutput(outputMutator, fieldNames, getColumns(), isStarQuery()); } else { // simply use RepeatedVarCharVector output = new RepeatedVarCharOutput(outputMutator, getColumns(), isStarQuery()); } // setup Input using InputStream stream = dfs.openPossiblyCompressedStream(split.getPath()); input = new TextInput( settings, stream, readBuffer, split.getStart(), split.getStart() + split.getLength()); // setup Reader using Input and Output reader = new TextReader(settings, input, output, whitespaceBuffer); reader.start(); } catch (SchemaChangeException | IOException e) { throw new ExecutionSetupException( String.format("Failure while setting up text reader for file %s", split.getPath()), e); } catch (IllegalArgumentException e) { throw UserException.dataReadError(e) .addContext("File Path", split.getPath().toString()) .build(logger); } }
@Override public void close() { Preconditions.checkState(!isClosed); Preconditions.checkState(resultState != null); logger.info("foreman cleaning up."); injector.injectPause(queryContext.getExecutionControls(), "foreman-cleanup", logger); // remove the channel disconnected listener (doesn't throw) closeFuture.removeListener(closeListener); // log the query summary logQuerySummary(); // These are straight forward removals from maps, so they won't throw. drillbitContext.getWorkBus().removeFragmentStatusListener(queryId); drillbitContext .getClusterCoordinator() .removeDrillbitStatusListener(queryManager.getDrillbitStatusListener()); suppressingClose(queryContext); /* * We do our best to write the latest state, but even that could fail. If it does, we can't write * the (possibly newly failing) state, so we continue on anyway. * * We only need to do this if the resultState differs from the last recorded state */ if (resultState != state) { suppressingClose( new AutoCloseable() { @Override public void close() throws Exception { recordNewState(resultState); } }); } /* * Construct the response based on the latest resultState. The builder shouldn't fail. */ final QueryResult.Builder resultBuilder = QueryResult.newBuilder().setQueryId(queryId).setQueryState(resultState); final UserException uex; if (resultException != null) { final boolean verbose = queryContext.getOptions().getOption(ExecConstants.ENABLE_VERBOSE_ERRORS_KEY).bool_val; uex = UserException.systemError(resultException) .addIdentity(queryContext.getCurrentEndpoint()) .build(logger); resultBuilder.addError(uex.getOrCreatePBError(verbose)); } else { uex = null; } // we store the final result here so we can capture any error/errorId in the profile for later // debugging. queryManager.writeFinalProfile(uex); /* * If sending the result fails, we don't really have any way to modify the result we tried to send; * it is possible it got sent but the result came from a later part of the code path. It is also * possible the connection has gone away, so this is irrelevant because there's nowhere to * send anything to. */ try { // send whatever result we ended up with initiatingClient.sendResult(responseListener, resultBuilder.build(), true); } catch (final Exception e) { addException(e); logger.warn("Exception sending result to client", resultException); } // Remove the Foreman from the running query list. bee.retireForeman(Foreman.this); try { releaseLease(); } finally { isClosed = true; } }
/** * Called by execution pool to do query setup, and kick off remote execution. * * <p>Note that completion of this function is not the end of the Foreman's role in the query's * lifecycle. */ @Override public void run() { // rename the thread we're using for debugging purposes final Thread currentThread = Thread.currentThread(); final String originalName = currentThread.getName(); currentThread.setName(QueryIdHelper.getQueryId(queryId) + ":foreman"); // track how long the query takes queryManager.markStartTime(); try { injector.injectChecked( queryContext.getExecutionControls(), "run-try-beginning", ForemanException.class); queryText = queryRequest.getPlan(); // convert a run query request into action switch (queryRequest.getType()) { case LOGICAL: parseAndRunLogicalPlan(queryRequest.getPlan()); break; case PHYSICAL: parseAndRunPhysicalPlan(queryRequest.getPlan()); break; case SQL: runSQL(queryRequest.getPlan()); break; default: throw new IllegalStateException(); } injector.injectChecked( queryContext.getExecutionControls(), "run-try-end", ForemanException.class); } catch (final OutOfMemoryException | OutOfMemoryRuntimeException e) { moveToState(QueryState.FAILED, UserException.memoryError(e).build(logger)); } catch (final ForemanException e) { moveToState(QueryState.FAILED, e); } catch (AssertionError | Exception ex) { moveToState( QueryState.FAILED, new ForemanException( "Unexpected exception during fragment initialization: " + ex.getMessage(), ex)); } catch (final OutOfMemoryError e) { if ("Direct buffer memory".equals(e.getMessage())) { moveToState( QueryState.FAILED, UserException.resourceError(e) .message("One or more nodes ran out of memory while executing the query.") .build(logger)); } else { /* * FragmentExecutors use a DrillbitStatusListener to watch out for the death of their query's Foreman. So, if we * die here, they should get notified about that, and cancel themselves; we don't have to attempt to notify * them, which might not work under these conditions. */ System.out.println("Node ran out of Heap memory, exiting."); e.printStackTrace(); System.out.flush(); System.exit(-1); } } finally { /* * Begin accepting external events. * * Doing this here in the finally clause will guarantee that it occurs. Otherwise, if there * is an exception anywhere during setup, it wouldn't occur, and any events that are generated * as a result of any partial setup that was done (such as the FragmentSubmitListener, * the ResponseSendListener, or an external call to cancel()), will hang the thread that makes the * event delivery call. * * If we do throw an exception during setup, and have already moved to QueryState.FAILED, we just need to * make sure that we can't make things any worse as those events are delivered, but allow * any necessary remaining cleanup to proceed. * * Note that cancellations cannot be simulated before this point, i.e. pauses can be injected, because Foreman * would wait on the cancelling thread to signal a resume and the cancelling thread would wait on the Foreman * to accept events. */ acceptExternalEvents.countDown(); // If we received the resume signal before fragments are setup, the first call does not // actually resume the // fragments. Since setup is done, all fragments must have been delivered to remote nodes. Now // we can resume. if (resume) { resume(); } injector.injectPause(queryContext.getExecutionControls(), "foreman-ready", logger); // restore the thread's original name currentThread.setName(originalName); } /* * Note that despite the run() completing, the Foreman continues to exist, and receives * events (indirectly, through the QueryManager's use of stateListener), about fragment * completions. It won't go away until everything is completed, failed, or cancelled. */ }
/** * Set up the non-root fragments for execution. Some may be local, and some may be remote. * Messages are sent immediately, so they may start returning data even before we complete this. * * @param fragments the fragments * @throws ForemanException */ private void setupNonRootFragments(final Collection<PlanFragment> fragments) throws ForemanException { /* * We will send a single message to each endpoint, regardless of how many fragments will be * executed there. We need to start up the intermediate fragments first so that they will be * ready once the leaf fragments start producing data. To satisfy both of these, we will * make a pass through the fragments and put them into these two maps according to their * leaf/intermediate state, as well as their target drillbit. */ final Multimap<DrillbitEndpoint, PlanFragment> leafFragmentMap = ArrayListMultimap.create(); final Multimap<DrillbitEndpoint, PlanFragment> intFragmentMap = ArrayListMultimap.create(); // record all fragments for status purposes. for (final PlanFragment planFragment : fragments) { logger.trace( "Tracking intermediate remote node {} with data {}", planFragment.getAssignment(), planFragment.getFragmentJson()); queryManager.addFragmentStatusTracker(planFragment, false); if (planFragment.getLeafFragment()) { leafFragmentMap.put(planFragment.getAssignment(), planFragment); } else { intFragmentMap.put(planFragment.getAssignment(), planFragment); } } /* * We need to wait for the intermediates to be sent so that they'll be set up by the time * the leaves start producing data. We'll use this latch to wait for the responses. * * However, in order not to hang the process if any of the RPC requests fails, we always * count down (see FragmentSubmitFailures), but we count the number of failures so that we'll * know if any submissions did fail. */ final int numIntFragments = intFragmentMap.keySet().size(); final ExtendedLatch endpointLatch = new ExtendedLatch(numIntFragments); final FragmentSubmitFailures fragmentSubmitFailures = new FragmentSubmitFailures(); // send remote intermediate fragments for (final DrillbitEndpoint ep : intFragmentMap.keySet()) { sendRemoteFragments(ep, intFragmentMap.get(ep), endpointLatch, fragmentSubmitFailures); } final long timeout = RPC_WAIT_IN_MSECS_PER_FRAGMENT * numIntFragments; if (numIntFragments > 0 && !endpointLatch.awaitUninterruptibly(timeout)) { long numberRemaining = endpointLatch.getCount(); throw UserException.connectionError() .message( "Exceeded timeout (%d) while waiting send intermediate work fragments to remote nodes. " + "Sent %d and only heard response back from %d nodes.", timeout, numIntFragments, numIntFragments - numberRemaining) .build(logger); } // if any of the intermediate fragment submissions failed, fail the query final List<FragmentSubmitFailures.SubmissionException> submissionExceptions = fragmentSubmitFailures.submissionExceptions; if (submissionExceptions.size() > 0) { Set<DrillbitEndpoint> endpoints = Sets.newHashSet(); StringBuilder sb = new StringBuilder(); boolean first = true; for (FragmentSubmitFailures.SubmissionException e : fragmentSubmitFailures.submissionExceptions) { DrillbitEndpoint endpoint = e.drillbitEndpoint; if (endpoints.add(endpoint)) { if (first) { first = false; } else { sb.append(", "); } sb.append(endpoint.getAddress()); } } throw UserException.connectionError(submissionExceptions.get(0).rpcException) .message("Error setting up remote intermediate fragment execution") .addContext("Nodes with failures", sb.toString()) .build(logger); } injector.injectChecked( queryContext.getExecutionControls(), "send-fragments", ForemanException.class); /* * Send the remote (leaf) fragments; we don't wait for these. Any problems will come in through * the regular sendListener event delivery. */ for (final DrillbitEndpoint ep : leafFragmentMap.keySet()) { sendRemoteFragments(ep, leafFragmentMap.get(ep), null, null); } }
private static void throwAsUnsupportedException(UserException ex) throws Exception { SqlUnsupportedException.errorClassNameToException( ex.getOrCreatePBError(false).getException().getExceptionClass()); throw ex; }
@Override public IterOutcome innerNext() { if (schema != null) { if (spillCount == 0) { return (getSelectionVector4().next()) ? IterOutcome.OK : IterOutcome.NONE; } else { Stopwatch w = Stopwatch.createStarted(); int count = copier.next(targetRecordCount); if (count > 0) { long t = w.elapsed(TimeUnit.MICROSECONDS); logger.debug("Took {} us to merge {} records", t, count); container.setRecordCount(count); return IterOutcome.OK; } else { logger.debug("copier returned 0 records"); return IterOutcome.NONE; } } } int totalCount = 0; int totalBatches = 0; // total number of batches received so far try { container.clear(); outer: while (true) { IterOutcome upstream; if (first) { upstream = IterOutcome.OK_NEW_SCHEMA; } else { upstream = next(incoming); } if (upstream == IterOutcome.OK && sorter == null) { upstream = IterOutcome.OK_NEW_SCHEMA; } switch (upstream) { case NONE: if (first) { return upstream; } break outer; case NOT_YET: throw new UnsupportedOperationException(); case STOP: return upstream; case OK_NEW_SCHEMA: case OK: VectorContainer convertedBatch; // only change in the case that the schema truly changes. Artificial schema changes are // ignored. if (upstream == IterOutcome.OK_NEW_SCHEMA && !incoming.getSchema().equals(schema)) { if (schema != null) { if (unionTypeEnabled) { this.schema = SchemaUtil.mergeSchemas(schema, incoming.getSchema()); } else { throw new SchemaChangeException( "Schema changes not supported in External Sort. Please enable Union type"); } } else { schema = incoming.getSchema(); } convertedBatch = SchemaUtil.coerceContainer(incoming, schema, oContext); for (BatchGroup b : batchGroups) { b.setSchema(schema); } for (BatchGroup b : spilledBatchGroups) { b.setSchema(schema); } this.sorter = createNewSorter(context, convertedBatch); } else { convertedBatch = SchemaUtil.coerceContainer(incoming, schema, oContext); } if (first) { first = false; } if (convertedBatch.getRecordCount() == 0) { for (VectorWrapper<?> w : convertedBatch) { w.clear(); } break; } SelectionVector2 sv2; if (incoming.getSchema().getSelectionVectorMode() == BatchSchema.SelectionVectorMode.TWO_BYTE) { sv2 = incoming.getSelectionVector2().clone(); } else { try { sv2 = newSV2(); } catch (InterruptedException e) { return IterOutcome.STOP; } catch (OutOfMemoryException e) { throw new OutOfMemoryException(e); } } int count = sv2.getCount(); totalCount += count; totalBatches++; sorter.setup(context, sv2, convertedBatch); sorter.sort(sv2); RecordBatchData rbd = new RecordBatchData(convertedBatch, oAllocator); boolean success = false; try { rbd.setSv2(sv2); batchGroups.add(new BatchGroup(rbd.getContainer(), rbd.getSv2(), oContext)); if (peakNumBatches < batchGroups.size()) { peakNumBatches = batchGroups.size(); stats.setLongStat(Metric.PEAK_BATCHES_IN_MEMORY, peakNumBatches); } batchesSinceLastSpill++; if ( // If we haven't spilled so far, do we have enough memory for MSorter if this // turns out to be the last incoming batch? (spillCount == 0 && !hasMemoryForInMemorySort(totalCount)) || // If we haven't spilled so far, make sure we don't exceed the maximum number of // batches SV4 can address (spillCount == 0 && totalBatches > Character.MAX_VALUE) || // TODO(DRILL-4438) - consider setting this threshold more intelligently, // lowering caused a failing low memory condition (test in // BasicPhysicalOpUnitTest) // to complete successfully (although it caused perf decrease as there was more // spilling) // current memory used is more than 95% of memory usage limit of this operator (oAllocator.getAllocatedMemory() > .95 * oAllocator.getLimit()) || // Number of incoming batches (BatchGroups) exceed the limit and number of // incoming batches accumulated // since the last spill exceed the defined limit (batchGroups.size() > SPILL_THRESHOLD && batchesSinceLastSpill >= SPILL_BATCH_GROUP_SIZE)) { if (firstSpillBatchCount == 0) { firstSpillBatchCount = batchGroups.size(); } if (spilledBatchGroups.size() > firstSpillBatchCount / 2) { logger.info("Merging spills"); final BatchGroup merged = mergeAndSpill(spilledBatchGroups); if (merged != null) { spilledBatchGroups.addFirst(merged); } } final BatchGroup merged = mergeAndSpill(batchGroups); if (merged != null) { // make sure we don't add null to spilledBatchGroups spilledBatchGroups.add(merged); batchesSinceLastSpill = 0; } } success = true; } finally { if (!success) { rbd.clear(); } } break; case OUT_OF_MEMORY: logger.debug("received OUT_OF_MEMORY, trying to spill"); if (batchesSinceLastSpill > 2) { final BatchGroup merged = mergeAndSpill(batchGroups); if (merged != null) { spilledBatchGroups.add(merged); batchesSinceLastSpill = 0; } } else { logger.debug("not enough batches to spill, sending OUT_OF_MEMORY downstream"); return IterOutcome.OUT_OF_MEMORY; } break; default: throw new UnsupportedOperationException(); } } if (totalCount == 0) { return IterOutcome.NONE; } if (spillCount == 0) { if (builder != null) { builder.clear(); builder.close(); } builder = new SortRecordBatchBuilder(oAllocator); for (BatchGroup group : batchGroups) { RecordBatchData rbd = new RecordBatchData(group.getContainer(), oAllocator); rbd.setSv2(group.getSv2()); builder.add(rbd); } builder.build(context, container); sv4 = builder.getSv4(); mSorter = createNewMSorter(); mSorter.setup(context, oAllocator, getSelectionVector4(), this.container); // For testing memory-leak purpose, inject exception after mSorter finishes setup injector.injectUnchecked(context.getExecutionControls(), INTERRUPTION_AFTER_SETUP); mSorter.sort(this.container); // sort may have prematurely exited due to should continue returning false. if (!context.shouldContinue()) { return IterOutcome.STOP; } // For testing memory-leak purpose, inject exception after mSorter finishes sorting injector.injectUnchecked(context.getExecutionControls(), INTERRUPTION_AFTER_SORT); sv4 = mSorter.getSV4(); container.buildSchema(SelectionVectorMode.FOUR_BYTE); } else { // some batches were spilled final BatchGroup merged = mergeAndSpill(batchGroups); if (merged != null) { spilledBatchGroups.add(merged); } batchGroups.addAll(spilledBatchGroups); spilledBatchGroups = null; // no need to cleanup spilledBatchGroups, all it's batches are in batchGroups now logger.warn( "Starting to merge. {} batch groups. Current allocated memory: {}", batchGroups.size(), oAllocator.getAllocatedMemory()); VectorContainer hyperBatch = constructHyperBatch(batchGroups); createCopier(hyperBatch, batchGroups, container, false); int estimatedRecordSize = 0; for (VectorWrapper<?> w : batchGroups.get(0)) { try { estimatedRecordSize += TypeHelper.getSize(w.getField().getType()); } catch (UnsupportedOperationException e) { estimatedRecordSize += 50; } } targetRecordCount = Math.min(MAX_BATCH_SIZE, Math.max(1, COPIER_BATCH_MEM_LIMIT / estimatedRecordSize)); int count = copier.next(targetRecordCount); container.buildSchema(SelectionVectorMode.NONE); container.setRecordCount(count); } return IterOutcome.OK_NEW_SCHEMA; } catch (SchemaChangeException ex) { kill(false); context.fail( UserException.unsupportedError(ex) .message("Sort doesn't currently support sorts with changing schemas") .build(logger)); return IterOutcome.STOP; } catch (ClassTransformationException | IOException ex) { kill(false); context.fail(ex); return IterOutcome.STOP; } catch (UnsupportedOperationException e) { throw new RuntimeException(e); } }
@Override public IterOutcome innerNext() { // if a special batch has been sent, we have no data in the incoming so exit early if (specialBatchSent) { return IterOutcome.NONE; } // this is only called on the first batch. Beyond this, the aggregator manages batches. if (aggregator == null || first) { IterOutcome outcome; if (first && incoming.getRecordCount() > 0) { first = false; outcome = IterOutcome.OK_NEW_SCHEMA; } else { outcome = next(incoming); } logger.debug("Next outcome of {}", outcome); switch (outcome) { case NONE: if (first && popConfig.getKeys().length == 0) { // if we have a straight aggregate and empty input batch, we need to handle it in a // different way constructSpecialBatch(); first = false; // set state to indicate the fact that we have sent a special batch and input is empty specialBatchSent = true; return IterOutcome.OK; } case OUT_OF_MEMORY: case NOT_YET: case STOP: return outcome; case OK_NEW_SCHEMA: if (!createAggregator()) { done = true; return IterOutcome.STOP; } break; case OK: break; default: throw new IllegalStateException(String.format("unknown outcome %s", outcome)); } } AggOutcome out = aggregator.doWork(); recordCount = aggregator.getOutputCount(); logger.debug("Aggregator response {}, records {}", out, aggregator.getOutputCount()); switch (out) { case CLEANUP_AND_RETURN: if (!first) { container.zeroVectors(); } done = true; // fall through case RETURN_OUTCOME: IterOutcome outcome = aggregator.getOutcome(); if (outcome == IterOutcome.NONE && first) { first = false; done = true; return IterOutcome.OK_NEW_SCHEMA; } else if (outcome == IterOutcome.OK && first) { outcome = IterOutcome.OK_NEW_SCHEMA; } else if (outcome != IterOutcome.OUT_OF_MEMORY) { first = false; } return outcome; case UPDATE_AGGREGATOR: context.fail( UserException.unsupportedError() .message("Streaming aggregate does not support schema changes") .build(logger)); close(); killIncoming(false); return IterOutcome.STOP; default: throw new IllegalStateException(String.format("Unknown state %s.", out)); } }
public BatchGroup mergeAndSpill(LinkedList<BatchGroup> batchGroups) throws SchemaChangeException { logger.debug("Copier allocator current allocation {}", copierAllocator.getAllocatedMemory()); logger.debug( "mergeAndSpill: starting total size in memory = {}", oAllocator.getAllocatedMemory()); VectorContainer outputContainer = new VectorContainer(); List<BatchGroup> batchGroupList = Lists.newArrayList(); int batchCount = batchGroups.size(); for (int i = 0; i < batchCount / 2; i++) { if (batchGroups.size() == 0) { break; } BatchGroup batch = batchGroups.pollLast(); assert batch != null : "Encountered a null batch during merge and spill operation"; batchGroupList.add(batch); } if (batchGroupList.size() == 0) { return null; } int estimatedRecordSize = 0; for (VectorWrapper<?> w : batchGroupList.get(0)) { try { estimatedRecordSize += TypeHelper.getSize(w.getField().getType()); } catch (UnsupportedOperationException e) { estimatedRecordSize += 50; } } int targetRecordCount = Math.max(1, COPIER_BATCH_MEM_LIMIT / estimatedRecordSize); VectorContainer hyperBatch = constructHyperBatch(batchGroupList); createCopier(hyperBatch, batchGroupList, outputContainer, true); int count = copier.next(targetRecordCount); assert count > 0; logger.debug( "mergeAndSpill: estimated record size = {}, target record count = {}", estimatedRecordSize, targetRecordCount); // 1 output container is kept in memory, so we want to hold on to it and transferClone // allows keeping ownership VectorContainer c1 = VectorContainer.getTransferClone(outputContainer, oContext); c1.buildSchema(BatchSchema.SelectionVectorMode.NONE); c1.setRecordCount(count); String spillDir = dirs.next(); Path currSpillPath = new Path(Joiner.on("/").join(spillDir, fileName)); currSpillDirs.add(currSpillPath); String outputFile = Joiner.on("/").join(currSpillPath, spillCount++); try { fs.deleteOnExit(currSpillPath); } catch (IOException e) { // since this is meant to be used in a batches's spilling, we don't propagate the exception logger.warn("Unable to mark spill directory " + currSpillPath + " for deleting on exit", e); } stats.setLongStat(Metric.SPILL_COUNT, spillCount); BatchGroup newGroup = new BatchGroup(c1, fs, outputFile, oContext); try (AutoCloseable a = AutoCloseables.all(batchGroupList)) { logger.info("Merging and spilling to {}", outputFile); while ((count = copier.next(targetRecordCount)) > 0) { outputContainer.buildSchema(BatchSchema.SelectionVectorMode.NONE); outputContainer.setRecordCount(count); // note that addBatch also clears the outputContainer newGroup.addBatch(outputContainer); } injector.injectChecked( context.getExecutionControls(), INTERRUPTION_WHILE_SPILLING, IOException.class); newGroup.closeOutputStream(); } catch (Throwable e) { // we only need to cleanup newGroup if spill failed try { AutoCloseables.close(e, newGroup); } catch (Throwable t) { /* close() may hit the same IO issue; just ignore */ } throw UserException.resourceError(e) .message("External Sort encountered an error while spilling to disk") .addContext(e.getMessage() /* more detail */) .build(logger); } finally { hyperBatch.clear(); } logger.debug("mergeAndSpill: final total size in memory = {}", oAllocator.getAllocatedMemory()); logger.info("Completed spilling to {}", outputFile); return newGroup; }
private StreamingAggregator createAggregatorInternal() throws SchemaChangeException, ClassTransformationException, IOException { ClassGenerator<StreamingAggregator> cg = CodeGenerator.getRoot( StreamingAggTemplate.TEMPLATE_DEFINITION, context.getFunctionRegistry()); container.clear(); LogicalExpression[] keyExprs = new LogicalExpression[popConfig.getKeys().length]; LogicalExpression[] valueExprs = new LogicalExpression[popConfig.getExprs().length]; TypedFieldId[] keyOutputIds = new TypedFieldId[popConfig.getKeys().length]; ErrorCollector collector = new ErrorCollectorImpl(); for (int i = 0; i < keyExprs.length; i++) { final NamedExpression ne = popConfig.getKeys()[i]; final LogicalExpression expr = ExpressionTreeMaterializer.materialize( ne.getExpr(), incoming, collector, context.getFunctionRegistry()); if (expr == null) { continue; } keyExprs[i] = expr; final MaterializedField outputField = MaterializedField.create(ne.getRef(), expr.getMajorType()); final ValueVector vector = TypeHelper.getNewVector(outputField, oContext.getAllocator()); keyOutputIds[i] = container.add(vector); } for (int i = 0; i < valueExprs.length; i++) { final NamedExpression ne = popConfig.getExprs()[i]; final LogicalExpression expr = ExpressionTreeMaterializer.materialize( ne.getExpr(), incoming, collector, context.getFunctionRegistry()); if (expr instanceof IfExpression) { throw UserException.unsupportedError( new UnsupportedOperationException( "Union type not supported in aggregate functions")) .build(logger); } if (expr == null) { continue; } final MaterializedField outputField = MaterializedField.create(ne.getRef(), expr.getMajorType()); ValueVector vector = TypeHelper.getNewVector(outputField, oContext.getAllocator()); TypedFieldId id = container.add(vector); valueExprs[i] = new ValueVectorWriteExpression(id, expr, true); } if (collector.hasErrors()) { throw new SchemaChangeException( "Failure while materializing expression. " + collector.toErrorString()); } setupIsSame(cg, keyExprs); setupIsSameApart(cg, keyExprs); addRecordValues(cg, valueExprs); outputRecordKeys(cg, keyOutputIds, keyExprs); outputRecordKeysPrev(cg, keyOutputIds, keyExprs); cg.getBlock("resetValues")._return(JExpr.TRUE); getIndex(cg); container.buildSchema(SelectionVectorMode.NONE); StreamingAggregator agg = context.getImplementationClass(cg); agg.setup(oContext, incoming, this); return agg; }
@Override public void run() { // if a cancel thread has already entered this executor, we have not reason to continue. if (!hasCloseoutThread.compareAndSet(false, true)) { return; } final Thread myThread = Thread.currentThread(); myThreadRef.set(myThread); final String originalThreadName = myThread.getName(); final FragmentHandle fragmentHandle = fragmentContext.getHandle(); final DrillbitContext drillbitContext = fragmentContext.getDrillbitContext(); final ClusterCoordinator clusterCoordinator = drillbitContext.getClusterCoordinator(); final DrillbitStatusListener drillbitStatusListener = new FragmentDrillbitStatusListener(); final String newThreadName = QueryIdHelper.getExecutorThreadName(fragmentHandle); try { myThread.setName(newThreadName); // if we didn't get the root operator when the executor was created, create it now. final FragmentRoot rootOperator = this.rootOperator != null ? this.rootOperator : drillbitContext.getPlanReader().readFragmentOperator(fragment.getFragmentJson()); root = ImplCreator.getExec(fragmentContext, rootOperator); if (root == null) { return; } clusterCoordinator.addDrillbitStatusListener(drillbitStatusListener); updateState(FragmentState.RUNNING); acceptExternalEvents.countDown(); injector.injectPause(fragmentContext.getExecutionControls(), "fragment-running", logger); final DrillbitEndpoint endpoint = drillbitContext.getEndpoint(); logger.debug( "Starting fragment {}:{} on {}:{}", fragmentHandle.getMajorFragmentId(), fragmentHandle.getMinorFragmentId(), endpoint.getAddress(), endpoint.getUserPort()); final UserGroupInformation queryUserUgi = fragmentContext.isImpersonationEnabled() ? ImpersonationUtil.createProxyUgi(fragmentContext.getQueryUserName()) : ImpersonationUtil.getProcessUserUGI(); queryUserUgi.doAs( new PrivilegedExceptionAction<Void>() { public Void run() throws Exception { injector.injectChecked( fragmentContext.getExecutionControls(), "fragment-execution", IOException.class); /* * Run the query until root.next returns false OR we no longer need to continue. */ while (shouldContinue() && root.next()) { // loop } return null; } }); } catch (OutOfMemoryError | OutOfMemoryException e) { if (!(e instanceof OutOfMemoryError) || "Direct buffer memory".equals(e.getMessage())) { fail(UserException.memoryError(e).build(logger)); } else { // we have a heap out of memory error. The JVM in unstable, exit. CatastrophicFailure.exit( e, "Unable to handle out of memory condition in FragmentExecutor.", -2); } } catch (AssertionError | Exception e) { fail(e); } finally { // no longer allow this thread to be interrupted. We synchronize here to make sure that cancel // can't set an // interruption after we have moved beyond this block. synchronized (myThreadRef) { myThreadRef.set(null); Thread.interrupted(); } // We need to sure we countDown at least once. We'll do it here to guarantee that. acceptExternalEvents.countDown(); // here we could be in FAILED, RUNNING, or CANCELLATION_REQUESTED cleanup(FragmentState.FINISHED); clusterCoordinator.removeDrillbitStatusListener(drillbitStatusListener); myThread.setName(originalThreadName); } }