public class ExternalSortBatch extends AbstractRecordBatch<ExternalSort> { private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(ExternalSortBatch.class); private static final ControlsInjector injector = ControlsInjectorFactory.getInjector(ExternalSortBatch.class); private static final GeneratorMapping COPIER_MAPPING = new GeneratorMapping("doSetup", "doCopy", null, null); private static final MappingSet MAIN_MAPPING = new MappingSet( (String) null, null, ClassGenerator.DEFAULT_SCALAR_MAP, ClassGenerator.DEFAULT_SCALAR_MAP); private static final MappingSet LEFT_MAPPING = new MappingSet( "leftIndex", null, ClassGenerator.DEFAULT_SCALAR_MAP, ClassGenerator.DEFAULT_SCALAR_MAP); private static final MappingSet RIGHT_MAPPING = new MappingSet( "rightIndex", null, ClassGenerator.DEFAULT_SCALAR_MAP, ClassGenerator.DEFAULT_SCALAR_MAP); private static final MappingSet COPIER_MAPPING_SET = new MappingSet(COPIER_MAPPING, COPIER_MAPPING); private final int SPILL_BATCH_GROUP_SIZE; private final int SPILL_THRESHOLD; private final Iterator<String> dirs; private final RecordBatch incoming; private final BufferAllocator oAllocator; private final BufferAllocator copierAllocator; private BatchSchema schema; private SingleBatchSorter sorter; private SortRecordBatchBuilder builder; private MSorter mSorter; /** * A single PriorityQueueCopier instance is used for 2 purposes: 1. Merge sorted batches before * spilling 2. Merge sorted batches when all incoming data fits in memory */ private PriorityQueueCopier copier; private LinkedList<BatchGroup> batchGroups = Lists.newLinkedList(); private LinkedList<BatchGroup> spilledBatchGroups = Lists.newLinkedList(); private SelectionVector4 sv4; private FileSystem fs; private int spillCount = 0; private int batchesSinceLastSpill = 0; private boolean first = true; private int targetRecordCount; private final String fileName; private Set<Path> currSpillDirs = Sets.newTreeSet(); private int firstSpillBatchCount = 0; private int peakNumBatches = -1; /** * The copier uses the COPIER_BATCH_MEM_LIMIT to estimate the target number of records to return * in each batch. */ private static final int COPIER_BATCH_MEM_LIMIT = 256 * 1024; public static final String INTERRUPTION_AFTER_SORT = "after-sort"; public static final String INTERRUPTION_AFTER_SETUP = "after-setup"; public static final String INTERRUPTION_WHILE_SPILLING = "spilling"; public enum Metric implements MetricDef { SPILL_COUNT, // number of times operator spilled to disk PEAK_SIZE_IN_MEMORY, // peak value for totalSizeInMemory PEAK_BATCHES_IN_MEMORY; // maximum number of batches kept in memory @Override public int metricId() { return ordinal(); } } public ExternalSortBatch(ExternalSort popConfig, FragmentContext context, RecordBatch incoming) throws OutOfMemoryException { super(popConfig, context, true); this.incoming = incoming; DrillConfig config = context.getConfig(); Configuration conf = new Configuration(); conf.set("fs.default.name", config.getString(ExecConstants.EXTERNAL_SORT_SPILL_FILESYSTEM)); try { this.fs = FileSystem.get(conf); } catch (IOException e) { throw new RuntimeException(e); } SPILL_BATCH_GROUP_SIZE = config.getInt(ExecConstants.EXTERNAL_SORT_SPILL_GROUP_SIZE); SPILL_THRESHOLD = config.getInt(ExecConstants.EXTERNAL_SORT_SPILL_THRESHOLD); dirs = Iterators.cycle(config.getStringList(ExecConstants.EXTERNAL_SORT_SPILL_DIRS)); oAllocator = oContext.getAllocator(); copierAllocator = oAllocator.newChildAllocator( oAllocator.getName() + ":copier", PriorityQueueCopier.INITIAL_ALLOCATION, PriorityQueueCopier.MAX_ALLOCATION); FragmentHandle handle = context.getHandle(); fileName = String.format( "%s_majorfragment%s_minorfragment%s_operator%s", QueryIdHelper.getQueryId(handle.getQueryId()), handle.getMajorFragmentId(), handle.getMinorFragmentId(), popConfig.getOperatorId()); } @Override public int getRecordCount() { if (sv4 != null) { return sv4.getCount(); } return container.getRecordCount(); } @Override public SelectionVector4 getSelectionVector4() { return sv4; } private void closeBatchGroups(Collection<BatchGroup> groups) { for (BatchGroup group : groups) { try { group.close(); } catch (Exception e) { // collect all failure and make sure to cleanup all remaining batches // Originally we would have thrown a RuntimeException that would propagate to // FragmentExecutor.closeOutResources() // where it would have been passed to context.fail() // passing the exception directly to context.fail(e) will let the cleanup process continue // instead of stopping // right away, this will also make sure we collect any additional exception we may get while // cleaning up context.fail(e); } } } @Override public void close() { try { if (batchGroups != null) { closeBatchGroups(batchGroups); batchGroups = null; } if (spilledBatchGroups != null) { closeBatchGroups(spilledBatchGroups); spilledBatchGroups = null; } } finally { if (builder != null) { builder.clear(); builder.close(); } if (sv4 != null) { sv4.clear(); } try { if (copier != null) { copier.close(); } } catch (IOException e) { throw new RuntimeException(e); } finally { copierAllocator.close(); super.close(); if (mSorter != null) { mSorter.clear(); } for (Iterator iter = this.currSpillDirs.iterator(); iter.hasNext(); iter.remove()) { Path path = (Path) iter.next(); try { if (fs != null && path != null && fs.exists(path)) { if (fs.delete(path, true)) { fs.cancelDeleteOnExit(path); } } } catch (IOException e) { // since this is meant to be used in a batches's cleanup, we don't propagate the // exception logger.warn("Unable to delete spill directory " + path, e); } } } } } @Override public void buildSchema() throws SchemaChangeException { IterOutcome outcome = next(incoming); switch (outcome) { case OK: case OK_NEW_SCHEMA: for (VectorWrapper<?> w : incoming) { ValueVector v = container.addOrGet(w.getField()); if (v instanceof AbstractContainerVector) { w.getValueVector().makeTransferPair(v); // Can we remove this hack? v.clear(); } v.allocateNew(); // Can we remove this? - SVR fails with NPE (TODO) } container.buildSchema(SelectionVectorMode.NONE); container.setRecordCount(0); break; case STOP: state = BatchState.STOP; break; case OUT_OF_MEMORY: state = BatchState.OUT_OF_MEMORY; break; case NONE: state = BatchState.DONE; break; default: break; } } @Override public IterOutcome innerNext() { if (schema != null) { if (spillCount == 0) { return (getSelectionVector4().next()) ? IterOutcome.OK : IterOutcome.NONE; } else { Stopwatch w = Stopwatch.createStarted(); int count = copier.next(targetRecordCount); if (count > 0) { long t = w.elapsed(TimeUnit.MICROSECONDS); logger.debug("Took {} us to merge {} records", t, count); container.setRecordCount(count); return IterOutcome.OK; } else { logger.debug("copier returned 0 records"); return IterOutcome.NONE; } } } int totalCount = 0; int totalBatches = 0; // total number of batches received so far try { container.clear(); outer: while (true) { IterOutcome upstream; if (first) { upstream = IterOutcome.OK_NEW_SCHEMA; } else { upstream = next(incoming); } if (upstream == IterOutcome.OK && sorter == null) { upstream = IterOutcome.OK_NEW_SCHEMA; } switch (upstream) { case NONE: if (first) { return upstream; } break outer; case NOT_YET: throw new UnsupportedOperationException(); case STOP: return upstream; case OK_NEW_SCHEMA: case OK: VectorContainer convertedBatch; // only change in the case that the schema truly changes. Artificial schema changes are // ignored. if (upstream == IterOutcome.OK_NEW_SCHEMA && !incoming.getSchema().equals(schema)) { if (schema != null) { if (unionTypeEnabled) { this.schema = SchemaUtil.mergeSchemas(schema, incoming.getSchema()); } else { throw new SchemaChangeException( "Schema changes not supported in External Sort. Please enable Union type"); } } else { schema = incoming.getSchema(); } convertedBatch = SchemaUtil.coerceContainer(incoming, schema, oContext); for (BatchGroup b : batchGroups) { b.setSchema(schema); } for (BatchGroup b : spilledBatchGroups) { b.setSchema(schema); } this.sorter = createNewSorter(context, convertedBatch); } else { convertedBatch = SchemaUtil.coerceContainer(incoming, schema, oContext); } if (first) { first = false; } if (convertedBatch.getRecordCount() == 0) { for (VectorWrapper<?> w : convertedBatch) { w.clear(); } break; } SelectionVector2 sv2; if (incoming.getSchema().getSelectionVectorMode() == BatchSchema.SelectionVectorMode.TWO_BYTE) { sv2 = incoming.getSelectionVector2().clone(); } else { try { sv2 = newSV2(); } catch (InterruptedException e) { return IterOutcome.STOP; } catch (OutOfMemoryException e) { throw new OutOfMemoryException(e); } } int count = sv2.getCount(); totalCount += count; totalBatches++; sorter.setup(context, sv2, convertedBatch); sorter.sort(sv2); RecordBatchData rbd = new RecordBatchData(convertedBatch, oAllocator); boolean success = false; try { rbd.setSv2(sv2); batchGroups.add(new BatchGroup(rbd.getContainer(), rbd.getSv2(), oContext)); if (peakNumBatches < batchGroups.size()) { peakNumBatches = batchGroups.size(); stats.setLongStat(Metric.PEAK_BATCHES_IN_MEMORY, peakNumBatches); } batchesSinceLastSpill++; if ( // If we haven't spilled so far, do we have enough memory for MSorter if this // turns out to be the last incoming batch? (spillCount == 0 && !hasMemoryForInMemorySort(totalCount)) || // If we haven't spilled so far, make sure we don't exceed the maximum number of // batches SV4 can address (spillCount == 0 && totalBatches > Character.MAX_VALUE) || // TODO(DRILL-4438) - consider setting this threshold more intelligently, // lowering caused a failing low memory condition (test in // BasicPhysicalOpUnitTest) // to complete successfully (although it caused perf decrease as there was more // spilling) // current memory used is more than 95% of memory usage limit of this operator (oAllocator.getAllocatedMemory() > .95 * oAllocator.getLimit()) || // Number of incoming batches (BatchGroups) exceed the limit and number of // incoming batches accumulated // since the last spill exceed the defined limit (batchGroups.size() > SPILL_THRESHOLD && batchesSinceLastSpill >= SPILL_BATCH_GROUP_SIZE)) { if (firstSpillBatchCount == 0) { firstSpillBatchCount = batchGroups.size(); } if (spilledBatchGroups.size() > firstSpillBatchCount / 2) { logger.info("Merging spills"); final BatchGroup merged = mergeAndSpill(spilledBatchGroups); if (merged != null) { spilledBatchGroups.addFirst(merged); } } final BatchGroup merged = mergeAndSpill(batchGroups); if (merged != null) { // make sure we don't add null to spilledBatchGroups spilledBatchGroups.add(merged); batchesSinceLastSpill = 0; } } success = true; } finally { if (!success) { rbd.clear(); } } break; case OUT_OF_MEMORY: logger.debug("received OUT_OF_MEMORY, trying to spill"); if (batchesSinceLastSpill > 2) { final BatchGroup merged = mergeAndSpill(batchGroups); if (merged != null) { spilledBatchGroups.add(merged); batchesSinceLastSpill = 0; } } else { logger.debug("not enough batches to spill, sending OUT_OF_MEMORY downstream"); return IterOutcome.OUT_OF_MEMORY; } break; default: throw new UnsupportedOperationException(); } } if (totalCount == 0) { return IterOutcome.NONE; } if (spillCount == 0) { if (builder != null) { builder.clear(); builder.close(); } builder = new SortRecordBatchBuilder(oAllocator); for (BatchGroup group : batchGroups) { RecordBatchData rbd = new RecordBatchData(group.getContainer(), oAllocator); rbd.setSv2(group.getSv2()); builder.add(rbd); } builder.build(context, container); sv4 = builder.getSv4(); mSorter = createNewMSorter(); mSorter.setup(context, oAllocator, getSelectionVector4(), this.container); // For testing memory-leak purpose, inject exception after mSorter finishes setup injector.injectUnchecked(context.getExecutionControls(), INTERRUPTION_AFTER_SETUP); mSorter.sort(this.container); // sort may have prematurely exited due to should continue returning false. if (!context.shouldContinue()) { return IterOutcome.STOP; } // For testing memory-leak purpose, inject exception after mSorter finishes sorting injector.injectUnchecked(context.getExecutionControls(), INTERRUPTION_AFTER_SORT); sv4 = mSorter.getSV4(); container.buildSchema(SelectionVectorMode.FOUR_BYTE); } else { // some batches were spilled final BatchGroup merged = mergeAndSpill(batchGroups); if (merged != null) { spilledBatchGroups.add(merged); } batchGroups.addAll(spilledBatchGroups); spilledBatchGroups = null; // no need to cleanup spilledBatchGroups, all it's batches are in batchGroups now logger.warn( "Starting to merge. {} batch groups. Current allocated memory: {}", batchGroups.size(), oAllocator.getAllocatedMemory()); VectorContainer hyperBatch = constructHyperBatch(batchGroups); createCopier(hyperBatch, batchGroups, container, false); int estimatedRecordSize = 0; for (VectorWrapper<?> w : batchGroups.get(0)) { try { estimatedRecordSize += TypeHelper.getSize(w.getField().getType()); } catch (UnsupportedOperationException e) { estimatedRecordSize += 50; } } targetRecordCount = Math.min(MAX_BATCH_SIZE, Math.max(1, COPIER_BATCH_MEM_LIMIT / estimatedRecordSize)); int count = copier.next(targetRecordCount); container.buildSchema(SelectionVectorMode.NONE); container.setRecordCount(count); } return IterOutcome.OK_NEW_SCHEMA; } catch (SchemaChangeException ex) { kill(false); context.fail( UserException.unsupportedError(ex) .message("Sort doesn't currently support sorts with changing schemas") .build(logger)); return IterOutcome.STOP; } catch (ClassTransformationException | IOException ex) { kill(false); context.fail(ex); return IterOutcome.STOP; } catch (UnsupportedOperationException e) { throw new RuntimeException(e); } } private boolean hasMemoryForInMemorySort(int currentRecordCount) { long currentlyAvailable = popConfig.getMaxAllocation() - oAllocator.getAllocatedMemory(); long neededForInMemorySort = SortRecordBatchBuilder.memoryNeeded(currentRecordCount) + MSortTemplate.memoryNeeded(currentRecordCount); return currentlyAvailable > neededForInMemorySort; } public BatchGroup mergeAndSpill(LinkedList<BatchGroup> batchGroups) throws SchemaChangeException { logger.debug("Copier allocator current allocation {}", copierAllocator.getAllocatedMemory()); logger.debug( "mergeAndSpill: starting total size in memory = {}", oAllocator.getAllocatedMemory()); VectorContainer outputContainer = new VectorContainer(); List<BatchGroup> batchGroupList = Lists.newArrayList(); int batchCount = batchGroups.size(); for (int i = 0; i < batchCount / 2; i++) { if (batchGroups.size() == 0) { break; } BatchGroup batch = batchGroups.pollLast(); assert batch != null : "Encountered a null batch during merge and spill operation"; batchGroupList.add(batch); } if (batchGroupList.size() == 0) { return null; } int estimatedRecordSize = 0; for (VectorWrapper<?> w : batchGroupList.get(0)) { try { estimatedRecordSize += TypeHelper.getSize(w.getField().getType()); } catch (UnsupportedOperationException e) { estimatedRecordSize += 50; } } int targetRecordCount = Math.max(1, COPIER_BATCH_MEM_LIMIT / estimatedRecordSize); VectorContainer hyperBatch = constructHyperBatch(batchGroupList); createCopier(hyperBatch, batchGroupList, outputContainer, true); int count = copier.next(targetRecordCount); assert count > 0; logger.debug( "mergeAndSpill: estimated record size = {}, target record count = {}", estimatedRecordSize, targetRecordCount); // 1 output container is kept in memory, so we want to hold on to it and transferClone // allows keeping ownership VectorContainer c1 = VectorContainer.getTransferClone(outputContainer, oContext); c1.buildSchema(BatchSchema.SelectionVectorMode.NONE); c1.setRecordCount(count); String spillDir = dirs.next(); Path currSpillPath = new Path(Joiner.on("/").join(spillDir, fileName)); currSpillDirs.add(currSpillPath); String outputFile = Joiner.on("/").join(currSpillPath, spillCount++); try { fs.deleteOnExit(currSpillPath); } catch (IOException e) { // since this is meant to be used in a batches's spilling, we don't propagate the exception logger.warn("Unable to mark spill directory " + currSpillPath + " for deleting on exit", e); } stats.setLongStat(Metric.SPILL_COUNT, spillCount); BatchGroup newGroup = new BatchGroup(c1, fs, outputFile, oContext); try (AutoCloseable a = AutoCloseables.all(batchGroupList)) { logger.info("Merging and spilling to {}", outputFile); while ((count = copier.next(targetRecordCount)) > 0) { outputContainer.buildSchema(BatchSchema.SelectionVectorMode.NONE); outputContainer.setRecordCount(count); // note that addBatch also clears the outputContainer newGroup.addBatch(outputContainer); } injector.injectChecked( context.getExecutionControls(), INTERRUPTION_WHILE_SPILLING, IOException.class); newGroup.closeOutputStream(); } catch (Throwable e) { // we only need to cleanup newGroup if spill failed try { AutoCloseables.close(e, newGroup); } catch (Throwable t) { /* close() may hit the same IO issue; just ignore */ } throw UserException.resourceError(e) .message("External Sort encountered an error while spilling to disk") .addContext(e.getMessage() /* more detail */) .build(logger); } finally { hyperBatch.clear(); } logger.debug("mergeAndSpill: final total size in memory = {}", oAllocator.getAllocatedMemory()); logger.info("Completed spilling to {}", outputFile); return newGroup; } private SelectionVector2 newSV2() throws OutOfMemoryException, InterruptedException { SelectionVector2 sv2 = new SelectionVector2(oAllocator); if (!sv2.allocateNewSafe(incoming.getRecordCount())) { try { final BatchGroup merged = mergeAndSpill(batchGroups); if (merged != null) { spilledBatchGroups.add(merged); } else { throw UserException.memoryError( "Unable to allocate sv2 for %d records, and not enough batchGroups to spill.", incoming.getRecordCount()) .addContext("batchGroups.size", batchGroups.size()) .addContext("spilledBatchGroups.size", spilledBatchGroups.size()) .addContext("allocated memory", oAllocator.getAllocatedMemory()) .addContext("allocator limit", oAllocator.getLimit()) .build(logger); } } catch (SchemaChangeException e) { throw new RuntimeException(e); } int waitTime = 1; while (true) { try { Thread.sleep(waitTime * 1000); } catch (final InterruptedException e) { if (!context.shouldContinue()) { throw e; } } waitTime *= 2; if (sv2.allocateNewSafe(incoming.getRecordCount())) { break; } if (waitTime >= 32) { throw new OutOfMemoryException("Unable to allocate sv2 buffer after repeated attempts"); } } } for (int i = 0; i < incoming.getRecordCount(); i++) { sv2.setIndex(i, (char) i); } sv2.setRecordCount(incoming.getRecordCount()); return sv2; } private VectorContainer constructHyperBatch(List<BatchGroup> batchGroupList) { VectorContainer cont = new VectorContainer(); for (MaterializedField field : schema) { ValueVector[] vectors = new ValueVector[batchGroupList.size()]; int i = 0; for (BatchGroup group : batchGroupList) { vectors[i++] = group .getValueAccessorById( field.getValueClass(), group.getValueVectorId(SchemaPath.getSimplePath(field.getPath())).getFieldIds()) .getValueVector(); } cont.add(vectors); } cont.buildSchema(BatchSchema.SelectionVectorMode.FOUR_BYTE); return cont; } private MSorter createNewMSorter() throws ClassTransformationException, IOException, SchemaChangeException { return createNewMSorter( this.context, this.popConfig.getOrderings(), this, MAIN_MAPPING, LEFT_MAPPING, RIGHT_MAPPING); } private MSorter createNewMSorter( FragmentContext context, List<Ordering> orderings, VectorAccessible batch, MappingSet mainMapping, MappingSet leftMapping, MappingSet rightMapping) throws ClassTransformationException, IOException, SchemaChangeException { CodeGenerator<MSorter> cg = CodeGenerator.get( MSorter.TEMPLATE_DEFINITION, context.getFunctionRegistry(), context.getOptions()); ClassGenerator<MSorter> g = cg.getRoot(); g.setMappingSet(mainMapping); for (Ordering od : orderings) { // first, we rewrite the evaluation stack for each side of the comparison. ErrorCollector collector = new ErrorCollectorImpl(); final LogicalExpression expr = ExpressionTreeMaterializer.materialize( od.getExpr(), batch, collector, context.getFunctionRegistry()); if (collector.hasErrors()) { throw new SchemaChangeException( "Failure while materializing expression. " + collector.toErrorString()); } g.setMappingSet(leftMapping); HoldingContainer left = g.addExpr(expr, ClassGenerator.BlkCreateMode.FALSE); g.setMappingSet(rightMapping); HoldingContainer right = g.addExpr(expr, ClassGenerator.BlkCreateMode.FALSE); g.setMappingSet(mainMapping); // next we wrap the two comparison sides and add the expression block for the comparison. LogicalExpression fh = FunctionGenerationHelper.getOrderingComparator( od.nullsSortHigh(), left, right, context.getFunctionRegistry()); HoldingContainer out = g.addExpr(fh, ClassGenerator.BlkCreateMode.FALSE); JConditional jc = g.getEvalBlock()._if(out.getValue().ne(JExpr.lit(0))); if (od.getDirection() == Direction.ASCENDING) { jc._then()._return(out.getValue()); } else { jc._then()._return(out.getValue().minus()); } g.rotateBlock(); } g.rotateBlock(); g.getEvalBlock()._return(JExpr.lit(0)); return context.getImplementationClass(cg); } public SingleBatchSorter createNewSorter(FragmentContext context, VectorAccessible batch) throws ClassTransformationException, IOException, SchemaChangeException { CodeGenerator<SingleBatchSorter> cg = CodeGenerator.get( SingleBatchSorter.TEMPLATE_DEFINITION, context.getFunctionRegistry(), context.getOptions()); ClassGenerator<SingleBatchSorter> g = cg.getRoot(); generateComparisons(g, batch); return context.getImplementationClass(cg); } private void generateComparisons(ClassGenerator<?> g, VectorAccessible batch) throws SchemaChangeException { g.setMappingSet(MAIN_MAPPING); for (Ordering od : popConfig.getOrderings()) { // first, we rewrite the evaluation stack for each side of the comparison. ErrorCollector collector = new ErrorCollectorImpl(); final LogicalExpression expr = ExpressionTreeMaterializer.materialize( od.getExpr(), batch, collector, context.getFunctionRegistry()); if (collector.hasErrors()) { throw new SchemaChangeException( "Failure while materializing expression. " + collector.toErrorString()); } g.setMappingSet(LEFT_MAPPING); HoldingContainer left = g.addExpr(expr, ClassGenerator.BlkCreateMode.FALSE); g.setMappingSet(RIGHT_MAPPING); HoldingContainer right = g.addExpr(expr, ClassGenerator.BlkCreateMode.FALSE); g.setMappingSet(MAIN_MAPPING); // next we wrap the two comparison sides and add the expression block for the comparison. LogicalExpression fh = FunctionGenerationHelper.getOrderingComparator( od.nullsSortHigh(), left, right, context.getFunctionRegistry()); HoldingContainer out = g.addExpr(fh, ClassGenerator.BlkCreateMode.FALSE); JConditional jc = g.getEvalBlock()._if(out.getValue().ne(JExpr.lit(0))); if (od.getDirection() == Direction.ASCENDING) { jc._then()._return(out.getValue()); } else { jc._then()._return(out.getValue().minus()); } g.rotateBlock(); } g.rotateBlock(); g.getEvalBlock()._return(JExpr.lit(0)); } private void createCopier( VectorAccessible batch, List<BatchGroup> batchGroupList, VectorContainer outputContainer, boolean spilling) throws SchemaChangeException { try { if (copier == null) { CodeGenerator<PriorityQueueCopier> cg = CodeGenerator.get( PriorityQueueCopier.TEMPLATE_DEFINITION, context.getFunctionRegistry(), context.getOptions()); ClassGenerator<PriorityQueueCopier> g = cg.getRoot(); generateComparisons(g, batch); g.setMappingSet(COPIER_MAPPING_SET); CopyUtil.generateCopies(g, batch, true); g.setMappingSet(MAIN_MAPPING); copier = context.getImplementationClass(cg); } else { copier.close(); } BufferAllocator allocator = spilling ? copierAllocator : oAllocator; for (VectorWrapper<?> i : batch) { ValueVector v = TypeHelper.getNewVector(i.getField(), allocator); outputContainer.add(v); } copier.setup(context, allocator, batch, batchGroupList, outputContainer); } catch (ClassTransformationException | IOException e) { throw new RuntimeException(e); } } @Override public WritableBatch getWritableBatch() { throw new UnsupportedOperationException("A sort batch is not writable."); } @Override protected void killIncoming(boolean sendUpstream) { incoming.kill(sendUpstream); } }
/** * Foreman manages all the fragments (local and remote) for a single query where this is the * driving/root node. * * <p>The flow is as follows: - Foreman is submitted as a runnable. - Runnable does query planning. * - state changes from PENDING to RUNNING - Runnable sends out starting fragments - Status listener * are activated - The Runnable's run() completes, but the Foreman stays around - Foreman listens * for state change messages. - state change messages can drive the state to FAILED or CANCELED, in * which case messages are sent to running fragments to terminate - when all fragments complete, * state change messages drive the state to COMPLETED */ public class Foreman implements Runnable { private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(Foreman.class); private static final org.slf4j.Logger queryLogger = org.slf4j.LoggerFactory.getLogger("query.logger"); private static final ControlsInjector injector = ControlsInjectorFactory.getInjector(Foreman.class); private static final ObjectMapper MAPPER = new ObjectMapper(); private static final long RPC_WAIT_IN_MSECS_PER_FRAGMENT = 5000; private final QueryId queryId; private final RunQuery queryRequest; private final QueryContext queryContext; private final QueryManager queryManager; // handles lower-level details of query execution private final WorkerBee bee; // provides an interface to submit tasks private final DrillbitContext drillbitContext; private final UserClientConnection initiatingClient; // used to send responses private volatile QueryState state; private boolean resume = false; private volatile DistributedLease lease; // used to limit the number of concurrent queries private FragmentExecutor rootRunner; // root Fragment private final ExtendedLatch acceptExternalEvents = new ExtendedLatch(); // gates acceptance of external events private final StateListener stateListener = new StateListener(); // source of external events private final ResponseSendListener responseListener = new ResponseSendListener(); private final StateSwitch stateSwitch = new StateSwitch(); private final ForemanResult foremanResult = new ForemanResult(); private final ConnectionClosedListener closeListener = new ConnectionClosedListener(); private final ChannelFuture closeFuture; private String queryText; /** * Constructor. Sets up the Foreman, but does not initiate any execution. * * @param bee used to submit additional work * @param drillbitContext * @param connection * @param queryId the id for the query * @param queryRequest the query to execute */ public Foreman( final WorkerBee bee, final DrillbitContext drillbitContext, final UserClientConnection connection, final QueryId queryId, final RunQuery queryRequest) { this.bee = bee; this.queryId = queryId; this.queryRequest = queryRequest; this.drillbitContext = drillbitContext; initiatingClient = connection; this.closeFuture = initiatingClient.getChannel().closeFuture(); closeFuture.addListener(closeListener); queryContext = new QueryContext(connection.getSession(), drillbitContext); queryManager = new QueryManager( queryId, queryRequest, drillbitContext.getPersistentStoreProvider(), stateListener, this); // TODO reference escapes before ctor is complete via stateListener, this recordNewState(QueryState.PENDING); } private class ConnectionClosedListener implements GenericFutureListener<Future<Void>> { @Override public void operationComplete(Future<Void> future) throws Exception { cancel(); } } /** * Get the QueryContext created for the query. * * @return the QueryContext */ public QueryContext getQueryContext() { return queryContext; } /** * Get the QueryManager created for the query. * * @return the QueryManager */ public QueryManager getQueryManager() { return queryManager; } /** * Cancel the query. Asynchronous -- it may take some time for all remote fragments to be * terminated. */ public void cancel() { // Note this can be called from outside of run() on another thread, or after run() completes stateListener.moveToState(QueryState.CANCELLATION_REQUESTED, null); } /** * Resume the query. Regardless of the current state, this method sends a resume signal to all * fragments. This method can be called multiple times. */ public void resume() { resume = true; // resume all pauses through query context queryContext.getExecutionControls().unpauseAll(); // resume all pauses through all fragment contexts queryManager.unpauseExecutingFragments(drillbitContext); } /** * Called by execution pool to do query setup, and kick off remote execution. * * <p>Note that completion of this function is not the end of the Foreman's role in the query's * lifecycle. */ @Override public void run() { // rename the thread we're using for debugging purposes final Thread currentThread = Thread.currentThread(); final String originalName = currentThread.getName(); currentThread.setName(QueryIdHelper.getQueryId(queryId) + ":foreman"); // track how long the query takes queryManager.markStartTime(); try { injector.injectChecked( queryContext.getExecutionControls(), "run-try-beginning", ForemanException.class); queryText = queryRequest.getPlan(); // convert a run query request into action switch (queryRequest.getType()) { case LOGICAL: parseAndRunLogicalPlan(queryRequest.getPlan()); break; case PHYSICAL: parseAndRunPhysicalPlan(queryRequest.getPlan()); break; case SQL: runSQL(queryRequest.getPlan()); break; default: throw new IllegalStateException(); } injector.injectChecked( queryContext.getExecutionControls(), "run-try-end", ForemanException.class); } catch (final OutOfMemoryException | OutOfMemoryRuntimeException e) { moveToState(QueryState.FAILED, UserException.memoryError(e).build(logger)); } catch (final ForemanException e) { moveToState(QueryState.FAILED, e); } catch (AssertionError | Exception ex) { moveToState( QueryState.FAILED, new ForemanException( "Unexpected exception during fragment initialization: " + ex.getMessage(), ex)); } catch (final OutOfMemoryError e) { if ("Direct buffer memory".equals(e.getMessage())) { moveToState( QueryState.FAILED, UserException.resourceError(e) .message("One or more nodes ran out of memory while executing the query.") .build(logger)); } else { /* * FragmentExecutors use a DrillbitStatusListener to watch out for the death of their query's Foreman. So, if we * die here, they should get notified about that, and cancel themselves; we don't have to attempt to notify * them, which might not work under these conditions. */ System.out.println("Node ran out of Heap memory, exiting."); e.printStackTrace(); System.out.flush(); System.exit(-1); } } finally { /* * Begin accepting external events. * * Doing this here in the finally clause will guarantee that it occurs. Otherwise, if there * is an exception anywhere during setup, it wouldn't occur, and any events that are generated * as a result of any partial setup that was done (such as the FragmentSubmitListener, * the ResponseSendListener, or an external call to cancel()), will hang the thread that makes the * event delivery call. * * If we do throw an exception during setup, and have already moved to QueryState.FAILED, we just need to * make sure that we can't make things any worse as those events are delivered, but allow * any necessary remaining cleanup to proceed. * * Note that cancellations cannot be simulated before this point, i.e. pauses can be injected, because Foreman * would wait on the cancelling thread to signal a resume and the cancelling thread would wait on the Foreman * to accept events. */ acceptExternalEvents.countDown(); // If we received the resume signal before fragments are setup, the first call does not // actually resume the // fragments. Since setup is done, all fragments must have been delivered to remote nodes. Now // we can resume. if (resume) { resume(); } injector.injectPause(queryContext.getExecutionControls(), "foreman-ready", logger); // restore the thread's original name currentThread.setName(originalName); } /* * Note that despite the run() completing, the Foreman continues to exist, and receives * events (indirectly, through the QueryManager's use of stateListener), about fragment * completions. It won't go away until everything is completed, failed, or cancelled. */ } private void releaseLease() { while (lease != null) { try { lease.close(); lease = null; } catch (final InterruptedException e) { // if we end up here, the while loop will try again } catch (final Exception e) { logger.warn("Failure while releasing lease.", e); break; } } } private void parseAndRunLogicalPlan(final String json) throws ExecutionSetupException { LogicalPlan logicalPlan; try { logicalPlan = drillbitContext.getPlanReader().readLogicalPlan(json); } catch (final IOException e) { throw new ForemanException("Failure parsing logical plan.", e); } if (logicalPlan.getProperties().resultMode == ResultMode.LOGICAL) { throw new ForemanException( "Failure running plan. You requested a result mode of LOGICAL and submitted a logical plan. In this case you're output mode must be PHYSICAL or EXEC."); } log(logicalPlan); final PhysicalPlan physicalPlan = convert(logicalPlan); if (logicalPlan.getProperties().resultMode == ResultMode.PHYSICAL) { returnPhysical(physicalPlan); return; } log(physicalPlan); runPhysicalPlan(physicalPlan); } private void log(final LogicalPlan plan) { if (logger.isDebugEnabled()) { logger.debug("Logical {}", plan.unparse(queryContext.getConfig())); } } private void log(final PhysicalPlan plan) { if (logger.isDebugEnabled()) { try { final String planText = queryContext.getConfig().getMapper().writeValueAsString(plan); logger.debug("Physical {}", planText); } catch (final IOException e) { logger.warn("Error while attempting to log physical plan.", e); } } } private void returnPhysical(final PhysicalPlan plan) throws ExecutionSetupException { final String jsonPlan = plan.unparse(queryContext.getConfig().getMapper().writer()); runPhysicalPlan( DirectPlan.createDirectPlan(queryContext, new PhysicalFromLogicalExplain(jsonPlan))); } public static class PhysicalFromLogicalExplain { public final String json; public PhysicalFromLogicalExplain(final String json) { this.json = json; } } private void parseAndRunPhysicalPlan(final String json) throws ExecutionSetupException { try { final PhysicalPlan plan = drillbitContext.getPlanReader().readPhysicalPlan(json); runPhysicalPlan(plan); } catch (final IOException e) { throw new ForemanSetupException("Failure while parsing physical plan.", e); } } private void runPhysicalPlan(final PhysicalPlan plan) throws ExecutionSetupException { validatePlan(plan); setupSortMemoryAllocations(plan); acquireQuerySemaphore(plan); final QueryWorkUnit work = getQueryWorkUnit(plan); final List<PlanFragment> planFragments = work.getFragments(); final PlanFragment rootPlanFragment = work.getRootFragment(); assert queryId == rootPlanFragment.getHandle().getQueryId(); drillbitContext .getWorkBus() .addFragmentStatusListener(queryId, queryManager.getFragmentStatusListener()); drillbitContext .getClusterCoordinator() .addDrillbitStatusListener(queryManager.getDrillbitStatusListener()); logger.debug("Submitting fragments to run."); // set up the root fragment first so we'll have incoming buffers available. setupRootFragment(rootPlanFragment, work.getRootOperator()); setupNonRootFragments(planFragments); drillbitContext.getAllocator().resetFragmentLimits(); // TODO a global effect for this query?!? moveToState(QueryState.RUNNING, null); logger.debug("Fragments running."); } private static void validatePlan(final PhysicalPlan plan) throws ForemanSetupException { if (plan.getProperties().resultMode != ResultMode.EXEC) { throw new ForemanSetupException( String.format( "Failure running plan. You requested a result mode of %s and a physical plan can only be output as EXEC", plan.getProperties().resultMode)); } } private void setupSortMemoryAllocations(final PhysicalPlan plan) { // look for external sorts final List<ExternalSort> sortList = new LinkedList<>(); for (final PhysicalOperator op : plan.getSortedOperators()) { if (op instanceof ExternalSort) { sortList.add((ExternalSort) op); } } // if there are any sorts, compute the maximum allocation, and set it on them if (sortList.size() > 0) { final OptionManager optionManager = queryContext.getOptions(); final long maxWidthPerNode = optionManager.getOption(ExecConstants.MAX_WIDTH_PER_NODE_KEY).num_val; long maxAllocPerNode = Math.min( DrillConfig.getMaxDirectMemory(), queryContext.getConfig().getLong(ExecConstants.TOP_LEVEL_MAX_ALLOC)); maxAllocPerNode = Math.min( maxAllocPerNode, optionManager.getOption(ExecConstants.MAX_QUERY_MEMORY_PER_NODE_KEY).num_val); final long maxSortAlloc = maxAllocPerNode / (sortList.size() * maxWidthPerNode); logger.debug("Max sort alloc: {}", maxSortAlloc); for (final ExternalSort externalSort : sortList) { externalSort.setMaxAllocation(maxSortAlloc); } } } /** * This limits the number of "small" and "large" queries that a Drill cluster will run * simultaneously, if queueing is enabled. If the query is unable to run, this will block until it * can. Beware that this is called under run(), and so will consume a Thread while it waits for * the required distributed semaphore. * * @param plan the query plan * @throws ForemanSetupException */ private void acquireQuerySemaphore(final PhysicalPlan plan) throws ForemanSetupException { final OptionManager optionManager = queryContext.getOptions(); final boolean queuingEnabled = optionManager.getOption(ExecConstants.ENABLE_QUEUE); if (queuingEnabled) { final long queueThreshold = optionManager.getOption(ExecConstants.QUEUE_THRESHOLD_SIZE); double totalCost = 0; for (final PhysicalOperator ops : plan.getSortedOperators()) { totalCost += ops.getCost(); } final long queueTimeout = optionManager.getOption(ExecConstants.QUEUE_TIMEOUT); final String queueName; try { @SuppressWarnings("resource") final ClusterCoordinator clusterCoordinator = drillbitContext.getClusterCoordinator(); final DistributedSemaphore distributedSemaphore; // get the appropriate semaphore if (totalCost > queueThreshold) { final int largeQueue = (int) optionManager.getOption(ExecConstants.LARGE_QUEUE_SIZE); distributedSemaphore = clusterCoordinator.getSemaphore("query.large", largeQueue); queueName = "large"; } else { final int smallQueue = (int) optionManager.getOption(ExecConstants.SMALL_QUEUE_SIZE); distributedSemaphore = clusterCoordinator.getSemaphore("query.small", smallQueue); queueName = "small"; } lease = distributedSemaphore.acquire(queueTimeout, TimeUnit.MILLISECONDS); } catch (final Exception e) { throw new ForemanSetupException("Unable to acquire slot for query.", e); } if (lease == null) { throw UserException.resourceError() .message( "Unable to acquire queue resources for query within timeout. Timeout for %s queue was set at %d seconds.", queueName, queueTimeout / 1000) .build(logger); } } } Exception getCurrentException() { return foremanResult.getException(); } private QueryWorkUnit getQueryWorkUnit(final PhysicalPlan plan) throws ExecutionSetupException { final PhysicalOperator rootOperator = plan.getSortedOperators(false).iterator().next(); final Fragment rootFragment = rootOperator.accept(MakeFragmentsVisitor.INSTANCE, null); final SimpleParallelizer parallelizer = new SimpleParallelizer(queryContext); final QueryWorkUnit queryWorkUnit = parallelizer.getFragments( queryContext.getOptions().getOptionList(), queryContext.getCurrentEndpoint(), queryId, queryContext.getActiveEndpoints(), drillbitContext.getPlanReader(), rootFragment, initiatingClient.getSession(), queryContext.getQueryContextInfo()); if (logger.isTraceEnabled()) { final StringBuilder sb = new StringBuilder(); sb.append("PlanFragments for query "); sb.append(queryId); sb.append('\n'); final List<PlanFragment> planFragments = queryWorkUnit.getFragments(); final int fragmentCount = planFragments.size(); int fragmentIndex = 0; for (final PlanFragment planFragment : planFragments) { final FragmentHandle fragmentHandle = planFragment.getHandle(); sb.append("PlanFragment("); sb.append(++fragmentIndex); sb.append('/'); sb.append(fragmentCount); sb.append(") major_fragment_id "); sb.append(fragmentHandle.getMajorFragmentId()); sb.append(" minor_fragment_id "); sb.append(fragmentHandle.getMinorFragmentId()); sb.append('\n'); final DrillbitEndpoint endpointAssignment = planFragment.getAssignment(); sb.append(" DrillbitEndpoint address "); sb.append(endpointAssignment.getAddress()); sb.append('\n'); String jsonString = "<<malformed JSON>>"; sb.append(" fragment_json: "); final ObjectMapper objectMapper = new ObjectMapper(); try { final Object json = objectMapper.readValue(planFragment.getFragmentJson(), Object.class); jsonString = objectMapper.defaultPrettyPrintingWriter().writeValueAsString(json); } catch (final Exception e) { // we've already set jsonString to a fallback value } sb.append(jsonString); logger.trace(sb.toString()); } } return queryWorkUnit; } /** * Manages the end-state processing for Foreman. * * <p>End-state processing is tricky, because even if a query appears to succeed, but we then * encounter a problem during cleanup, we still want to mark the query as failed. So we have to * construct the successful result we would send, and then clean up before we send that result, * possibly changing that result if we encounter a problem during cleanup. We only send the result * when there is nothing left to do, so it will account for any possible problems. * * <p>The idea here is to make close()ing the ForemanResult do the final cleanup and sending. * Closing the result must be the last thing that is done by Foreman. */ private class ForemanResult implements AutoCloseable { private QueryState resultState = null; private volatile Exception resultException = null; private boolean isClosed = false; /** * Set up the result for a COMPLETED or CANCELED state. * * <p>Note that before sending this result, we execute cleanup steps that could result in this * result still being changed to a FAILED state. * * @param queryState one of COMPLETED or CANCELED */ public void setCompleted(final QueryState queryState) { Preconditions.checkArgument( (queryState == QueryState.COMPLETED) || (queryState == QueryState.CANCELED)); Preconditions.checkState(!isClosed); Preconditions.checkState(resultState == null); resultState = queryState; } /** * Set up the result for a FAILED state. * * <p>Failures that occur during cleanup processing will be added as suppressed exceptions. * * @param exception the exception that led to the FAILED state */ public void setFailed(final Exception exception) { Preconditions.checkArgument(exception != null); Preconditions.checkState(!isClosed); Preconditions.checkState(resultState == null); resultState = QueryState.FAILED; resultException = exception; } /** * Ignore the current status and force the given failure as current status. NOTE: Used only for * testing purposes. Shouldn't be used in production. */ public void setForceFailure(final Exception exception) { Preconditions.checkArgument(exception != null); Preconditions.checkState(!isClosed); resultState = QueryState.FAILED; resultException = exception; } /** * Add an exception to the result. All exceptions after the first become suppressed exceptions * hanging off the first. * * @param exception the exception to add */ private void addException(final Exception exception) { Preconditions.checkNotNull(exception); if (resultException == null) { resultException = exception; } else { resultException.addSuppressed(exception); } } /** * Expose the current exception (if it exists). This is useful for secondary reporting to the * query profile. * * @return the current Foreman result exception or null. */ public Exception getException() { return resultException; } /** * Close the given resource, catching and adding any caught exceptions via {@link * #addException(Exception)}. If an exception is caught, it will change the result state to * FAILED, regardless of what its current value. * * @param autoCloseable the resource to close */ private void suppressingClose(final AutoCloseable autoCloseable) { Preconditions.checkState(!isClosed); Preconditions.checkState(resultState != null); if (autoCloseable == null) { return; } try { autoCloseable.close(); } catch (final Exception e) { /* * Even if the query completed successfully, we'll still report failure if we have * problems cleaning up. */ resultState = QueryState.FAILED; addException(e); } } private void logQuerySummary() { try { LoggedQuery q = new LoggedQuery( QueryIdHelper.getQueryId(queryId), queryContext.getQueryContextInfo().getDefaultSchemaName(), queryText, new Date(queryContext.getQueryContextInfo().getQueryStartTime()), new Date(System.currentTimeMillis()), state, queryContext.getSession().getCredentials().getUserName()); queryLogger.info(MAPPER.writeValueAsString(q)); } catch (Exception e) { logger.error("Failure while recording query information to query log.", e); } } @Override public void close() { Preconditions.checkState(!isClosed); Preconditions.checkState(resultState != null); logger.info("foreman cleaning up."); injector.injectPause(queryContext.getExecutionControls(), "foreman-cleanup", logger); // remove the channel disconnected listener (doesn't throw) closeFuture.removeListener(closeListener); // log the query summary logQuerySummary(); // These are straight forward removals from maps, so they won't throw. drillbitContext.getWorkBus().removeFragmentStatusListener(queryId); drillbitContext .getClusterCoordinator() .removeDrillbitStatusListener(queryManager.getDrillbitStatusListener()); suppressingClose(queryContext); /* * We do our best to write the latest state, but even that could fail. If it does, we can't write * the (possibly newly failing) state, so we continue on anyway. * * We only need to do this if the resultState differs from the last recorded state */ if (resultState != state) { suppressingClose( new AutoCloseable() { @Override public void close() throws Exception { recordNewState(resultState); } }); } /* * Construct the response based on the latest resultState. The builder shouldn't fail. */ final QueryResult.Builder resultBuilder = QueryResult.newBuilder().setQueryId(queryId).setQueryState(resultState); final UserException uex; if (resultException != null) { final boolean verbose = queryContext.getOptions().getOption(ExecConstants.ENABLE_VERBOSE_ERRORS_KEY).bool_val; uex = UserException.systemError(resultException) .addIdentity(queryContext.getCurrentEndpoint()) .build(logger); resultBuilder.addError(uex.getOrCreatePBError(verbose)); } else { uex = null; } // we store the final result here so we can capture any error/errorId in the profile for later // debugging. queryManager.writeFinalProfile(uex); /* * If sending the result fails, we don't really have any way to modify the result we tried to send; * it is possible it got sent but the result came from a later part of the code path. It is also * possible the connection has gone away, so this is irrelevant because there's nowhere to * send anything to. */ try { // send whatever result we ended up with initiatingClient.sendResult(responseListener, resultBuilder.build(), true); } catch (final Exception e) { addException(e); logger.warn("Exception sending result to client", resultException); } // Remove the Foreman from the running query list. bee.retireForeman(Foreman.this); try { releaseLease(); } finally { isClosed = true; } } } private static class StateEvent { final QueryState newState; final Exception exception; StateEvent(final QueryState newState, final Exception exception) { this.newState = newState; this.exception = exception; } } private class StateSwitch extends EventProcessor<StateEvent> { public void moveToState(final QueryState newState, final Exception exception) { sendEvent(new StateEvent(newState, exception)); } @Override protected void processEvent(final StateEvent event) { final QueryState newState = event.newState; final Exception exception = event.exception; // TODO Auto-generated method stub logger.info("State change requested. {} --> {}", state, newState, exception); switch (state) { case PENDING: if (newState == QueryState.RUNNING) { recordNewState(QueryState.RUNNING); return; } // $FALL-THROUGH$ case RUNNING: { /* * For cases that cancel executing fragments, we have to record the new * state first, because the cancellation of the local root fragment will * cause this to be called recursively. */ switch (newState) { case CANCELLATION_REQUESTED: { assert exception == null; queryManager.markEndTime(); recordNewState(QueryState.CANCELLATION_REQUESTED); queryManager.cancelExecutingFragments(drillbitContext); foremanResult.setCompleted(QueryState.CANCELED); /* * We don't close the foremanResult until we've gotten * acknowledgements, which happens below in the case for current state * == CANCELLATION_REQUESTED. */ return; } case COMPLETED: { assert exception == null; queryManager.markEndTime(); recordNewState(QueryState.COMPLETED); foremanResult.setCompleted(QueryState.COMPLETED); foremanResult.close(); return; } case FAILED: { assert exception != null; queryManager.markEndTime(); recordNewState(QueryState.FAILED); queryManager.cancelExecutingFragments(drillbitContext); foremanResult.setFailed(exception); foremanResult.close(); return; } default: throw new IllegalStateException("illegal transition from RUNNING to " + newState); } } case CANCELLATION_REQUESTED: if ((newState == QueryState.CANCELED) || (newState == QueryState.COMPLETED) || (newState == QueryState.FAILED)) { if (drillbitContext .getConfig() .getBoolean(ExecConstants.RETURN_ERROR_FOR_FAILURE_IN_CANCELLED_FRAGMENTS)) { if (newState == QueryState.FAILED) { assert exception != null; recordNewState(QueryState.FAILED); foremanResult.setForceFailure(exception); } } /* * These amount to a completion of the cancellation requests' cleanup; * now we can clean up and send the result. */ foremanResult.close(); } return; case CANCELED: case COMPLETED: case FAILED: logger.warn( "Dropping request to move to {} state as query is already at {} state (which is terminal).", newState, state); return; } throw new IllegalStateException( String.format( "Failure trying to change states: %s --> %s", state.name(), newState.name())); } } /** * Tells the foreman to move to a new state. * * @param newState the state to move to * @param exception if not null, the exception that drove this state transition (usually a * failure) */ private void moveToState(final QueryState newState, final Exception exception) { stateSwitch.moveToState(newState, exception); } private void recordNewState(final QueryState newState) { state = newState; queryManager.updateEphemeralState(newState); } private void runSQL(final String sql) throws ExecutionSetupException { final DrillSqlWorker sqlWorker = new DrillSqlWorker(queryContext); final Pointer<String> textPlan = new Pointer<>(); final PhysicalPlan plan = sqlWorker.getPlan(sql, textPlan); queryManager.setPlanText(textPlan.value); runPhysicalPlan(plan); } private PhysicalPlan convert(final LogicalPlan plan) throws OptimizerException { if (logger.isDebugEnabled()) { logger.debug("Converting logical plan {}.", plan.toJsonStringSafe(queryContext.getConfig())); } return new BasicOptimizer(queryContext, initiatingClient) .optimize(new BasicOptimizer.BasicOptimizationContext(queryContext), plan); } public QueryId getQueryId() { return queryId; } /** * Set up the root fragment (which will run locally), and submit it for execution. * * @param rootFragment * @param rootOperator * @throws ExecutionSetupException */ private void setupRootFragment(final PlanFragment rootFragment, final FragmentRoot rootOperator) throws ExecutionSetupException { @SuppressWarnings("resource") final FragmentContext rootContext = new FragmentContext( drillbitContext, rootFragment, queryContext, initiatingClient, drillbitContext.getFunctionImplementationRegistry()); @SuppressWarnings("resource") final IncomingBuffers buffers = new IncomingBuffers(rootFragment, rootContext); rootContext.setBuffers(buffers); queryManager.addFragmentStatusTracker(rootFragment, true); rootRunner = new FragmentExecutor( rootContext, rootFragment, queryManager.newRootStatusHandler(rootContext, drillbitContext), rootOperator); final RootFragmentManager fragmentManager = new RootFragmentManager(rootFragment.getHandle(), buffers, rootRunner); if (buffers.isDone()) { // if we don't have to wait for any incoming data, start the fragment runner. bee.addFragmentRunner(fragmentManager.getRunnable()); } else { // if we do, record the fragment manager in the workBus. // TODO aren't we managing our own work? What does this do? It looks like this will never get // run drillbitContext.getWorkBus().addFragmentManager(fragmentManager); } } /** * Set up the non-root fragments for execution. Some may be local, and some may be remote. * Messages are sent immediately, so they may start returning data even before we complete this. * * @param fragments the fragments * @throws ForemanException */ private void setupNonRootFragments(final Collection<PlanFragment> fragments) throws ForemanException { /* * We will send a single message to each endpoint, regardless of how many fragments will be * executed there. We need to start up the intermediate fragments first so that they will be * ready once the leaf fragments start producing data. To satisfy both of these, we will * make a pass through the fragments and put them into these two maps according to their * leaf/intermediate state, as well as their target drillbit. */ final Multimap<DrillbitEndpoint, PlanFragment> leafFragmentMap = ArrayListMultimap.create(); final Multimap<DrillbitEndpoint, PlanFragment> intFragmentMap = ArrayListMultimap.create(); // record all fragments for status purposes. for (final PlanFragment planFragment : fragments) { logger.trace( "Tracking intermediate remote node {} with data {}", planFragment.getAssignment(), planFragment.getFragmentJson()); queryManager.addFragmentStatusTracker(planFragment, false); if (planFragment.getLeafFragment()) { leafFragmentMap.put(planFragment.getAssignment(), planFragment); } else { intFragmentMap.put(planFragment.getAssignment(), planFragment); } } /* * We need to wait for the intermediates to be sent so that they'll be set up by the time * the leaves start producing data. We'll use this latch to wait for the responses. * * However, in order not to hang the process if any of the RPC requests fails, we always * count down (see FragmentSubmitFailures), but we count the number of failures so that we'll * know if any submissions did fail. */ final int numIntFragments = intFragmentMap.keySet().size(); final ExtendedLatch endpointLatch = new ExtendedLatch(numIntFragments); final FragmentSubmitFailures fragmentSubmitFailures = new FragmentSubmitFailures(); // send remote intermediate fragments for (final DrillbitEndpoint ep : intFragmentMap.keySet()) { sendRemoteFragments(ep, intFragmentMap.get(ep), endpointLatch, fragmentSubmitFailures); } final long timeout = RPC_WAIT_IN_MSECS_PER_FRAGMENT * numIntFragments; if (numIntFragments > 0 && !endpointLatch.awaitUninterruptibly(timeout)) { long numberRemaining = endpointLatch.getCount(); throw UserException.connectionError() .message( "Exceeded timeout (%d) while waiting send intermediate work fragments to remote nodes. " + "Sent %d and only heard response back from %d nodes.", timeout, numIntFragments, numIntFragments - numberRemaining) .build(logger); } // if any of the intermediate fragment submissions failed, fail the query final List<FragmentSubmitFailures.SubmissionException> submissionExceptions = fragmentSubmitFailures.submissionExceptions; if (submissionExceptions.size() > 0) { Set<DrillbitEndpoint> endpoints = Sets.newHashSet(); StringBuilder sb = new StringBuilder(); boolean first = true; for (FragmentSubmitFailures.SubmissionException e : fragmentSubmitFailures.submissionExceptions) { DrillbitEndpoint endpoint = e.drillbitEndpoint; if (endpoints.add(endpoint)) { if (first) { first = false; } else { sb.append(", "); } sb.append(endpoint.getAddress()); } } throw UserException.connectionError(submissionExceptions.get(0).rpcException) .message("Error setting up remote intermediate fragment execution") .addContext("Nodes with failures", sb.toString()) .build(logger); } injector.injectChecked( queryContext.getExecutionControls(), "send-fragments", ForemanException.class); /* * Send the remote (leaf) fragments; we don't wait for these. Any problems will come in through * the regular sendListener event delivery. */ for (final DrillbitEndpoint ep : leafFragmentMap.keySet()) { sendRemoteFragments(ep, leafFragmentMap.get(ep), null, null); } } /** * Send all the remote fragments belonging to a single target drillbit in one request. * * @param assignment the drillbit assigned to these fragments * @param fragments the set of fragments * @param latch the countdown latch used to track the requests to all endpoints * @param fragmentSubmitFailures the submission failure counter used to track the requests to all * endpoints */ private void sendRemoteFragments( final DrillbitEndpoint assignment, final Collection<PlanFragment> fragments, final CountDownLatch latch, final FragmentSubmitFailures fragmentSubmitFailures) { @SuppressWarnings("resource") final Controller controller = drillbitContext.getController(); final InitializeFragments.Builder fb = InitializeFragments.newBuilder(); for (final PlanFragment planFragment : fragments) { fb.addFragment(planFragment); } final InitializeFragments initFrags = fb.build(); logger.debug("Sending remote fragments to \nNode:\n{} \n\nData:\n{}", assignment, initFrags); final FragmentSubmitListener listener = new FragmentSubmitListener(assignment, initFrags, latch, fragmentSubmitFailures); controller.getTunnel(assignment).sendFragments(listener, initFrags); } public QueryState getState() { return state; } /** Used by {@link FragmentSubmitListener} to track the number of submission failures. */ private static class FragmentSubmitFailures { static class SubmissionException { final DrillbitEndpoint drillbitEndpoint; final RpcException rpcException; SubmissionException( @SuppressWarnings("unused") final DrillbitEndpoint drillbitEndpoint, final RpcException rpcException) { this.drillbitEndpoint = drillbitEndpoint; this.rpcException = rpcException; } } final List<SubmissionException> submissionExceptions = new LinkedList<>(); void addFailure(final DrillbitEndpoint drillbitEndpoint, final RpcException rpcException) { submissionExceptions.add(new SubmissionException(drillbitEndpoint, rpcException)); } } private class FragmentSubmitListener extends EndpointListener<Ack, InitializeFragments> { private final CountDownLatch latch; private final FragmentSubmitFailures fragmentSubmitFailures; /** * Constructor. * * @param endpoint the endpoint for the submission * @param value the initialize fragments message * @param latch the latch to count down when the status is known; may be null * @param fragmentSubmitFailures the counter to use for failures; must be non-null iff latch is * non-null */ public FragmentSubmitListener( final DrillbitEndpoint endpoint, final InitializeFragments value, final CountDownLatch latch, final FragmentSubmitFailures fragmentSubmitFailures) { super(endpoint, value); Preconditions.checkState((latch == null) == (fragmentSubmitFailures == null)); this.latch = latch; this.fragmentSubmitFailures = fragmentSubmitFailures; } @Override public void success(final Ack ack, final ByteBuf byteBuf) { if (latch != null) { latch.countDown(); } } @Override public void failed(final RpcException ex) { if (latch != null) { fragmentSubmitFailures.addFailure(endpoint, ex); latch.countDown(); } else { // since this won't be waited on, we can wait to deliver this event once the Foreman is // ready logger.debug("Failure while sending fragment. Stopping query.", ex); stateListener.moveToState(QueryState.FAILED, ex); } } @Override public void interrupted(final InterruptedException e) { // Foreman shouldn't get interrupted while waiting for the RPC outcome of fragment submission. // Consider the interrupt as failure. final String errMsg = "Interrupted while waiting for the RPC outcome of fragment submission."; logger.error(errMsg, e); failed(new RpcException(errMsg, e)); } } /** * Provides gated access to state transitions. * * <p>The StateListener waits on a latch before delivery state transitions to the Foreman. The * latch will be tripped when the Foreman is sufficiently set up that it can receive and process * external events from other threads. */ public class StateListener { /** * Move the Foreman to the specified new state. * * @param newState the state to move to * @param ex if moving to a failure state, the exception that led to the failure; used for * reporting to the user */ public void moveToState(final QueryState newState, final Exception ex) { acceptExternalEvents.awaitUninterruptibly(); Foreman.this.moveToState(newState, ex); } } /** Listens for the status of the RPC response sent to the user for the query. */ private class ResponseSendListener extends BaseRpcOutcomeListener<Ack> { @Override public void failed(final RpcException ex) { logger.info( "Failure while trying communicate query result to initiating client. " + "This would happen if a client is disconnected before response notice can be sent.", ex); stateListener.moveToState(QueryState.FAILED, ex); } @Override public void interrupted(final InterruptedException e) { logger.warn( "Interrupted while waiting for RPC outcome of sending final query result to initiating client."); stateListener.moveToState(QueryState.FAILED, e); } } }
/** * Responsible for running a single fragment on a single Drillbit. Listens/responds to status * request and cancellation messages. */ public class FragmentExecutor implements Runnable { private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(FragmentExecutor.class); private static final ControlsInjector injector = ControlsInjectorFactory.getInjector(FragmentExecutor.class); private final AtomicBoolean hasCloseoutThread = new AtomicBoolean(false); private final String fragmentName; private final FragmentContext fragmentContext; private final FragmentStatusReporter statusReporter; private final DeferredException deferredException = new DeferredException(); private final PlanFragment fragment; private final FragmentRoot rootOperator; private final ReceiverExecutor receiverExecutor; private volatile RootExec root; private final AtomicReference<FragmentState> fragmentState = new AtomicReference<>(FragmentState.AWAITING_ALLOCATION); private final ExtendedLatch acceptExternalEvents = new ExtendedLatch(); // Thread that is currently executing the Fragment. Value is null if the fragment hasn't started // running or finished private final AtomicReference<Thread> myThreadRef = new AtomicReference<>(null); /** * Create a FragmentExecutor where we need to parse and materialize the root operator. * * @param context * @param fragment * @param statusReporter */ public FragmentExecutor( final FragmentContext context, final PlanFragment fragment, final FragmentStatusReporter statusReporter) { this(context, fragment, statusReporter, null); } /** * Create a FragmentExecutor where we already have a root operator in memory. * * @param context * @param fragment * @param statusReporter * @param rootOperator */ public FragmentExecutor( final FragmentContext context, final PlanFragment fragment, final FragmentStatusReporter statusReporter, final FragmentRoot rootOperator) { this.fragmentContext = context; this.statusReporter = statusReporter; this.fragment = fragment; this.rootOperator = rootOperator; this.fragmentName = QueryIdHelper.getQueryIdentifier(context.getHandle()); this.receiverExecutor = new ReceiverExecutor(fragmentName, fragmentContext.getExecutor()); context.setExecutorState(new ExecutorStateImpl()); } @Override public String toString() { final StringBuilder builder = new StringBuilder(); builder.append("FragmentExecutor [fragmentContext="); builder.append(fragmentContext); builder.append(", fragmentState="); builder.append(fragmentState); builder.append("]"); return builder.toString(); } /** * Returns the current fragment status if the fragment is running. Otherwise, returns no status. * * @return FragmentStatus or null. */ public FragmentStatus getStatus() { /* * If the query is not in a running state, the operator tree is still being constructed and * there is no reason to poll for intermediate results. * * Previously the call to get the operator stats with the AbstractStatusReporter was happening * before this check. This caused a concurrent modification exception as the list of operator * stats is iterated over while collecting info, and added to while building the operator tree. */ if (fragmentState.get() != FragmentState.RUNNING) { return null; } return statusReporter.getStatus(FragmentState.RUNNING); } /** * Cancel the execution of this fragment is in an appropriate state. Messages come from external. * NOTE that this will be called from threads *other* than the one running this runnable(), so we * need to be careful about the state transitions that can result. */ public void cancel() { final boolean thisIsOnlyThread = hasCloseoutThread.compareAndSet(false, true); if (!thisIsOnlyThread) { acceptExternalEvents.awaitUninterruptibly(); /* * We set the cancel requested flag but the actual cancellation is managed by the run() loop, if called. */ updateState(FragmentState.CANCELLATION_REQUESTED); /* * Interrupt the thread so that it exits from any blocking operation it could be executing currently. We * synchronize here to ensure we don't accidentally create a race condition where we interrupt the close out * procedure of the main thread. */ synchronized (myThreadRef) { final Thread myThread = myThreadRef.get(); if (myThread != null) { logger.debug("Interrupting fragment thread {}", myThread.getName()); myThread.interrupt(); } } } else { // countdown so receiver fragment finished can proceed. acceptExternalEvents.countDown(); updateState(FragmentState.CANCELLATION_REQUESTED); cleanup(FragmentState.FINISHED); } } private void cleanup(FragmentState state) { closeOutResources(); updateState(state); // send the final state of the fragment. only the main execution thread can send the final state // and it can // only be sent once. sendFinalState(); } /** * Resume all the pauses within the current context. Note that this method will be called from * threads *other* than the one running this runnable(). Also, this method can be called multiple * times. */ public synchronized void unpause() { fragmentContext.getExecutionControls().unpauseAll(); } /** * Inform this fragment that one of its downstream partners no longer needs additional records. * This is most commonly called in the case that a limit query is executed. * * @param handle The downstream FragmentHandle of the Fragment that needs no more records from * this Fragment. */ public void receivingFragmentFinished(final FragmentHandle handle) { receiverExecutor.submitReceiverFinished(handle); } @Override public void run() { // if a cancel thread has already entered this executor, we have not reason to continue. if (!hasCloseoutThread.compareAndSet(false, true)) { return; } final Thread myThread = Thread.currentThread(); myThreadRef.set(myThread); final String originalThreadName = myThread.getName(); final FragmentHandle fragmentHandle = fragmentContext.getHandle(); final DrillbitContext drillbitContext = fragmentContext.getDrillbitContext(); final ClusterCoordinator clusterCoordinator = drillbitContext.getClusterCoordinator(); final DrillbitStatusListener drillbitStatusListener = new FragmentDrillbitStatusListener(); final String newThreadName = QueryIdHelper.getExecutorThreadName(fragmentHandle); try { myThread.setName(newThreadName); // if we didn't get the root operator when the executor was created, create it now. final FragmentRoot rootOperator = this.rootOperator != null ? this.rootOperator : drillbitContext.getPlanReader().readFragmentOperator(fragment.getFragmentJson()); root = ImplCreator.getExec(fragmentContext, rootOperator); if (root == null) { return; } clusterCoordinator.addDrillbitStatusListener(drillbitStatusListener); updateState(FragmentState.RUNNING); acceptExternalEvents.countDown(); injector.injectPause(fragmentContext.getExecutionControls(), "fragment-running", logger); final DrillbitEndpoint endpoint = drillbitContext.getEndpoint(); logger.debug( "Starting fragment {}:{} on {}:{}", fragmentHandle.getMajorFragmentId(), fragmentHandle.getMinorFragmentId(), endpoint.getAddress(), endpoint.getUserPort()); final UserGroupInformation queryUserUgi = fragmentContext.isImpersonationEnabled() ? ImpersonationUtil.createProxyUgi(fragmentContext.getQueryUserName()) : ImpersonationUtil.getProcessUserUGI(); queryUserUgi.doAs( new PrivilegedExceptionAction<Void>() { public Void run() throws Exception { injector.injectChecked( fragmentContext.getExecutionControls(), "fragment-execution", IOException.class); /* * Run the query until root.next returns false OR we no longer need to continue. */ while (shouldContinue() && root.next()) { // loop } return null; } }); } catch (OutOfMemoryError | OutOfMemoryException e) { if (!(e instanceof OutOfMemoryError) || "Direct buffer memory".equals(e.getMessage())) { fail(UserException.memoryError(e).build(logger)); } else { // we have a heap out of memory error. The JVM in unstable, exit. CatastrophicFailure.exit( e, "Unable to handle out of memory condition in FragmentExecutor.", -2); } } catch (AssertionError | Exception e) { fail(e); } finally { // no longer allow this thread to be interrupted. We synchronize here to make sure that cancel // can't set an // interruption after we have moved beyond this block. synchronized (myThreadRef) { myThreadRef.set(null); Thread.interrupted(); } // We need to sure we countDown at least once. We'll do it here to guarantee that. acceptExternalEvents.countDown(); // here we could be in FAILED, RUNNING, or CANCELLATION_REQUESTED cleanup(FragmentState.FINISHED); clusterCoordinator.removeDrillbitStatusListener(drillbitStatusListener); myThread.setName(originalThreadName); } } /** * Utility method to check where we are in a no terminal state. * * @return Whether or not execution should continue. */ private boolean shouldContinue() { return !isCompleted() && FragmentState.CANCELLATION_REQUESTED != fragmentState.get(); } /** * Returns true if the fragment is in a terminal state * * @return Whether this state is in a terminal state. */ private boolean isCompleted() { return isTerminal(fragmentState.get()); } private void sendFinalState() { final FragmentState outcome = fragmentState.get(); if (outcome == FragmentState.FAILED) { final FragmentHandle handle = getContext().getHandle(); final UserException uex = UserException.systemError(deferredException.getAndClear()) .addIdentity(getContext().getIdentity()) .addContext( "Fragment", handle.getMajorFragmentId() + ":" + handle.getMinorFragmentId()) .build(logger); statusReporter.fail(uex); } else { statusReporter.stateChanged(outcome); } } private void closeOutResources() { // first close the operators and release all memory. try { // Say executor was cancelled before setup. Now when executor actually runs, root is not // initialized, but this // method is called in finally. So root can be null. if (root != null) { root.close(); } } catch (final Exception e) { fail(e); } // then close the fragment context. fragmentContext.close(); } private void warnStateChange(final FragmentState current, final FragmentState target) { logger.warn( fragmentName + ": Ignoring unexpected state transition {} --> {}", current.name(), target.name()); } private void errorStateChange(final FragmentState current, final FragmentState target) { final String msg = "%s: Invalid state transition %s --> %s"; throw new StateTransitionException( String.format(msg, fragmentName, current.name(), target.name())); } private synchronized boolean updateState(FragmentState target) { final FragmentState current = fragmentState.get(); logger.info(fragmentName + ": State change requested {} --> {}", current, target); switch (target) { case CANCELLATION_REQUESTED: switch (current) { case SENDING: case AWAITING_ALLOCATION: case RUNNING: fragmentState.set(target); statusReporter.stateChanged(target); return true; default: warnStateChange(current, target); return false; } case FINISHED: if (current == FragmentState.CANCELLATION_REQUESTED) { target = FragmentState.CANCELLED; } else if (current == FragmentState.FAILED) { target = FragmentState.FAILED; } // fall-through case FAILED: if (!isTerminal(current)) { fragmentState.set(target); // don't notify reporter until we finalize this terminal state. return true; } else if (current == FragmentState.FAILED) { // no warn since we can call fail multiple times. return false; } else if (current == FragmentState.CANCELLED && target == FragmentState.FAILED) { fragmentState.set(FragmentState.FAILED); return true; } else { warnStateChange(current, target); return false; } case RUNNING: if (current == FragmentState.AWAITING_ALLOCATION) { fragmentState.set(target); statusReporter.stateChanged(target); return true; } else { errorStateChange(current, target); } // these should never be requested. case CANCELLED: case SENDING: case AWAITING_ALLOCATION: default: errorStateChange(current, target); } // errorStateChange() throw should mean this is never executed throw new IllegalStateException(); } private boolean isTerminal(final FragmentState state) { return state == FragmentState.CANCELLED || state == FragmentState.FAILED || state == FragmentState.FINISHED; } /** * Capture an exception and add store it. Update state to failed status (if not already there). * Does not immediately report status back to Foreman. Only the original thread can return status * to the Foreman. * * @param excep The failure that occurred. */ private void fail(final Throwable excep) { deferredException.addThrowable(excep); updateState(FragmentState.FAILED); } public FragmentContext getContext() { return fragmentContext; } private class ExecutorStateImpl implements ExecutorState { public boolean shouldContinue() { return FragmentExecutor.this.shouldContinue(); } public void fail(final Throwable t) { FragmentExecutor.this.fail(t); } public boolean isFailed() { return fragmentState.get() == FragmentState.FAILED; } public Throwable getFailureCause() { return deferredException.getException(); } } private class FragmentDrillbitStatusListener implements DrillbitStatusListener { @Override public void drillbitRegistered( final Set<CoordinationProtos.DrillbitEndpoint> registeredDrillbits) {} @Override public void drillbitUnregistered( final Set<CoordinationProtos.DrillbitEndpoint> unregisteredDrillbits) { // if the defunct Drillbit was running our Foreman, then cancel the query final DrillbitEndpoint foremanEndpoint = FragmentExecutor.this.fragmentContext.getForemanEndpoint(); if (unregisteredDrillbits.contains(foremanEndpoint)) { logger.warn( "Foreman {} no longer active. Cancelling fragment {}.", foremanEndpoint.getAddress(), QueryIdHelper.getQueryIdentifier(fragmentContext.getHandle())); FragmentExecutor.this.cancel(); } } } private class ReceiverExecutor extends SerializedExecutor { public ReceiverExecutor(String name, Executor underlyingExecutor) { super(name, underlyingExecutor); } @Override protected void runException(Runnable command, Throwable t) { logger.error("Failure running with exception of command {}", command, t); } public void submitReceiverFinished(FragmentHandle handle) { execute(new ReceiverFinished(handle)); } } private class ReceiverFinished implements Runnable { final FragmentHandle handle; public ReceiverFinished(FragmentHandle handle) { super(); this.handle = handle; } @Override public void run() { acceptExternalEvents.awaitUninterruptibly(); if (root != null) { logger.info( "Applying request for early sender termination for {} -> {}.", QueryIdHelper.getFragmentId(getContext().getHandle()), QueryIdHelper.getFragmentId(handle)); root.receivingFragmentFinished(handle); } else { logger.warn( "Dropping request for early fragment termination for path {} -> {} as no root exec exists.", QueryIdHelper.getFragmentId(getContext().getHandle()), QueryIdHelper.getFragmentId(handle)); } } } }