@Override public void buildSchema() throws SchemaChangeException { IterOutcome outcome = next(incoming); switch (outcome) { case OK: case OK_NEW_SCHEMA: for (VectorWrapper<?> w : incoming) { ValueVector v = container.addOrGet(w.getField()); if (v instanceof AbstractContainerVector) { w.getValueVector().makeTransferPair(v); // Can we remove this hack? v.clear(); } v.allocateNew(); // Can we remove this? - SVR fails with NPE (TODO) } container.buildSchema(SelectionVectorMode.NONE); container.setRecordCount(0); break; case STOP: state = BatchState.STOP; break; case OUT_OF_MEMORY: state = BatchState.OUT_OF_MEMORY; break; case NONE: state = BatchState.DONE; break; default: break; } }
public void testCommon(String[] expectedResults, String physicalPlan, String resourceFile) throws Exception { try (RemoteServiceSet serviceSet = RemoteServiceSet.getLocalServiceSet(); Drillbit bit = new Drillbit(CONFIG, serviceSet); DrillClient client = new DrillClient(CONFIG, serviceSet.getCoordinator())) { // run query. bit.run(); client.connect(); List<QueryDataBatch> results = client.runQuery( org.apache.drill.exec.proto.UserBitShared.QueryType.PHYSICAL, Files.toString(FileUtils.getResourceAsFile(physicalPlan), Charsets.UTF_8) .replace("#{TEST_FILE}", resourceFile)); RecordBatchLoader batchLoader = new RecordBatchLoader(bit.getContext().getAllocator()); QueryDataBatch batch = results.get(0); assertTrue(batchLoader.load(batch.getHeader().getDef(), batch.getData())); int i = 0; for (VectorWrapper<?> v : batchLoader) { ValueVector.Accessor accessor = v.getValueVector().getAccessor(); System.out.println(accessor.getObject(0)); assertEquals(expectedResults[i++], accessor.getObject(0).toString()); } batchLoader.clear(); for (QueryDataBatch b : results) { b.release(); } } }
/** * Method is invoked when we have a straight aggregate (no group by expression) and our input is * empty. In this case we construct an outgoing batch with record count as 1. For the nullable * vectors we don't set anything as we want the output to be NULL. For the required vectors (only * for count()) we set the value to be zero since we don't zero out our buffers initially while * allocating them. */ private void constructSpecialBatch() { int exprIndex = 0; for (final VectorWrapper<?> vw : container) { final ValueVector vv = vw.getValueVector(); AllocationHelper.allocateNew(vv, SPECIAL_BATCH_COUNT); vv.getMutator().setValueCount(SPECIAL_BATCH_COUNT); if (vv.getField().getType().getMode() == TypeProtos.DataMode.REQUIRED) { if (vv instanceof FixedWidthVector) { /* * The only case we should have a required vector in the aggregate is for count function whose output is * always a FixedWidthVector (BigIntVector). Zero out the vector. */ ((FixedWidthVector) vv).zeroVector(); } else { /* * If we are in this else block it means that we have a required vector which is of variable length. We * should not be here, raising an error since we have set the record count to be 1 and not cleared the * buffer */ throw new DrillRuntimeException( "FixedWidth vectors is the expected output vector type. " + "Corresponding expression: " + popConfig.getExprs()[exprIndex].toString()); } } exprIndex++; } container.setRecordCount(SPECIAL_BATCH_COUNT); recordCount = SPECIAL_BATCH_COUNT; }
private void createCopier( VectorAccessible batch, List<BatchGroup> batchGroupList, VectorContainer outputContainer, boolean spilling) throws SchemaChangeException { try { if (copier == null) { CodeGenerator<PriorityQueueCopier> cg = CodeGenerator.get( PriorityQueueCopier.TEMPLATE_DEFINITION, context.getFunctionRegistry(), context.getOptions()); ClassGenerator<PriorityQueueCopier> g = cg.getRoot(); generateComparisons(g, batch); g.setMappingSet(COPIER_MAPPING_SET); CopyUtil.generateCopies(g, batch, true); g.setMappingSet(MAIN_MAPPING); copier = context.getImplementationClass(cg); } else { copier.close(); } BufferAllocator allocator = spilling ? copierAllocator : oAllocator; for (VectorWrapper<?> i : batch) { ValueVector v = TypeHelper.getNewVector(i.getField(), allocator); outputContainer.add(v); } copier.setup(context, allocator, batch, batchGroupList, outputContainer); } catch (ClassTransformationException | IOException e) { throw new RuntimeException(e); } }
private VectorContainer getBatch() throws IOException { assert fs != null; assert path != null; if (inputStream == null) { inputStream = fs.open(path); } VectorAccessibleSerializable vas = new VectorAccessibleSerializable(allocator); Stopwatch watch = Stopwatch.createStarted(); vas.readFromStream(inputStream); VectorContainer c = vas.get(); if (schema != null) { c = SchemaUtil.coerceContainer(c, schema, context); } // logger.debug("Took {} us to read {} records", watch.elapsed(TimeUnit.MICROSECONDS), // c.getRecordCount()); spilledBatches--; currentContainer.zeroVectors(); Iterator<VectorWrapper<?>> wrapperIterator = c.iterator(); for (VectorWrapper w : currentContainer) { TransferPair pair = wrapperIterator.next().getValueVector().makeTransferPair(w.getValueVector()); pair.transfer(); } currentContainer.setRecordCount(c.getRecordCount()); c.zeroVectors(); return c; }
@Override public void removeAllFields() { for (VectorWrapper<?> vw : container) { vw.clear(); } container.clear(); fieldVectorMap.clear(); }
/** * This method is called when the first batch comes in. Incoming batches are collected until a * threshold is met. At that point, the records in the batches are sorted and sampled, and the * sampled records are stored in the distributed cache. Once a sufficient fraction of the * fragments have shared their samples, each fragment grabs all the samples, sorts all the * records, builds a partition table, and attempts to push the partition table to the distributed * cache. Whichever table gets pushed first becomes the table used by all fragments for * partitioning. * * @return True is successful. False if failed. */ private boolean getPartitionVectors() { try { if (!saveSamples()) { return false; } CachedVectorContainer finalTable = null; long val = minorFragmentSampleCount.incrementAndGet(); logger.debug("Incremented mfsc, got {}", val); final long fragmentsBeforeProceed = (long) Math.ceil(sendingMajorFragmentWidth * completionFactor); final String finalTableKey = mapKey + "final"; if (val == fragmentsBeforeProceed) { // we crossed the barrier, build table and get data. buildTable(); finalTable = tableMap.get(finalTableKey); } else { // Wait until sufficient number of fragments have submitted samples, or proceed after xx ms // passed // TODO: this should be polling. if (val < fragmentsBeforeProceed) Thread.sleep(10); for (int i = 0; i < 100 && finalTable == null; i++) { finalTable = tableMap.get(finalTableKey); if (finalTable != null) { break; } Thread.sleep(10); } if (finalTable == null) { buildTable(); } finalTable = tableMap.get(finalTableKey); } Preconditions.checkState(finalTable != null); // Extract vectors from the wrapper, and add to partition vectors. These vectors will be used // for partitioning in // the rest of this operator for (VectorWrapper<?> w : finalTable.get()) { partitionVectors.add(w.getValueVector()); } } catch (ClassTransformationException | IOException | SchemaChangeException | InterruptedException ex) { kill(false); logger.error("Failure while building final partition table.", ex); context.fail(ex); return false; } return true; }
/** * Sets up projection that will transfer all of the columns in batch, and also populate the * partition column based on which partition a record falls into in the partition table * * @param batch * @throws SchemaChangeException */ protected void setupNewSchema(VectorAccessible batch) throws SchemaChangeException { container.clear(); final ErrorCollector collector = new ErrorCollectorImpl(); final List<TransferPair> transfers = Lists.newArrayList(); final ClassGenerator<OrderedPartitionProjector> cg = CodeGenerator.getRoot( OrderedPartitionProjector.TEMPLATE_DEFINITION, context.getFunctionRegistry()); for (VectorWrapper<?> vw : batch) { TransferPair tp = vw.getValueVector().getTransferPair(); transfers.add(tp); container.add(tp.getTo()); } cg.setMappingSet(mainMapping); int count = 0; for (Ordering od : popConfig.getOrderings()) { final LogicalExpression expr = ExpressionTreeMaterializer.materialize( od.getExpr(), batch, collector, context.getFunctionRegistry()); if (collector.hasErrors()) throw new SchemaChangeException( "Failure while materializing expression. " + collector.toErrorString()); cg.setMappingSet(incomingMapping); ClassGenerator.HoldingContainer left = cg.addExpr(expr, false); cg.setMappingSet(partitionMapping); ClassGenerator.HoldingContainer right = cg.addExpr( new ValueVectorReadExpression(new TypedFieldId(expr.getMajorType(), count++)), false); cg.setMappingSet(mainMapping); LogicalExpression fh = FunctionGenerationHelper.getComparator(left, right, context.getFunctionRegistry()); ClassGenerator.HoldingContainer out = cg.addExpr(fh, false); JConditional jc = cg.getEvalBlock()._if(out.getValue().ne(JExpr.lit(0))); if (od.getDirection() == Direction.ASCENDING) { jc._then()._return(out.getValue()); } else { jc._then()._return(out.getValue().minus()); } } cg.getEvalBlock()._return(JExpr.lit(0)); container.add(this.partitionKeyVector); container.buildSchema(batch.getSchema().getSelectionVectorMode()); try { this.projector = context.getImplementationClass(cg); projector.setup( context, batch, this, transfers, partitionVectors, partitions, popConfig.getRef()); } catch (ClassTransformationException | IOException e) { throw new SchemaChangeException("Failure while attempting to load generated class", e); } }
protected void doWork(VectorAccessible batch) { int recordCount = batch.getRecordCount(); AllocationHelper.allocate(partitionKeyVector, recordCount, 50); projector.projectRecords(recordCount, 0); for (VectorWrapper<?> v : container) { ValueVector.Mutator m = v.getValueVector().getMutator(); m.setValueCount(recordCount); } }
@Override public void copyRecords() { // logger.debug("Copying records."); final int recordCount = sv4.getCount(); allocateVectors(recordCount); int outgoingPosition = 0; for (int svIndex = 0; svIndex < sv4.getCount(); svIndex++, outgoingPosition++) { int deRefIndex = sv4.get(svIndex); doEval(deRefIndex, outgoingPosition); } for (VectorWrapper<?> v : incoming) { v.clear(); } }
@Test public void twoBitTwoExchange() throws Exception { RemoteServiceSet serviceSet = RemoteServiceSet.getLocalServiceSet(); try (Drillbit bit1 = new Drillbit(CONFIG, serviceSet); Drillbit bit2 = new Drillbit(CONFIG, serviceSet); DrillClient client = new DrillClient(CONFIG, serviceSet.getCoordinator()); ) { bit1.run(); bit2.run(); client.connect(); List<QueryResultBatch> results = client.runQuery( org.apache.drill.exec.proto.UserBitShared.QueryType.PHYSICAL, Files.toString( FileUtils.getResourceAsFile("/mergerecv/merging_receiver.json"), Charsets.UTF_8)); int count = 0; RecordBatchLoader batchLoader = new RecordBatchLoader(client.getAllocator()); // print the results for (QueryResultBatch b : results) { count += b.getHeader().getRowCount(); for (int valueIdx = 0; valueIdx < b.getHeader().getRowCount(); valueIdx++) { List<Object> row = Lists.newArrayList(); batchLoader.load(b.getHeader().getDef(), b.getData()); for (VectorWrapper<?> vw : batchLoader) row.add( vw.getValueVector().getField().toExpr() + ":" + vw.getValueVector().getAccessor().getObject(valueIdx)); for (Object cell : row) { if (cell == null) { System.out.print("<null> "); continue; } int len = cell.toString().length(); System.out.print(cell + " "); for (int i = 0; i < (30 - len); ++i) System.out.print(" "); } System.out.println(); } b.release(); batchLoader.clear(); } assertEquals(200, count); } }
@Test public void testMultipleProvidersMixedSizes() throws Exception { RemoteServiceSet serviceSet = RemoteServiceSet.getLocalServiceSet(); try (Drillbit bit1 = new Drillbit(CONFIG, serviceSet); Drillbit bit2 = new Drillbit(CONFIG, serviceSet); DrillClient client = new DrillClient(CONFIG, serviceSet.getCoordinator()); ) { bit1.run(); bit2.run(); client.connect(); List<QueryResultBatch> results = client.runQuery( org.apache.drill.exec.proto.UserBitShared.QueryType.PHYSICAL, Files.toString( FileUtils.getResourceAsFile("/mergerecv/multiple_providers.json"), Charsets.UTF_8)); int count = 0; RecordBatchLoader batchLoader = new RecordBatchLoader(client.getAllocator()); // print the results Long lastBlueValue = null; for (QueryResultBatch b : results) { count += b.getHeader().getRowCount(); for (int valueIdx = 0; valueIdx < b.getHeader().getRowCount(); valueIdx++) { List<Object> row = Lists.newArrayList(); batchLoader.load(b.getHeader().getDef(), b.getData()); for (VectorWrapper vw : batchLoader) { row.add( vw.getValueVector().getField().toExpr() + ":" + vw.getValueVector().getAccessor().getObject(valueIdx)); if (vw.getValueVector() .getField() .getAsSchemaPath() .getRootSegment() .getPath() .equals("blue")) { // assert order is ascending if (((Long) vw.getValueVector().getAccessor().getObject(valueIdx)).longValue() == 0) continue; // ignore initial 0's from sort if (lastBlueValue != null) assertTrue( ((Long) vw.getValueVector().getAccessor().getObject(valueIdx)).longValue() >= ((Long) lastBlueValue).longValue()); lastBlueValue = (Long) vw.getValueVector().getAccessor().getObject(valueIdx); } } for (Object cell : row) { int len = cell.toString().length(); System.out.print(cell + " "); for (int i = 0; i < (30 - len); ++i) System.out.print(" "); } System.out.println(); } b.release(); batchLoader.clear(); } assertEquals(400, count); } }
@Test @Ignore public void testParseParquetPhysicalPlan() throws Exception { RemoteServiceSet serviceSet = RemoteServiceSet.getLocalServiceSet(); DrillConfig config = DrillConfig.create(); try (Drillbit bit1 = new Drillbit(config, serviceSet); DrillClient client = new DrillClient(config, serviceSet.getCoordinator()); ) { bit1.run(); client.connect(); List<QueryDataBatch> results = client.runQuery( org.apache.drill.exec.proto.UserBitShared.QueryType.PHYSICAL, Resources.toString(Resources.getResource(fileName), Charsets.UTF_8)); RecordBatchLoader loader = new RecordBatchLoader(bit1.getContext().getAllocator()); int count = 0; for (QueryDataBatch b : results) { System.out.println(String.format("Got %d results", b.getHeader().getRowCount())); count += b.getHeader().getRowCount(); loader.load(b.getHeader().getDef(), b.getData()); for (VectorWrapper vw : loader) { System.out.print(vw.getValueVector().getField().toExpr() + ": "); ValueVector vv = vw.getValueVector(); for (int i = 0; i < vv.getAccessor().getValueCount(); i++) { Object o = vv.getAccessor().getObject(i); if (o instanceof byte[]) { System.out.print(" [" + new String((byte[]) o) + "]"); } else { System.out.print(" [" + vv.getAccessor().getObject(i) + "]"); } // break; } System.out.println(); } loader.clear(); b.release(); } client.close(); System.out.println(String.format("Got %d total results", count)); } }
@Override public void buildSchema() throws SchemaChangeException { IterOutcome outcome = next(incoming); switch (outcome) { case NONE: state = BatchState.DONE; container.buildSchema(SelectionVectorMode.NONE); return; case OUT_OF_MEMORY: state = BatchState.OUT_OF_MEMORY; return; case STOP: state = BatchState.STOP; return; } if (!createAggregator()) { state = BatchState.DONE; } for (final VectorWrapper<?> w : container) { w.getValueVector().allocateNew(); } }
@Override public void dataArrived(QueryDataBatch result, ConnectionThrottle throttle) { try { final int rows = result.getHeader().getRowCount(); if (result.hasData()) { RecordBatchLoader loader = null; try { loader = new RecordBatchLoader(allocator); loader.load(result.getHeader().getDef(), result.getData()); // TODO: Clean: DRILL-2933: That load(...) no longer throws // SchemaChangeException, so check/clean catch clause below. for (int i = 0; i < loader.getSchema().getFieldCount(); ++i) { columns.add(loader.getSchema().getColumn(i).getPath()); } for (int i = 0; i < rows; ++i) { final Map<String, String> record = Maps.newHashMap(); for (VectorWrapper<?> vw : loader) { final String field = vw.getValueVector().getMetadata().getNamePart().getName(); final ValueVector.Accessor accessor = vw.getValueVector().getAccessor(); final Object value = i < accessor.getValueCount() ? accessor.getObject(i) : null; final String display = value == null ? null : value.toString(); record.put(field, display); } results.add(record); } } finally { if (loader != null) { loader.clear(); } } } } catch (SchemaChangeException e) { throw new RuntimeException(e); } finally { result.release(); } }
private IterOutcome doWork() throws ClassTransformationException, IOException, SchemaChangeException { if (allocationVectors != null) { for (ValueVector v : allocationVectors) { v.clear(); } } allocationVectors = Lists.newArrayList(); transfers.clear(); final ClassGenerator<UnionAller> cg = CodeGenerator.getRoot(UnionAller.TEMPLATE_DEFINITION, context.getFunctionRegistry()); int index = 0; for (VectorWrapper<?> vw : current) { ValueVector vvIn = vw.getValueVector(); // get the original input column names SchemaPath inputPath = vvIn.getField().getPath(); // get the renamed column names SchemaPath outputPath = outputFields.get(index).getPath(); final ErrorCollector collector = new ErrorCollectorImpl(); // According to input data names, Minortypes, Datamodes, choose to // transfer directly, // rename columns or // cast data types (Minortype or DataMode) if (hasSameTypeAndMode(outputFields.get(index), vw.getValueVector().getField())) { // Transfer column if (outputFields.get(index).getPath().equals(inputPath)) { final LogicalExpression expr = ExpressionTreeMaterializer.materialize( inputPath, current, collector, context.getFunctionRegistry()); if (collector.hasErrors()) { throw new SchemaChangeException( String.format( "Failure while trying to materialize incoming schema. Errors:\n %s.", collector.toErrorString())); } ValueVectorReadExpression vectorRead = (ValueVectorReadExpression) expr; ValueVector vvOut = container.addOrGet(MaterializedField.create(outputPath, vectorRead.getMajorType())); TransferPair tp = vvIn.makeTransferPair(vvOut); transfers.add(tp); // Copy data in order to rename the column } else { final LogicalExpression expr = ExpressionTreeMaterializer.materialize( inputPath, current, collector, context.getFunctionRegistry()); if (collector.hasErrors()) { throw new SchemaChangeException( String.format( "Failure while trying to materialize incoming schema. Errors:\n %s.", collector.toErrorString())); } MaterializedField outputField = MaterializedField.create(outputPath, expr.getMajorType()); ValueVector vv = container.addOrGet(outputField, callBack); allocationVectors.add(vv); TypedFieldId fid = container.getValueVectorId(outputField.getPath()); ValueVectorWriteExpression write = new ValueVectorWriteExpression(fid, expr, true); cg.addExpr(write); } // Cast is necessary } else { LogicalExpression expr = ExpressionTreeMaterializer.materialize( inputPath, current, collector, context.getFunctionRegistry()); if (collector.hasErrors()) { throw new SchemaChangeException( String.format( "Failure while trying to materialize incoming schema. Errors:\n %s.", collector.toErrorString())); } // If the inputs' DataMode is required and the outputs' DataMode is not required // cast to the one with the least restriction if (vvIn.getField().getType().getMode() == DataMode.REQUIRED && outputFields.get(index).getType().getMode() != DataMode.REQUIRED) { expr = ExpressionTreeMaterializer.convertToNullableType( expr, vvIn.getField().getType().getMinorType(), context.getFunctionRegistry(), collector); if (collector.hasErrors()) { throw new SchemaChangeException( String.format( "Failure while trying to materialize incoming schema. Errors:\n %s.", collector.toErrorString())); } } // If two inputs' MinorTypes are different, // Insert a cast before the Union operation if (vvIn.getField().getType().getMinorType() != outputFields.get(index).getType().getMinorType()) { expr = ExpressionTreeMaterializer.addCastExpression( expr, outputFields.get(index).getType(), context.getFunctionRegistry(), collector); if (collector.hasErrors()) { throw new SchemaChangeException( String.format( "Failure while trying to materialize incoming schema. Errors:\n %s.", collector.toErrorString())); } } final MaterializedField outputField = MaterializedField.create(outputPath, expr.getMajorType()); ValueVector vector = container.addOrGet(outputField, callBack); allocationVectors.add(vector); TypedFieldId fid = container.getValueVectorId(outputField.getPath()); boolean useSetSafe = !(vector instanceof FixedWidthVector); ValueVectorWriteExpression write = new ValueVectorWriteExpression(fid, expr, useSetSafe); cg.addExpr(write); } ++index; } unionall = context.getImplementationClass(cg.getCodeGenerator()); unionall.setup(context, current, this, transfers); if (!schemaAvailable) { container.buildSchema(BatchSchema.SelectionVectorMode.NONE); schemaAvailable = true; } if (!doAlloc()) { return IterOutcome.OUT_OF_MEMORY; } recordCount = unionall.unionRecords(0, current.getRecordCount(), 0); setValueCount(recordCount); return IterOutcome.OK; }
private void buildTable() throws SchemaChangeException, ClassTransformationException, IOException { // Get all samples from distributed map SortRecordBatchBuilder containerBuilder = new SortRecordBatchBuilder(context.getAllocator(), MAX_SORT_BYTES); for (CachedVectorContainer w : mmap.get(mapKey)) { containerBuilder.add(w.get()); } VectorContainer allSamplesContainer = new VectorContainer(); containerBuilder.build(context, allSamplesContainer); List<Ordering> orderDefs = Lists.newArrayList(); int i = 0; for (Ordering od : popConfig.getOrderings()) { SchemaPath sp = SchemaPath.getSimplePath("f" + i++); orderDefs.add(new Ordering(od.getDirection(), new FieldReference(sp))); } // sort the data incoming samples. SelectionVector4 newSv4 = containerBuilder.getSv4(); Sorter sorter = SortBatch.createNewSorter(context, orderDefs, allSamplesContainer); sorter.setup(context, newSv4, allSamplesContainer); sorter.sort(newSv4, allSamplesContainer); // Copy every Nth record from the samples into a candidate partition table, where N = // totalSampledRecords/partitions // Attempt to push this to the distributed map. Only the first candidate to get pushed will be // used. VectorContainer candidatePartitionTable = new VectorContainer(); SampleCopier copier = null; List<ValueVector> localAllocationVectors = Lists.newArrayList(); copier = getCopier( newSv4, allSamplesContainer, candidatePartitionTable, orderDefs, localAllocationVectors); int allocationSize = 50; while (true) { for (ValueVector vv : localAllocationVectors) { AllocationHelper.allocate(vv, samplingFactor * partitions, allocationSize); } int skipRecords = containerBuilder.getSv4().getTotalCount() / partitions; if (copier.copyRecords(skipRecords, skipRecords, partitions - 1)) { assert copier.getOutputRecords() == partitions - 1 : String.format( "output records: %d partitions: %d", copier.getOutputRecords(), partitions); for (VectorWrapper<?> vw : candidatePartitionTable) { vw.getValueVector().getMutator().setValueCount(copier.getOutputRecords()); } break; } else { candidatePartitionTable.zeroVectors(); allocationSize *= 2; } } candidatePartitionTable.setRecordCount(copier.getOutputRecords()); WritableBatch batch = WritableBatch.getBatchNoHVWrap( candidatePartitionTable.getRecordCount(), candidatePartitionTable, false); CachedVectorContainer wrap = new CachedVectorContainer(batch, context.getDrillbitContext().getAllocator()); tableMap.putIfAbsent(mapKey + "final", wrap, 1, TimeUnit.MINUTES); candidatePartitionTable.clear(); allSamplesContainer.clear(); containerBuilder.clear(); wrap.clear(); }
@Override public IterOutcome innerNext() { if (schema != null) { if (spillCount == 0) { return (getSelectionVector4().next()) ? IterOutcome.OK : IterOutcome.NONE; } else { Stopwatch w = Stopwatch.createStarted(); int count = copier.next(targetRecordCount); if (count > 0) { long t = w.elapsed(TimeUnit.MICROSECONDS); logger.debug("Took {} us to merge {} records", t, count); container.setRecordCount(count); return IterOutcome.OK; } else { logger.debug("copier returned 0 records"); return IterOutcome.NONE; } } } int totalCount = 0; int totalBatches = 0; // total number of batches received so far try { container.clear(); outer: while (true) { IterOutcome upstream; if (first) { upstream = IterOutcome.OK_NEW_SCHEMA; } else { upstream = next(incoming); } if (upstream == IterOutcome.OK && sorter == null) { upstream = IterOutcome.OK_NEW_SCHEMA; } switch (upstream) { case NONE: if (first) { return upstream; } break outer; case NOT_YET: throw new UnsupportedOperationException(); case STOP: return upstream; case OK_NEW_SCHEMA: case OK: VectorContainer convertedBatch; // only change in the case that the schema truly changes. Artificial schema changes are // ignored. if (upstream == IterOutcome.OK_NEW_SCHEMA && !incoming.getSchema().equals(schema)) { if (schema != null) { if (unionTypeEnabled) { this.schema = SchemaUtil.mergeSchemas(schema, incoming.getSchema()); } else { throw new SchemaChangeException( "Schema changes not supported in External Sort. Please enable Union type"); } } else { schema = incoming.getSchema(); } convertedBatch = SchemaUtil.coerceContainer(incoming, schema, oContext); for (BatchGroup b : batchGroups) { b.setSchema(schema); } for (BatchGroup b : spilledBatchGroups) { b.setSchema(schema); } this.sorter = createNewSorter(context, convertedBatch); } else { convertedBatch = SchemaUtil.coerceContainer(incoming, schema, oContext); } if (first) { first = false; } if (convertedBatch.getRecordCount() == 0) { for (VectorWrapper<?> w : convertedBatch) { w.clear(); } break; } SelectionVector2 sv2; if (incoming.getSchema().getSelectionVectorMode() == BatchSchema.SelectionVectorMode.TWO_BYTE) { sv2 = incoming.getSelectionVector2().clone(); } else { try { sv2 = newSV2(); } catch (InterruptedException e) { return IterOutcome.STOP; } catch (OutOfMemoryException e) { throw new OutOfMemoryException(e); } } int count = sv2.getCount(); totalCount += count; totalBatches++; sorter.setup(context, sv2, convertedBatch); sorter.sort(sv2); RecordBatchData rbd = new RecordBatchData(convertedBatch, oAllocator); boolean success = false; try { rbd.setSv2(sv2); batchGroups.add(new BatchGroup(rbd.getContainer(), rbd.getSv2(), oContext)); if (peakNumBatches < batchGroups.size()) { peakNumBatches = batchGroups.size(); stats.setLongStat(Metric.PEAK_BATCHES_IN_MEMORY, peakNumBatches); } batchesSinceLastSpill++; if ( // If we haven't spilled so far, do we have enough memory for MSorter if this // turns out to be the last incoming batch? (spillCount == 0 && !hasMemoryForInMemorySort(totalCount)) || // If we haven't spilled so far, make sure we don't exceed the maximum number of // batches SV4 can address (spillCount == 0 && totalBatches > Character.MAX_VALUE) || // TODO(DRILL-4438) - consider setting this threshold more intelligently, // lowering caused a failing low memory condition (test in // BasicPhysicalOpUnitTest) // to complete successfully (although it caused perf decrease as there was more // spilling) // current memory used is more than 95% of memory usage limit of this operator (oAllocator.getAllocatedMemory() > .95 * oAllocator.getLimit()) || // Number of incoming batches (BatchGroups) exceed the limit and number of // incoming batches accumulated // since the last spill exceed the defined limit (batchGroups.size() > SPILL_THRESHOLD && batchesSinceLastSpill >= SPILL_BATCH_GROUP_SIZE)) { if (firstSpillBatchCount == 0) { firstSpillBatchCount = batchGroups.size(); } if (spilledBatchGroups.size() > firstSpillBatchCount / 2) { logger.info("Merging spills"); final BatchGroup merged = mergeAndSpill(spilledBatchGroups); if (merged != null) { spilledBatchGroups.addFirst(merged); } } final BatchGroup merged = mergeAndSpill(batchGroups); if (merged != null) { // make sure we don't add null to spilledBatchGroups spilledBatchGroups.add(merged); batchesSinceLastSpill = 0; } } success = true; } finally { if (!success) { rbd.clear(); } } break; case OUT_OF_MEMORY: logger.debug("received OUT_OF_MEMORY, trying to spill"); if (batchesSinceLastSpill > 2) { final BatchGroup merged = mergeAndSpill(batchGroups); if (merged != null) { spilledBatchGroups.add(merged); batchesSinceLastSpill = 0; } } else { logger.debug("not enough batches to spill, sending OUT_OF_MEMORY downstream"); return IterOutcome.OUT_OF_MEMORY; } break; default: throw new UnsupportedOperationException(); } } if (totalCount == 0) { return IterOutcome.NONE; } if (spillCount == 0) { if (builder != null) { builder.clear(); builder.close(); } builder = new SortRecordBatchBuilder(oAllocator); for (BatchGroup group : batchGroups) { RecordBatchData rbd = new RecordBatchData(group.getContainer(), oAllocator); rbd.setSv2(group.getSv2()); builder.add(rbd); } builder.build(context, container); sv4 = builder.getSv4(); mSorter = createNewMSorter(); mSorter.setup(context, oAllocator, getSelectionVector4(), this.container); // For testing memory-leak purpose, inject exception after mSorter finishes setup injector.injectUnchecked(context.getExecutionControls(), INTERRUPTION_AFTER_SETUP); mSorter.sort(this.container); // sort may have prematurely exited due to should continue returning false. if (!context.shouldContinue()) { return IterOutcome.STOP; } // For testing memory-leak purpose, inject exception after mSorter finishes sorting injector.injectUnchecked(context.getExecutionControls(), INTERRUPTION_AFTER_SORT); sv4 = mSorter.getSV4(); container.buildSchema(SelectionVectorMode.FOUR_BYTE); } else { // some batches were spilled final BatchGroup merged = mergeAndSpill(batchGroups); if (merged != null) { spilledBatchGroups.add(merged); } batchGroups.addAll(spilledBatchGroups); spilledBatchGroups = null; // no need to cleanup spilledBatchGroups, all it's batches are in batchGroups now logger.warn( "Starting to merge. {} batch groups. Current allocated memory: {}", batchGroups.size(), oAllocator.getAllocatedMemory()); VectorContainer hyperBatch = constructHyperBatch(batchGroups); createCopier(hyperBatch, batchGroups, container, false); int estimatedRecordSize = 0; for (VectorWrapper<?> w : batchGroups.get(0)) { try { estimatedRecordSize += TypeHelper.getSize(w.getField().getType()); } catch (UnsupportedOperationException e) { estimatedRecordSize += 50; } } targetRecordCount = Math.min(MAX_BATCH_SIZE, Math.max(1, COPIER_BATCH_MEM_LIMIT / estimatedRecordSize)); int count = copier.next(targetRecordCount); container.buildSchema(SelectionVectorMode.NONE); container.setRecordCount(count); } return IterOutcome.OK_NEW_SCHEMA; } catch (SchemaChangeException ex) { kill(false); context.fail( UserException.unsupportedError(ex) .message("Sort doesn't currently support sorts with changing schemas") .build(logger)); return IterOutcome.STOP; } catch (ClassTransformationException | IOException ex) { kill(false); context.fail(ex); return IterOutcome.STOP; } catch (UnsupportedOperationException e) { throw new RuntimeException(e); } }
public BatchGroup mergeAndSpill(LinkedList<BatchGroup> batchGroups) throws SchemaChangeException { logger.debug("Copier allocator current allocation {}", copierAllocator.getAllocatedMemory()); logger.debug( "mergeAndSpill: starting total size in memory = {}", oAllocator.getAllocatedMemory()); VectorContainer outputContainer = new VectorContainer(); List<BatchGroup> batchGroupList = Lists.newArrayList(); int batchCount = batchGroups.size(); for (int i = 0; i < batchCount / 2; i++) { if (batchGroups.size() == 0) { break; } BatchGroup batch = batchGroups.pollLast(); assert batch != null : "Encountered a null batch during merge and spill operation"; batchGroupList.add(batch); } if (batchGroupList.size() == 0) { return null; } int estimatedRecordSize = 0; for (VectorWrapper<?> w : batchGroupList.get(0)) { try { estimatedRecordSize += TypeHelper.getSize(w.getField().getType()); } catch (UnsupportedOperationException e) { estimatedRecordSize += 50; } } int targetRecordCount = Math.max(1, COPIER_BATCH_MEM_LIMIT / estimatedRecordSize); VectorContainer hyperBatch = constructHyperBatch(batchGroupList); createCopier(hyperBatch, batchGroupList, outputContainer, true); int count = copier.next(targetRecordCount); assert count > 0; logger.debug( "mergeAndSpill: estimated record size = {}, target record count = {}", estimatedRecordSize, targetRecordCount); // 1 output container is kept in memory, so we want to hold on to it and transferClone // allows keeping ownership VectorContainer c1 = VectorContainer.getTransferClone(outputContainer, oContext); c1.buildSchema(BatchSchema.SelectionVectorMode.NONE); c1.setRecordCount(count); String spillDir = dirs.next(); Path currSpillPath = new Path(Joiner.on("/").join(spillDir, fileName)); currSpillDirs.add(currSpillPath); String outputFile = Joiner.on("/").join(currSpillPath, spillCount++); try { fs.deleteOnExit(currSpillPath); } catch (IOException e) { // since this is meant to be used in a batches's spilling, we don't propagate the exception logger.warn("Unable to mark spill directory " + currSpillPath + " for deleting on exit", e); } stats.setLongStat(Metric.SPILL_COUNT, spillCount); BatchGroup newGroup = new BatchGroup(c1, fs, outputFile, oContext); try (AutoCloseable a = AutoCloseables.all(batchGroupList)) { logger.info("Merging and spilling to {}", outputFile); while ((count = copier.next(targetRecordCount)) > 0) { outputContainer.buildSchema(BatchSchema.SelectionVectorMode.NONE); outputContainer.setRecordCount(count); // note that addBatch also clears the outputContainer newGroup.addBatch(outputContainer); } injector.injectChecked( context.getExecutionControls(), INTERRUPTION_WHILE_SPILLING, IOException.class); newGroup.closeOutputStream(); } catch (Throwable e) { // we only need to cleanup newGroup if spill failed try { AutoCloseables.close(e, newGroup); } catch (Throwable t) { /* close() may hit the same IO issue; just ignore */ } throw UserException.resourceError(e) .message("External Sort encountered an error while spilling to disk") .addContext(e.getMessage() /* more detail */) .build(logger); } finally { hyperBatch.clear(); } logger.debug("mergeAndSpill: final total size in memory = {}", oAllocator.getAllocatedMemory()); logger.info("Completed spilling to {}", outputFile); return newGroup; }
// This method is used by inner class to clear the current record batch private void clearCurrentRecordBatch() { for (VectorWrapper<?> v : current) { v.clear(); } }
private boolean saveSamples() throws SchemaChangeException, ClassTransformationException, IOException { recordsSampled = 0; IterOutcome upstream; // Start collecting batches until recordsToSample records have been collected SortRecordBatchBuilder builder = new SortRecordBatchBuilder(oContext.getAllocator(), MAX_SORT_BYTES); builder.add(incoming); recordsSampled += incoming.getRecordCount(); outer: while (recordsSampled < recordsToSample) { upstream = next(incoming); switch (upstream) { case NONE: case NOT_YET: case STOP: upstreamNone = true; break outer; default: // fall through } builder.add(incoming); recordsSampled += incoming.getRecordCount(); if (upstream == IterOutcome.NONE) break; } VectorContainer sortedSamples = new VectorContainer(); builder.build(context, sortedSamples); // Sort the records according the orderings given in the configuration Sorter sorter = SortBatch.createNewSorter(context, popConfig.getOrderings(), sortedSamples); SelectionVector4 sv4 = builder.getSv4(); sorter.setup(context, sv4, sortedSamples); sorter.sort(sv4, sortedSamples); // Project every Nth record to a new vector container, where N = recordsSampled/(samplingFactor // * partitions). // Uses the // the expressions from the Orderings to populate each column. There is one column for each // Ordering in // popConfig.orderings. VectorContainer containerToCache = new VectorContainer(); List<ValueVector> localAllocationVectors = Lists.newArrayList(); SampleCopier copier = getCopier( sv4, sortedSamples, containerToCache, popConfig.getOrderings(), localAllocationVectors); int allocationSize = 50; while (true) { for (ValueVector vv : localAllocationVectors) { AllocationHelper.allocate(vv, samplingFactor * partitions, allocationSize); } if (copier.copyRecords( recordsSampled / (samplingFactor * partitions), 0, samplingFactor * partitions)) { break; } else { containerToCache.zeroVectors(); allocationSize *= 2; } } for (VectorWrapper<?> vw : containerToCache) { vw.getValueVector().getMutator().setValueCount(copier.getOutputRecords()); } containerToCache.setRecordCount(copier.getOutputRecords()); // Get a distributed multimap handle from the distributed cache, and put the vectors from the // new vector container // into a serializable wrapper object, and then add to distributed map WritableBatch batch = WritableBatch.getBatchNoHVWrap(containerToCache.getRecordCount(), containerToCache, false); CachedVectorContainer sampleToSave = new CachedVectorContainer(batch, context.getAllocator()); mmap.put(mapKey, sampleToSave); this.sampledIncomingBatches = builder.getHeldRecordBatches(); builder.clear(); batch.clear(); containerToCache.clear(); sampleToSave.clear(); return true; }