private VectorContainer getBatch() throws IOException { assert fs != null; assert path != null; if (inputStream == null) { inputStream = fs.open(path); } VectorAccessibleSerializable vas = new VectorAccessibleSerializable(allocator); Stopwatch watch = Stopwatch.createStarted(); vas.readFromStream(inputStream); VectorContainer c = vas.get(); if (schema != null) { c = SchemaUtil.coerceContainer(c, schema, context); } // logger.debug("Took {} us to read {} records", watch.elapsed(TimeUnit.MICROSECONDS), // c.getRecordCount()); spilledBatches--; currentContainer.zeroVectors(); Iterator<VectorWrapper<?>> wrapperIterator = c.iterator(); for (VectorWrapper w : currentContainer) { TransferPair pair = wrapperIterator.next().getValueVector().makeTransferPair(w.getValueVector()); pair.transfer(); } currentContainer.setRecordCount(c.getRecordCount()); c.zeroVectors(); return c; }
public int getNextIndex() { int val; if (pointer == getRecordCount()) { if (spilledBatches == 0) { return -1; } try { currentContainer.zeroVectors(); getBatch(); } catch (IOException e) { throw new RuntimeException(e); } pointer = 1; return 0; } if (sv2 == null) { val = pointer; pointer++; assert val < currentContainer.getRecordCount(); } else { val = pointer; pointer++; assert val < currentContainer.getRecordCount(); val = sv2.getIndex(val); } return val; }
public void addBatch(VectorContainer newContainer) throws IOException { assert fs != null; assert path != null; if (outputStream == null) { outputStream = fs.create(path); } int recordCount = newContainer.getRecordCount(); WritableBatch batch = WritableBatch.getBatchNoHVWrap(recordCount, newContainer, false); VectorAccessibleSerializable outputBatch = new VectorAccessibleSerializable(batch, allocator); Stopwatch watch = Stopwatch.createStarted(); outputBatch.writeToStream(outputStream); newContainer.zeroVectors(); logger.debug( "Took {} us to spill {} records", watch.elapsed(TimeUnit.MICROSECONDS), recordCount); spilledBatches++; }
@Override public void close() throws IOException { currentContainer.zeroVectors(); if (sv2 != null) { sv2.clear(); } if (outputStream != null) { outputStream.close(); } if (inputStream != null) { inputStream.close(); } if (fs != null && fs.exists(path)) { fs.delete(path, false); } }
@Override public IterOutcome innerNext() { container.zeroVectors(); // if we got IterOutcome.NONE while getting partition vectors, and there are no batches on the // queue, then we are // done if (upstreamNone && (batchQueue == null || batchQueue.size() == 0)) return IterOutcome.NONE; // if there are batches on the queue, process them first, rather than calling incoming.next() if (batchQueue != null && batchQueue.size() > 0) { VectorContainer vc = batchQueue.poll(); recordCount = vc.getRecordCount(); try { // Must set up a new schema each time, because ValueVectors are not reused between // containers in queue setupNewSchema(vc); } catch (SchemaChangeException ex) { kill(false); logger.error("Failure during query", ex); context.fail(ex); return IterOutcome.STOP; } doWork(vc); vc.zeroVectors(); return IterOutcome.OK_NEW_SCHEMA; } // Reaching this point, either this is the first iteration, or there are no batches left on the // queue and there are // more incoming IterOutcome upstream = next(incoming); if (this.first && upstream == IterOutcome.OK) { throw new RuntimeException("Invalid state: First batch should have OK_NEW_SCHEMA"); } // If this is the first iteration, we need to generate the partition vectors before we can // proceed if (this.first && upstream == IterOutcome.OK_NEW_SCHEMA) { if (!getPartitionVectors()) { cleanup(); return IterOutcome.STOP; } batchQueue = new LinkedBlockingQueue<>(this.sampledIncomingBatches); first = false; // Now that we have the partition vectors, we immediately process the first batch on the queue VectorContainer vc = batchQueue.poll(); try { setupNewSchema(vc); } catch (SchemaChangeException ex) { kill(false); logger.error("Failure during query", ex); context.fail(ex); return IterOutcome.STOP; } doWork(vc); vc.zeroVectors(); recordCount = vc.getRecordCount(); return IterOutcome.OK_NEW_SCHEMA; } // if this now that all the batches on the queue are processed, we begin processing the incoming // batches. For the // first one // we need to generate a new schema, even if the outcome is IterOutcome.OK After that we can // reuse the schema. if (this.startedUnsampledBatches == false) { this.startedUnsampledBatches = true; if (upstream == IterOutcome.OK) upstream = IterOutcome.OK_NEW_SCHEMA; } switch (upstream) { case NONE: case NOT_YET: case STOP: cleanup(); recordCount = 0; return upstream; case OK_NEW_SCHEMA: try { setupNewSchema(incoming); } catch (SchemaChangeException ex) { kill(false); logger.error("Failure during query", ex); context.fail(ex); return IterOutcome.STOP; } // fall through. case OK: doWork(incoming); recordCount = incoming.getRecordCount(); return upstream; // change if upstream changed, otherwise normal. default: throw new UnsupportedOperationException(); } }
private void buildTable() throws SchemaChangeException, ClassTransformationException, IOException { // Get all samples from distributed map SortRecordBatchBuilder containerBuilder = new SortRecordBatchBuilder(context.getAllocator(), MAX_SORT_BYTES); for (CachedVectorContainer w : mmap.get(mapKey)) { containerBuilder.add(w.get()); } VectorContainer allSamplesContainer = new VectorContainer(); containerBuilder.build(context, allSamplesContainer); List<Ordering> orderDefs = Lists.newArrayList(); int i = 0; for (Ordering od : popConfig.getOrderings()) { SchemaPath sp = SchemaPath.getSimplePath("f" + i++); orderDefs.add(new Ordering(od.getDirection(), new FieldReference(sp))); } // sort the data incoming samples. SelectionVector4 newSv4 = containerBuilder.getSv4(); Sorter sorter = SortBatch.createNewSorter(context, orderDefs, allSamplesContainer); sorter.setup(context, newSv4, allSamplesContainer); sorter.sort(newSv4, allSamplesContainer); // Copy every Nth record from the samples into a candidate partition table, where N = // totalSampledRecords/partitions // Attempt to push this to the distributed map. Only the first candidate to get pushed will be // used. VectorContainer candidatePartitionTable = new VectorContainer(); SampleCopier copier = null; List<ValueVector> localAllocationVectors = Lists.newArrayList(); copier = getCopier( newSv4, allSamplesContainer, candidatePartitionTable, orderDefs, localAllocationVectors); int allocationSize = 50; while (true) { for (ValueVector vv : localAllocationVectors) { AllocationHelper.allocate(vv, samplingFactor * partitions, allocationSize); } int skipRecords = containerBuilder.getSv4().getTotalCount() / partitions; if (copier.copyRecords(skipRecords, skipRecords, partitions - 1)) { assert copier.getOutputRecords() == partitions - 1 : String.format( "output records: %d partitions: %d", copier.getOutputRecords(), partitions); for (VectorWrapper<?> vw : candidatePartitionTable) { vw.getValueVector().getMutator().setValueCount(copier.getOutputRecords()); } break; } else { candidatePartitionTable.zeroVectors(); allocationSize *= 2; } } candidatePartitionTable.setRecordCount(copier.getOutputRecords()); WritableBatch batch = WritableBatch.getBatchNoHVWrap( candidatePartitionTable.getRecordCount(), candidatePartitionTable, false); CachedVectorContainer wrap = new CachedVectorContainer(batch, context.getDrillbitContext().getAllocator()); tableMap.putIfAbsent(mapKey + "final", wrap, 1, TimeUnit.MINUTES); candidatePartitionTable.clear(); allSamplesContainer.clear(); containerBuilder.clear(); wrap.clear(); }
private boolean saveSamples() throws SchemaChangeException, ClassTransformationException, IOException { recordsSampled = 0; IterOutcome upstream; // Start collecting batches until recordsToSample records have been collected SortRecordBatchBuilder builder = new SortRecordBatchBuilder(oContext.getAllocator(), MAX_SORT_BYTES); builder.add(incoming); recordsSampled += incoming.getRecordCount(); outer: while (recordsSampled < recordsToSample) { upstream = next(incoming); switch (upstream) { case NONE: case NOT_YET: case STOP: upstreamNone = true; break outer; default: // fall through } builder.add(incoming); recordsSampled += incoming.getRecordCount(); if (upstream == IterOutcome.NONE) break; } VectorContainer sortedSamples = new VectorContainer(); builder.build(context, sortedSamples); // Sort the records according the orderings given in the configuration Sorter sorter = SortBatch.createNewSorter(context, popConfig.getOrderings(), sortedSamples); SelectionVector4 sv4 = builder.getSv4(); sorter.setup(context, sv4, sortedSamples); sorter.sort(sv4, sortedSamples); // Project every Nth record to a new vector container, where N = recordsSampled/(samplingFactor // * partitions). // Uses the // the expressions from the Orderings to populate each column. There is one column for each // Ordering in // popConfig.orderings. VectorContainer containerToCache = new VectorContainer(); List<ValueVector> localAllocationVectors = Lists.newArrayList(); SampleCopier copier = getCopier( sv4, sortedSamples, containerToCache, popConfig.getOrderings(), localAllocationVectors); int allocationSize = 50; while (true) { for (ValueVector vv : localAllocationVectors) { AllocationHelper.allocate(vv, samplingFactor * partitions, allocationSize); } if (copier.copyRecords( recordsSampled / (samplingFactor * partitions), 0, samplingFactor * partitions)) { break; } else { containerToCache.zeroVectors(); allocationSize *= 2; } } for (VectorWrapper<?> vw : containerToCache) { vw.getValueVector().getMutator().setValueCount(copier.getOutputRecords()); } containerToCache.setRecordCount(copier.getOutputRecords()); // Get a distributed multimap handle from the distributed cache, and put the vectors from the // new vector container // into a serializable wrapper object, and then add to distributed map WritableBatch batch = WritableBatch.getBatchNoHVWrap(containerToCache.getRecordCount(), containerToCache, false); CachedVectorContainer sampleToSave = new CachedVectorContainer(batch, context.getAllocator()); mmap.put(mapKey, sampleToSave); this.sampledIncomingBatches = builder.getHeldRecordBatches(); builder.clear(); batch.clear(); containerToCache.clear(); sampleToSave.clear(); return true; }