Exemplo n.º 1
0
 private VectorContainer getBatch() throws IOException {
   assert fs != null;
   assert path != null;
   if (inputStream == null) {
     inputStream = fs.open(path);
   }
   VectorAccessibleSerializable vas = new VectorAccessibleSerializable(allocator);
   Stopwatch watch = Stopwatch.createStarted();
   vas.readFromStream(inputStream);
   VectorContainer c = vas.get();
   if (schema != null) {
     c = SchemaUtil.coerceContainer(c, schema, context);
   }
   //    logger.debug("Took {} us to read {} records", watch.elapsed(TimeUnit.MICROSECONDS),
   // c.getRecordCount());
   spilledBatches--;
   currentContainer.zeroVectors();
   Iterator<VectorWrapper<?>> wrapperIterator = c.iterator();
   for (VectorWrapper w : currentContainer) {
     TransferPair pair =
         wrapperIterator.next().getValueVector().makeTransferPair(w.getValueVector());
     pair.transfer();
   }
   currentContainer.setRecordCount(c.getRecordCount());
   c.zeroVectors();
   return c;
 }
Exemplo n.º 2
0
  public int getNextIndex() {
    int val;
    if (pointer == getRecordCount()) {
      if (spilledBatches == 0) {
        return -1;
      }
      try {
        currentContainer.zeroVectors();
        getBatch();
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
      pointer = 1;
      return 0;
    }
    if (sv2 == null) {
      val = pointer;
      pointer++;
      assert val < currentContainer.getRecordCount();
    } else {
      val = pointer;
      pointer++;
      assert val < currentContainer.getRecordCount();
      val = sv2.getIndex(val);
    }

    return val;
  }
Exemplo n.º 3
0
 public void addBatch(VectorContainer newContainer) throws IOException {
   assert fs != null;
   assert path != null;
   if (outputStream == null) {
     outputStream = fs.create(path);
   }
   int recordCount = newContainer.getRecordCount();
   WritableBatch batch = WritableBatch.getBatchNoHVWrap(recordCount, newContainer, false);
   VectorAccessibleSerializable outputBatch = new VectorAccessibleSerializable(batch, allocator);
   Stopwatch watch = Stopwatch.createStarted();
   outputBatch.writeToStream(outputStream);
   newContainer.zeroVectors();
   logger.debug(
       "Took {} us to spill {} records", watch.elapsed(TimeUnit.MICROSECONDS), recordCount);
   spilledBatches++;
 }
Exemplo n.º 4
0
 @Override
 public void close() throws IOException {
   currentContainer.zeroVectors();
   if (sv2 != null) {
     sv2.clear();
   }
   if (outputStream != null) {
     outputStream.close();
   }
   if (inputStream != null) {
     inputStream.close();
   }
   if (fs != null && fs.exists(path)) {
     fs.delete(path, false);
   }
 }
  @Override
  public IterOutcome innerNext() {
    container.zeroVectors();

    // if we got IterOutcome.NONE while getting partition vectors, and there are no batches on the
    // queue, then we are
    // done
    if (upstreamNone && (batchQueue == null || batchQueue.size() == 0)) return IterOutcome.NONE;

    // if there are batches on the queue, process them first, rather than calling incoming.next()
    if (batchQueue != null && batchQueue.size() > 0) {
      VectorContainer vc = batchQueue.poll();
      recordCount = vc.getRecordCount();
      try {

        // Must set up a new schema each time, because ValueVectors are not reused between
        // containers in queue
        setupNewSchema(vc);
      } catch (SchemaChangeException ex) {
        kill(false);
        logger.error("Failure during query", ex);
        context.fail(ex);
        return IterOutcome.STOP;
      }
      doWork(vc);
      vc.zeroVectors();
      return IterOutcome.OK_NEW_SCHEMA;
    }

    // Reaching this point, either this is the first iteration, or there are no batches left on the
    // queue and there are
    // more incoming
    IterOutcome upstream = next(incoming);

    if (this.first && upstream == IterOutcome.OK) {
      throw new RuntimeException("Invalid state: First batch should have OK_NEW_SCHEMA");
    }

    // If this is the first iteration, we need to generate the partition vectors before we can
    // proceed
    if (this.first && upstream == IterOutcome.OK_NEW_SCHEMA) {
      if (!getPartitionVectors()) {
        cleanup();
        return IterOutcome.STOP;
      }

      batchQueue = new LinkedBlockingQueue<>(this.sampledIncomingBatches);
      first = false;

      // Now that we have the partition vectors, we immediately process the first batch on the queue
      VectorContainer vc = batchQueue.poll();
      try {
        setupNewSchema(vc);
      } catch (SchemaChangeException ex) {
        kill(false);
        logger.error("Failure during query", ex);
        context.fail(ex);
        return IterOutcome.STOP;
      }
      doWork(vc);
      vc.zeroVectors();
      recordCount = vc.getRecordCount();
      return IterOutcome.OK_NEW_SCHEMA;
    }

    // if this now that all the batches on the queue are processed, we begin processing the incoming
    // batches. For the
    // first one
    // we need to generate a new schema, even if the outcome is IterOutcome.OK After that we can
    // reuse the schema.
    if (this.startedUnsampledBatches == false) {
      this.startedUnsampledBatches = true;
      if (upstream == IterOutcome.OK) upstream = IterOutcome.OK_NEW_SCHEMA;
    }
    switch (upstream) {
      case NONE:
      case NOT_YET:
      case STOP:
        cleanup();
        recordCount = 0;
        return upstream;
      case OK_NEW_SCHEMA:
        try {
          setupNewSchema(incoming);
        } catch (SchemaChangeException ex) {
          kill(false);
          logger.error("Failure during query", ex);
          context.fail(ex);
          return IterOutcome.STOP;
        }
        // fall through.
      case OK:
        doWork(incoming);
        recordCount = incoming.getRecordCount();
        return upstream; // change if upstream changed, otherwise normal.
      default:
        throw new UnsupportedOperationException();
    }
  }
  private void buildTable()
      throws SchemaChangeException, ClassTransformationException, IOException {

    // Get all samples from distributed map

    SortRecordBatchBuilder containerBuilder =
        new SortRecordBatchBuilder(context.getAllocator(), MAX_SORT_BYTES);
    for (CachedVectorContainer w : mmap.get(mapKey)) {
      containerBuilder.add(w.get());
    }
    VectorContainer allSamplesContainer = new VectorContainer();
    containerBuilder.build(context, allSamplesContainer);

    List<Ordering> orderDefs = Lists.newArrayList();
    int i = 0;
    for (Ordering od : popConfig.getOrderings()) {
      SchemaPath sp = SchemaPath.getSimplePath("f" + i++);
      orderDefs.add(new Ordering(od.getDirection(), new FieldReference(sp)));
    }

    // sort the data incoming samples.
    SelectionVector4 newSv4 = containerBuilder.getSv4();
    Sorter sorter = SortBatch.createNewSorter(context, orderDefs, allSamplesContainer);
    sorter.setup(context, newSv4, allSamplesContainer);
    sorter.sort(newSv4, allSamplesContainer);

    // Copy every Nth record from the samples into a candidate partition table, where N =
    // totalSampledRecords/partitions
    // Attempt to push this to the distributed map. Only the first candidate to get pushed will be
    // used.
    VectorContainer candidatePartitionTable = new VectorContainer();
    SampleCopier copier = null;
    List<ValueVector> localAllocationVectors = Lists.newArrayList();
    copier =
        getCopier(
            newSv4,
            allSamplesContainer,
            candidatePartitionTable,
            orderDefs,
            localAllocationVectors);
    int allocationSize = 50;
    while (true) {
      for (ValueVector vv : localAllocationVectors) {
        AllocationHelper.allocate(vv, samplingFactor * partitions, allocationSize);
      }
      int skipRecords = containerBuilder.getSv4().getTotalCount() / partitions;
      if (copier.copyRecords(skipRecords, skipRecords, partitions - 1)) {
        assert copier.getOutputRecords() == partitions - 1
            : String.format(
                "output records: %d partitions: %d", copier.getOutputRecords(), partitions);
        for (VectorWrapper<?> vw : candidatePartitionTable) {
          vw.getValueVector().getMutator().setValueCount(copier.getOutputRecords());
        }
        break;
      } else {
        candidatePartitionTable.zeroVectors();
        allocationSize *= 2;
      }
    }
    candidatePartitionTable.setRecordCount(copier.getOutputRecords());
    WritableBatch batch =
        WritableBatch.getBatchNoHVWrap(
            candidatePartitionTable.getRecordCount(), candidatePartitionTable, false);
    CachedVectorContainer wrap =
        new CachedVectorContainer(batch, context.getDrillbitContext().getAllocator());
    tableMap.putIfAbsent(mapKey + "final", wrap, 1, TimeUnit.MINUTES);

    candidatePartitionTable.clear();
    allSamplesContainer.clear();
    containerBuilder.clear();
    wrap.clear();
  }
  private boolean saveSamples()
      throws SchemaChangeException, ClassTransformationException, IOException {
    recordsSampled = 0;
    IterOutcome upstream;

    // Start collecting batches until recordsToSample records have been collected

    SortRecordBatchBuilder builder =
        new SortRecordBatchBuilder(oContext.getAllocator(), MAX_SORT_BYTES);
    builder.add(incoming);

    recordsSampled += incoming.getRecordCount();

    outer:
    while (recordsSampled < recordsToSample) {
      upstream = next(incoming);
      switch (upstream) {
        case NONE:
        case NOT_YET:
        case STOP:
          upstreamNone = true;
          break outer;
        default:
          // fall through
      }
      builder.add(incoming);
      recordsSampled += incoming.getRecordCount();
      if (upstream == IterOutcome.NONE) break;
    }
    VectorContainer sortedSamples = new VectorContainer();
    builder.build(context, sortedSamples);

    // Sort the records according the orderings given in the configuration

    Sorter sorter = SortBatch.createNewSorter(context, popConfig.getOrderings(), sortedSamples);
    SelectionVector4 sv4 = builder.getSv4();
    sorter.setup(context, sv4, sortedSamples);
    sorter.sort(sv4, sortedSamples);

    // Project every Nth record to a new vector container, where N = recordsSampled/(samplingFactor
    // * partitions).
    // Uses the
    // the expressions from the Orderings to populate each column. There is one column for each
    // Ordering in
    // popConfig.orderings.

    VectorContainer containerToCache = new VectorContainer();
    List<ValueVector> localAllocationVectors = Lists.newArrayList();
    SampleCopier copier =
        getCopier(
            sv4, sortedSamples, containerToCache, popConfig.getOrderings(), localAllocationVectors);
    int allocationSize = 50;
    while (true) {
      for (ValueVector vv : localAllocationVectors) {
        AllocationHelper.allocate(vv, samplingFactor * partitions, allocationSize);
      }
      if (copier.copyRecords(
          recordsSampled / (samplingFactor * partitions), 0, samplingFactor * partitions)) {
        break;
      } else {
        containerToCache.zeroVectors();
        allocationSize *= 2;
      }
    }
    for (VectorWrapper<?> vw : containerToCache) {
      vw.getValueVector().getMutator().setValueCount(copier.getOutputRecords());
    }
    containerToCache.setRecordCount(copier.getOutputRecords());

    // Get a distributed multimap handle from the distributed cache, and put the vectors from the
    // new vector container
    // into a serializable wrapper object, and then add to distributed map

    WritableBatch batch =
        WritableBatch.getBatchNoHVWrap(containerToCache.getRecordCount(), containerToCache, false);
    CachedVectorContainer sampleToSave = new CachedVectorContainer(batch, context.getAllocator());

    mmap.put(mapKey, sampleToSave);
    this.sampledIncomingBatches = builder.getHeldRecordBatches();
    builder.clear();
    batch.clear();
    containerToCache.clear();
    sampleToSave.clear();
    return true;
  }