Exemplo n.º 1
0
public class ExternalSortBatch extends AbstractRecordBatch<ExternalSort> {
  private static final org.slf4j.Logger logger =
      org.slf4j.LoggerFactory.getLogger(ExternalSortBatch.class);
  private static final ControlsInjector injector =
      ControlsInjectorFactory.getInjector(ExternalSortBatch.class);

  private static final GeneratorMapping COPIER_MAPPING =
      new GeneratorMapping("doSetup", "doCopy", null, null);
  private static final MappingSet MAIN_MAPPING =
      new MappingSet(
          (String) null,
          null,
          ClassGenerator.DEFAULT_SCALAR_MAP,
          ClassGenerator.DEFAULT_SCALAR_MAP);
  private static final MappingSet LEFT_MAPPING =
      new MappingSet(
          "leftIndex", null, ClassGenerator.DEFAULT_SCALAR_MAP, ClassGenerator.DEFAULT_SCALAR_MAP);
  private static final MappingSet RIGHT_MAPPING =
      new MappingSet(
          "rightIndex", null, ClassGenerator.DEFAULT_SCALAR_MAP, ClassGenerator.DEFAULT_SCALAR_MAP);
  private static final MappingSet COPIER_MAPPING_SET =
      new MappingSet(COPIER_MAPPING, COPIER_MAPPING);

  private final int SPILL_BATCH_GROUP_SIZE;
  private final int SPILL_THRESHOLD;
  private final Iterator<String> dirs;
  private final RecordBatch incoming;
  private final BufferAllocator oAllocator;
  private final BufferAllocator copierAllocator;

  private BatchSchema schema;
  private SingleBatchSorter sorter;
  private SortRecordBatchBuilder builder;
  private MSorter mSorter;
  /**
   * A single PriorityQueueCopier instance is used for 2 purposes: 1. Merge sorted batches before
   * spilling 2. Merge sorted batches when all incoming data fits in memory
   */
  private PriorityQueueCopier copier;

  private LinkedList<BatchGroup> batchGroups = Lists.newLinkedList();
  private LinkedList<BatchGroup> spilledBatchGroups = Lists.newLinkedList();
  private SelectionVector4 sv4;
  private FileSystem fs;
  private int spillCount = 0;
  private int batchesSinceLastSpill = 0;
  private boolean first = true;
  private int targetRecordCount;
  private final String fileName;
  private Set<Path> currSpillDirs = Sets.newTreeSet();
  private int firstSpillBatchCount = 0;
  private int peakNumBatches = -1;

  /**
   * The copier uses the COPIER_BATCH_MEM_LIMIT to estimate the target number of records to return
   * in each batch.
   */
  private static final int COPIER_BATCH_MEM_LIMIT = 256 * 1024;

  public static final String INTERRUPTION_AFTER_SORT = "after-sort";
  public static final String INTERRUPTION_AFTER_SETUP = "after-setup";
  public static final String INTERRUPTION_WHILE_SPILLING = "spilling";

  public enum Metric implements MetricDef {
    SPILL_COUNT, // number of times operator spilled to disk
    PEAK_SIZE_IN_MEMORY, // peak value for totalSizeInMemory
    PEAK_BATCHES_IN_MEMORY; // maximum number of batches kept in memory

    @Override
    public int metricId() {
      return ordinal();
    }
  }

  public ExternalSortBatch(ExternalSort popConfig, FragmentContext context, RecordBatch incoming)
      throws OutOfMemoryException {
    super(popConfig, context, true);
    this.incoming = incoming;
    DrillConfig config = context.getConfig();
    Configuration conf = new Configuration();
    conf.set("fs.default.name", config.getString(ExecConstants.EXTERNAL_SORT_SPILL_FILESYSTEM));
    try {
      this.fs = FileSystem.get(conf);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
    SPILL_BATCH_GROUP_SIZE = config.getInt(ExecConstants.EXTERNAL_SORT_SPILL_GROUP_SIZE);
    SPILL_THRESHOLD = config.getInt(ExecConstants.EXTERNAL_SORT_SPILL_THRESHOLD);
    dirs = Iterators.cycle(config.getStringList(ExecConstants.EXTERNAL_SORT_SPILL_DIRS));
    oAllocator = oContext.getAllocator();
    copierAllocator =
        oAllocator.newChildAllocator(
            oAllocator.getName() + ":copier",
            PriorityQueueCopier.INITIAL_ALLOCATION,
            PriorityQueueCopier.MAX_ALLOCATION);
    FragmentHandle handle = context.getHandle();
    fileName =
        String.format(
            "%s_majorfragment%s_minorfragment%s_operator%s",
            QueryIdHelper.getQueryId(handle.getQueryId()),
            handle.getMajorFragmentId(),
            handle.getMinorFragmentId(),
            popConfig.getOperatorId());
  }

  @Override
  public int getRecordCount() {
    if (sv4 != null) {
      return sv4.getCount();
    }
    return container.getRecordCount();
  }

  @Override
  public SelectionVector4 getSelectionVector4() {
    return sv4;
  }

  private void closeBatchGroups(Collection<BatchGroup> groups) {
    for (BatchGroup group : groups) {
      try {
        group.close();
      } catch (Exception e) {
        // collect all failure and make sure to cleanup all remaining batches
        // Originally we would have thrown a RuntimeException that would propagate to
        // FragmentExecutor.closeOutResources()
        // where it would have been passed to context.fail()
        // passing the exception directly to context.fail(e) will let the cleanup process continue
        // instead of stopping
        // right away, this will also make sure we collect any additional exception we may get while
        // cleaning up
        context.fail(e);
      }
    }
  }

  @Override
  public void close() {
    try {
      if (batchGroups != null) {
        closeBatchGroups(batchGroups);
        batchGroups = null;
      }
      if (spilledBatchGroups != null) {
        closeBatchGroups(spilledBatchGroups);
        spilledBatchGroups = null;
      }
    } finally {
      if (builder != null) {
        builder.clear();
        builder.close();
      }
      if (sv4 != null) {
        sv4.clear();
      }

      try {
        if (copier != null) {
          copier.close();
        }
      } catch (IOException e) {
        throw new RuntimeException(e);
      } finally {
        copierAllocator.close();
        super.close();

        if (mSorter != null) {
          mSorter.clear();
        }
        for (Iterator iter = this.currSpillDirs.iterator(); iter.hasNext(); iter.remove()) {
          Path path = (Path) iter.next();
          try {
            if (fs != null && path != null && fs.exists(path)) {
              if (fs.delete(path, true)) {
                fs.cancelDeleteOnExit(path);
              }
            }
          } catch (IOException e) {
            // since this is meant to be used in a batches's cleanup, we don't propagate the
            // exception
            logger.warn("Unable to delete spill directory " + path, e);
          }
        }
      }
    }
  }

  @Override
  public void buildSchema() throws SchemaChangeException {
    IterOutcome outcome = next(incoming);
    switch (outcome) {
      case OK:
      case OK_NEW_SCHEMA:
        for (VectorWrapper<?> w : incoming) {
          ValueVector v = container.addOrGet(w.getField());
          if (v instanceof AbstractContainerVector) {
            w.getValueVector().makeTransferPair(v); // Can we remove this hack?
            v.clear();
          }
          v.allocateNew(); // Can we remove this? - SVR fails with NPE (TODO)
        }
        container.buildSchema(SelectionVectorMode.NONE);
        container.setRecordCount(0);
        break;
      case STOP:
        state = BatchState.STOP;
        break;
      case OUT_OF_MEMORY:
        state = BatchState.OUT_OF_MEMORY;
        break;
      case NONE:
        state = BatchState.DONE;
        break;
      default:
        break;
    }
  }

  @Override
  public IterOutcome innerNext() {
    if (schema != null) {
      if (spillCount == 0) {
        return (getSelectionVector4().next()) ? IterOutcome.OK : IterOutcome.NONE;
      } else {
        Stopwatch w = Stopwatch.createStarted();
        int count = copier.next(targetRecordCount);
        if (count > 0) {
          long t = w.elapsed(TimeUnit.MICROSECONDS);
          logger.debug("Took {} us to merge {} records", t, count);
          container.setRecordCount(count);
          return IterOutcome.OK;
        } else {
          logger.debug("copier returned 0 records");
          return IterOutcome.NONE;
        }
      }
    }

    int totalCount = 0;
    int totalBatches = 0; // total number of batches received so far

    try {
      container.clear();
      outer:
      while (true) {
        IterOutcome upstream;
        if (first) {
          upstream = IterOutcome.OK_NEW_SCHEMA;
        } else {
          upstream = next(incoming);
        }
        if (upstream == IterOutcome.OK && sorter == null) {
          upstream = IterOutcome.OK_NEW_SCHEMA;
        }
        switch (upstream) {
          case NONE:
            if (first) {
              return upstream;
            }
            break outer;
          case NOT_YET:
            throw new UnsupportedOperationException();
          case STOP:
            return upstream;
          case OK_NEW_SCHEMA:
          case OK:
            VectorContainer convertedBatch;
            // only change in the case that the schema truly changes.  Artificial schema changes are
            // ignored.
            if (upstream == IterOutcome.OK_NEW_SCHEMA && !incoming.getSchema().equals(schema)) {
              if (schema != null) {
                if (unionTypeEnabled) {
                  this.schema = SchemaUtil.mergeSchemas(schema, incoming.getSchema());
                } else {
                  throw new SchemaChangeException(
                      "Schema changes not supported in External Sort. Please enable Union type");
                }
              } else {
                schema = incoming.getSchema();
              }
              convertedBatch = SchemaUtil.coerceContainer(incoming, schema, oContext);
              for (BatchGroup b : batchGroups) {
                b.setSchema(schema);
              }
              for (BatchGroup b : spilledBatchGroups) {
                b.setSchema(schema);
              }
              this.sorter = createNewSorter(context, convertedBatch);
            } else {
              convertedBatch = SchemaUtil.coerceContainer(incoming, schema, oContext);
            }
            if (first) {
              first = false;
            }
            if (convertedBatch.getRecordCount() == 0) {
              for (VectorWrapper<?> w : convertedBatch) {
                w.clear();
              }
              break;
            }
            SelectionVector2 sv2;
            if (incoming.getSchema().getSelectionVectorMode()
                == BatchSchema.SelectionVectorMode.TWO_BYTE) {
              sv2 = incoming.getSelectionVector2().clone();
            } else {
              try {
                sv2 = newSV2();
              } catch (InterruptedException e) {
                return IterOutcome.STOP;
              } catch (OutOfMemoryException e) {
                throw new OutOfMemoryException(e);
              }
            }

            int count = sv2.getCount();
            totalCount += count;
            totalBatches++;
            sorter.setup(context, sv2, convertedBatch);
            sorter.sort(sv2);
            RecordBatchData rbd = new RecordBatchData(convertedBatch, oAllocator);
            boolean success = false;
            try {
              rbd.setSv2(sv2);
              batchGroups.add(new BatchGroup(rbd.getContainer(), rbd.getSv2(), oContext));
              if (peakNumBatches < batchGroups.size()) {
                peakNumBatches = batchGroups.size();
                stats.setLongStat(Metric.PEAK_BATCHES_IN_MEMORY, peakNumBatches);
              }

              batchesSinceLastSpill++;
              if ( // If we haven't spilled so far, do we have enough memory for MSorter if this
              // turns out to be the last incoming batch?
              (spillCount == 0 && !hasMemoryForInMemorySort(totalCount))
                  ||
                  // If we haven't spilled so far, make sure we don't exceed the maximum number of
                  // batches SV4 can address
                  (spillCount == 0 && totalBatches > Character.MAX_VALUE)
                  ||
                  // TODO(DRILL-4438) - consider setting this threshold more intelligently,
                  // lowering caused a failing low memory condition (test in
                  // BasicPhysicalOpUnitTest)
                  // to complete successfully (although it caused perf decrease as there was more
                  // spilling)

                  // current memory used is more than 95% of memory usage limit of this operator
                  (oAllocator.getAllocatedMemory() > .95 * oAllocator.getLimit())
                  ||
                  // Number of incoming batches (BatchGroups) exceed the limit and number of
                  // incoming batches accumulated
                  // since the last spill exceed the defined limit
                  (batchGroups.size() > SPILL_THRESHOLD
                      && batchesSinceLastSpill >= SPILL_BATCH_GROUP_SIZE)) {

                if (firstSpillBatchCount == 0) {
                  firstSpillBatchCount = batchGroups.size();
                }

                if (spilledBatchGroups.size() > firstSpillBatchCount / 2) {
                  logger.info("Merging spills");
                  final BatchGroup merged = mergeAndSpill(spilledBatchGroups);
                  if (merged != null) {
                    spilledBatchGroups.addFirst(merged);
                  }
                }
                final BatchGroup merged = mergeAndSpill(batchGroups);
                if (merged != null) { // make sure we don't add null to spilledBatchGroups
                  spilledBatchGroups.add(merged);
                  batchesSinceLastSpill = 0;
                }
              }
              success = true;
            } finally {
              if (!success) {
                rbd.clear();
              }
            }
            break;
          case OUT_OF_MEMORY:
            logger.debug("received OUT_OF_MEMORY, trying to spill");
            if (batchesSinceLastSpill > 2) {
              final BatchGroup merged = mergeAndSpill(batchGroups);
              if (merged != null) {
                spilledBatchGroups.add(merged);
                batchesSinceLastSpill = 0;
              }
            } else {
              logger.debug("not enough batches to spill, sending OUT_OF_MEMORY downstream");
              return IterOutcome.OUT_OF_MEMORY;
            }
            break;
          default:
            throw new UnsupportedOperationException();
        }
      }

      if (totalCount == 0) {
        return IterOutcome.NONE;
      }
      if (spillCount == 0) {

        if (builder != null) {
          builder.clear();
          builder.close();
        }
        builder = new SortRecordBatchBuilder(oAllocator);

        for (BatchGroup group : batchGroups) {
          RecordBatchData rbd = new RecordBatchData(group.getContainer(), oAllocator);
          rbd.setSv2(group.getSv2());
          builder.add(rbd);
        }

        builder.build(context, container);
        sv4 = builder.getSv4();
        mSorter = createNewMSorter();
        mSorter.setup(context, oAllocator, getSelectionVector4(), this.container);

        // For testing memory-leak purpose, inject exception after mSorter finishes setup
        injector.injectUnchecked(context.getExecutionControls(), INTERRUPTION_AFTER_SETUP);
        mSorter.sort(this.container);

        // sort may have prematurely exited due to should continue returning false.
        if (!context.shouldContinue()) {
          return IterOutcome.STOP;
        }

        // For testing memory-leak purpose, inject exception after mSorter finishes sorting
        injector.injectUnchecked(context.getExecutionControls(), INTERRUPTION_AFTER_SORT);
        sv4 = mSorter.getSV4();

        container.buildSchema(SelectionVectorMode.FOUR_BYTE);
      } else { // some batches were spilled
        final BatchGroup merged = mergeAndSpill(batchGroups);
        if (merged != null) {
          spilledBatchGroups.add(merged);
        }
        batchGroups.addAll(spilledBatchGroups);
        spilledBatchGroups =
            null; // no need to cleanup spilledBatchGroups, all it's batches are in batchGroups now

        logger.warn(
            "Starting to merge. {} batch groups. Current allocated memory: {}",
            batchGroups.size(),
            oAllocator.getAllocatedMemory());
        VectorContainer hyperBatch = constructHyperBatch(batchGroups);
        createCopier(hyperBatch, batchGroups, container, false);

        int estimatedRecordSize = 0;
        for (VectorWrapper<?> w : batchGroups.get(0)) {
          try {
            estimatedRecordSize += TypeHelper.getSize(w.getField().getType());
          } catch (UnsupportedOperationException e) {
            estimatedRecordSize += 50;
          }
        }
        targetRecordCount =
            Math.min(MAX_BATCH_SIZE, Math.max(1, COPIER_BATCH_MEM_LIMIT / estimatedRecordSize));
        int count = copier.next(targetRecordCount);
        container.buildSchema(SelectionVectorMode.NONE);
        container.setRecordCount(count);
      }

      return IterOutcome.OK_NEW_SCHEMA;

    } catch (SchemaChangeException ex) {
      kill(false);
      context.fail(
          UserException.unsupportedError(ex)
              .message("Sort doesn't currently support sorts with changing schemas")
              .build(logger));
      return IterOutcome.STOP;
    } catch (ClassTransformationException | IOException ex) {
      kill(false);
      context.fail(ex);
      return IterOutcome.STOP;
    } catch (UnsupportedOperationException e) {
      throw new RuntimeException(e);
    }
  }

  private boolean hasMemoryForInMemorySort(int currentRecordCount) {
    long currentlyAvailable = popConfig.getMaxAllocation() - oAllocator.getAllocatedMemory();

    long neededForInMemorySort =
        SortRecordBatchBuilder.memoryNeeded(currentRecordCount)
            + MSortTemplate.memoryNeeded(currentRecordCount);

    return currentlyAvailable > neededForInMemorySort;
  }

  public BatchGroup mergeAndSpill(LinkedList<BatchGroup> batchGroups) throws SchemaChangeException {
    logger.debug("Copier allocator current allocation {}", copierAllocator.getAllocatedMemory());
    logger.debug(
        "mergeAndSpill: starting total size in memory = {}", oAllocator.getAllocatedMemory());
    VectorContainer outputContainer = new VectorContainer();
    List<BatchGroup> batchGroupList = Lists.newArrayList();
    int batchCount = batchGroups.size();
    for (int i = 0; i < batchCount / 2; i++) {
      if (batchGroups.size() == 0) {
        break;
      }
      BatchGroup batch = batchGroups.pollLast();
      assert batch != null : "Encountered a null batch during merge and spill operation";
      batchGroupList.add(batch);
    }

    if (batchGroupList.size() == 0) {
      return null;
    }
    int estimatedRecordSize = 0;
    for (VectorWrapper<?> w : batchGroupList.get(0)) {
      try {
        estimatedRecordSize += TypeHelper.getSize(w.getField().getType());
      } catch (UnsupportedOperationException e) {
        estimatedRecordSize += 50;
      }
    }
    int targetRecordCount = Math.max(1, COPIER_BATCH_MEM_LIMIT / estimatedRecordSize);
    VectorContainer hyperBatch = constructHyperBatch(batchGroupList);
    createCopier(hyperBatch, batchGroupList, outputContainer, true);

    int count = copier.next(targetRecordCount);
    assert count > 0;

    logger.debug(
        "mergeAndSpill: estimated record size = {}, target record count = {}",
        estimatedRecordSize,
        targetRecordCount);

    // 1 output container is kept in memory, so we want to hold on to it and transferClone
    // allows keeping ownership
    VectorContainer c1 = VectorContainer.getTransferClone(outputContainer, oContext);
    c1.buildSchema(BatchSchema.SelectionVectorMode.NONE);
    c1.setRecordCount(count);

    String spillDir = dirs.next();
    Path currSpillPath = new Path(Joiner.on("/").join(spillDir, fileName));
    currSpillDirs.add(currSpillPath);
    String outputFile = Joiner.on("/").join(currSpillPath, spillCount++);
    try {
      fs.deleteOnExit(currSpillPath);
    } catch (IOException e) {
      // since this is meant to be used in a batches's spilling, we don't propagate the exception
      logger.warn("Unable to mark spill directory " + currSpillPath + " for deleting on exit", e);
    }
    stats.setLongStat(Metric.SPILL_COUNT, spillCount);
    BatchGroup newGroup = new BatchGroup(c1, fs, outputFile, oContext);
    try (AutoCloseable a = AutoCloseables.all(batchGroupList)) {
      logger.info("Merging and spilling to {}", outputFile);
      while ((count = copier.next(targetRecordCount)) > 0) {
        outputContainer.buildSchema(BatchSchema.SelectionVectorMode.NONE);
        outputContainer.setRecordCount(count);
        // note that addBatch also clears the outputContainer
        newGroup.addBatch(outputContainer);
      }
      injector.injectChecked(
          context.getExecutionControls(), INTERRUPTION_WHILE_SPILLING, IOException.class);
      newGroup.closeOutputStream();
    } catch (Throwable e) {
      // we only need to cleanup newGroup if spill failed
      try {
        AutoCloseables.close(e, newGroup);
      } catch (Throwable t) {
        /* close() may hit the same IO issue; just ignore */
      }
      throw UserException.resourceError(e)
          .message("External Sort encountered an error while spilling to disk")
          .addContext(e.getMessage() /* more detail */)
          .build(logger);
    } finally {
      hyperBatch.clear();
    }
    logger.debug("mergeAndSpill: final total size in memory = {}", oAllocator.getAllocatedMemory());
    logger.info("Completed spilling to {}", outputFile);
    return newGroup;
  }

  private SelectionVector2 newSV2() throws OutOfMemoryException, InterruptedException {
    SelectionVector2 sv2 = new SelectionVector2(oAllocator);
    if (!sv2.allocateNewSafe(incoming.getRecordCount())) {
      try {
        final BatchGroup merged = mergeAndSpill(batchGroups);
        if (merged != null) {
          spilledBatchGroups.add(merged);
        } else {
          throw UserException.memoryError(
                  "Unable to allocate sv2 for %d records, and not enough batchGroups to spill.",
                  incoming.getRecordCount())
              .addContext("batchGroups.size", batchGroups.size())
              .addContext("spilledBatchGroups.size", spilledBatchGroups.size())
              .addContext("allocated memory", oAllocator.getAllocatedMemory())
              .addContext("allocator limit", oAllocator.getLimit())
              .build(logger);
        }
      } catch (SchemaChangeException e) {
        throw new RuntimeException(e);
      }
      int waitTime = 1;
      while (true) {
        try {
          Thread.sleep(waitTime * 1000);
        } catch (final InterruptedException e) {
          if (!context.shouldContinue()) {
            throw e;
          }
        }
        waitTime *= 2;
        if (sv2.allocateNewSafe(incoming.getRecordCount())) {
          break;
        }
        if (waitTime >= 32) {
          throw new OutOfMemoryException("Unable to allocate sv2 buffer after repeated attempts");
        }
      }
    }
    for (int i = 0; i < incoming.getRecordCount(); i++) {
      sv2.setIndex(i, (char) i);
    }
    sv2.setRecordCount(incoming.getRecordCount());
    return sv2;
  }

  private VectorContainer constructHyperBatch(List<BatchGroup> batchGroupList) {
    VectorContainer cont = new VectorContainer();
    for (MaterializedField field : schema) {
      ValueVector[] vectors = new ValueVector[batchGroupList.size()];
      int i = 0;
      for (BatchGroup group : batchGroupList) {
        vectors[i++] =
            group
                .getValueAccessorById(
                    field.getValueClass(),
                    group.getValueVectorId(SchemaPath.getSimplePath(field.getPath())).getFieldIds())
                .getValueVector();
      }
      cont.add(vectors);
    }
    cont.buildSchema(BatchSchema.SelectionVectorMode.FOUR_BYTE);
    return cont;
  }

  private MSorter createNewMSorter()
      throws ClassTransformationException, IOException, SchemaChangeException {
    return createNewMSorter(
        this.context,
        this.popConfig.getOrderings(),
        this,
        MAIN_MAPPING,
        LEFT_MAPPING,
        RIGHT_MAPPING);
  }

  private MSorter createNewMSorter(
      FragmentContext context,
      List<Ordering> orderings,
      VectorAccessible batch,
      MappingSet mainMapping,
      MappingSet leftMapping,
      MappingSet rightMapping)
      throws ClassTransformationException, IOException, SchemaChangeException {
    CodeGenerator<MSorter> cg =
        CodeGenerator.get(
            MSorter.TEMPLATE_DEFINITION, context.getFunctionRegistry(), context.getOptions());
    ClassGenerator<MSorter> g = cg.getRoot();
    g.setMappingSet(mainMapping);

    for (Ordering od : orderings) {
      // first, we rewrite the evaluation stack for each side of the comparison.
      ErrorCollector collector = new ErrorCollectorImpl();
      final LogicalExpression expr =
          ExpressionTreeMaterializer.materialize(
              od.getExpr(), batch, collector, context.getFunctionRegistry());
      if (collector.hasErrors()) {
        throw new SchemaChangeException(
            "Failure while materializing expression. " + collector.toErrorString());
      }
      g.setMappingSet(leftMapping);
      HoldingContainer left = g.addExpr(expr, ClassGenerator.BlkCreateMode.FALSE);
      g.setMappingSet(rightMapping);
      HoldingContainer right = g.addExpr(expr, ClassGenerator.BlkCreateMode.FALSE);
      g.setMappingSet(mainMapping);

      // next we wrap the two comparison sides and add the expression block for the comparison.
      LogicalExpression fh =
          FunctionGenerationHelper.getOrderingComparator(
              od.nullsSortHigh(), left, right, context.getFunctionRegistry());
      HoldingContainer out = g.addExpr(fh, ClassGenerator.BlkCreateMode.FALSE);
      JConditional jc = g.getEvalBlock()._if(out.getValue().ne(JExpr.lit(0)));

      if (od.getDirection() == Direction.ASCENDING) {
        jc._then()._return(out.getValue());
      } else {
        jc._then()._return(out.getValue().minus());
      }
      g.rotateBlock();
    }

    g.rotateBlock();
    g.getEvalBlock()._return(JExpr.lit(0));

    return context.getImplementationClass(cg);
  }

  public SingleBatchSorter createNewSorter(FragmentContext context, VectorAccessible batch)
      throws ClassTransformationException, IOException, SchemaChangeException {
    CodeGenerator<SingleBatchSorter> cg =
        CodeGenerator.get(
            SingleBatchSorter.TEMPLATE_DEFINITION,
            context.getFunctionRegistry(),
            context.getOptions());
    ClassGenerator<SingleBatchSorter> g = cg.getRoot();

    generateComparisons(g, batch);

    return context.getImplementationClass(cg);
  }

  private void generateComparisons(ClassGenerator<?> g, VectorAccessible batch)
      throws SchemaChangeException {
    g.setMappingSet(MAIN_MAPPING);

    for (Ordering od : popConfig.getOrderings()) {
      // first, we rewrite the evaluation stack for each side of the comparison.
      ErrorCollector collector = new ErrorCollectorImpl();
      final LogicalExpression expr =
          ExpressionTreeMaterializer.materialize(
              od.getExpr(), batch, collector, context.getFunctionRegistry());
      if (collector.hasErrors()) {
        throw new SchemaChangeException(
            "Failure while materializing expression. " + collector.toErrorString());
      }
      g.setMappingSet(LEFT_MAPPING);
      HoldingContainer left = g.addExpr(expr, ClassGenerator.BlkCreateMode.FALSE);
      g.setMappingSet(RIGHT_MAPPING);
      HoldingContainer right = g.addExpr(expr, ClassGenerator.BlkCreateMode.FALSE);
      g.setMappingSet(MAIN_MAPPING);

      // next we wrap the two comparison sides and add the expression block for the comparison.
      LogicalExpression fh =
          FunctionGenerationHelper.getOrderingComparator(
              od.nullsSortHigh(), left, right, context.getFunctionRegistry());
      HoldingContainer out = g.addExpr(fh, ClassGenerator.BlkCreateMode.FALSE);
      JConditional jc = g.getEvalBlock()._if(out.getValue().ne(JExpr.lit(0)));

      if (od.getDirection() == Direction.ASCENDING) {
        jc._then()._return(out.getValue());
      } else {
        jc._then()._return(out.getValue().minus());
      }
      g.rotateBlock();
    }

    g.rotateBlock();
    g.getEvalBlock()._return(JExpr.lit(0));
  }

  private void createCopier(
      VectorAccessible batch,
      List<BatchGroup> batchGroupList,
      VectorContainer outputContainer,
      boolean spilling)
      throws SchemaChangeException {
    try {
      if (copier == null) {
        CodeGenerator<PriorityQueueCopier> cg =
            CodeGenerator.get(
                PriorityQueueCopier.TEMPLATE_DEFINITION,
                context.getFunctionRegistry(),
                context.getOptions());
        ClassGenerator<PriorityQueueCopier> g = cg.getRoot();

        generateComparisons(g, batch);

        g.setMappingSet(COPIER_MAPPING_SET);
        CopyUtil.generateCopies(g, batch, true);
        g.setMappingSet(MAIN_MAPPING);
        copier = context.getImplementationClass(cg);
      } else {
        copier.close();
      }

      BufferAllocator allocator = spilling ? copierAllocator : oAllocator;
      for (VectorWrapper<?> i : batch) {
        ValueVector v = TypeHelper.getNewVector(i.getField(), allocator);
        outputContainer.add(v);
      }
      copier.setup(context, allocator, batch, batchGroupList, outputContainer);
    } catch (ClassTransformationException | IOException e) {
      throw new RuntimeException(e);
    }
  }

  @Override
  public WritableBatch getWritableBatch() {
    throw new UnsupportedOperationException("A sort batch is not writable.");
  }

  @Override
  protected void killIncoming(boolean sendUpstream) {
    incoming.kill(sendUpstream);
  }
}
Exemplo n.º 2
0
/**
 * Foreman manages all the fragments (local and remote) for a single query where this is the
 * driving/root node.
 *
 * <p>The flow is as follows: - Foreman is submitted as a runnable. - Runnable does query planning.
 * - state changes from PENDING to RUNNING - Runnable sends out starting fragments - Status listener
 * are activated - The Runnable's run() completes, but the Foreman stays around - Foreman listens
 * for state change messages. - state change messages can drive the state to FAILED or CANCELED, in
 * which case messages are sent to running fragments to terminate - when all fragments complete,
 * state change messages drive the state to COMPLETED
 */
public class Foreman implements Runnable {
  private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(Foreman.class);
  private static final org.slf4j.Logger queryLogger =
      org.slf4j.LoggerFactory.getLogger("query.logger");
  private static final ControlsInjector injector =
      ControlsInjectorFactory.getInjector(Foreman.class);

  private static final ObjectMapper MAPPER = new ObjectMapper();
  private static final long RPC_WAIT_IN_MSECS_PER_FRAGMENT = 5000;

  private final QueryId queryId;
  private final RunQuery queryRequest;
  private final QueryContext queryContext;
  private final QueryManager queryManager; // handles lower-level details of query execution
  private final WorkerBee bee; // provides an interface to submit tasks
  private final DrillbitContext drillbitContext;
  private final UserClientConnection initiatingClient; // used to send responses
  private volatile QueryState state;
  private boolean resume = false;

  private volatile DistributedLease lease; // used to limit the number of concurrent queries

  private FragmentExecutor rootRunner; // root Fragment

  private final ExtendedLatch acceptExternalEvents =
      new ExtendedLatch(); // gates acceptance of external events
  private final StateListener stateListener = new StateListener(); // source of external events
  private final ResponseSendListener responseListener = new ResponseSendListener();
  private final StateSwitch stateSwitch = new StateSwitch();
  private final ForemanResult foremanResult = new ForemanResult();
  private final ConnectionClosedListener closeListener = new ConnectionClosedListener();
  private final ChannelFuture closeFuture;

  private String queryText;

  /**
   * Constructor. Sets up the Foreman, but does not initiate any execution.
   *
   * @param bee used to submit additional work
   * @param drillbitContext
   * @param connection
   * @param queryId the id for the query
   * @param queryRequest the query to execute
   */
  public Foreman(
      final WorkerBee bee,
      final DrillbitContext drillbitContext,
      final UserClientConnection connection,
      final QueryId queryId,
      final RunQuery queryRequest) {
    this.bee = bee;
    this.queryId = queryId;
    this.queryRequest = queryRequest;
    this.drillbitContext = drillbitContext;

    initiatingClient = connection;
    this.closeFuture = initiatingClient.getChannel().closeFuture();
    closeFuture.addListener(closeListener);

    queryContext = new QueryContext(connection.getSession(), drillbitContext);
    queryManager =
        new QueryManager(
            queryId,
            queryRequest,
            drillbitContext.getPersistentStoreProvider(),
            stateListener,
            this); // TODO reference escapes before ctor is complete via stateListener, this

    recordNewState(QueryState.PENDING);
  }

  private class ConnectionClosedListener implements GenericFutureListener<Future<Void>> {
    @Override
    public void operationComplete(Future<Void> future) throws Exception {
      cancel();
    }
  }

  /**
   * Get the QueryContext created for the query.
   *
   * @return the QueryContext
   */
  public QueryContext getQueryContext() {
    return queryContext;
  }

  /**
   * Get the QueryManager created for the query.
   *
   * @return the QueryManager
   */
  public QueryManager getQueryManager() {
    return queryManager;
  }

  /**
   * Cancel the query. Asynchronous -- it may take some time for all remote fragments to be
   * terminated.
   */
  public void cancel() {
    // Note this can be called from outside of run() on another thread, or after run() completes
    stateListener.moveToState(QueryState.CANCELLATION_REQUESTED, null);
  }

  /**
   * Resume the query. Regardless of the current state, this method sends a resume signal to all
   * fragments. This method can be called multiple times.
   */
  public void resume() {
    resume = true;
    // resume all pauses through query context
    queryContext.getExecutionControls().unpauseAll();
    // resume all pauses through all fragment contexts
    queryManager.unpauseExecutingFragments(drillbitContext);
  }

  /**
   * Called by execution pool to do query setup, and kick off remote execution.
   *
   * <p>Note that completion of this function is not the end of the Foreman's role in the query's
   * lifecycle.
   */
  @Override
  public void run() {
    // rename the thread we're using for debugging purposes
    final Thread currentThread = Thread.currentThread();
    final String originalName = currentThread.getName();
    currentThread.setName(QueryIdHelper.getQueryId(queryId) + ":foreman");

    // track how long the query takes
    queryManager.markStartTime();

    try {
      injector.injectChecked(
          queryContext.getExecutionControls(), "run-try-beginning", ForemanException.class);
      queryText = queryRequest.getPlan();

      // convert a run query request into action
      switch (queryRequest.getType()) {
        case LOGICAL:
          parseAndRunLogicalPlan(queryRequest.getPlan());
          break;
        case PHYSICAL:
          parseAndRunPhysicalPlan(queryRequest.getPlan());
          break;
        case SQL:
          runSQL(queryRequest.getPlan());
          break;
        default:
          throw new IllegalStateException();
      }
      injector.injectChecked(
          queryContext.getExecutionControls(), "run-try-end", ForemanException.class);
    } catch (final OutOfMemoryException | OutOfMemoryRuntimeException e) {
      moveToState(QueryState.FAILED, UserException.memoryError(e).build(logger));
    } catch (final ForemanException e) {
      moveToState(QueryState.FAILED, e);
    } catch (AssertionError | Exception ex) {
      moveToState(
          QueryState.FAILED,
          new ForemanException(
              "Unexpected exception during fragment initialization: " + ex.getMessage(), ex));
    } catch (final OutOfMemoryError e) {
      if ("Direct buffer memory".equals(e.getMessage())) {
        moveToState(
            QueryState.FAILED,
            UserException.resourceError(e)
                .message("One or more nodes ran out of memory while executing the query.")
                .build(logger));
      } else {
        /*
         * FragmentExecutors use a DrillbitStatusListener to watch out for the death of their query's Foreman. So, if we
         * die here, they should get notified about that, and cancel themselves; we don't have to attempt to notify
         * them, which might not work under these conditions.
         */
        System.out.println("Node ran out of Heap memory, exiting.");
        e.printStackTrace();
        System.out.flush();
        System.exit(-1);
      }

    } finally {
      /*
       * Begin accepting external events.
       *
       * Doing this here in the finally clause will guarantee that it occurs. Otherwise, if there
       * is an exception anywhere during setup, it wouldn't occur, and any events that are generated
       * as a result of any partial setup that was done (such as the FragmentSubmitListener,
       * the ResponseSendListener, or an external call to cancel()), will hang the thread that makes the
       * event delivery call.
       *
       * If we do throw an exception during setup, and have already moved to QueryState.FAILED, we just need to
       * make sure that we can't make things any worse as those events are delivered, but allow
       * any necessary remaining cleanup to proceed.
       *
       * Note that cancellations cannot be simulated before this point, i.e. pauses can be injected, because Foreman
       * would wait on the cancelling thread to signal a resume and the cancelling thread would wait on the Foreman
       * to accept events.
       */
      acceptExternalEvents.countDown();

      // If we received the resume signal before fragments are setup, the first call does not
      // actually resume the
      // fragments. Since setup is done, all fragments must have been delivered to remote nodes. Now
      // we can resume.
      if (resume) {
        resume();
      }
      injector.injectPause(queryContext.getExecutionControls(), "foreman-ready", logger);

      // restore the thread's original name
      currentThread.setName(originalName);
    }

    /*
     * Note that despite the run() completing, the Foreman continues to exist, and receives
     * events (indirectly, through the QueryManager's use of stateListener), about fragment
     * completions. It won't go away until everything is completed, failed, or cancelled.
     */
  }

  private void releaseLease() {
    while (lease != null) {
      try {
        lease.close();
        lease = null;
      } catch (final InterruptedException e) {
        // if we end up here, the while loop will try again
      } catch (final Exception e) {
        logger.warn("Failure while releasing lease.", e);
        break;
      }
    }
  }

  private void parseAndRunLogicalPlan(final String json) throws ExecutionSetupException {
    LogicalPlan logicalPlan;
    try {
      logicalPlan = drillbitContext.getPlanReader().readLogicalPlan(json);
    } catch (final IOException e) {
      throw new ForemanException("Failure parsing logical plan.", e);
    }

    if (logicalPlan.getProperties().resultMode == ResultMode.LOGICAL) {
      throw new ForemanException(
          "Failure running plan.  You requested a result mode of LOGICAL and submitted a logical plan.  In this case you're output mode must be PHYSICAL or EXEC.");
    }

    log(logicalPlan);

    final PhysicalPlan physicalPlan = convert(logicalPlan);

    if (logicalPlan.getProperties().resultMode == ResultMode.PHYSICAL) {
      returnPhysical(physicalPlan);
      return;
    }

    log(physicalPlan);
    runPhysicalPlan(physicalPlan);
  }

  private void log(final LogicalPlan plan) {
    if (logger.isDebugEnabled()) {
      logger.debug("Logical {}", plan.unparse(queryContext.getConfig()));
    }
  }

  private void log(final PhysicalPlan plan) {
    if (logger.isDebugEnabled()) {
      try {
        final String planText = queryContext.getConfig().getMapper().writeValueAsString(plan);
        logger.debug("Physical {}", planText);
      } catch (final IOException e) {
        logger.warn("Error while attempting to log physical plan.", e);
      }
    }
  }

  private void returnPhysical(final PhysicalPlan plan) throws ExecutionSetupException {
    final String jsonPlan = plan.unparse(queryContext.getConfig().getMapper().writer());
    runPhysicalPlan(
        DirectPlan.createDirectPlan(queryContext, new PhysicalFromLogicalExplain(jsonPlan)));
  }

  public static class PhysicalFromLogicalExplain {
    public final String json;

    public PhysicalFromLogicalExplain(final String json) {
      this.json = json;
    }
  }

  private void parseAndRunPhysicalPlan(final String json) throws ExecutionSetupException {
    try {
      final PhysicalPlan plan = drillbitContext.getPlanReader().readPhysicalPlan(json);
      runPhysicalPlan(plan);
    } catch (final IOException e) {
      throw new ForemanSetupException("Failure while parsing physical plan.", e);
    }
  }

  private void runPhysicalPlan(final PhysicalPlan plan) throws ExecutionSetupException {
    validatePlan(plan);
    setupSortMemoryAllocations(plan);
    acquireQuerySemaphore(plan);

    final QueryWorkUnit work = getQueryWorkUnit(plan);
    final List<PlanFragment> planFragments = work.getFragments();
    final PlanFragment rootPlanFragment = work.getRootFragment();
    assert queryId == rootPlanFragment.getHandle().getQueryId();

    drillbitContext
        .getWorkBus()
        .addFragmentStatusListener(queryId, queryManager.getFragmentStatusListener());
    drillbitContext
        .getClusterCoordinator()
        .addDrillbitStatusListener(queryManager.getDrillbitStatusListener());

    logger.debug("Submitting fragments to run.");

    // set up the root fragment first so we'll have incoming buffers available.
    setupRootFragment(rootPlanFragment, work.getRootOperator());

    setupNonRootFragments(planFragments);
    drillbitContext.getAllocator().resetFragmentLimits(); // TODO a global effect for this query?!?

    moveToState(QueryState.RUNNING, null);
    logger.debug("Fragments running.");
  }

  private static void validatePlan(final PhysicalPlan plan) throws ForemanSetupException {
    if (plan.getProperties().resultMode != ResultMode.EXEC) {
      throw new ForemanSetupException(
          String.format(
              "Failure running plan.  You requested a result mode of %s and a physical plan can only be output as EXEC",
              plan.getProperties().resultMode));
    }
  }

  private void setupSortMemoryAllocations(final PhysicalPlan plan) {
    // look for external sorts
    final List<ExternalSort> sortList = new LinkedList<>();
    for (final PhysicalOperator op : plan.getSortedOperators()) {
      if (op instanceof ExternalSort) {
        sortList.add((ExternalSort) op);
      }
    }

    // if there are any sorts, compute the maximum allocation, and set it on them
    if (sortList.size() > 0) {
      final OptionManager optionManager = queryContext.getOptions();
      final long maxWidthPerNode =
          optionManager.getOption(ExecConstants.MAX_WIDTH_PER_NODE_KEY).num_val;
      long maxAllocPerNode =
          Math.min(
              DrillConfig.getMaxDirectMemory(),
              queryContext.getConfig().getLong(ExecConstants.TOP_LEVEL_MAX_ALLOC));
      maxAllocPerNode =
          Math.min(
              maxAllocPerNode,
              optionManager.getOption(ExecConstants.MAX_QUERY_MEMORY_PER_NODE_KEY).num_val);
      final long maxSortAlloc = maxAllocPerNode / (sortList.size() * maxWidthPerNode);
      logger.debug("Max sort alloc: {}", maxSortAlloc);

      for (final ExternalSort externalSort : sortList) {
        externalSort.setMaxAllocation(maxSortAlloc);
      }
    }
  }

  /**
   * This limits the number of "small" and "large" queries that a Drill cluster will run
   * simultaneously, if queueing is enabled. If the query is unable to run, this will block until it
   * can. Beware that this is called under run(), and so will consume a Thread while it waits for
   * the required distributed semaphore.
   *
   * @param plan the query plan
   * @throws ForemanSetupException
   */
  private void acquireQuerySemaphore(final PhysicalPlan plan) throws ForemanSetupException {
    final OptionManager optionManager = queryContext.getOptions();
    final boolean queuingEnabled = optionManager.getOption(ExecConstants.ENABLE_QUEUE);
    if (queuingEnabled) {
      final long queueThreshold = optionManager.getOption(ExecConstants.QUEUE_THRESHOLD_SIZE);
      double totalCost = 0;
      for (final PhysicalOperator ops : plan.getSortedOperators()) {
        totalCost += ops.getCost();
      }

      final long queueTimeout = optionManager.getOption(ExecConstants.QUEUE_TIMEOUT);
      final String queueName;

      try {
        @SuppressWarnings("resource")
        final ClusterCoordinator clusterCoordinator = drillbitContext.getClusterCoordinator();
        final DistributedSemaphore distributedSemaphore;

        // get the appropriate semaphore
        if (totalCost > queueThreshold) {
          final int largeQueue = (int) optionManager.getOption(ExecConstants.LARGE_QUEUE_SIZE);
          distributedSemaphore = clusterCoordinator.getSemaphore("query.large", largeQueue);
          queueName = "large";
        } else {
          final int smallQueue = (int) optionManager.getOption(ExecConstants.SMALL_QUEUE_SIZE);
          distributedSemaphore = clusterCoordinator.getSemaphore("query.small", smallQueue);
          queueName = "small";
        }

        lease = distributedSemaphore.acquire(queueTimeout, TimeUnit.MILLISECONDS);
      } catch (final Exception e) {
        throw new ForemanSetupException("Unable to acquire slot for query.", e);
      }

      if (lease == null) {
        throw UserException.resourceError()
            .message(
                "Unable to acquire queue resources for query within timeout.  Timeout for %s queue was set at %d seconds.",
                queueName, queueTimeout / 1000)
            .build(logger);
      }
    }
  }

  Exception getCurrentException() {
    return foremanResult.getException();
  }

  private QueryWorkUnit getQueryWorkUnit(final PhysicalPlan plan) throws ExecutionSetupException {
    final PhysicalOperator rootOperator = plan.getSortedOperators(false).iterator().next();
    final Fragment rootFragment = rootOperator.accept(MakeFragmentsVisitor.INSTANCE, null);
    final SimpleParallelizer parallelizer = new SimpleParallelizer(queryContext);
    final QueryWorkUnit queryWorkUnit =
        parallelizer.getFragments(
            queryContext.getOptions().getOptionList(),
            queryContext.getCurrentEndpoint(),
            queryId,
            queryContext.getActiveEndpoints(),
            drillbitContext.getPlanReader(),
            rootFragment,
            initiatingClient.getSession(),
            queryContext.getQueryContextInfo());

    if (logger.isTraceEnabled()) {
      final StringBuilder sb = new StringBuilder();
      sb.append("PlanFragments for query ");
      sb.append(queryId);
      sb.append('\n');

      final List<PlanFragment> planFragments = queryWorkUnit.getFragments();
      final int fragmentCount = planFragments.size();
      int fragmentIndex = 0;
      for (final PlanFragment planFragment : planFragments) {
        final FragmentHandle fragmentHandle = planFragment.getHandle();
        sb.append("PlanFragment(");
        sb.append(++fragmentIndex);
        sb.append('/');
        sb.append(fragmentCount);
        sb.append(") major_fragment_id ");
        sb.append(fragmentHandle.getMajorFragmentId());
        sb.append(" minor_fragment_id ");
        sb.append(fragmentHandle.getMinorFragmentId());
        sb.append('\n');

        final DrillbitEndpoint endpointAssignment = planFragment.getAssignment();
        sb.append("  DrillbitEndpoint address ");
        sb.append(endpointAssignment.getAddress());
        sb.append('\n');

        String jsonString = "<<malformed JSON>>";
        sb.append("  fragment_json: ");
        final ObjectMapper objectMapper = new ObjectMapper();
        try {
          final Object json = objectMapper.readValue(planFragment.getFragmentJson(), Object.class);
          jsonString = objectMapper.defaultPrettyPrintingWriter().writeValueAsString(json);
        } catch (final Exception e) {
          // we've already set jsonString to a fallback value
        }
        sb.append(jsonString);

        logger.trace(sb.toString());
      }
    }

    return queryWorkUnit;
  }

  /**
   * Manages the end-state processing for Foreman.
   *
   * <p>End-state processing is tricky, because even if a query appears to succeed, but we then
   * encounter a problem during cleanup, we still want to mark the query as failed. So we have to
   * construct the successful result we would send, and then clean up before we send that result,
   * possibly changing that result if we encounter a problem during cleanup. We only send the result
   * when there is nothing left to do, so it will account for any possible problems.
   *
   * <p>The idea here is to make close()ing the ForemanResult do the final cleanup and sending.
   * Closing the result must be the last thing that is done by Foreman.
   */
  private class ForemanResult implements AutoCloseable {
    private QueryState resultState = null;
    private volatile Exception resultException = null;
    private boolean isClosed = false;

    /**
     * Set up the result for a COMPLETED or CANCELED state.
     *
     * <p>Note that before sending this result, we execute cleanup steps that could result in this
     * result still being changed to a FAILED state.
     *
     * @param queryState one of COMPLETED or CANCELED
     */
    public void setCompleted(final QueryState queryState) {
      Preconditions.checkArgument(
          (queryState == QueryState.COMPLETED) || (queryState == QueryState.CANCELED));
      Preconditions.checkState(!isClosed);
      Preconditions.checkState(resultState == null);

      resultState = queryState;
    }

    /**
     * Set up the result for a FAILED state.
     *
     * <p>Failures that occur during cleanup processing will be added as suppressed exceptions.
     *
     * @param exception the exception that led to the FAILED state
     */
    public void setFailed(final Exception exception) {
      Preconditions.checkArgument(exception != null);
      Preconditions.checkState(!isClosed);
      Preconditions.checkState(resultState == null);

      resultState = QueryState.FAILED;
      resultException = exception;
    }

    /**
     * Ignore the current status and force the given failure as current status. NOTE: Used only for
     * testing purposes. Shouldn't be used in production.
     */
    public void setForceFailure(final Exception exception) {
      Preconditions.checkArgument(exception != null);
      Preconditions.checkState(!isClosed);

      resultState = QueryState.FAILED;
      resultException = exception;
    }

    /**
     * Add an exception to the result. All exceptions after the first become suppressed exceptions
     * hanging off the first.
     *
     * @param exception the exception to add
     */
    private void addException(final Exception exception) {
      Preconditions.checkNotNull(exception);

      if (resultException == null) {
        resultException = exception;
      } else {
        resultException.addSuppressed(exception);
      }
    }

    /**
     * Expose the current exception (if it exists). This is useful for secondary reporting to the
     * query profile.
     *
     * @return the current Foreman result exception or null.
     */
    public Exception getException() {
      return resultException;
    }

    /**
     * Close the given resource, catching and adding any caught exceptions via {@link
     * #addException(Exception)}. If an exception is caught, it will change the result state to
     * FAILED, regardless of what its current value.
     *
     * @param autoCloseable the resource to close
     */
    private void suppressingClose(final AutoCloseable autoCloseable) {
      Preconditions.checkState(!isClosed);
      Preconditions.checkState(resultState != null);

      if (autoCloseable == null) {
        return;
      }

      try {
        autoCloseable.close();
      } catch (final Exception e) {
        /*
         * Even if the query completed successfully, we'll still report failure if we have
         * problems cleaning up.
         */
        resultState = QueryState.FAILED;
        addException(e);
      }
    }

    private void logQuerySummary() {
      try {
        LoggedQuery q =
            new LoggedQuery(
                QueryIdHelper.getQueryId(queryId),
                queryContext.getQueryContextInfo().getDefaultSchemaName(),
                queryText,
                new Date(queryContext.getQueryContextInfo().getQueryStartTime()),
                new Date(System.currentTimeMillis()),
                state,
                queryContext.getSession().getCredentials().getUserName());
        queryLogger.info(MAPPER.writeValueAsString(q));
      } catch (Exception e) {
        logger.error("Failure while recording query information to query log.", e);
      }
    }

    @Override
    public void close() {
      Preconditions.checkState(!isClosed);
      Preconditions.checkState(resultState != null);

      logger.info("foreman cleaning up.");
      injector.injectPause(queryContext.getExecutionControls(), "foreman-cleanup", logger);

      // remove the channel disconnected listener (doesn't throw)
      closeFuture.removeListener(closeListener);

      // log the query summary
      logQuerySummary();

      // These are straight forward removals from maps, so they won't throw.
      drillbitContext.getWorkBus().removeFragmentStatusListener(queryId);
      drillbitContext
          .getClusterCoordinator()
          .removeDrillbitStatusListener(queryManager.getDrillbitStatusListener());

      suppressingClose(queryContext);

      /*
       * We do our best to write the latest state, but even that could fail. If it does, we can't write
       * the (possibly newly failing) state, so we continue on anyway.
       *
       * We only need to do this if the resultState differs from the last recorded state
       */
      if (resultState != state) {
        suppressingClose(
            new AutoCloseable() {
              @Override
              public void close() throws Exception {
                recordNewState(resultState);
              }
            });
      }

      /*
       * Construct the response based on the latest resultState. The builder shouldn't fail.
       */
      final QueryResult.Builder resultBuilder =
          QueryResult.newBuilder().setQueryId(queryId).setQueryState(resultState);
      final UserException uex;
      if (resultException != null) {
        final boolean verbose =
            queryContext.getOptions().getOption(ExecConstants.ENABLE_VERBOSE_ERRORS_KEY).bool_val;
        uex =
            UserException.systemError(resultException)
                .addIdentity(queryContext.getCurrentEndpoint())
                .build(logger);
        resultBuilder.addError(uex.getOrCreatePBError(verbose));
      } else {
        uex = null;
      }

      // we store the final result here so we can capture any error/errorId in the profile for later
      // debugging.
      queryManager.writeFinalProfile(uex);

      /*
       * If sending the result fails, we don't really have any way to modify the result we tried to send;
       * it is possible it got sent but the result came from a later part of the code path. It is also
       * possible the connection has gone away, so this is irrelevant because there's nowhere to
       * send anything to.
       */
      try {
        // send whatever result we ended up with
        initiatingClient.sendResult(responseListener, resultBuilder.build(), true);
      } catch (final Exception e) {
        addException(e);
        logger.warn("Exception sending result to client", resultException);
      }

      // Remove the Foreman from the running query list.
      bee.retireForeman(Foreman.this);

      try {
        releaseLease();
      } finally {
        isClosed = true;
      }
    }
  }

  private static class StateEvent {
    final QueryState newState;
    final Exception exception;

    StateEvent(final QueryState newState, final Exception exception) {
      this.newState = newState;
      this.exception = exception;
    }
  }

  private class StateSwitch extends EventProcessor<StateEvent> {
    public void moveToState(final QueryState newState, final Exception exception) {
      sendEvent(new StateEvent(newState, exception));
    }

    @Override
    protected void processEvent(final StateEvent event) {
      final QueryState newState = event.newState;
      final Exception exception = event.exception;

      // TODO Auto-generated method stub
      logger.info("State change requested.  {} --> {}", state, newState, exception);
      switch (state) {
        case PENDING:
          if (newState == QueryState.RUNNING) {
            recordNewState(QueryState.RUNNING);
            return;
          }

          // $FALL-THROUGH$

        case RUNNING:
          {
            /*
             * For cases that cancel executing fragments, we have to record the new
             * state first, because the cancellation of the local root fragment will
             * cause this to be called recursively.
             */
            switch (newState) {
              case CANCELLATION_REQUESTED:
                {
                  assert exception == null;
                  queryManager.markEndTime();
                  recordNewState(QueryState.CANCELLATION_REQUESTED);
                  queryManager.cancelExecutingFragments(drillbitContext);
                  foremanResult.setCompleted(QueryState.CANCELED);
                  /*
                   * We don't close the foremanResult until we've gotten
                   * acknowledgements, which happens below in the case for current state
                   * == CANCELLATION_REQUESTED.
                   */
                  return;
                }

              case COMPLETED:
                {
                  assert exception == null;
                  queryManager.markEndTime();
                  recordNewState(QueryState.COMPLETED);
                  foremanResult.setCompleted(QueryState.COMPLETED);
                  foremanResult.close();
                  return;
                }

              case FAILED:
                {
                  assert exception != null;
                  queryManager.markEndTime();
                  recordNewState(QueryState.FAILED);
                  queryManager.cancelExecutingFragments(drillbitContext);
                  foremanResult.setFailed(exception);
                  foremanResult.close();
                  return;
                }

              default:
                throw new IllegalStateException("illegal transition from RUNNING to " + newState);
            }
          }

        case CANCELLATION_REQUESTED:
          if ((newState == QueryState.CANCELED)
              || (newState == QueryState.COMPLETED)
              || (newState == QueryState.FAILED)) {

            if (drillbitContext
                .getConfig()
                .getBoolean(ExecConstants.RETURN_ERROR_FOR_FAILURE_IN_CANCELLED_FRAGMENTS)) {
              if (newState == QueryState.FAILED) {
                assert exception != null;
                recordNewState(QueryState.FAILED);
                foremanResult.setForceFailure(exception);
              }
            }
            /*
             * These amount to a completion of the cancellation requests' cleanup;
             * now we can clean up and send the result.
             */
            foremanResult.close();
          }
          return;

        case CANCELED:
        case COMPLETED:
        case FAILED:
          logger.warn(
              "Dropping request to move to {} state as query is already at {} state (which is terminal).",
              newState,
              state);
          return;
      }

      throw new IllegalStateException(
          String.format(
              "Failure trying to change states: %s --> %s", state.name(), newState.name()));
    }
  }

  /**
   * Tells the foreman to move to a new state.
   *
   * @param newState the state to move to
   * @param exception if not null, the exception that drove this state transition (usually a
   *     failure)
   */
  private void moveToState(final QueryState newState, final Exception exception) {
    stateSwitch.moveToState(newState, exception);
  }

  private void recordNewState(final QueryState newState) {
    state = newState;
    queryManager.updateEphemeralState(newState);
  }

  private void runSQL(final String sql) throws ExecutionSetupException {
    final DrillSqlWorker sqlWorker = new DrillSqlWorker(queryContext);
    final Pointer<String> textPlan = new Pointer<>();
    final PhysicalPlan plan = sqlWorker.getPlan(sql, textPlan);
    queryManager.setPlanText(textPlan.value);
    runPhysicalPlan(plan);
  }

  private PhysicalPlan convert(final LogicalPlan plan) throws OptimizerException {
    if (logger.isDebugEnabled()) {
      logger.debug("Converting logical plan {}.", plan.toJsonStringSafe(queryContext.getConfig()));
    }
    return new BasicOptimizer(queryContext, initiatingClient)
        .optimize(new BasicOptimizer.BasicOptimizationContext(queryContext), plan);
  }

  public QueryId getQueryId() {
    return queryId;
  }

  /**
   * Set up the root fragment (which will run locally), and submit it for execution.
   *
   * @param rootFragment
   * @param rootOperator
   * @throws ExecutionSetupException
   */
  private void setupRootFragment(final PlanFragment rootFragment, final FragmentRoot rootOperator)
      throws ExecutionSetupException {
    @SuppressWarnings("resource")
    final FragmentContext rootContext =
        new FragmentContext(
            drillbitContext,
            rootFragment,
            queryContext,
            initiatingClient,
            drillbitContext.getFunctionImplementationRegistry());
    @SuppressWarnings("resource")
    final IncomingBuffers buffers = new IncomingBuffers(rootFragment, rootContext);
    rootContext.setBuffers(buffers);

    queryManager.addFragmentStatusTracker(rootFragment, true);

    rootRunner =
        new FragmentExecutor(
            rootContext,
            rootFragment,
            queryManager.newRootStatusHandler(rootContext, drillbitContext),
            rootOperator);
    final RootFragmentManager fragmentManager =
        new RootFragmentManager(rootFragment.getHandle(), buffers, rootRunner);

    if (buffers.isDone()) {
      // if we don't have to wait for any incoming data, start the fragment runner.
      bee.addFragmentRunner(fragmentManager.getRunnable());
    } else {
      // if we do, record the fragment manager in the workBus.
      // TODO aren't we managing our own work? What does this do? It looks like this will never get
      // run
      drillbitContext.getWorkBus().addFragmentManager(fragmentManager);
    }
  }

  /**
   * Set up the non-root fragments for execution. Some may be local, and some may be remote.
   * Messages are sent immediately, so they may start returning data even before we complete this.
   *
   * @param fragments the fragments
   * @throws ForemanException
   */
  private void setupNonRootFragments(final Collection<PlanFragment> fragments)
      throws ForemanException {
    /*
     * We will send a single message to each endpoint, regardless of how many fragments will be
     * executed there. We need to start up the intermediate fragments first so that they will be
     * ready once the leaf fragments start producing data. To satisfy both of these, we will
     * make a pass through the fragments and put them into these two maps according to their
     * leaf/intermediate state, as well as their target drillbit.
     */
    final Multimap<DrillbitEndpoint, PlanFragment> leafFragmentMap = ArrayListMultimap.create();
    final Multimap<DrillbitEndpoint, PlanFragment> intFragmentMap = ArrayListMultimap.create();

    // record all fragments for status purposes.
    for (final PlanFragment planFragment : fragments) {
      logger.trace(
          "Tracking intermediate remote node {} with data {}",
          planFragment.getAssignment(),
          planFragment.getFragmentJson());
      queryManager.addFragmentStatusTracker(planFragment, false);
      if (planFragment.getLeafFragment()) {
        leafFragmentMap.put(planFragment.getAssignment(), planFragment);
      } else {
        intFragmentMap.put(planFragment.getAssignment(), planFragment);
      }
    }

    /*
     * We need to wait for the intermediates to be sent so that they'll be set up by the time
     * the leaves start producing data. We'll use this latch to wait for the responses.
     *
     * However, in order not to hang the process if any of the RPC requests fails, we always
     * count down (see FragmentSubmitFailures), but we count the number of failures so that we'll
     * know if any submissions did fail.
     */
    final int numIntFragments = intFragmentMap.keySet().size();
    final ExtendedLatch endpointLatch = new ExtendedLatch(numIntFragments);
    final FragmentSubmitFailures fragmentSubmitFailures = new FragmentSubmitFailures();

    // send remote intermediate fragments
    for (final DrillbitEndpoint ep : intFragmentMap.keySet()) {
      sendRemoteFragments(ep, intFragmentMap.get(ep), endpointLatch, fragmentSubmitFailures);
    }

    final long timeout = RPC_WAIT_IN_MSECS_PER_FRAGMENT * numIntFragments;
    if (numIntFragments > 0 && !endpointLatch.awaitUninterruptibly(timeout)) {
      long numberRemaining = endpointLatch.getCount();
      throw UserException.connectionError()
          .message(
              "Exceeded timeout (%d) while waiting send intermediate work fragments to remote nodes. "
                  + "Sent %d and only heard response back from %d nodes.",
              timeout, numIntFragments, numIntFragments - numberRemaining)
          .build(logger);
    }

    // if any of the intermediate fragment submissions failed, fail the query
    final List<FragmentSubmitFailures.SubmissionException> submissionExceptions =
        fragmentSubmitFailures.submissionExceptions;
    if (submissionExceptions.size() > 0) {
      Set<DrillbitEndpoint> endpoints = Sets.newHashSet();
      StringBuilder sb = new StringBuilder();
      boolean first = true;

      for (FragmentSubmitFailures.SubmissionException e :
          fragmentSubmitFailures.submissionExceptions) {
        DrillbitEndpoint endpoint = e.drillbitEndpoint;
        if (endpoints.add(endpoint)) {
          if (first) {
            first = false;
          } else {
            sb.append(", ");
          }
          sb.append(endpoint.getAddress());
        }
      }
      throw UserException.connectionError(submissionExceptions.get(0).rpcException)
          .message("Error setting up remote intermediate fragment execution")
          .addContext("Nodes with failures", sb.toString())
          .build(logger);
    }

    injector.injectChecked(
        queryContext.getExecutionControls(), "send-fragments", ForemanException.class);
    /*
     * Send the remote (leaf) fragments; we don't wait for these. Any problems will come in through
     * the regular sendListener event delivery.
     */
    for (final DrillbitEndpoint ep : leafFragmentMap.keySet()) {
      sendRemoteFragments(ep, leafFragmentMap.get(ep), null, null);
    }
  }

  /**
   * Send all the remote fragments belonging to a single target drillbit in one request.
   *
   * @param assignment the drillbit assigned to these fragments
   * @param fragments the set of fragments
   * @param latch the countdown latch used to track the requests to all endpoints
   * @param fragmentSubmitFailures the submission failure counter used to track the requests to all
   *     endpoints
   */
  private void sendRemoteFragments(
      final DrillbitEndpoint assignment,
      final Collection<PlanFragment> fragments,
      final CountDownLatch latch,
      final FragmentSubmitFailures fragmentSubmitFailures) {
    @SuppressWarnings("resource")
    final Controller controller = drillbitContext.getController();
    final InitializeFragments.Builder fb = InitializeFragments.newBuilder();
    for (final PlanFragment planFragment : fragments) {
      fb.addFragment(planFragment);
    }
    final InitializeFragments initFrags = fb.build();

    logger.debug("Sending remote fragments to \nNode:\n{} \n\nData:\n{}", assignment, initFrags);
    final FragmentSubmitListener listener =
        new FragmentSubmitListener(assignment, initFrags, latch, fragmentSubmitFailures);
    controller.getTunnel(assignment).sendFragments(listener, initFrags);
  }

  public QueryState getState() {
    return state;
  }

  /** Used by {@link FragmentSubmitListener} to track the number of submission failures. */
  private static class FragmentSubmitFailures {
    static class SubmissionException {
      final DrillbitEndpoint drillbitEndpoint;
      final RpcException rpcException;

      SubmissionException(
          @SuppressWarnings("unused") final DrillbitEndpoint drillbitEndpoint,
          final RpcException rpcException) {
        this.drillbitEndpoint = drillbitEndpoint;
        this.rpcException = rpcException;
      }
    }

    final List<SubmissionException> submissionExceptions = new LinkedList<>();

    void addFailure(final DrillbitEndpoint drillbitEndpoint, final RpcException rpcException) {
      submissionExceptions.add(new SubmissionException(drillbitEndpoint, rpcException));
    }
  }

  private class FragmentSubmitListener extends EndpointListener<Ack, InitializeFragments> {
    private final CountDownLatch latch;
    private final FragmentSubmitFailures fragmentSubmitFailures;

    /**
     * Constructor.
     *
     * @param endpoint the endpoint for the submission
     * @param value the initialize fragments message
     * @param latch the latch to count down when the status is known; may be null
     * @param fragmentSubmitFailures the counter to use for failures; must be non-null iff latch is
     *     non-null
     */
    public FragmentSubmitListener(
        final DrillbitEndpoint endpoint,
        final InitializeFragments value,
        final CountDownLatch latch,
        final FragmentSubmitFailures fragmentSubmitFailures) {
      super(endpoint, value);
      Preconditions.checkState((latch == null) == (fragmentSubmitFailures == null));
      this.latch = latch;
      this.fragmentSubmitFailures = fragmentSubmitFailures;
    }

    @Override
    public void success(final Ack ack, final ByteBuf byteBuf) {
      if (latch != null) {
        latch.countDown();
      }
    }

    @Override
    public void failed(final RpcException ex) {
      if (latch != null) {
        fragmentSubmitFailures.addFailure(endpoint, ex);
        latch.countDown();
      } else {
        // since this won't be waited on, we can wait to deliver this event once the Foreman is
        // ready
        logger.debug("Failure while sending fragment.  Stopping query.", ex);
        stateListener.moveToState(QueryState.FAILED, ex);
      }
    }

    @Override
    public void interrupted(final InterruptedException e) {
      // Foreman shouldn't get interrupted while waiting for the RPC outcome of fragment submission.
      // Consider the interrupt as failure.
      final String errMsg = "Interrupted while waiting for the RPC outcome of fragment submission.";
      logger.error(errMsg, e);
      failed(new RpcException(errMsg, e));
    }
  }

  /**
   * Provides gated access to state transitions.
   *
   * <p>The StateListener waits on a latch before delivery state transitions to the Foreman. The
   * latch will be tripped when the Foreman is sufficiently set up that it can receive and process
   * external events from other threads.
   */
  public class StateListener {
    /**
     * Move the Foreman to the specified new state.
     *
     * @param newState the state to move to
     * @param ex if moving to a failure state, the exception that led to the failure; used for
     *     reporting to the user
     */
    public void moveToState(final QueryState newState, final Exception ex) {
      acceptExternalEvents.awaitUninterruptibly();

      Foreman.this.moveToState(newState, ex);
    }
  }

  /** Listens for the status of the RPC response sent to the user for the query. */
  private class ResponseSendListener extends BaseRpcOutcomeListener<Ack> {
    @Override
    public void failed(final RpcException ex) {
      logger.info(
          "Failure while trying communicate query result to initiating client. "
              + "This would happen if a client is disconnected before response notice can be sent.",
          ex);
      stateListener.moveToState(QueryState.FAILED, ex);
    }

    @Override
    public void interrupted(final InterruptedException e) {
      logger.warn(
          "Interrupted while waiting for RPC outcome of sending final query result to initiating client.");
      stateListener.moveToState(QueryState.FAILED, e);
    }
  }
}
Exemplo n.º 3
0
/**
 * Responsible for running a single fragment on a single Drillbit. Listens/responds to status
 * request and cancellation messages.
 */
public class FragmentExecutor implements Runnable {
  private static final org.slf4j.Logger logger =
      org.slf4j.LoggerFactory.getLogger(FragmentExecutor.class);
  private static final ControlsInjector injector =
      ControlsInjectorFactory.getInjector(FragmentExecutor.class);

  private final AtomicBoolean hasCloseoutThread = new AtomicBoolean(false);
  private final String fragmentName;
  private final FragmentContext fragmentContext;
  private final FragmentStatusReporter statusReporter;
  private final DeferredException deferredException = new DeferredException();
  private final PlanFragment fragment;
  private final FragmentRoot rootOperator;
  private final ReceiverExecutor receiverExecutor;

  private volatile RootExec root;
  private final AtomicReference<FragmentState> fragmentState =
      new AtomicReference<>(FragmentState.AWAITING_ALLOCATION);
  private final ExtendedLatch acceptExternalEvents = new ExtendedLatch();

  // Thread that is currently executing the Fragment. Value is null if the fragment hasn't started
  // running or finished
  private final AtomicReference<Thread> myThreadRef = new AtomicReference<>(null);

  /**
   * Create a FragmentExecutor where we need to parse and materialize the root operator.
   *
   * @param context
   * @param fragment
   * @param statusReporter
   */
  public FragmentExecutor(
      final FragmentContext context,
      final PlanFragment fragment,
      final FragmentStatusReporter statusReporter) {
    this(context, fragment, statusReporter, null);
  }

  /**
   * Create a FragmentExecutor where we already have a root operator in memory.
   *
   * @param context
   * @param fragment
   * @param statusReporter
   * @param rootOperator
   */
  public FragmentExecutor(
      final FragmentContext context,
      final PlanFragment fragment,
      final FragmentStatusReporter statusReporter,
      final FragmentRoot rootOperator) {
    this.fragmentContext = context;
    this.statusReporter = statusReporter;
    this.fragment = fragment;
    this.rootOperator = rootOperator;
    this.fragmentName = QueryIdHelper.getQueryIdentifier(context.getHandle());
    this.receiverExecutor = new ReceiverExecutor(fragmentName, fragmentContext.getExecutor());

    context.setExecutorState(new ExecutorStateImpl());
  }

  @Override
  public String toString() {
    final StringBuilder builder = new StringBuilder();
    builder.append("FragmentExecutor [fragmentContext=");
    builder.append(fragmentContext);
    builder.append(", fragmentState=");
    builder.append(fragmentState);
    builder.append("]");
    return builder.toString();
  }

  /**
   * Returns the current fragment status if the fragment is running. Otherwise, returns no status.
   *
   * @return FragmentStatus or null.
   */
  public FragmentStatus getStatus() {
    /*
     * If the query is not in a running state, the operator tree is still being constructed and
     * there is no reason to poll for intermediate results.
     *
     * Previously the call to get the operator stats with the AbstractStatusReporter was happening
     * before this check. This caused a concurrent modification exception as the list of operator
     * stats is iterated over while collecting info, and added to while building the operator tree.
     */
    if (fragmentState.get() != FragmentState.RUNNING) {
      return null;
    }

    return statusReporter.getStatus(FragmentState.RUNNING);
  }

  /**
   * Cancel the execution of this fragment is in an appropriate state. Messages come from external.
   * NOTE that this will be called from threads *other* than the one running this runnable(), so we
   * need to be careful about the state transitions that can result.
   */
  public void cancel() {
    final boolean thisIsOnlyThread = hasCloseoutThread.compareAndSet(false, true);

    if (!thisIsOnlyThread) {
      acceptExternalEvents.awaitUninterruptibly();

      /*
       * We set the cancel requested flag but the actual cancellation is managed by the run() loop, if called.
       */
      updateState(FragmentState.CANCELLATION_REQUESTED);

      /*
       * Interrupt the thread so that it exits from any blocking operation it could be executing currently. We
       * synchronize here to ensure we don't accidentally create a race condition where we interrupt the close out
       * procedure of the main thread.
       */
      synchronized (myThreadRef) {
        final Thread myThread = myThreadRef.get();
        if (myThread != null) {
          logger.debug("Interrupting fragment thread {}", myThread.getName());
          myThread.interrupt();
        }
      }
    } else {
      // countdown so receiver fragment finished can proceed.
      acceptExternalEvents.countDown();

      updateState(FragmentState.CANCELLATION_REQUESTED);
      cleanup(FragmentState.FINISHED);
    }
  }

  private void cleanup(FragmentState state) {

    closeOutResources();

    updateState(state);
    // send the final state of the fragment. only the main execution thread can send the final state
    // and it can
    // only be sent once.
    sendFinalState();
  }

  /**
   * Resume all the pauses within the current context. Note that this method will be called from
   * threads *other* than the one running this runnable(). Also, this method can be called multiple
   * times.
   */
  public synchronized void unpause() {
    fragmentContext.getExecutionControls().unpauseAll();
  }

  /**
   * Inform this fragment that one of its downstream partners no longer needs additional records.
   * This is most commonly called in the case that a limit query is executed.
   *
   * @param handle The downstream FragmentHandle of the Fragment that needs no more records from
   *     this Fragment.
   */
  public void receivingFragmentFinished(final FragmentHandle handle) {
    receiverExecutor.submitReceiverFinished(handle);
  }

  @Override
  public void run() {
    // if a cancel thread has already entered this executor, we have not reason to continue.
    if (!hasCloseoutThread.compareAndSet(false, true)) {
      return;
    }

    final Thread myThread = Thread.currentThread();
    myThreadRef.set(myThread);
    final String originalThreadName = myThread.getName();
    final FragmentHandle fragmentHandle = fragmentContext.getHandle();
    final DrillbitContext drillbitContext = fragmentContext.getDrillbitContext();
    final ClusterCoordinator clusterCoordinator = drillbitContext.getClusterCoordinator();
    final DrillbitStatusListener drillbitStatusListener = new FragmentDrillbitStatusListener();
    final String newThreadName = QueryIdHelper.getExecutorThreadName(fragmentHandle);

    try {

      myThread.setName(newThreadName);

      // if we didn't get the root operator when the executor was created, create it now.
      final FragmentRoot rootOperator =
          this.rootOperator != null
              ? this.rootOperator
              : drillbitContext.getPlanReader().readFragmentOperator(fragment.getFragmentJson());

      root = ImplCreator.getExec(fragmentContext, rootOperator);
      if (root == null) {
        return;
      }

      clusterCoordinator.addDrillbitStatusListener(drillbitStatusListener);
      updateState(FragmentState.RUNNING);

      acceptExternalEvents.countDown();
      injector.injectPause(fragmentContext.getExecutionControls(), "fragment-running", logger);

      final DrillbitEndpoint endpoint = drillbitContext.getEndpoint();
      logger.debug(
          "Starting fragment {}:{} on {}:{}",
          fragmentHandle.getMajorFragmentId(),
          fragmentHandle.getMinorFragmentId(),
          endpoint.getAddress(),
          endpoint.getUserPort());

      final UserGroupInformation queryUserUgi =
          fragmentContext.isImpersonationEnabled()
              ? ImpersonationUtil.createProxyUgi(fragmentContext.getQueryUserName())
              : ImpersonationUtil.getProcessUserUGI();

      queryUserUgi.doAs(
          new PrivilegedExceptionAction<Void>() {
            public Void run() throws Exception {
              injector.injectChecked(
                  fragmentContext.getExecutionControls(), "fragment-execution", IOException.class);
              /*
               * Run the query until root.next returns false OR we no longer need to continue.
               */
              while (shouldContinue() && root.next()) {
                // loop
              }

              return null;
            }
          });

    } catch (OutOfMemoryError | OutOfMemoryException e) {
      if (!(e instanceof OutOfMemoryError) || "Direct buffer memory".equals(e.getMessage())) {
        fail(UserException.memoryError(e).build(logger));
      } else {
        // we have a heap out of memory error. The JVM in unstable, exit.
        CatastrophicFailure.exit(
            e, "Unable to handle out of memory condition in FragmentExecutor.", -2);
      }
    } catch (AssertionError | Exception e) {
      fail(e);
    } finally {

      // no longer allow this thread to be interrupted. We synchronize here to make sure that cancel
      // can't set an
      // interruption after we have moved beyond this block.
      synchronized (myThreadRef) {
        myThreadRef.set(null);
        Thread.interrupted();
      }

      // We need to sure we countDown at least once. We'll do it here to guarantee that.
      acceptExternalEvents.countDown();

      // here we could be in FAILED, RUNNING, or CANCELLATION_REQUESTED
      cleanup(FragmentState.FINISHED);

      clusterCoordinator.removeDrillbitStatusListener(drillbitStatusListener);

      myThread.setName(originalThreadName);
    }
  }

  /**
   * Utility method to check where we are in a no terminal state.
   *
   * @return Whether or not execution should continue.
   */
  private boolean shouldContinue() {
    return !isCompleted() && FragmentState.CANCELLATION_REQUESTED != fragmentState.get();
  }

  /**
   * Returns true if the fragment is in a terminal state
   *
   * @return Whether this state is in a terminal state.
   */
  private boolean isCompleted() {
    return isTerminal(fragmentState.get());
  }

  private void sendFinalState() {
    final FragmentState outcome = fragmentState.get();
    if (outcome == FragmentState.FAILED) {
      final FragmentHandle handle = getContext().getHandle();
      final UserException uex =
          UserException.systemError(deferredException.getAndClear())
              .addIdentity(getContext().getIdentity())
              .addContext(
                  "Fragment", handle.getMajorFragmentId() + ":" + handle.getMinorFragmentId())
              .build(logger);
      statusReporter.fail(uex);
    } else {
      statusReporter.stateChanged(outcome);
    }
  }

  private void closeOutResources() {

    // first close the operators and release all memory.
    try {
      // Say executor was cancelled before setup. Now when executor actually runs, root is not
      // initialized, but this
      // method is called in finally. So root can be null.
      if (root != null) {
        root.close();
      }
    } catch (final Exception e) {
      fail(e);
    }

    // then close the fragment context.
    fragmentContext.close();
  }

  private void warnStateChange(final FragmentState current, final FragmentState target) {
    logger.warn(
        fragmentName + ": Ignoring unexpected state transition {} --> {}",
        current.name(),
        target.name());
  }

  private void errorStateChange(final FragmentState current, final FragmentState target) {
    final String msg = "%s: Invalid state transition %s --> %s";
    throw new StateTransitionException(
        String.format(msg, fragmentName, current.name(), target.name()));
  }

  private synchronized boolean updateState(FragmentState target) {
    final FragmentState current = fragmentState.get();
    logger.info(fragmentName + ": State change requested {} --> {}", current, target);
    switch (target) {
      case CANCELLATION_REQUESTED:
        switch (current) {
          case SENDING:
          case AWAITING_ALLOCATION:
          case RUNNING:
            fragmentState.set(target);
            statusReporter.stateChanged(target);
            return true;

          default:
            warnStateChange(current, target);
            return false;
        }

      case FINISHED:
        if (current == FragmentState.CANCELLATION_REQUESTED) {
          target = FragmentState.CANCELLED;
        } else if (current == FragmentState.FAILED) {
          target = FragmentState.FAILED;
        }
        // fall-through
      case FAILED:
        if (!isTerminal(current)) {
          fragmentState.set(target);
          // don't notify reporter until we finalize this terminal state.
          return true;
        } else if (current == FragmentState.FAILED) {
          // no warn since we can call fail multiple times.
          return false;
        } else if (current == FragmentState.CANCELLED && target == FragmentState.FAILED) {
          fragmentState.set(FragmentState.FAILED);
          return true;
        } else {
          warnStateChange(current, target);
          return false;
        }

      case RUNNING:
        if (current == FragmentState.AWAITING_ALLOCATION) {
          fragmentState.set(target);
          statusReporter.stateChanged(target);
          return true;
        } else {
          errorStateChange(current, target);
        }

        // these should never be requested.
      case CANCELLED:
      case SENDING:
      case AWAITING_ALLOCATION:
      default:
        errorStateChange(current, target);
    }

    // errorStateChange() throw should mean this is never executed
    throw new IllegalStateException();
  }

  private boolean isTerminal(final FragmentState state) {
    return state == FragmentState.CANCELLED
        || state == FragmentState.FAILED
        || state == FragmentState.FINISHED;
  }

  /**
   * Capture an exception and add store it. Update state to failed status (if not already there).
   * Does not immediately report status back to Foreman. Only the original thread can return status
   * to the Foreman.
   *
   * @param excep The failure that occurred.
   */
  private void fail(final Throwable excep) {
    deferredException.addThrowable(excep);
    updateState(FragmentState.FAILED);
  }

  public FragmentContext getContext() {
    return fragmentContext;
  }

  private class ExecutorStateImpl implements ExecutorState {
    public boolean shouldContinue() {
      return FragmentExecutor.this.shouldContinue();
    }

    public void fail(final Throwable t) {
      FragmentExecutor.this.fail(t);
    }

    public boolean isFailed() {
      return fragmentState.get() == FragmentState.FAILED;
    }

    public Throwable getFailureCause() {
      return deferredException.getException();
    }
  }

  private class FragmentDrillbitStatusListener implements DrillbitStatusListener {
    @Override
    public void drillbitRegistered(
        final Set<CoordinationProtos.DrillbitEndpoint> registeredDrillbits) {}

    @Override
    public void drillbitUnregistered(
        final Set<CoordinationProtos.DrillbitEndpoint> unregisteredDrillbits) {
      // if the defunct Drillbit was running our Foreman, then cancel the query
      final DrillbitEndpoint foremanEndpoint =
          FragmentExecutor.this.fragmentContext.getForemanEndpoint();
      if (unregisteredDrillbits.contains(foremanEndpoint)) {
        logger.warn(
            "Foreman {} no longer active.  Cancelling fragment {}.",
            foremanEndpoint.getAddress(),
            QueryIdHelper.getQueryIdentifier(fragmentContext.getHandle()));
        FragmentExecutor.this.cancel();
      }
    }
  }

  private class ReceiverExecutor extends SerializedExecutor {

    public ReceiverExecutor(String name, Executor underlyingExecutor) {
      super(name, underlyingExecutor);
    }

    @Override
    protected void runException(Runnable command, Throwable t) {
      logger.error("Failure running with exception of command {}", command, t);
    }

    public void submitReceiverFinished(FragmentHandle handle) {
      execute(new ReceiverFinished(handle));
    }
  }

  private class ReceiverFinished implements Runnable {
    final FragmentHandle handle;

    public ReceiverFinished(FragmentHandle handle) {
      super();
      this.handle = handle;
    }

    @Override
    public void run() {
      acceptExternalEvents.awaitUninterruptibly();

      if (root != null) {
        logger.info(
            "Applying request for early sender termination for {} -> {}.",
            QueryIdHelper.getFragmentId(getContext().getHandle()),
            QueryIdHelper.getFragmentId(handle));
        root.receivingFragmentFinished(handle);
      } else {
        logger.warn(
            "Dropping request for early fragment termination for path {} -> {} as no root exec exists.",
            QueryIdHelper.getFragmentId(getContext().getHandle()),
            QueryIdHelper.getFragmentId(handle));
      }
    }
  }
}