Example #1
0
  /**
   * Add the StatsTask as a dependent task of the MoveTask because StatsTask will change the
   * Table/Partition metadata. For atomicity, we should not change it before the data is actually
   * there done by MoveTask.
   *
   * @param nd the FileSinkOperator whose results are taken care of by the MoveTask.
   * @param mvTask The MoveTask that moves the FileSinkOperator's results.
   * @param currTask The MapRedTask that the FileSinkOperator belongs to.
   * @param hconf HiveConf
   */
  private void addStatsTask(
      FileSinkOperator nd, MoveTask mvTask, Task<? extends Serializable> currTask, HiveConf hconf) {

    MoveWork mvWork = ((MoveTask) mvTask).getWork();
    StatsWork statsWork = null;
    if (mvWork.getLoadTableWork() != null) {
      statsWork = new StatsWork(mvWork.getLoadTableWork());
    } else if (mvWork.getLoadFileWork() != null) {
      statsWork = new StatsWork(mvWork.getLoadFileWork());
    }
    assert statsWork != null : "Error when genereting StatsTask";
    statsWork.setStatsReliable(hconf.getBoolVar(ConfVars.HIVE_STATS_RELIABLE));
    MapredWork mrWork = (MapredWork) currTask.getWork();

    // AggKey in StatsWork is used for stats aggregation while StatsAggPrefix
    // in FileSinkDesc is used for stats publishing. They should be consistent.
    statsWork.setAggKey(((FileSinkOperator) nd).getConf().getStatsAggPrefix());
    Task<? extends Serializable> statsTask = TaskFactory.get(statsWork, hconf);

    // mark the MapredWork and FileSinkOperator for gathering stats
    nd.getConf().setGatherStats(true);
    mrWork.setGatheringStats(true);
    nd.getConf().setStatsReliable(hconf.getBoolVar(ConfVars.HIVE_STATS_RELIABLE));
    nd.getConf()
        .setMaxStatsKeyPrefixLength(hconf.getIntVar(ConfVars.HIVE_STATS_KEY_PREFIX_MAX_LENGTH));
    // mrWork.addDestinationTable(nd.getConf().getTableInfo().getTableName());

    // subscribe feeds from the MoveTask so that MoveTask can forward the list
    // of dynamic partition list to the StatsTask
    mvTask.addDependentTask(statsTask);
    statsTask.subscribeFeed(mvTask);
  }
Example #2
0
  /**
   * handle partial scan command.
   *
   * <p>It is composed of PartialScanTask followed by StatsTask.
   */
  private void handlePartialScanCommand(
      TableScanOperator tableScan,
      ParseContext parseContext,
      StatsWork statsWork,
      GenTezProcContext context,
      Task<StatsWork> statsTask)
      throws SemanticException {

    String aggregationKey = tableScan.getConf().getStatsAggPrefix();
    StringBuilder aggregationKeyBuffer = new StringBuilder(aggregationKey);
    List<Path> inputPaths =
        GenMapRedUtils.getInputPathsForPartialScan(tableScan, aggregationKeyBuffer);
    aggregationKey = aggregationKeyBuffer.toString();

    // scan work
    PartialScanWork scanWork = new PartialScanWork(inputPaths);
    scanWork.setMapperCannotSpanPartns(true);
    scanWork.setAggKey(aggregationKey);
    scanWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir(), parseContext.getConf());

    // stats work
    statsWork.setPartialScanAnalyzeCommand(true);

    // partial scan task
    DriverContext driverCxt = new DriverContext();
    Task<PartialScanWork> partialScanTask = TaskFactory.get(scanWork, parseContext.getConf());
    partialScanTask.initialize(parseContext.getConf(), null, driverCxt);
    partialScanTask.setWork(scanWork);
    statsWork.setSourceTask(partialScanTask);

    // task dependency
    context.rootTasks.remove(context.currentTask);
    context.rootTasks.add(partialScanTask);
    partialScanTask.addDependentTask(statsTask);
  }
  /**
   * handle partial scan command. It is composed of PartialScanTask followed by StatsTask .
   *
   * @param op
   * @param ctx
   * @param parseCtx
   * @param currTask
   * @param parseInfo
   * @param statsWork
   * @param statsTask
   * @throws SemanticException
   */
  private void handlePartialScanCommand(
      TableScanOperator op,
      GenMRProcContext ctx,
      ParseContext parseCtx,
      Task<? extends Serializable> currTask,
      StatsWork statsWork,
      Task<StatsWork> statsTask)
      throws SemanticException {
    String aggregationKey = op.getConf().getStatsAggPrefix();
    StringBuffer aggregationKeyBuffer = new StringBuffer(aggregationKey);
    List<Path> inputPaths = GenMapRedUtils.getInputPathsForPartialScan(op, aggregationKeyBuffer);
    aggregationKey = aggregationKeyBuffer.toString();

    // scan work
    PartialScanWork scanWork = new PartialScanWork(inputPaths);
    scanWork.setMapperCannotSpanPartns(true);
    scanWork.setAggKey(aggregationKey);

    // stats work
    statsWork.setPartialScanAnalyzeCommand(true);

    // partial scan task
    DriverContext driverCxt = new DriverContext();
    Task<PartialScanWork> psTask = TaskFactory.get(scanWork, parseCtx.getConf());
    psTask.initialize(parseCtx.getConf(), null, driverCxt);
    psTask.setWork(scanWork);

    // task dependency
    ctx.getRootTasks().remove(currTask);
    ctx.getRootTasks().add(psTask);
    psTask.addDependentTask(statsTask);
    List<Task<? extends Serializable>> parentTasks = new ArrayList<Task<? extends Serializable>>();
    parentTasks.add(psTask);
    statsTask.setParentTasks(parentTasks);
  }
  /**
   * Add the StatsTask as a dependent task of the MoveTask because StatsTask will change the
   * Table/Partition metadata. For atomicity, we should not change it before the data is actually
   * there done by MoveTask.
   *
   * @param nd the FileSinkOperator whose results are taken care of by the MoveTask.
   * @param mvTask The MoveTask that moves the FileSinkOperator's results.
   * @param currTask The MapRedTask that the FileSinkOperator belongs to.
   * @param hconf HiveConf
   */
  private void addStatsTask(
      FileSinkOperator nd, MoveTask mvTask, Task<? extends Serializable> currTask, HiveConf hconf) {

    MoveWork mvWork = ((MoveTask) mvTask).getWork();
    StatsWork statsWork = new StatsWork(mvWork.getLoadTableWork());
    MapredWork mrWork = (MapredWork) currTask.getWork();

    // AggKey in StatsWork is used for stats aggregation while StatsAggPrefix
    // in FileSinkDesc is used for stats publishing. They should be consistent.
    statsWork.setAggKey(((FileSinkOperator) nd).getConf().getStatsAggPrefix());
    Task<? extends Serializable> statsTask = TaskFactory.get(statsWork, hconf);

    // mark the MapredWork and FileSinkOperator for gathering stats
    nd.getConf().setGatherStats(true);
    mrWork.setGatheringStats(true);
    // mrWork.addDestinationTable(nd.getConf().getTableInfo().getTableName());

    // subscribe feeds from the MoveTask so that MoveTask can forward the list
    // of dynamic partition list to the StatsTask
    mvTask.addDependentTask(statsTask);
    statsTask.subscribeFeed(mvTask);
  }
Example #5
0
  @SuppressWarnings("unchecked")
  @Override
  public Object process(
      Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs)
      throws SemanticException {

    GenTezProcContext context = (GenTezProcContext) procContext;

    TableScanOperator tableScan = (TableScanOperator) nd;

    ParseContext parseContext = context.parseContext;
    Class<? extends InputFormat> inputFormat =
        tableScan.getConf().getTableMetadata().getInputFormatClass();

    if (parseContext.getQueryProperties().isAnalyzeCommand()) {

      assert tableScan.getChildOperators() == null || tableScan.getChildOperators().size() == 0;

      String alias = null;
      for (String a : parseContext.getTopOps().keySet()) {
        if (tableScan == parseContext.getTopOps().get(a)) {
          alias = a;
        }
      }

      assert alias != null;

      TezWork tezWork = context.currentTask.getWork();
      if (inputFormat.equals(OrcInputFormat.class)) {
        // For ORC, all the following statements are the same
        // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
        // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
        // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;

        // There will not be any Tez job above this task
        StatsNoJobWork snjWork =
            new StatsNoJobWork(tableScan.getConf().getTableMetadata().getTableSpec());
        snjWork.setStatsReliable(
            parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
        // If partition is specified, get pruned partition list
        Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
        if (confirmedParts.size() > 0) {
          Table source = tableScan.getConf().getTableMetadata();
          List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
          PrunedPartitionList partList =
              new PrunedPartitionList(source, confirmedParts, partCols, false);
          snjWork.setPrunedPartitionList(partList);
        }
        Task<StatsNoJobWork> snjTask = TaskFactory.get(snjWork, parseContext.getConf());
        snjTask.setParentTasks(null);
        context.rootTasks.remove(context.currentTask);
        context.rootTasks.add(snjTask);
        return true;
      } else {

        // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
        // The plan consists of a simple TezTask followed by a StatsTask.
        // The Tez task is just a simple TableScanOperator

        StatsWork statsWork = new StatsWork(tableScan.getConf().getTableMetadata().getTableSpec());
        statsWork.setAggKey(tableScan.getConf().getStatsAggPrefix());
        statsWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir());
        statsWork.setSourceTask(context.currentTask);
        statsWork.setStatsReliable(
            parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
        Task<StatsWork> statsTask = TaskFactory.get(statsWork, parseContext.getConf());
        context.currentTask.addDependentTask(statsTask);

        // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
        // The plan consists of a StatsTask only.
        if (parseContext.getQueryProperties().isNoScanAnalyzeCommand()) {
          statsTask.setParentTasks(null);
          statsWork.setNoScanAnalyzeCommand(true);
          context.rootTasks.remove(context.currentTask);
          context.rootTasks.add(statsTask);
        }

        // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
        if (parseContext.getQueryProperties().isPartialScanAnalyzeCommand()) {
          handlePartialScanCommand(tableScan, parseContext, statsWork, context, statsTask);
        }

        // NOTE: here we should use the new partition predicate pushdown API to get a list of pruned
        // list,
        // and pass it to setTaskPlan as the last parameter
        Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
        PrunedPartitionList partitions = null;
        if (confirmedPartns.size() > 0) {
          Table source = tableScan.getConf().getTableMetadata();
          List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
          partitions = new PrunedPartitionList(source, confirmedPartns, partCols, false);
        }

        MapWork w = utils.createMapWork(context, tableScan, tezWork, partitions);
        w.setGatheringStats(true);

        return true;
      }
    }

    return null;
  }
  @Override
  public void analyzeInternal(ASTNode ast) throws SemanticException {
    isLocal = false;
    isOverWrite = false;
    Tree fromTree = ast.getChild(0);
    Tree tableTree = ast.getChild(1);

    if (ast.getChildCount() == 4) {
      isLocal = true;
      isOverWrite = true;
    }

    if (ast.getChildCount() == 3) {
      if (ast.getChild(2).getText().toLowerCase().equals("local")) {
        isLocal = true;
      } else {
        isOverWrite = true;
      }
    }

    // initialize load path
    URI fromURI;
    try {
      String fromPath = stripQuotes(fromTree.getText());
      fromURI = initializeFromURI(fromPath);
    } catch (IOException e) {
      throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e.getMessage()), e);
    } catch (URISyntaxException e) {
      throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e.getMessage()), e);
    }

    // initialize destination table/partition
    tableSpec ts = new tableSpec(db, conf, (ASTNode) tableTree);

    if (ts.tableHandle.isOffline()) {
      throw new SemanticException(
          ErrorMsg.OFFLINE_TABLE_OR_PARTITION.getMsg(":Table " + ts.tableName));
    }

    if (ts.tableHandle.isView()) {
      throw new SemanticException(ErrorMsg.DML_AGAINST_VIEW.getMsg());
    }
    if (ts.tableHandle.isNonNative()) {
      throw new SemanticException(ErrorMsg.LOAD_INTO_NON_NATIVE.getMsg());
    }

    if (ts.tableHandle.isStoredAsSubDirectories()) {
      throw new SemanticException(ErrorMsg.LOAD_INTO_STORED_AS_DIR.getMsg());
    }

    URI toURI =
        (ts.partHandle != null)
            ? ts.partHandle.getDataLocation()
            : ts.tableHandle.getDataLocation();

    List<FieldSchema> parts = ts.tableHandle.getPartitionKeys();
    if ((parts != null && parts.size() > 0) && (ts.partSpec == null || ts.partSpec.size() == 0)) {
      throw new SemanticException(ErrorMsg.NEED_PARTITION_ERROR.getMsg());
    }

    // make sure the arguments make sense
    applyConstraints(fromURI, toURI, fromTree, isLocal);

    Task<? extends Serializable> rTask = null;

    // create copy work
    if (isLocal) {
      // if the local keyword is specified - we will always make a copy. this
      // might seem redundant in the case
      // that the hive warehouse is also located in the local file system - but
      // that's just a test case.
      String copyURIStr = ctx.getExternalTmpFileURI(toURI);
      URI copyURI = URI.create(copyURIStr);
      rTask = TaskFactory.get(new CopyWork(fromURI.toString(), copyURIStr), conf);
      fromURI = copyURI;
    }

    // create final load/move work

    String loadTmpPath = ctx.getExternalTmpFileURI(toURI);
    Map<String, String> partSpec = ts.getPartSpec();
    if (partSpec == null) {
      partSpec = new LinkedHashMap<String, String>();
      outputs.add(new WriteEntity(ts.tableHandle));
    } else {
      try {
        Partition part = Hive.get().getPartition(ts.tableHandle, partSpec, false);
        if (part != null) {
          if (part.isOffline()) {
            throw new SemanticException(
                ErrorMsg.OFFLINE_TABLE_OR_PARTITION.getMsg(ts.tableName + ":" + part.getName()));
          }
          outputs.add(new WriteEntity(part));
        } else {
          outputs.add(new WriteEntity(ts.tableHandle));
        }
      } catch (HiveException e) {
        throw new SemanticException(e);
      }
    }

    LoadTableDesc loadTableWork =
        new LoadTableDesc(
            fromURI.toString(),
            loadTmpPath,
            Utilities.getTableDesc(ts.tableHandle),
            partSpec,
            isOverWrite);

    Task<? extends Serializable> childTask =
        TaskFactory.get(new MoveWork(getInputs(), getOutputs(), loadTableWork, null, true), conf);
    if (rTask != null) {
      rTask.addDependentTask(childTask);
    } else {
      rTask = childTask;
    }

    rootTasks.add(rTask);

    // The user asked for stats to be collected.
    // Some stats like number of rows require a scan of the data
    // However, some other stats, like number of files, do not require a complete scan
    // Update the stats which do not require a complete scan.
    Task<? extends Serializable> statTask = null;
    if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
      StatsWork statDesc = new StatsWork(loadTableWork);
      statDesc.setNoStatsAggregator(true);
      statDesc.setClearAggregatorStats(true);
      statDesc.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
      statTask = TaskFactory.get(statDesc, conf);
    }

    // HIVE-3334 has been filed for load file with index auto update
    if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEINDEXAUTOUPDATE)) {
      IndexUpdater indexUpdater = new IndexUpdater(loadTableWork, getInputs(), conf);
      try {
        List<Task<? extends Serializable>> indexUpdateTasks = indexUpdater.generateUpdateTasks();

        for (Task<? extends Serializable> updateTask : indexUpdateTasks) {
          // LOAD DATA will either have a copy & move or just a move,
          // we always want the update to be dependent on the move
          childTask.addDependentTask(updateTask);
          if (statTask != null) {
            updateTask.addDependentTask(statTask);
          }
        }
      } catch (HiveException e) {
        console.printInfo(
            "WARNING: could not auto-update stale indexes, indexes are not out of sync");
      }
    } else if (statTask != null) {
      childTask.addDependentTask(statTask);
    }
  }
  /**
   * Table Sink encountered.
   *
   * @param nd the table sink operator encountered
   * @param opProcCtx context
   */
  public Object process(
      Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs)
      throws SemanticException {
    TableScanOperator op = (TableScanOperator) nd;
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    ParseContext parseCtx = ctx.getParseCtx();
    Class<? extends InputFormat> inputFormat =
        op.getConf().getTableMetadata().getInputFormatClass();
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();

    // create a dummy MapReduce task
    MapredWork currWork = GenMapRedUtils.getMapRedWork(parseCtx);
    MapRedTask currTask = (MapRedTask) TaskFactory.get(currWork, parseCtx.getConf());
    Operator<? extends OperatorDesc> currTopOp = op;
    ctx.setCurrTask(currTask);
    ctx.setCurrTopOp(currTopOp);

    for (String alias : parseCtx.getTopOps().keySet()) {
      Operator<? extends OperatorDesc> currOp = parseCtx.getTopOps().get(alias);
      if (currOp == op) {
        String currAliasId = alias;
        ctx.setCurrAliasId(currAliasId);
        mapCurrCtx.put(op, new GenMapRedCtx(currTask, currAliasId));

        if (parseCtx.getQueryProperties().isAnalyzeCommand()) {
          boolean partialScan = parseCtx.getQueryProperties().isPartialScanAnalyzeCommand();
          boolean noScan = parseCtx.getQueryProperties().isNoScanAnalyzeCommand();
          if (inputFormat.equals(OrcInputFormat.class)) {
            // For ORC, all the following statements are the same
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;

            // There will not be any MR or Tez job above this task
            StatsNoJobWork snjWork =
                new StatsNoJobWork(op.getConf().getTableMetadata().getTableSpec());
            snjWork.setStatsReliable(
                parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
            // If partition is specified, get pruned partition list
            Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(op);
            if (confirmedParts.size() > 0) {
              Table source = op.getConf().getTableMetadata();
              List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
              PrunedPartitionList partList =
                  new PrunedPartitionList(source, confirmedParts, partCols, false);
              snjWork.setPrunedPartitionList(partList);
            }
            Task<StatsNoJobWork> snjTask = TaskFactory.get(snjWork, parseCtx.getConf());
            ctx.setCurrTask(snjTask);
            ctx.setCurrTopOp(null);
            ctx.getRootTasks().clear();
            ctx.getRootTasks().add(snjTask);
          } else {
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
            // The plan consists of a simple MapRedTask followed by a StatsTask.
            // The MR task is just a simple TableScanOperator

            StatsWork statsWork = new StatsWork(op.getConf().getTableMetadata().getTableSpec());
            statsWork.setAggKey(op.getConf().getStatsAggPrefix());
            statsWork.setSourceTask(currTask);
            statsWork.setStatsReliable(
                parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
            Task<StatsWork> statsTask = TaskFactory.get(statsWork, parseCtx.getConf());
            currTask.addDependentTask(statsTask);
            if (!ctx.getRootTasks().contains(currTask)) {
              ctx.getRootTasks().add(currTask);
            }

            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
            // The plan consists of a StatsTask only.
            if (noScan) {
              statsTask.setParentTasks(null);
              statsWork.setNoScanAnalyzeCommand(true);
              ctx.getRootTasks().remove(currTask);
              ctx.getRootTasks().add(statsTask);
            }

            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
            if (partialScan) {
              handlePartialScanCommand(op, ctx, parseCtx, currTask, statsWork, statsTask);
            }

            currWork.getMapWork().setGatheringStats(true);
            if (currWork.getReduceWork() != null) {
              currWork.getReduceWork().setGatheringStats(true);
            }

            // NOTE: here we should use the new partition predicate pushdown API to get a list of
            // pruned list,
            // and pass it to setTaskPlan as the last parameter
            Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(op);
            if (confirmedPartns.size() > 0) {
              Table source = op.getConf().getTableMetadata();
              List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
              PrunedPartitionList partList =
                  new PrunedPartitionList(source, confirmedPartns, partCols, false);
              GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, currTask, false, ctx, partList);
            } else { // non-partitioned table
              GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, currTask, false, ctx);
            }
          }
        }

        return true;
      }
    }
    assert false;
    return null;
  }