/** * Construct a conditional task given the current leaf task, the MoveWork and the MapredWork. * * @param conf HiveConf * @param currTask current leaf task * @param mvWork MoveWork for the move task * @param mergeWork MapredWork for the merge task. * @param inputPath the input directory of the merge/move task * @return The conditional task */ private ConditionalTask createCondTask( HiveConf conf, Task<? extends Serializable> currTask, MoveWork mvWork, MapredWork mergeWork, String inputPath) { Task<? extends Serializable> mergeTask = TaskFactory.get(mergeWork, conf); Task<? extends Serializable> moveTask = TaskFactory.get(mvWork, conf); List<Serializable> listWorks = new ArrayList<Serializable>(); listWorks.add(mvWork); listWorks.add(mergeWork); ConditionalWork cndWork = new ConditionalWork(listWorks); List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>(); listTasks.add(moveTask); listTasks.add(mergeTask); ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, conf); cndTsk.setListTasks(listTasks); // create resolver cndTsk.setResolver(new ConditionalResolverMergeFiles()); ConditionalResolverMergeFilesCtx mrCtx = new ConditionalResolverMergeFilesCtx(listTasks, inputPath); cndTsk.setResolverCtx(mrCtx); // make the conditional task as the child of the current leaf task currTask.addDependentTask(cndTsk); return cndTsk; }
/** * Construct a conditional task given the current leaf task, the MoveWork and the MapredWork. * * @param conf HiveConf * @param currTask current leaf task * @param mvWork MoveWork for the move task * @param mergeWork MapredWork for the merge task. * @param inputPath the input directory of the merge/move task * @return The conditional task */ private ConditionalTask createCondTask( HiveConf conf, Task<? extends Serializable> currTask, MoveWork mvWork, MapredWork mergeWork, String inputPath) { // There are 3 options for this ConditionalTask: // 1) Merge the partitions // 2) Move the partitions (i.e. don't merge the partitions) // 3) Merge some partitions and move other partitions (i.e. merge some partitions and don't // merge others) in this case the merge is done first followed by the move to prevent // conflicts. Task<? extends Serializable> mergeOnlyMergeTask = TaskFactory.get(mergeWork, conf); Task<? extends Serializable> moveOnlyMoveTask = TaskFactory.get(mvWork, conf); Task<? extends Serializable> mergeAndMoveMergeTask = TaskFactory.get(mergeWork, conf); Task<? extends Serializable> mergeAndMoveMoveTask = TaskFactory.get(mvWork, conf); // NOTE! It is necessary merge task is the parent of the move task, and not // the other way around, for the proper execution of the execute method of // ConditionalTask mergeAndMoveMergeTask.addDependentTask(mergeAndMoveMoveTask); List<Serializable> listWorks = new ArrayList<Serializable>(); listWorks.add(mvWork); listWorks.add(mergeWork); ConditionalWork cndWork = new ConditionalWork(listWorks); List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>(); listTasks.add(moveOnlyMoveTask); listTasks.add(mergeOnlyMergeTask); listTasks.add(mergeAndMoveMergeTask); ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, conf); cndTsk.setListTasks(listTasks); // create resolver cndTsk.setResolver(new ConditionalResolverMergeFiles()); ConditionalResolverMergeFilesCtx mrCtx = new ConditionalResolverMergeFilesCtx(listTasks, inputPath); cndTsk.setResolverCtx(mrCtx); // make the conditional task as the child of the current leaf task currTask.addDependentTask(cndTsk); return cndTsk; }
/** * Add the StatsTask as a dependent task of the MoveTask because StatsTask will change the * Table/Partition metadata. For atomicity, we should not change it before the data is actually * there done by MoveTask. * * @param nd the FileSinkOperator whose results are taken care of by the MoveTask. * @param mvTask The MoveTask that moves the FileSinkOperator's results. * @param currTask The MapRedTask that the FileSinkOperator belongs to. * @param hconf HiveConf */ private void addStatsTask( FileSinkOperator nd, MoveTask mvTask, Task<? extends Serializable> currTask, HiveConf hconf) { MoveWork mvWork = ((MoveTask) mvTask).getWork(); StatsWork statsWork = null; if (mvWork.getLoadTableWork() != null) { statsWork = new StatsWork(mvWork.getLoadTableWork()); } else if (mvWork.getLoadFileWork() != null) { statsWork = new StatsWork(mvWork.getLoadFileWork()); } assert statsWork != null : "Error when genereting StatsTask"; statsWork.setStatsReliable(hconf.getBoolVar(ConfVars.HIVE_STATS_RELIABLE)); MapredWork mrWork = (MapredWork) currTask.getWork(); // AggKey in StatsWork is used for stats aggregation while StatsAggPrefix // in FileSinkDesc is used for stats publishing. They should be consistent. statsWork.setAggKey(((FileSinkOperator) nd).getConf().getStatsAggPrefix()); Task<? extends Serializable> statsTask = TaskFactory.get(statsWork, hconf); // mark the MapredWork and FileSinkOperator for gathering stats nd.getConf().setGatherStats(true); mrWork.setGatheringStats(true); nd.getConf().setStatsReliable(hconf.getBoolVar(ConfVars.HIVE_STATS_RELIABLE)); nd.getConf() .setMaxStatsKeyPrefixLength(hconf.getIntVar(ConfVars.HIVE_STATS_KEY_PREFIX_MAX_LENGTH)); // mrWork.addDestinationTable(nd.getConf().getTableInfo().getTableName()); // subscribe feeds from the MoveTask so that MoveTask can forward the list // of dynamic partition list to the StatsTask mvTask.addDependentTask(statsTask); statsTask.subscribeFeed(mvTask); }
/** * handle partial scan command. * * <p>It is composed of PartialScanTask followed by StatsTask. */ private void handlePartialScanCommand( TableScanOperator tableScan, ParseContext parseContext, StatsWork statsWork, GenTezProcContext context, Task<StatsWork> statsTask) throws SemanticException { String aggregationKey = tableScan.getConf().getStatsAggPrefix(); StringBuilder aggregationKeyBuffer = new StringBuilder(aggregationKey); List<Path> inputPaths = GenMapRedUtils.getInputPathsForPartialScan(tableScan, aggregationKeyBuffer); aggregationKey = aggregationKeyBuffer.toString(); // scan work PartialScanWork scanWork = new PartialScanWork(inputPaths); scanWork.setMapperCannotSpanPartns(true); scanWork.setAggKey(aggregationKey); scanWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir(), parseContext.getConf()); // stats work statsWork.setPartialScanAnalyzeCommand(true); // partial scan task DriverContext driverCxt = new DriverContext(); Task<PartialScanWork> partialScanTask = TaskFactory.get(scanWork, parseContext.getConf()); partialScanTask.initialize(parseContext.getConf(), null, driverCxt); partialScanTask.setWork(scanWork); statsWork.setSourceTask(partialScanTask); // task dependency context.rootTasks.remove(context.currentTask); context.rootTasks.add(partialScanTask); partialScanTask.addDependentTask(statsTask); }
/** * Table Sink encountered * * @param nd the table sink operator encountered * @param opProcCtx context */ public Object process( Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException { TableScanOperator op = (TableScanOperator) nd; GenMRProcContext ctx = (GenMRProcContext) opProcCtx; ParseContext parseCtx = ctx.getParseCtx(); Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx(); // create a dummy task Task<? extends Serializable> currTask = TaskFactory.get(GenMapRedUtils.getMapRedWork(), parseCtx.getConf()); Operator<? extends Serializable> currTopOp = op; ctx.setCurrTask(currTask); ctx.setCurrTopOp(currTopOp); for (String alias : parseCtx.getTopOps().keySet()) { Operator<? extends Serializable> currOp = parseCtx.getTopOps().get(alias); if (currOp == op) { String currAliasId = alias; ctx.setCurrAliasId(currAliasId); mapCurrCtx.put(op, new GenMapRedCtx(currTask, currTopOp, currAliasId)); return null; } } assert false; return null; }
/** * handle partial scan command. It is composed of PartialScanTask followed by StatsTask . * * @param op * @param ctx * @param parseCtx * @param currTask * @param parseInfo * @param statsWork * @param statsTask * @throws SemanticException */ private void handlePartialScanCommand( TableScanOperator op, GenMRProcContext ctx, ParseContext parseCtx, Task<? extends Serializable> currTask, StatsWork statsWork, Task<StatsWork> statsTask) throws SemanticException { String aggregationKey = op.getConf().getStatsAggPrefix(); StringBuffer aggregationKeyBuffer = new StringBuffer(aggregationKey); List<Path> inputPaths = GenMapRedUtils.getInputPathsForPartialScan(op, aggregationKeyBuffer); aggregationKey = aggregationKeyBuffer.toString(); // scan work PartialScanWork scanWork = new PartialScanWork(inputPaths); scanWork.setMapperCannotSpanPartns(true); scanWork.setAggKey(aggregationKey); // stats work statsWork.setPartialScanAnalyzeCommand(true); // partial scan task DriverContext driverCxt = new DriverContext(); Task<PartialScanWork> psTask = TaskFactory.get(scanWork, parseCtx.getConf()); psTask.initialize(parseCtx.getConf(), null, driverCxt); psTask.setWork(scanWork); // task dependency ctx.getRootTasks().remove(currTask); ctx.getRootTasks().add(psTask); psTask.addDependentTask(statsTask); List<Task<? extends Serializable>> parentTasks = new ArrayList<Task<? extends Serializable>>(); parentTasks.add(psTask); statsTask.setParentTasks(parentTasks); }
/** * Returns dependencyTaskForMultiInsert initializing it if necessary. * * <p>dependencyTaskForMultiInsert serves as a mutual dependency for the final move tasks in a * multi-insert query. * * @return */ public DependencyCollectionTask getDependencyTaskForMultiInsert() { if (dependencyTaskForMultiInsert == null) { dependencyTaskForMultiInsert = (DependencyCollectionTask) TaskFactory.get(new DependencyCollectionWork(), conf); } return dependencyTaskForMultiInsert; }
/** * Add the StatsTask as a dependent task of the MoveTask because StatsTask will change the * Table/Partition metadata. For atomicity, we should not change it before the data is actually * there done by MoveTask. * * @param nd the FileSinkOperator whose results are taken care of by the MoveTask. * @param mvTask The MoveTask that moves the FileSinkOperator's results. * @param currTask The MapRedTask that the FileSinkOperator belongs to. * @param hconf HiveConf */ private void addStatsTask( FileSinkOperator nd, MoveTask mvTask, Task<? extends Serializable> currTask, HiveConf hconf) { MoveWork mvWork = ((MoveTask) mvTask).getWork(); StatsWork statsWork = new StatsWork(mvWork.getLoadTableWork()); MapredWork mrWork = (MapredWork) currTask.getWork(); // AggKey in StatsWork is used for stats aggregation while StatsAggPrefix // in FileSinkDesc is used for stats publishing. They should be consistent. statsWork.setAggKey(((FileSinkOperator) nd).getConf().getStatsAggPrefix()); Task<? extends Serializable> statsTask = TaskFactory.get(statsWork, hconf); // mark the MapredWork and FileSinkOperator for gathering stats nd.getConf().setGatherStats(true); mrWork.setGatheringStats(true); // mrWork.addDestinationTable(nd.getConf().getTableInfo().getTableName()); // subscribe feeds from the MoveTask so that MoveTask can forward the list // of dynamic partition list to the StatsTask mvTask.addDependentTask(statsTask); statsTask.subscribeFeed(mvTask); }
/** * Split the current plan by creating a temporary destination. * * @param op the reduce sink operator encountered * @param opProcCtx processing context */ public static void splitPlan(ReduceSinkOperator op, GenMRProcContext opProcCtx) throws SemanticException { // Generate a new task ParseContext parseCtx = opProcCtx.getParseCtx(); MapredWork cplan = getMapRedWork(parseCtx.getConf()); Task<? extends Serializable> redTask = TaskFactory.get(cplan, parseCtx.getConf()); Operator<? extends Serializable> reducer = op.getChildOperators().get(0); // Add the reducer cplan.setReducer(reducer); ReduceSinkDesc desc = op.getConf(); cplan.setNumReduceTasks(new Integer(desc.getNumReducers())); HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap = opProcCtx.getOpTaskMap(); opTaskMap.put(reducer, redTask); Task<? extends Serializable> currTask = opProcCtx.getCurrTask(); splitTasks(op, currTask, redTask, opProcCtx, true, false, 0); opProcCtx.getRootOps().add(op); }
public static void mergeMapJoinUnion(UnionOperator union, GenMRProcContext ctx, int pos) throws SemanticException { ParseContext parseCtx = ctx.getParseCtx(); UnionProcContext uCtx = parseCtx.getUCtx(); UnionParseContext uPrsCtx = uCtx.getUnionParseContext(union); assert uPrsCtx != null; Task<? extends Serializable> currTask = ctx.getCurrTask(); GenMRUnionCtx uCtxTask = ctx.getUnionTask(union); Task<? extends Serializable> uTask = null; union.getParentOperators().get(pos); MapredWork uPlan = null; // union is encountered for the first time if (uCtxTask == null) { uCtxTask = new GenMRUnionCtx(); uPlan = GenMapRedUtils.getMapRedWork(parseCtx.getConf()); uTask = TaskFactory.get(uPlan, parseCtx.getConf()); uCtxTask.setUTask(uTask); ctx.setUnionTask(union, uCtxTask); } else { uTask = uCtxTask.getUTask(); uPlan = (MapredWork) uTask.getWork(); } // If there is a mapjoin at position 'pos' if (uPrsCtx.getMapJoinSubq(pos)) { GenMRMapJoinCtx mjCtx = ctx.getMapJoinCtx(ctx.getCurrMapJoinOp()); String taskTmpDir = mjCtx.getTaskTmpDir(); if (uPlan.getPathToAliases().get(taskTmpDir) == null) { uPlan.getPathToAliases().put(taskTmpDir, new ArrayList<String>()); uPlan.getPathToAliases().get(taskTmpDir).add(taskTmpDir); uPlan.getPathToPartitionInfo().put(taskTmpDir, new PartitionDesc(mjCtx.getTTDesc(), null)); uPlan.getAliasToWork().put(taskTmpDir, mjCtx.getRootMapJoinOp()); } for (Task t : currTask.getParentTasks()) { t.addDependentTask(uTask); } try { boolean notDone = true; while (notDone) { for (Task t : currTask.getParentTasks()) { t.removeDependentTask(currTask); } notDone = false; } } catch (ConcurrentModificationException e) { } } else { setTaskPlan(ctx.getCurrAliasId(), ctx.getCurrTopOp(), uPlan, false, ctx); } ctx.setCurrTask(uTask); ctx.setCurrAliasId(null); ctx.setCurrTopOp(null); ctx.setCurrMapJoinOp(null); ctx.getMapCurrCtx().put(union, new GenMapRedCtx(ctx.getCurrTask(), null, null)); }
@SuppressWarnings("unchecked") @Override public Object process( Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException { GenTezProcContext context = (GenTezProcContext) procContext; TableScanOperator tableScan = (TableScanOperator) nd; ParseContext parseContext = context.parseContext; Class<? extends InputFormat> inputFormat = tableScan.getConf().getTableMetadata().getInputFormatClass(); if (parseContext.getQueryProperties().isAnalyzeCommand()) { assert tableScan.getChildOperators() == null || tableScan.getChildOperators().size() == 0; String alias = null; for (String a : parseContext.getTopOps().keySet()) { if (tableScan == parseContext.getTopOps().get(a)) { alias = a; } } assert alias != null; TezWork tezWork = context.currentTask.getWork(); if (inputFormat.equals(OrcInputFormat.class)) { // For ORC, all the following statements are the same // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan; // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan; // There will not be any Tez job above this task StatsNoJobWork snjWork = new StatsNoJobWork(tableScan.getConf().getTableMetadata().getTableSpec()); snjWork.setStatsReliable( parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE)); // If partition is specified, get pruned partition list Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan); if (confirmedParts.size() > 0) { Table source = tableScan.getConf().getTableMetadata(); List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan); PrunedPartitionList partList = new PrunedPartitionList(source, confirmedParts, partCols, false); snjWork.setPrunedPartitionList(partList); } Task<StatsNoJobWork> snjTask = TaskFactory.get(snjWork, parseContext.getConf()); snjTask.setParentTasks(null); context.rootTasks.remove(context.currentTask); context.rootTasks.add(snjTask); return true; } else { // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS; // The plan consists of a simple TezTask followed by a StatsTask. // The Tez task is just a simple TableScanOperator StatsWork statsWork = new StatsWork(tableScan.getConf().getTableMetadata().getTableSpec()); statsWork.setAggKey(tableScan.getConf().getStatsAggPrefix()); statsWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir()); statsWork.setSourceTask(context.currentTask); statsWork.setStatsReliable( parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE)); Task<StatsWork> statsTask = TaskFactory.get(statsWork, parseContext.getConf()); context.currentTask.addDependentTask(statsTask); // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan; // The plan consists of a StatsTask only. if (parseContext.getQueryProperties().isNoScanAnalyzeCommand()) { statsTask.setParentTasks(null); statsWork.setNoScanAnalyzeCommand(true); context.rootTasks.remove(context.currentTask); context.rootTasks.add(statsTask); } // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan; if (parseContext.getQueryProperties().isPartialScanAnalyzeCommand()) { handlePartialScanCommand(tableScan, parseContext, statsWork, context, statsTask); } // NOTE: here we should use the new partition predicate pushdown API to get a list of pruned // list, // and pass it to setTaskPlan as the last parameter Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan); PrunedPartitionList partitions = null; if (confirmedPartns.size() > 0) { Table source = tableScan.getConf().getTableMetadata(); List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan); partitions = new PrunedPartitionList(source, confirmedPartns, partCols, false); } MapWork w = utils.createMapWork(context, tableScan, tezWork, partitions); w.setGatheringStats(true); return true; } } return null; }
@Override public void analyzeInternal(ASTNode ast) throws SemanticException { isLocal = false; isOverWrite = false; Tree fromTree = ast.getChild(0); Tree tableTree = ast.getChild(1); if (ast.getChildCount() == 4) { isLocal = true; isOverWrite = true; } if (ast.getChildCount() == 3) { if (ast.getChild(2).getText().toLowerCase().equals("local")) { isLocal = true; } else { isOverWrite = true; } } // initialize load path URI fromURI; try { String fromPath = stripQuotes(fromTree.getText()); fromURI = initializeFromURI(fromPath); } catch (IOException e) { throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e.getMessage()), e); } catch (URISyntaxException e) { throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e.getMessage()), e); } // initialize destination table/partition tableSpec ts = new tableSpec(db, conf, (ASTNode) tableTree); if (ts.tableHandle.isOffline()) { throw new SemanticException( ErrorMsg.OFFLINE_TABLE_OR_PARTITION.getMsg(":Table " + ts.tableName)); } if (ts.tableHandle.isView()) { throw new SemanticException(ErrorMsg.DML_AGAINST_VIEW.getMsg()); } if (ts.tableHandle.isNonNative()) { throw new SemanticException(ErrorMsg.LOAD_INTO_NON_NATIVE.getMsg()); } if (ts.tableHandle.isStoredAsSubDirectories()) { throw new SemanticException(ErrorMsg.LOAD_INTO_STORED_AS_DIR.getMsg()); } URI toURI = (ts.partHandle != null) ? ts.partHandle.getDataLocation() : ts.tableHandle.getDataLocation(); List<FieldSchema> parts = ts.tableHandle.getPartitionKeys(); if ((parts != null && parts.size() > 0) && (ts.partSpec == null || ts.partSpec.size() == 0)) { throw new SemanticException(ErrorMsg.NEED_PARTITION_ERROR.getMsg()); } // make sure the arguments make sense applyConstraints(fromURI, toURI, fromTree, isLocal); Task<? extends Serializable> rTask = null; // create copy work if (isLocal) { // if the local keyword is specified - we will always make a copy. this // might seem redundant in the case // that the hive warehouse is also located in the local file system - but // that's just a test case. String copyURIStr = ctx.getExternalTmpFileURI(toURI); URI copyURI = URI.create(copyURIStr); rTask = TaskFactory.get(new CopyWork(fromURI.toString(), copyURIStr), conf); fromURI = copyURI; } // create final load/move work String loadTmpPath = ctx.getExternalTmpFileURI(toURI); Map<String, String> partSpec = ts.getPartSpec(); if (partSpec == null) { partSpec = new LinkedHashMap<String, String>(); outputs.add(new WriteEntity(ts.tableHandle)); } else { try { Partition part = Hive.get().getPartition(ts.tableHandle, partSpec, false); if (part != null) { if (part.isOffline()) { throw new SemanticException( ErrorMsg.OFFLINE_TABLE_OR_PARTITION.getMsg(ts.tableName + ":" + part.getName())); } outputs.add(new WriteEntity(part)); } else { outputs.add(new WriteEntity(ts.tableHandle)); } } catch (HiveException e) { throw new SemanticException(e); } } LoadTableDesc loadTableWork = new LoadTableDesc( fromURI.toString(), loadTmpPath, Utilities.getTableDesc(ts.tableHandle), partSpec, isOverWrite); Task<? extends Serializable> childTask = TaskFactory.get(new MoveWork(getInputs(), getOutputs(), loadTableWork, null, true), conf); if (rTask != null) { rTask.addDependentTask(childTask); } else { rTask = childTask; } rootTasks.add(rTask); // The user asked for stats to be collected. // Some stats like number of rows require a scan of the data // However, some other stats, like number of files, do not require a complete scan // Update the stats which do not require a complete scan. Task<? extends Serializable> statTask = null; if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { StatsWork statDesc = new StatsWork(loadTableWork); statDesc.setNoStatsAggregator(true); statDesc.setClearAggregatorStats(true); statDesc.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE)); statTask = TaskFactory.get(statDesc, conf); } // HIVE-3334 has been filed for load file with index auto update if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEINDEXAUTOUPDATE)) { IndexUpdater indexUpdater = new IndexUpdater(loadTableWork, getInputs(), conf); try { List<Task<? extends Serializable>> indexUpdateTasks = indexUpdater.generateUpdateTasks(); for (Task<? extends Serializable> updateTask : indexUpdateTasks) { // LOAD DATA will either have a copy & move or just a move, // we always want the update to be dependent on the move childTask.addDependentTask(updateTask); if (statTask != null) { updateTask.addDependentTask(statTask); } } } catch (HiveException e) { console.printInfo( "WARNING: could not auto-update stale indexes, indexes are not out of sync"); } } else if (statTask != null) { childTask.addDependentTask(statTask); } }
/** * Table Sink encountered. * * @param nd the table sink operator encountered * @param opProcCtx context */ public Object process( Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException { TableScanOperator op = (TableScanOperator) nd; GenMRProcContext ctx = (GenMRProcContext) opProcCtx; ParseContext parseCtx = ctx.getParseCtx(); Class<? extends InputFormat> inputFormat = op.getConf().getTableMetadata().getInputFormatClass(); Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx(); // create a dummy MapReduce task MapredWork currWork = GenMapRedUtils.getMapRedWork(parseCtx); MapRedTask currTask = (MapRedTask) TaskFactory.get(currWork, parseCtx.getConf()); Operator<? extends OperatorDesc> currTopOp = op; ctx.setCurrTask(currTask); ctx.setCurrTopOp(currTopOp); for (String alias : parseCtx.getTopOps().keySet()) { Operator<? extends OperatorDesc> currOp = parseCtx.getTopOps().get(alias); if (currOp == op) { String currAliasId = alias; ctx.setCurrAliasId(currAliasId); mapCurrCtx.put(op, new GenMapRedCtx(currTask, currAliasId)); if (parseCtx.getQueryProperties().isAnalyzeCommand()) { boolean partialScan = parseCtx.getQueryProperties().isPartialScanAnalyzeCommand(); boolean noScan = parseCtx.getQueryProperties().isNoScanAnalyzeCommand(); if (inputFormat.equals(OrcInputFormat.class)) { // For ORC, all the following statements are the same // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan; // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan; // There will not be any MR or Tez job above this task StatsNoJobWork snjWork = new StatsNoJobWork(op.getConf().getTableMetadata().getTableSpec()); snjWork.setStatsReliable( parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE)); // If partition is specified, get pruned partition list Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(op); if (confirmedParts.size() > 0) { Table source = op.getConf().getTableMetadata(); List<String> partCols = GenMapRedUtils.getPartitionColumns(op); PrunedPartitionList partList = new PrunedPartitionList(source, confirmedParts, partCols, false); snjWork.setPrunedPartitionList(partList); } Task<StatsNoJobWork> snjTask = TaskFactory.get(snjWork, parseCtx.getConf()); ctx.setCurrTask(snjTask); ctx.setCurrTopOp(null); ctx.getRootTasks().clear(); ctx.getRootTasks().add(snjTask); } else { // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS; // The plan consists of a simple MapRedTask followed by a StatsTask. // The MR task is just a simple TableScanOperator StatsWork statsWork = new StatsWork(op.getConf().getTableMetadata().getTableSpec()); statsWork.setAggKey(op.getConf().getStatsAggPrefix()); statsWork.setSourceTask(currTask); statsWork.setStatsReliable( parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE)); Task<StatsWork> statsTask = TaskFactory.get(statsWork, parseCtx.getConf()); currTask.addDependentTask(statsTask); if (!ctx.getRootTasks().contains(currTask)) { ctx.getRootTasks().add(currTask); } // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan; // The plan consists of a StatsTask only. if (noScan) { statsTask.setParentTasks(null); statsWork.setNoScanAnalyzeCommand(true); ctx.getRootTasks().remove(currTask); ctx.getRootTasks().add(statsTask); } // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan; if (partialScan) { handlePartialScanCommand(op, ctx, parseCtx, currTask, statsWork, statsTask); } currWork.getMapWork().setGatheringStats(true); if (currWork.getReduceWork() != null) { currWork.getReduceWork().setGatheringStats(true); } // NOTE: here we should use the new partition predicate pushdown API to get a list of // pruned list, // and pass it to setTaskPlan as the last parameter Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(op); if (confirmedPartns.size() > 0) { Table source = op.getConf().getTableMetadata(); List<String> partCols = GenMapRedUtils.getPartitionColumns(op); PrunedPartitionList partList = new PrunedPartitionList(source, confirmedPartns, partCols, false); GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, currTask, false, ctx, partList); } else { // non-partitioned table GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, currTask, false, ctx); } } } return true; } } assert false; return null; }