@Override public Object dispatch(Node nd, Stack<Node> stack, Object... nodeOutputs) throws SemanticException { Task<? extends Serializable> task = (Task<? extends Serializable>) nd; if (!task.isMapRedTask() || task instanceof ConditionalTask || ((MapredWork) task.getWork()).getReducer() == null) { return null; } SkewJoinProcCtx skewJoinProcContext = new SkewJoinProcCtx(task, physicalContext.getParseContext()); Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); opRules.put( new RuleRegExp("R1", CommonJoinOperator.getOperatorName() + "%"), SkewJoinProcFactory.getJoinProc()); // The dispatcher fires the processor corresponding to the closest // matching rule and passes the context along Dispatcher disp = new DefaultRuleDispatcher( SkewJoinProcFactory.getDefaultProc(), opRules, skewJoinProcContext); GraphWalker ogw = new DefaultGraphWalker(disp); // iterator the reducer operator tree ArrayList<Node> topNodes = new ArrayList<Node>(); topNodes.add(((MapredWork) task.getWork()).getReducer()); ogw.startWalking(topNodes, null); return null; }
/* * Multiple file sink descriptors are linked. * Use the task created by the first linked file descriptor */ private void processLinkedFileDesc(GenMRProcContext ctx, Task<? extends Serializable> childTask) throws SemanticException { Operator<? extends OperatorDesc> currTopOp = ctx.getCurrTopOp(); String currAliasId = ctx.getCurrAliasId(); List<Operator<? extends OperatorDesc>> seenOps = ctx.getSeenOps(); List<Task<? extends Serializable>> rootTasks = ctx.getRootTasks(); Task<? extends Serializable> currTask = ctx.getCurrTask(); if (currTopOp != null) { if (!seenOps.contains(currTopOp)) { seenOps.add(currTopOp); GenMapRedUtils.setTaskPlan( currAliasId, currTopOp, (MapredWork) currTask.getWork(), false, ctx); } if (!rootTasks.contains(currTask) && (currTask.getParentTasks() == null || currTask.getParentTasks().isEmpty())) { rootTasks.add(currTask); } } if (childTask != null) { currTask.addDependentTask(childTask); } }
/* * It is a idempotent function to add various intermediate files as the source * for the union. The plan has already been created. */ public static void initUnionPlan( GenMRProcContext opProcCtx, Task<? extends Serializable> currTask, boolean local) { MapredWork plan = (MapredWork) currTask.getWork(); UnionOperator currUnionOp = opProcCtx.getCurrUnionOp(); assert currUnionOp != null; GenMRUnionCtx uCtx = opProcCtx.getUnionTask(currUnionOp); assert uCtx != null; List<String> taskTmpDirLst = uCtx.getTaskTmpDir(); List<TableDesc> tt_descLst = uCtx.getTTDesc(); assert !taskTmpDirLst.isEmpty() && !tt_descLst.isEmpty(); assert taskTmpDirLst.size() == tt_descLst.size(); int size = taskTmpDirLst.size(); assert local == false; for (int pos = 0; pos < size; pos++) { String taskTmpDir = taskTmpDirLst.get(pos); TableDesc tt_desc = tt_descLst.get(pos); if (plan.getPathToAliases().get(taskTmpDir) == null) { plan.getPathToAliases().put(taskTmpDir, new ArrayList<String>()); plan.getPathToAliases().get(taskTmpDir).add(taskTmpDir); plan.getPathToPartitionInfo().put(taskTmpDir, new PartitionDesc(tt_desc, null)); plan.getAliasToWork().put(taskTmpDir, currUnionOp); } } }
/** * Add the StatsTask as a dependent task of the MoveTask because StatsTask will change the * Table/Partition metadata. For atomicity, we should not change it before the data is actually * there done by MoveTask. * * @param nd the FileSinkOperator whose results are taken care of by the MoveTask. * @param mvTask The MoveTask that moves the FileSinkOperator's results. * @param currTask The MapRedTask that the FileSinkOperator belongs to. * @param hconf HiveConf */ private void addStatsTask( FileSinkOperator nd, MoveTask mvTask, Task<? extends Serializable> currTask, HiveConf hconf) { MoveWork mvWork = ((MoveTask) mvTask).getWork(); StatsWork statsWork = null; if (mvWork.getLoadTableWork() != null) { statsWork = new StatsWork(mvWork.getLoadTableWork()); } else if (mvWork.getLoadFileWork() != null) { statsWork = new StatsWork(mvWork.getLoadFileWork()); } assert statsWork != null : "Error when genereting StatsTask"; statsWork.setStatsReliable(hconf.getBoolVar(ConfVars.HIVE_STATS_RELIABLE)); MapredWork mrWork = (MapredWork) currTask.getWork(); // AggKey in StatsWork is used for stats aggregation while StatsAggPrefix // in FileSinkDesc is used for stats publishing. They should be consistent. statsWork.setAggKey(((FileSinkOperator) nd).getConf().getStatsAggPrefix()); Task<? extends Serializable> statsTask = TaskFactory.get(statsWork, hconf); // mark the MapredWork and FileSinkOperator for gathering stats nd.getConf().setGatherStats(true); mrWork.setGatheringStats(true); nd.getConf().setStatsReliable(hconf.getBoolVar(ConfVars.HIVE_STATS_RELIABLE)); nd.getConf() .setMaxStatsKeyPrefixLength(hconf.getIntVar(ConfVars.HIVE_STATS_KEY_PREFIX_MAX_LENGTH)); // mrWork.addDestinationTable(nd.getConf().getTableInfo().getTableName()); // subscribe feeds from the MoveTask so that MoveTask can forward the list // of dynamic partition list to the StatsTask mvTask.addDependentTask(statsTask); statsTask.subscribeFeed(mvTask); }
/** * File Sink Operator encountered. * * @param nd the file sink operator encountered * @param opProcCtx context */ public Object process( Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException { GenMRProcContext ctx = (GenMRProcContext) opProcCtx; ParseContext parseCtx = ctx.getParseCtx(); boolean chDir = false; Task<? extends Serializable> currTask = ctx.getCurrTask(); FileSinkOperator fsOp = (FileSinkOperator) nd; boolean isInsertTable = // is INSERT OVERWRITE TABLE fsOp.getConf().getTableInfo().getTableName() != null && parseCtx.getQB().getParseInfo().isInsertToTable(); HiveConf hconf = parseCtx.getConf(); // Has the user enabled merging of files for map-only jobs or for all jobs if ((ctx.getMvTask() != null) && (!ctx.getMvTask().isEmpty())) { List<Task<? extends Serializable>> mvTasks = ctx.getMvTask(); // In case of unions or map-joins, it is possible that the file has // already been seen. // So, no need to attempt to merge the files again. if ((ctx.getSeenFileSinkOps() == null) || (!ctx.getSeenFileSinkOps().contains(nd))) { // no need of merging if the move is to a local file system MoveTask mvTask = (MoveTask) findMoveTask(mvTasks, fsOp); if (isInsertTable && hconf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { addStatsTask(fsOp, mvTask, currTask, parseCtx.getConf()); } if ((mvTask != null) && !mvTask.isLocal()) { // There are separate configuration parameters to control whether to // merge for a map-only job // or for a map-reduce job MapredWork currWork = (MapredWork) currTask.getWork(); boolean mergeMapOnly = hconf.getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPFILES) && currWork.getReducer() == null; boolean mergeMapRed = hconf.getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPREDFILES) && currWork.getReducer() != null; if (mergeMapOnly || mergeMapRed) { chDir = true; } } } } String finalName = processFS(nd, stack, opProcCtx, chDir); // need to merge the files in the destination table/partitions if (chDir && (finalName != null)) { createMergeJob((FileSinkOperator) nd, ctx, finalName); } return null; }
private Task<MoveWork> findMoveTask(List<Task<MoveWork>> mvTasks, FileSinkOperator fsOp) { // find the move task for (Task<MoveWork> mvTsk : mvTasks) { MoveWork mvWork = mvTsk.getWork(); String srcDir = null; if (mvWork.getLoadFileWork() != null) { srcDir = mvWork.getLoadFileWork().getSourceDir(); } else if (mvWork.getLoadTableWork() != null) { srcDir = mvWork.getLoadTableWork().getSourceDir(); } String fsOpDirName = fsOp.getConf().getFinalDirName(); if ((srcDir != null) && (srcDir.equalsIgnoreCase(fsOpDirName))) { return mvTsk; } } return null; }
private Task<? extends Serializable> findMoveTask( List<Task<? extends Serializable>> mvTasks, FileSinkOperator fsOp) { // find the move task for (Task<? extends Serializable> mvTsk : mvTasks) { MoveWork mvWork = (MoveWork) mvTsk.getWork(); String srcDir = null; if (mvWork.getLoadFileWork() != null) { srcDir = mvWork.getLoadFileWork().getSourceDir(); } else if (mvWork.getLoadTableWork() != null) { srcDir = mvWork.getLoadTableWork().getSourceDir(); } if ((srcDir != null) && (srcDir.equalsIgnoreCase(fsOp.getConf().getDirName()))) { return mvTsk; } } return null; }
/** * Initialize the current plan by adding it to root tasks. * * @param op the reduce sink operator encountered * @param opProcCtx processing context */ public static void initPlan(ReduceSinkOperator op, GenMRProcContext opProcCtx) throws SemanticException { Operator<? extends Serializable> reducer = op.getChildOperators().get(0); Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx(); GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0)); Task<? extends Serializable> currTask = mapredCtx.getCurrTask(); MapredWork plan = (MapredWork) currTask.getWork(); HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap = opProcCtx.getOpTaskMap(); Operator<? extends Serializable> currTopOp = opProcCtx.getCurrTopOp(); opTaskMap.put(reducer, currTask); plan.setReducer(reducer); ReduceSinkDesc desc = op.getConf(); plan.setNumReduceTasks(desc.getNumReducers()); List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks(); if (!rootTasks.contains(currTask)) { rootTasks.add(currTask); } if (reducer.getClass() == JoinOperator.class) { plan.setNeedsTagging(true); } assert currTopOp != null; List<Operator<? extends Serializable>> seenOps = opProcCtx.getSeenOps(); String currAliasId = opProcCtx.getCurrAliasId(); if (!seenOps.contains(currTopOp)) { seenOps.add(currTopOp); setTaskPlan(currAliasId, currTopOp, plan, false, opProcCtx); } currTopOp = null; currAliasId = null; opProcCtx.setCurrTask(currTask); opProcCtx.setCurrTopOp(currTopOp); opProcCtx.setCurrAliasId(currAliasId); }
/** * Adds the dependencyTaskForMultiInsert in ctx as a dependent of parentTask. If mvTask is a load * table, and HIVE_MULTI_INSERT_ATOMIC_OUTPUTS is set, adds mvTask as a dependent of * dependencyTaskForMultiInsert in ctx, otherwise adds mvTask as a dependent of parentTask as * well. * * @param ctx * @param mvTask * @param parentTask */ private void addDependentMoveTasks( GenMRProcContext ctx, Task<MoveWork> mvTask, Task<? extends Serializable> parentTask) { if (mvTask != null) { if (ctx.getConf().getBoolVar(ConfVars.HIVE_MULTI_INSERT_MOVE_TASKS_SHARE_DEPENDENCIES)) { DependencyCollectionTask dependencyTask = ctx.getDependencyTaskForMultiInsert(); parentTask.addDependentTask(dependencyTask); if (mvTask.getWork().getLoadTableWork() != null) { // Moving tables/partitions depend on the dependencyTask dependencyTask.addDependentTask(mvTask); } else { // Moving files depends on the parentTask (we still want the dependencyTask to depend // on the parentTask) parentTask.addDependentTask(mvTask); } } else { parentTask.addDependentTask(mvTask); } } }
/** * Initialize the current union plan. * * @param op the reduce sink operator encountered * @param opProcCtx processing context */ public static void initUnionPlan(ReduceSinkOperator op, GenMRProcContext opProcCtx) throws SemanticException { Operator<? extends Serializable> reducer = op.getChildOperators().get(0); Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx(); GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0)); Task<? extends Serializable> currTask = mapredCtx.getCurrTask(); MapredWork plan = (MapredWork) currTask.getWork(); HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap = opProcCtx.getOpTaskMap(); opTaskMap.put(reducer, currTask); plan.setReducer(reducer); ReduceSinkDesc desc = op.getConf(); plan.setNumReduceTasks(desc.getNumReducers()); if (reducer.getClass() == JoinOperator.class) { plan.setNeedsTagging(true); } initUnionPlan(opProcCtx, currTask, false); }
/** * Add the StatsTask as a dependent task of the MoveTask because StatsTask will change the * Table/Partition metadata. For atomicity, we should not change it before the data is actually * there done by MoveTask. * * @param nd the FileSinkOperator whose results are taken care of by the MoveTask. * @param mvTask The MoveTask that moves the FileSinkOperator's results. * @param currTask The MapRedTask that the FileSinkOperator belongs to. * @param hconf HiveConf */ private void addStatsTask( FileSinkOperator nd, MoveTask mvTask, Task<? extends Serializable> currTask, HiveConf hconf) { MoveWork mvWork = ((MoveTask) mvTask).getWork(); StatsWork statsWork = new StatsWork(mvWork.getLoadTableWork()); MapredWork mrWork = (MapredWork) currTask.getWork(); // AggKey in StatsWork is used for stats aggregation while StatsAggPrefix // in FileSinkDesc is used for stats publishing. They should be consistent. statsWork.setAggKey(((FileSinkOperator) nd).getConf().getStatsAggPrefix()); Task<? extends Serializable> statsTask = TaskFactory.get(statsWork, hconf); // mark the MapredWork and FileSinkOperator for gathering stats nd.getConf().setGatherStats(true); mrWork.setGatheringStats(true); // mrWork.addDestinationTable(nd.getConf().getTableInfo().getTableName()); // subscribe feeds from the MoveTask so that MoveTask can forward the list // of dynamic partition list to the StatsTask mvTask.addDependentTask(statsTask); statsTask.subscribeFeed(mvTask); }
/** * Process the FileSink operator to generate a MoveTask if necessary. * * @param nd current FileSink operator * @param stack parent operators * @param opProcCtx * @param chDir whether the operator should be first output to a tmp dir and then merged to the * final dir later * @return the final file name to which the FileSinkOperator should store. * @throws SemanticException */ private String processFS(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, boolean chDir) throws SemanticException { // Is it the dummy file sink after the mapjoin FileSinkOperator fsOp = (FileSinkOperator) nd; if ((fsOp.getParentOperators().size() == 1) && (fsOp.getParentOperators().get(0) instanceof MapJoinOperator)) { return null; } GenMRProcContext ctx = (GenMRProcContext) opProcCtx; List<FileSinkOperator> seenFSOps = ctx.getSeenFileSinkOps(); if (seenFSOps == null) { seenFSOps = new ArrayList<FileSinkOperator>(); } if (!seenFSOps.contains(fsOp)) { seenFSOps.add(fsOp); } ctx.setSeenFileSinkOps(seenFSOps); Task<? extends Serializable> currTask = ctx.getCurrTask(); // If the directory needs to be changed, send the new directory String dest = null; if (chDir) { dest = fsOp.getConf().getDirName(); // generate the temporary file // it must be on the same file system as the current destination ParseContext parseCtx = ctx.getParseCtx(); Context baseCtx = parseCtx.getContext(); String tmpDir = baseCtx.getExternalTmpFileURI((new Path(dest)).toUri()); fsOp.getConf().setDirName(tmpDir); } Task<? extends Serializable> mvTask = null; if (!chDir) { mvTask = findMoveTask(ctx.getMvTask(), fsOp); } Operator<? extends Serializable> currTopOp = ctx.getCurrTopOp(); String currAliasId = ctx.getCurrAliasId(); HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap = ctx.getOpTaskMap(); List<Operator<? extends Serializable>> seenOps = ctx.getSeenOps(); List<Task<? extends Serializable>> rootTasks = ctx.getRootTasks(); // Set the move task to be dependent on the current task if (mvTask != null) { currTask.addDependentTask(mvTask); } // In case of multi-table insert, the path to alias mapping is needed for // all the sources. Since there is no // reducer, treat it as a plan with null reducer // If it is a map-only job, the task needs to be processed if (currTopOp != null) { Task<? extends Serializable> mapTask = opTaskMap.get(null); if (mapTask == null) { assert (!seenOps.contains(currTopOp)); seenOps.add(currTopOp); GenMapRedUtils.setTaskPlan( currAliasId, currTopOp, (MapredWork) currTask.getWork(), false, ctx); opTaskMap.put(null, currTask); rootTasks.add(currTask); } else { if (!seenOps.contains(currTopOp)) { seenOps.add(currTopOp); GenMapRedUtils.setTaskPlan( currAliasId, currTopOp, (MapredWork) mapTask.getWork(), false, ctx); } // mapTask and currTask should be merged by and join/union operator // (e.g., GenMRUnion1j) which has multiple topOps. assert mapTask == currTask : "mapTask.id = " + mapTask.getId() + "; currTask.id = " + currTask.getId(); } return dest; } UnionOperator currUnionOp = ctx.getCurrUnionOp(); if (currUnionOp != null) { opTaskMap.put(null, currTask); GenMapRedUtils.initUnionPlan(ctx, currTask, false); return dest; } AbstractMapJoinOperator<? extends MapJoinDesc> currMapJoinOp = ctx.getCurrMapJoinOp(); if (currMapJoinOp != null) { opTaskMap.put(null, currTask); GenMRMapJoinCtx mjCtx = ctx.getMapJoinCtx(currMapJoinOp); MapredWork plan = (MapredWork) currTask.getWork(); String taskTmpDir = mjCtx.getTaskTmpDir(); TableDesc tt_desc = mjCtx.getTTDesc(); assert plan.getPathToAliases().get(taskTmpDir) == null; plan.getPathToAliases().put(taskTmpDir, new ArrayList<String>()); plan.getPathToAliases().get(taskTmpDir).add(taskTmpDir); plan.getPathToPartitionInfo().put(taskTmpDir, new PartitionDesc(tt_desc, null)); plan.getAliasToWork().put(taskTmpDir, mjCtx.getRootMapJoinOp()); return dest; } return dest; }
public static void mergeMapJoinUnion(UnionOperator union, GenMRProcContext ctx, int pos) throws SemanticException { ParseContext parseCtx = ctx.getParseCtx(); UnionProcContext uCtx = parseCtx.getUCtx(); UnionParseContext uPrsCtx = uCtx.getUnionParseContext(union); assert uPrsCtx != null; Task<? extends Serializable> currTask = ctx.getCurrTask(); GenMRUnionCtx uCtxTask = ctx.getUnionTask(union); Task<? extends Serializable> uTask = null; union.getParentOperators().get(pos); MapredWork uPlan = null; // union is encountered for the first time if (uCtxTask == null) { uCtxTask = new GenMRUnionCtx(); uPlan = GenMapRedUtils.getMapRedWork(parseCtx.getConf()); uTask = TaskFactory.get(uPlan, parseCtx.getConf()); uCtxTask.setUTask(uTask); ctx.setUnionTask(union, uCtxTask); } else { uTask = uCtxTask.getUTask(); uPlan = (MapredWork) uTask.getWork(); } // If there is a mapjoin at position 'pos' if (uPrsCtx.getMapJoinSubq(pos)) { GenMRMapJoinCtx mjCtx = ctx.getMapJoinCtx(ctx.getCurrMapJoinOp()); String taskTmpDir = mjCtx.getTaskTmpDir(); if (uPlan.getPathToAliases().get(taskTmpDir) == null) { uPlan.getPathToAliases().put(taskTmpDir, new ArrayList<String>()); uPlan.getPathToAliases().get(taskTmpDir).add(taskTmpDir); uPlan.getPathToPartitionInfo().put(taskTmpDir, new PartitionDesc(mjCtx.getTTDesc(), null)); uPlan.getAliasToWork().put(taskTmpDir, mjCtx.getRootMapJoinOp()); } for (Task t : currTask.getParentTasks()) { t.addDependentTask(uTask); } try { boolean notDone = true; while (notDone) { for (Task t : currTask.getParentTasks()) { t.removeDependentTask(currTask); } notDone = false; } } catch (ConcurrentModificationException e) { } } else { setTaskPlan(ctx.getCurrAliasId(), ctx.getCurrTopOp(), uPlan, false, ctx); } ctx.setCurrTask(uTask); ctx.setCurrAliasId(null); ctx.setCurrTopOp(null); ctx.setCurrMapJoinOp(null); ctx.getMapCurrCtx().put(union, new GenMapRedCtx(ctx.getCurrTask(), null, null)); }
@SuppressWarnings("nls") /** * Merge the tasks - by creating a temporary file between them. * * @param op reduce sink operator being processed * @param oldTask the parent task * @param task the child task * @param opProcCtx context * @param setReducer does the reducer needs to be set * @param pos position of the parent */ public static void splitTasks( Operator<? extends Serializable> op, Task<? extends Serializable> parentTask, Task<? extends Serializable> childTask, GenMRProcContext opProcCtx, boolean setReducer, boolean local, int posn) throws SemanticException { childTask.getWork(); Operator<? extends Serializable> currTopOp = opProcCtx.getCurrTopOp(); ParseContext parseCtx = opProcCtx.getParseCtx(); parentTask.addDependentTask(childTask); // Root Task cannot depend on any other task, therefore childTask cannot be // a root Task List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks(); if (rootTasks.contains(childTask)) { rootTasks.remove(childTask); } // generate the temporary file Context baseCtx = parseCtx.getContext(); String taskTmpDir = baseCtx.getMRTmpFileURI(); Operator<? extends Serializable> parent = op.getParentOperators().get(posn); TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc( PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol")); // Create a file sink operator for this file name boolean compressIntermediate = parseCtx.getConf().getBoolVar(HiveConf.ConfVars.COMPRESSINTERMEDIATE); FileSinkDesc desc = new FileSinkDesc(taskTmpDir, tt_desc, compressIntermediate); if (compressIntermediate) { desc.setCompressCodec(parseCtx.getConf().getVar(HiveConf.ConfVars.COMPRESSINTERMEDIATECODEC)); desc.setCompressType(parseCtx.getConf().getVar(HiveConf.ConfVars.COMPRESSINTERMEDIATETYPE)); } Operator<? extends Serializable> fs_op = putOpInsertMap(OperatorFactory.get(desc, parent.getSchema()), null, parseCtx); // replace the reduce child with this operator List<Operator<? extends Serializable>> childOpList = parent.getChildOperators(); for (int pos = 0; pos < childOpList.size(); pos++) { if (childOpList.get(pos) == op) { childOpList.set(pos, fs_op); break; } } List<Operator<? extends Serializable>> parentOpList = new ArrayList<Operator<? extends Serializable>>(); parentOpList.add(parent); fs_op.setParentOperators(parentOpList); // create a dummy tableScan operator on top of op // TableScanOperator is implicitly created here for each MapOperator RowResolver rowResolver = opProcCtx.getParseCtx().getOpParseCtx().get(parent).getRowResolver(); Operator<? extends Serializable> ts_op = putOpInsertMap( OperatorFactory.get(TableScanDesc.class, parent.getSchema()), rowResolver, parseCtx); childOpList = new ArrayList<Operator<? extends Serializable>>(); childOpList.add(op); ts_op.setChildOperators(childOpList); op.getParentOperators().set(posn, ts_op); Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx(); mapCurrCtx.put(ts_op, new GenMapRedCtx(childTask, null, null)); String streamDesc = taskTmpDir; MapredWork cplan = (MapredWork) childTask.getWork(); if (setReducer) { Operator<? extends Serializable> reducer = op.getChildOperators().get(0); if (reducer.getClass() == JoinOperator.class) { String origStreamDesc; streamDesc = "$INTNAME"; origStreamDesc = streamDesc; int pos = 0; while (cplan.getAliasToWork().get(streamDesc) != null) { streamDesc = origStreamDesc.concat(String.valueOf(++pos)); } } // TODO: Allocate work to remove the temporary files and make that // dependent on the redTask if (reducer.getClass() == JoinOperator.class) { cplan.setNeedsTagging(true); } } // Add the path to alias mapping setTaskPlan(taskTmpDir, streamDesc, ts_op, cplan, local, tt_desc); // This can be cleaned up as a function table in future if (op instanceof AbstractMapJoinOperator<?>) { AbstractMapJoinOperator<? extends MapJoinDesc> mjOp = (AbstractMapJoinOperator<? extends MapJoinDesc>) op; opProcCtx.setCurrMapJoinOp(mjOp); GenMRMapJoinCtx mjCtx = opProcCtx.getMapJoinCtx(mjOp); if (mjCtx == null) { mjCtx = new GenMRMapJoinCtx(taskTmpDir, tt_desc, ts_op, null); } else { mjCtx.setTaskTmpDir(taskTmpDir); mjCtx.setTTDesc(tt_desc); mjCtx.setRootMapJoinOp(ts_op); } opProcCtx.setMapJoinCtx(mjOp, mjCtx); opProcCtx.getMapCurrCtx().put(parent, new GenMapRedCtx(childTask, null, null)); setupBucketMapJoinInfo(cplan, mjOp, false); } currTopOp = null; String currAliasId = null; opProcCtx.setCurrTopOp(currTopOp); opProcCtx.setCurrAliasId(currAliasId); opProcCtx.setCurrTask(childTask); }
/** * Merge the current task with the task for the current reducer. * * @param op operator being processed * @param oldTask the old task for the current reducer * @param task the current task for the current reducer * @param opProcCtx processing context * @param pos position of the parent in the stack */ public static void joinPlan( Operator<? extends Serializable> op, Task<? extends Serializable> oldTask, Task<? extends Serializable> task, GenMRProcContext opProcCtx, int pos, boolean split, boolean readMapJoinData, boolean readUnionData, boolean createLocalWork) throws SemanticException { Task<? extends Serializable> currTask = task; MapredWork plan = (MapredWork) currTask.getWork(); Operator<? extends Serializable> currTopOp = opProcCtx.getCurrTopOp(); List<Task<? extends Serializable>> parTasks = null; // terminate the old task and make current task dependent on it if (split) { assert oldTask != null; splitTasks(op, oldTask, currTask, opProcCtx, true, false, 0); } else { if ((oldTask != null) && (oldTask.getParentTasks() != null) && !oldTask.getParentTasks().isEmpty()) { parTasks = new ArrayList<Task<? extends Serializable>>(); parTasks.addAll(oldTask.getParentTasks()); Object[] parTaskArr = parTasks.toArray(); for (Object element : parTaskArr) { ((Task<? extends Serializable>) element).removeDependentTask(oldTask); } } } if (currTopOp != null) { List<Operator<? extends Serializable>> seenOps = opProcCtx.getSeenOps(); String currAliasId = opProcCtx.getCurrAliasId(); if (!seenOps.contains(currTopOp)) { seenOps.add(currTopOp); boolean local = false; if (pos != -1) { local = (pos == ((MapJoinDesc) op.getConf()).getPosBigTable()) ? false : true; } setTaskPlan(currAliasId, currTopOp, plan, local, opProcCtx); if (op instanceof AbstractMapJoinOperator) { setupBucketMapJoinInfo( plan, (AbstractMapJoinOperator<? extends MapJoinDesc>) op, createLocalWork); } } currTopOp = null; opProcCtx.setCurrTopOp(currTopOp); } else if (opProcCtx.getCurrMapJoinOp() != null) { AbstractMapJoinOperator<? extends MapJoinDesc> mjOp = opProcCtx.getCurrMapJoinOp(); if (readUnionData) { initUnionPlan(opProcCtx, currTask, false); } else { GenMRMapJoinCtx mjCtx = opProcCtx.getMapJoinCtx(mjOp); // In case of map-join followed by map-join, the file needs to be // obtained from the old map join AbstractMapJoinOperator<? extends MapJoinDesc> oldMapJoin = mjCtx.getOldMapJoin(); String taskTmpDir = null; TableDesc tt_desc = null; Operator<? extends Serializable> rootOp = null; boolean local = ((pos == -1) || (pos == (mjOp.getConf()).getPosBigTable())) ? false : true; if (oldMapJoin == null) { if (opProcCtx.getParseCtx().getListMapJoinOpsNoReducer().contains(mjOp) || local || (oldTask != null) && (parTasks != null)) { taskTmpDir = mjCtx.getTaskTmpDir(); tt_desc = mjCtx.getTTDesc(); rootOp = mjCtx.getRootMapJoinOp(); } } else { GenMRMapJoinCtx oldMjCtx = opProcCtx.getMapJoinCtx(oldMapJoin); assert oldMjCtx != null; taskTmpDir = oldMjCtx.getTaskTmpDir(); tt_desc = oldMjCtx.getTTDesc(); rootOp = oldMjCtx.getRootMapJoinOp(); } setTaskPlan(taskTmpDir, taskTmpDir, rootOp, plan, local, tt_desc); setupBucketMapJoinInfo(plan, oldMapJoin, createLocalWork); } opProcCtx.setCurrMapJoinOp(null); if ((oldTask != null) && (parTasks != null)) { for (Task<? extends Serializable> parTask : parTasks) { parTask.addDependentTask(currTask); if (opProcCtx.getRootTasks().contains(currTask)) { opProcCtx.getRootTasks().remove(currTask); } } } } opProcCtx.setCurrTask(currTask); }
private void outputPlan(Task<? extends Serializable> task) { if (task == null) return; out.printf("Stage: \n", task.getId()); // real output Serializable work = task.getWork(); if (work == null) return; if (work instanceof FetchWork) { out.println("Fetch"); output(((FetchWork) work).getSource()); } else if (work instanceof MapredLocalWork) { out.println("MapredLocalWork"); // fetch try { out.println("Fetch Part"); Collection<FetchWork> fetchWorkCollect = ((MapredLocalWork) work).getAliasToFetchWork().values(); for (FetchWork f : fetchWorkCollect) { output(f.getSource()); } } catch (Exception e) { out.println("Exception 1"); } // others try { out.println("Other Parts"); Collection<Operator<? extends OperatorDesc>> collect = ((MapredLocalWork) work).getAliasToWork().values(); for (Operator<? extends OperatorDesc> c : collect) { output(c); } } catch (Exception e) { out.println("Exception 2"); } } else if (work instanceof MapredWork) { out.println("MapredWork"); try { Collection<Operator<? extends OperatorDesc>> collect = ((MapredWork) work).getAllOperators(); for (Operator<? extends OperatorDesc> c : collect) { // out.println(1); output(c); break; // first operator will give out all info s } } catch (Exception e) { out.println("Exception 3"); } } else { output(work); } // -------other cases-------------------- if (task instanceof ConditionalTask && ((ConditionalTask) task).getListTasks() != null) { for (Task<? extends Serializable> con : ((ConditionalTask) task).getListTasks()) { outputPlan(con); } } if (task.getChildTasks() != null) { for (Task<? extends Serializable> child : task.getChildTasks()) { outputPlan(child); } } }
/** Post analyze hook that invokes hive auth bindings */ @Override public void postAnalyze( HiveSemanticAnalyzerHookContext context, List<Task<? extends Serializable>> rootTasks) throws SemanticException { HiveOperation stmtOperation = getCurrentHiveStmtOp(); HiveAuthzPrivileges stmtAuthObject; stmtAuthObject = HiveAuthzPrivilegesMap.getHiveAuthzPrivileges(stmtOperation); // must occur above the null check on stmtAuthObject // since GRANT/REVOKE/etc are not authorized by binding layer at present Subject subject = getCurrentSubject(context); Set<String> subjectGroups = hiveAuthzBinding.getGroups(subject); for (Task<? extends Serializable> task : rootTasks) { if (task instanceof SentryGrantRevokeTask) { SentryGrantRevokeTask sentryTask = (SentryGrantRevokeTask) task; sentryTask.setHiveAuthzBinding(hiveAuthzBinding); sentryTask.setAuthzConf(authzConf); sentryTask.setSubject(subject); sentryTask.setSubjectGroups(subjectGroups); sentryTask.setIpAddress(context.getIpAddress()); sentryTask.setOperation(stmtOperation); } } try { if (stmtAuthObject == null) { // We don't handle authorizing this statement return; } /** * Replace DDLTask using the SentryFilterDDLTask for protection, such as "show column" only * allow show some column that user can access to. SENTRY-847 */ for (int i = 0; i < rootTasks.size(); i++) { Task<? extends Serializable> task = rootTasks.get(i); if (task instanceof DDLTask) { SentryFilterDDLTask filterTask = new SentryFilterDDLTask(hiveAuthzBinding, subject, stmtOperation); filterTask.setWork((DDLWork) task.getWork()); rootTasks.set(i, filterTask); } } authorizeWithHiveBindings(context, stmtAuthObject, stmtOperation); } catch (AuthorizationException e) { executeOnFailureHooks(context, stmtOperation, e); String permsRequired = ""; for (String perm : hiveAuthzBinding.getLastQueryPrivilegeErrors()) { permsRequired += perm + ";"; } SessionState.get().getConf().set(HiveAuthzConf.HIVE_SENTRY_AUTH_ERRORS, permsRequired); String msgForLog = HiveAuthzConf.HIVE_SENTRY_PRIVILEGE_ERROR_MESSAGE + "\n Required privileges for this query: " + permsRequired; String msgForConsole = HiveAuthzConf.HIVE_SENTRY_PRIVILEGE_ERROR_MESSAGE + "\n " + e.getMessage() + "\n The required privileges: " + permsRequired; // AuthorizationException is not a real exception, use the info level to record this. LOG.info(msgForLog); throw new SemanticException(msgForConsole, e); } finally { hiveAuthzBinding.close(); } if ("true" .equalsIgnoreCase(context.getConf().get(HiveAuthzConf.HIVE_SENTRY_MOCK_COMPILATION))) { throw new SemanticException( HiveAuthzConf.HIVE_SENTRY_MOCK_ERROR + " Mock query compilation aborted. Set " + HiveAuthzConf.HIVE_SENTRY_MOCK_COMPILATION + " to 'false' for normal query processing"); } }
/** * Initialize the current plan by adding it to root tasks. * * @param op the map join operator encountered * @param opProcCtx processing context * @param pos position of the parent */ public static void initMapJoinPlan( Operator<? extends Serializable> op, GenMRProcContext opProcCtx, boolean readInputMapJoin, boolean readInputUnion, boolean setReducer, int pos, boolean createLocalPlan) throws SemanticException { Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx(); assert (((pos == -1) && (readInputMapJoin)) || (pos != -1)); int parentPos = (pos == -1) ? 0 : pos; GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(parentPos)); Task<? extends Serializable> currTask = mapredCtx.getCurrTask(); MapredWork plan = (MapredWork) currTask.getWork(); HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap = opProcCtx.getOpTaskMap(); Operator<? extends Serializable> currTopOp = opProcCtx.getCurrTopOp(); // The mapjoin has already been encountered. Some context must be stored // about that if (readInputMapJoin) { AbstractMapJoinOperator<? extends MapJoinDesc> currMapJoinOp = opProcCtx.getCurrMapJoinOp(); assert currMapJoinOp != null; boolean local = ((pos == -1) || (pos == (currMapJoinOp.getConf()).getPosBigTable())) ? false : true; if (setReducer) { Operator<? extends Serializable> reducer = op.getChildOperators().get(0); plan.setReducer(reducer); opTaskMap.put(reducer, currTask); if (reducer.getClass() == JoinOperator.class) { plan.setNeedsTagging(true); } ReduceSinkDesc desc = (ReduceSinkDesc) op.getConf(); plan.setNumReduceTasks(desc.getNumReducers()); } else { opTaskMap.put(op, currTask); } if (!readInputUnion) { GenMRMapJoinCtx mjCtx = opProcCtx.getMapJoinCtx(currMapJoinOp); String taskTmpDir; TableDesc tt_desc; Operator<? extends Serializable> rootOp; if (mjCtx.getOldMapJoin() == null || setReducer) { taskTmpDir = mjCtx.getTaskTmpDir(); tt_desc = mjCtx.getTTDesc(); rootOp = mjCtx.getRootMapJoinOp(); } else { GenMRMapJoinCtx oldMjCtx = opProcCtx.getMapJoinCtx(mjCtx.getOldMapJoin()); taskTmpDir = oldMjCtx.getTaskTmpDir(); tt_desc = oldMjCtx.getTTDesc(); rootOp = oldMjCtx.getRootMapJoinOp(); } setTaskPlan(taskTmpDir, taskTmpDir, rootOp, plan, local, tt_desc); setupBucketMapJoinInfo(plan, currMapJoinOp, createLocalPlan); } else { initUnionPlan(opProcCtx, currTask, false); } opProcCtx.setCurrMapJoinOp(null); } else { MapJoinDesc desc = (MapJoinDesc) op.getConf(); // The map is overloaded to keep track of mapjoins also opTaskMap.put(op, currTask); List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks(); rootTasks.add(currTask); assert currTopOp != null; List<Operator<? extends Serializable>> seenOps = opProcCtx.getSeenOps(); String currAliasId = opProcCtx.getCurrAliasId(); seenOps.add(currTopOp); boolean local = (pos == desc.getPosBigTable()) ? false : true; setTaskPlan(currAliasId, currTopOp, plan, local, opProcCtx); setupBucketMapJoinInfo( plan, (AbstractMapJoinOperator<? extends MapJoinDesc>) op, createLocalPlan); } opProcCtx.setCurrTask(currTask); opProcCtx.setCurrTopOp(null); opProcCtx.setCurrAliasId(null); }
/** * File Sink Operator encountered. * * @param nd the file sink operator encountered * @param opProcCtx context */ public Object process( Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException { GenMRProcContext ctx = (GenMRProcContext) opProcCtx; ParseContext parseCtx = ctx.getParseCtx(); boolean chDir = false; Task<? extends Serializable> currTask = ctx.getCurrTask(); FileSinkOperator fsOp = (FileSinkOperator) nd; boolean isInsertTable = // is INSERT OVERWRITE TABLE fsOp.getConf().getTableInfo().getTableName() != null && parseCtx.getQB().getParseInfo().isInsertToTable(); HiveConf hconf = parseCtx.getConf(); // Mark this task as a final map reduce task (ignoring the optional merge task) ((MapredWork) currTask.getWork()).setFinalMapRed(true); // If this file sink desc has been processed due to a linked file sink desc, // use that task Map<FileSinkDesc, Task<? extends Serializable>> fileSinkDescs = ctx.getLinkedFileDescTasks(); if (fileSinkDescs != null) { Task<? extends Serializable> childTask = fileSinkDescs.get(fsOp.getConf()); processLinkedFileDesc(ctx, childTask); return null; } // Has the user enabled merging of files for map-only jobs or for all jobs if ((ctx.getMvTask() != null) && (!ctx.getMvTask().isEmpty())) { List<Task<MoveWork>> mvTasks = ctx.getMvTask(); // In case of unions or map-joins, it is possible that the file has // already been seen. // So, no need to attempt to merge the files again. if ((ctx.getSeenFileSinkOps() == null) || (!ctx.getSeenFileSinkOps().contains(nd))) { // no need of merging if the move is to a local file system MoveTask mvTask = (MoveTask) findMoveTask(mvTasks, fsOp); if (isInsertTable && hconf.getBoolVar(ConfVars.HIVESTATSAUTOGATHER)) { addStatsTask(fsOp, mvTask, currTask, parseCtx.getConf()); } if ((mvTask != null) && !mvTask.isLocal() && fsOp.getConf().canBeMerged()) { if (fsOp.getConf().isLinkedFileSink()) { // If the user has HIVEMERGEMAPREDFILES set to false, the idea was the // number of reducers are few, so the number of files anyway are small. // However, with this optimization, we are increasing the number of files // possibly by a big margin. So, merge aggresively. if (hconf.getBoolVar(ConfVars.HIVEMERGEMAPFILES) || hconf.getBoolVar(ConfVars.HIVEMERGEMAPREDFILES)) { chDir = true; } } else { // There are separate configuration parameters to control whether to // merge for a map-only job // or for a map-reduce job MapredWork currWork = (MapredWork) currTask.getWork(); boolean mergeMapOnly = hconf.getBoolVar(ConfVars.HIVEMERGEMAPFILES) && currWork.getReducer() == null; boolean mergeMapRed = hconf.getBoolVar(ConfVars.HIVEMERGEMAPREDFILES) && currWork.getReducer() != null; if (mergeMapOnly || mergeMapRed) { chDir = true; } } } } } String finalName = processFS(fsOp, stack, opProcCtx, chDir); if (chDir) { // Merge the files in the destination table/partitions by creating Map-only merge job // If underlying data is RCFile or OrcFile a BlockMerge task would be created. LOG.info("using CombineHiveInputformat for the merge job"); createMRWorkForMergingFiles(fsOp, ctx, finalName); } FileSinkDesc fileSinkDesc = fsOp.getConf(); if (fileSinkDesc.isLinkedFileSink()) { Map<FileSinkDesc, Task<? extends Serializable>> linkedFileDescTasks = ctx.getLinkedFileDescTasks(); if (linkedFileDescTasks == null) { linkedFileDescTasks = new HashMap<FileSinkDesc, Task<? extends Serializable>>(); ctx.setLinkedFileDescTasks(linkedFileDescTasks); } // The child tasks may be null in case of a select if ((currTask.getChildTasks() != null) && (currTask.getChildTasks().size() == 1)) { for (FileSinkDesc fileDesc : fileSinkDesc.getLinkedFileSinkDesc()) { linkedFileDescTasks.put(fileDesc, currTask.getChildTasks().get(0)); } } } return null; }
/** * Process the FileSink operator to generate a MoveTask if necessary. * * @param fsOp current FileSink operator * @param stack parent operators * @param opProcCtx * @param chDir whether the operator should be first output to a tmp dir and then merged to the * final dir later * @return the final file name to which the FileSinkOperator should store. * @throws SemanticException */ private String processFS( FileSinkOperator fsOp, Stack<Node> stack, NodeProcessorCtx opProcCtx, boolean chDir) throws SemanticException { GenMRProcContext ctx = (GenMRProcContext) opProcCtx; List<FileSinkOperator> seenFSOps = ctx.getSeenFileSinkOps(); if (seenFSOps == null) { seenFSOps = new ArrayList<FileSinkOperator>(); } if (!seenFSOps.contains(fsOp)) { seenFSOps.add(fsOp); } ctx.setSeenFileSinkOps(seenFSOps); Task<? extends Serializable> currTask = ctx.getCurrTask(); // If the directory needs to be changed, send the new directory String dest = null; if (chDir) { dest = fsOp.getConf().getFinalDirName(); // generate the temporary file // it must be on the same file system as the current destination ParseContext parseCtx = ctx.getParseCtx(); Context baseCtx = parseCtx.getContext(); String tmpDir = baseCtx.getExternalTmpFileURI((new Path(dest)).toUri()); FileSinkDesc fileSinkDesc = fsOp.getConf(); // Change all the linked file sink descriptors if (fileSinkDesc.isLinkedFileSink()) { for (FileSinkDesc fsConf : fileSinkDesc.getLinkedFileSinkDesc()) { String fileName = Utilities.getFileNameFromDirName(fsConf.getDirName()); fsConf.setParentDir(tmpDir); fsConf.setDirName(tmpDir + Path.SEPARATOR + fileName); } } else { fileSinkDesc.setDirName(tmpDir); } } Task<MoveWork> mvTask = null; if (!chDir) { mvTask = findMoveTask(ctx.getMvTask(), fsOp); } Operator<? extends OperatorDesc> currTopOp = ctx.getCurrTopOp(); String currAliasId = ctx.getCurrAliasId(); HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = ctx.getOpTaskMap(); List<Operator<? extends OperatorDesc>> seenOps = ctx.getSeenOps(); List<Task<? extends Serializable>> rootTasks = ctx.getRootTasks(); // Set the move task to be dependent on the current task if (mvTask != null) { addDependentMoveTasks(ctx, mvTask, currTask); } // In case of multi-table insert, the path to alias mapping is needed for // all the sources. Since there is no // reducer, treat it as a plan with null reducer // If it is a map-only job, the task needs to be processed if (currTopOp != null) { Task<? extends Serializable> mapTask = opTaskMap.get(null); if (mapTask == null) { if (!seenOps.contains(currTopOp)) { seenOps.add(currTopOp); GenMapRedUtils.setTaskPlan( currAliasId, currTopOp, (MapredWork) currTask.getWork(), false, ctx); } opTaskMap.put(null, currTask); if (!rootTasks.contains(currTask) && (currTask.getParentTasks() == null || currTask.getParentTasks().isEmpty())) { rootTasks.add(currTask); } } else { if (!seenOps.contains(currTopOp)) { seenOps.add(currTopOp); GenMapRedUtils.setTaskPlan( currAliasId, currTopOp, (MapredWork) mapTask.getWork(), false, ctx); } else { UnionOperator currUnionOp = ctx.getCurrUnionOp(); if (currUnionOp != null) { opTaskMap.put(null, currTask); ctx.setCurrTopOp(null); GenMapRedUtils.initUnionPlan(ctx, currUnionOp, currTask, false); return dest; } } // mapTask and currTask should be merged by and join/union operator // (e.g., GenMRUnion1) which has multiple topOps. // assert mapTask == currTask : "mapTask.id = " + mapTask.getId() // + "; currTask.id = " + currTask.getId(); } return dest; } UnionOperator currUnionOp = ctx.getCurrUnionOp(); if (currUnionOp != null) { opTaskMap.put(null, currTask); GenMapRedUtils.initUnionPlan(ctx, currUnionOp, currTask, false); return dest; } return dest; }