/** * Construct a conditional task given the current leaf task, the MoveWork and the MapredWork. * * @param conf HiveConf * @param currTask current leaf task * @param mvWork MoveWork for the move task * @param mergeWork MapredWork for the merge task. * @param inputPath the input directory of the merge/move task * @return The conditional task */ private ConditionalTask createCondTask( HiveConf conf, Task<? extends Serializable> currTask, MoveWork mvWork, MapredWork mergeWork, String inputPath) { // There are 3 options for this ConditionalTask: // 1) Merge the partitions // 2) Move the partitions (i.e. don't merge the partitions) // 3) Merge some partitions and move other partitions (i.e. merge some partitions and don't // merge others) in this case the merge is done first followed by the move to prevent // conflicts. Task<? extends Serializable> mergeOnlyMergeTask = TaskFactory.get(mergeWork, conf); Task<? extends Serializable> moveOnlyMoveTask = TaskFactory.get(mvWork, conf); Task<? extends Serializable> mergeAndMoveMergeTask = TaskFactory.get(mergeWork, conf); Task<? extends Serializable> mergeAndMoveMoveTask = TaskFactory.get(mvWork, conf); // NOTE! It is necessary merge task is the parent of the move task, and not // the other way around, for the proper execution of the execute method of // ConditionalTask mergeAndMoveMergeTask.addDependentTask(mergeAndMoveMoveTask); List<Serializable> listWorks = new ArrayList<Serializable>(); listWorks.add(mvWork); listWorks.add(mergeWork); ConditionalWork cndWork = new ConditionalWork(listWorks); List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>(); listTasks.add(moveOnlyMoveTask); listTasks.add(mergeOnlyMergeTask); listTasks.add(mergeAndMoveMergeTask); ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, conf); cndTsk.setListTasks(listTasks); // create resolver cndTsk.setResolver(new ConditionalResolverMergeFiles()); ConditionalResolverMergeFilesCtx mrCtx = new ConditionalResolverMergeFilesCtx(listTasks, inputPath); cndTsk.setResolverCtx(mrCtx); // make the conditional task as the child of the current leaf task currTask.addDependentTask(cndTsk); return cndTsk; }
/** * Construct a conditional task given the current leaf task, the MoveWork and the MapredWork. * * @param conf HiveConf * @param currTask current leaf task * @param mvWork MoveWork for the move task * @param mergeWork MapredWork for the merge task. * @param inputPath the input directory of the merge/move task * @return The conditional task */ private ConditionalTask createCondTask( HiveConf conf, Task<? extends Serializable> currTask, MoveWork mvWork, MapredWork mergeWork, String inputPath) { Task<? extends Serializable> mergeTask = TaskFactory.get(mergeWork, conf); Task<? extends Serializable> moveTask = TaskFactory.get(mvWork, conf); List<Serializable> listWorks = new ArrayList<Serializable>(); listWorks.add(mvWork); listWorks.add(mergeWork); ConditionalWork cndWork = new ConditionalWork(listWorks); List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>(); listTasks.add(moveTask); listTasks.add(mergeTask); ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, conf); cndTsk.setListTasks(listTasks); // create resolver cndTsk.setResolver(new ConditionalResolverMergeFiles()); ConditionalResolverMergeFilesCtx mrCtx = new ConditionalResolverMergeFilesCtx(listTasks, inputPath); cndTsk.setResolverCtx(mrCtx); // make the conditional task as the child of the current leaf task currTask.addDependentTask(cndTsk); return cndTsk; }
/* * Multiple file sink descriptors are linked. * Use the task created by the first linked file descriptor */ private void processLinkedFileDesc(GenMRProcContext ctx, Task<? extends Serializable> childTask) throws SemanticException { Operator<? extends OperatorDesc> currTopOp = ctx.getCurrTopOp(); String currAliasId = ctx.getCurrAliasId(); List<Operator<? extends OperatorDesc>> seenOps = ctx.getSeenOps(); List<Task<? extends Serializable>> rootTasks = ctx.getRootTasks(); Task<? extends Serializable> currTask = ctx.getCurrTask(); if (currTopOp != null) { if (!seenOps.contains(currTopOp)) { seenOps.add(currTopOp); GenMapRedUtils.setTaskPlan( currAliasId, currTopOp, (MapredWork) currTask.getWork(), false, ctx); } if (!rootTasks.contains(currTask) && (currTask.getParentTasks() == null || currTask.getParentTasks().isEmpty())) { rootTasks.add(currTask); } } if (childTask != null) { currTask.addDependentTask(childTask); } }
/** * handle partial scan command. * * <p>It is composed of PartialScanTask followed by StatsTask. */ private void handlePartialScanCommand( TableScanOperator tableScan, ParseContext parseContext, StatsWork statsWork, GenTezProcContext context, Task<StatsWork> statsTask) throws SemanticException { String aggregationKey = tableScan.getConf().getStatsAggPrefix(); StringBuilder aggregationKeyBuffer = new StringBuilder(aggregationKey); List<Path> inputPaths = GenMapRedUtils.getInputPathsForPartialScan(tableScan, aggregationKeyBuffer); aggregationKey = aggregationKeyBuffer.toString(); // scan work PartialScanWork scanWork = new PartialScanWork(inputPaths); scanWork.setMapperCannotSpanPartns(true); scanWork.setAggKey(aggregationKey); scanWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir(), parseContext.getConf()); // stats work statsWork.setPartialScanAnalyzeCommand(true); // partial scan task DriverContext driverCxt = new DriverContext(); Task<PartialScanWork> partialScanTask = TaskFactory.get(scanWork, parseContext.getConf()); partialScanTask.initialize(parseContext.getConf(), null, driverCxt); partialScanTask.setWork(scanWork); statsWork.setSourceTask(partialScanTask); // task dependency context.rootTasks.remove(context.currentTask); context.rootTasks.add(partialScanTask); partialScanTask.addDependentTask(statsTask); }
/** * handle partial scan command. It is composed of PartialScanTask followed by StatsTask . * * @param op * @param ctx * @param parseCtx * @param currTask * @param parseInfo * @param statsWork * @param statsTask * @throws SemanticException */ private void handlePartialScanCommand( TableScanOperator op, GenMRProcContext ctx, ParseContext parseCtx, Task<? extends Serializable> currTask, StatsWork statsWork, Task<StatsWork> statsTask) throws SemanticException { String aggregationKey = op.getConf().getStatsAggPrefix(); StringBuffer aggregationKeyBuffer = new StringBuffer(aggregationKey); List<Path> inputPaths = GenMapRedUtils.getInputPathsForPartialScan(op, aggregationKeyBuffer); aggregationKey = aggregationKeyBuffer.toString(); // scan work PartialScanWork scanWork = new PartialScanWork(inputPaths); scanWork.setMapperCannotSpanPartns(true); scanWork.setAggKey(aggregationKey); // stats work statsWork.setPartialScanAnalyzeCommand(true); // partial scan task DriverContext driverCxt = new DriverContext(); Task<PartialScanWork> psTask = TaskFactory.get(scanWork, parseCtx.getConf()); psTask.initialize(parseCtx.getConf(), null, driverCxt); psTask.setWork(scanWork); // task dependency ctx.getRootTasks().remove(currTask); ctx.getRootTasks().add(psTask); psTask.addDependentTask(statsTask); List<Task<? extends Serializable>> parentTasks = new ArrayList<Task<? extends Serializable>>(); parentTasks.add(psTask); statsTask.setParentTasks(parentTasks); }
/** * Adds the dependencyTaskForMultiInsert in ctx as a dependent of parentTask. If mvTask is a load * table, and HIVE_MULTI_INSERT_ATOMIC_OUTPUTS is set, adds mvTask as a dependent of * dependencyTaskForMultiInsert in ctx, otherwise adds mvTask as a dependent of parentTask as * well. * * @param ctx * @param mvTask * @param parentTask */ private void addDependentMoveTasks( GenMRProcContext ctx, Task<MoveWork> mvTask, Task<? extends Serializable> parentTask) { if (mvTask != null) { if (ctx.getConf().getBoolVar(ConfVars.HIVE_MULTI_INSERT_MOVE_TASKS_SHARE_DEPENDENCIES)) { DependencyCollectionTask dependencyTask = ctx.getDependencyTaskForMultiInsert(); parentTask.addDependentTask(dependencyTask); if (mvTask.getWork().getLoadTableWork() != null) { // Moving tables/partitions depend on the dependencyTask dependencyTask.addDependentTask(mvTask); } else { // Moving files depends on the parentTask (we still want the dependencyTask to depend // on the parentTask) parentTask.addDependentTask(mvTask); } } else { parentTask.addDependentTask(mvTask); } } }
private void LinkMoveTask( GenMRProcContext ctx, FileSinkOperator newOutput, ConditionalTask cndTsk) { List<Task<? extends Serializable>> mvTasks = ctx.getMvTask(); Task<? extends Serializable> mvTask = findMoveTask(mvTasks, newOutput); if (mvTask != null) { for (Task<? extends Serializable> tsk : cndTsk.getListTasks()) { tsk.addDependentTask(mvTask); } } }
/** * Process the FileSink operator to generate a MoveTask if necessary. * * @param nd current FileSink operator * @param stack parent operators * @param opProcCtx * @param chDir whether the operator should be first output to a tmp dir and then merged to the * final dir later * @return the final file name to which the FileSinkOperator should store. * @throws SemanticException */ private String processFS(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, boolean chDir) throws SemanticException { // Is it the dummy file sink after the mapjoin FileSinkOperator fsOp = (FileSinkOperator) nd; if ((fsOp.getParentOperators().size() == 1) && (fsOp.getParentOperators().get(0) instanceof MapJoinOperator)) { return null; } GenMRProcContext ctx = (GenMRProcContext) opProcCtx; List<FileSinkOperator> seenFSOps = ctx.getSeenFileSinkOps(); if (seenFSOps == null) { seenFSOps = new ArrayList<FileSinkOperator>(); } if (!seenFSOps.contains(fsOp)) { seenFSOps.add(fsOp); } ctx.setSeenFileSinkOps(seenFSOps); Task<? extends Serializable> currTask = ctx.getCurrTask(); // If the directory needs to be changed, send the new directory String dest = null; if (chDir) { dest = fsOp.getConf().getDirName(); // generate the temporary file // it must be on the same file system as the current destination ParseContext parseCtx = ctx.getParseCtx(); Context baseCtx = parseCtx.getContext(); String tmpDir = baseCtx.getExternalTmpFileURI((new Path(dest)).toUri()); fsOp.getConf().setDirName(tmpDir); } Task<? extends Serializable> mvTask = null; if (!chDir) { mvTask = findMoveTask(ctx.getMvTask(), fsOp); } Operator<? extends Serializable> currTopOp = ctx.getCurrTopOp(); String currAliasId = ctx.getCurrAliasId(); HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap = ctx.getOpTaskMap(); List<Operator<? extends Serializable>> seenOps = ctx.getSeenOps(); List<Task<? extends Serializable>> rootTasks = ctx.getRootTasks(); // Set the move task to be dependent on the current task if (mvTask != null) { currTask.addDependentTask(mvTask); } // In case of multi-table insert, the path to alias mapping is needed for // all the sources. Since there is no // reducer, treat it as a plan with null reducer // If it is a map-only job, the task needs to be processed if (currTopOp != null) { Task<? extends Serializable> mapTask = opTaskMap.get(null); if (mapTask == null) { assert (!seenOps.contains(currTopOp)); seenOps.add(currTopOp); GenMapRedUtils.setTaskPlan( currAliasId, currTopOp, (MapredWork) currTask.getWork(), false, ctx); opTaskMap.put(null, currTask); rootTasks.add(currTask); } else { if (!seenOps.contains(currTopOp)) { seenOps.add(currTopOp); GenMapRedUtils.setTaskPlan( currAliasId, currTopOp, (MapredWork) mapTask.getWork(), false, ctx); } // mapTask and currTask should be merged by and join/union operator // (e.g., GenMRUnion1j) which has multiple topOps. assert mapTask == currTask : "mapTask.id = " + mapTask.getId() + "; currTask.id = " + currTask.getId(); } return dest; } UnionOperator currUnionOp = ctx.getCurrUnionOp(); if (currUnionOp != null) { opTaskMap.put(null, currTask); GenMapRedUtils.initUnionPlan(ctx, currTask, false); return dest; } AbstractMapJoinOperator<? extends MapJoinDesc> currMapJoinOp = ctx.getCurrMapJoinOp(); if (currMapJoinOp != null) { opTaskMap.put(null, currTask); GenMRMapJoinCtx mjCtx = ctx.getMapJoinCtx(currMapJoinOp); MapredWork plan = (MapredWork) currTask.getWork(); String taskTmpDir = mjCtx.getTaskTmpDir(); TableDesc tt_desc = mjCtx.getTTDesc(); assert plan.getPathToAliases().get(taskTmpDir) == null; plan.getPathToAliases().put(taskTmpDir, new ArrayList<String>()); plan.getPathToAliases().get(taskTmpDir).add(taskTmpDir); plan.getPathToPartitionInfo().put(taskTmpDir, new PartitionDesc(tt_desc, null)); plan.getAliasToWork().put(taskTmpDir, mjCtx.getRootMapJoinOp()); return dest; } return dest; }
public static void mergeMapJoinUnion(UnionOperator union, GenMRProcContext ctx, int pos) throws SemanticException { ParseContext parseCtx = ctx.getParseCtx(); UnionProcContext uCtx = parseCtx.getUCtx(); UnionParseContext uPrsCtx = uCtx.getUnionParseContext(union); assert uPrsCtx != null; Task<? extends Serializable> currTask = ctx.getCurrTask(); GenMRUnionCtx uCtxTask = ctx.getUnionTask(union); Task<? extends Serializable> uTask = null; union.getParentOperators().get(pos); MapredWork uPlan = null; // union is encountered for the first time if (uCtxTask == null) { uCtxTask = new GenMRUnionCtx(); uPlan = GenMapRedUtils.getMapRedWork(parseCtx.getConf()); uTask = TaskFactory.get(uPlan, parseCtx.getConf()); uCtxTask.setUTask(uTask); ctx.setUnionTask(union, uCtxTask); } else { uTask = uCtxTask.getUTask(); uPlan = (MapredWork) uTask.getWork(); } // If there is a mapjoin at position 'pos' if (uPrsCtx.getMapJoinSubq(pos)) { GenMRMapJoinCtx mjCtx = ctx.getMapJoinCtx(ctx.getCurrMapJoinOp()); String taskTmpDir = mjCtx.getTaskTmpDir(); if (uPlan.getPathToAliases().get(taskTmpDir) == null) { uPlan.getPathToAliases().put(taskTmpDir, new ArrayList<String>()); uPlan.getPathToAliases().get(taskTmpDir).add(taskTmpDir); uPlan.getPathToPartitionInfo().put(taskTmpDir, new PartitionDesc(mjCtx.getTTDesc(), null)); uPlan.getAliasToWork().put(taskTmpDir, mjCtx.getRootMapJoinOp()); } for (Task t : currTask.getParentTasks()) { t.addDependentTask(uTask); } try { boolean notDone = true; while (notDone) { for (Task t : currTask.getParentTasks()) { t.removeDependentTask(currTask); } notDone = false; } } catch (ConcurrentModificationException e) { } } else { setTaskPlan(ctx.getCurrAliasId(), ctx.getCurrTopOp(), uPlan, false, ctx); } ctx.setCurrTask(uTask); ctx.setCurrAliasId(null); ctx.setCurrTopOp(null); ctx.setCurrMapJoinOp(null); ctx.getMapCurrCtx().put(union, new GenMapRedCtx(ctx.getCurrTask(), null, null)); }
@SuppressWarnings("nls") /** * Merge the tasks - by creating a temporary file between them. * * @param op reduce sink operator being processed * @param oldTask the parent task * @param task the child task * @param opProcCtx context * @param setReducer does the reducer needs to be set * @param pos position of the parent */ public static void splitTasks( Operator<? extends Serializable> op, Task<? extends Serializable> parentTask, Task<? extends Serializable> childTask, GenMRProcContext opProcCtx, boolean setReducer, boolean local, int posn) throws SemanticException { childTask.getWork(); Operator<? extends Serializable> currTopOp = opProcCtx.getCurrTopOp(); ParseContext parseCtx = opProcCtx.getParseCtx(); parentTask.addDependentTask(childTask); // Root Task cannot depend on any other task, therefore childTask cannot be // a root Task List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks(); if (rootTasks.contains(childTask)) { rootTasks.remove(childTask); } // generate the temporary file Context baseCtx = parseCtx.getContext(); String taskTmpDir = baseCtx.getMRTmpFileURI(); Operator<? extends Serializable> parent = op.getParentOperators().get(posn); TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc( PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol")); // Create a file sink operator for this file name boolean compressIntermediate = parseCtx.getConf().getBoolVar(HiveConf.ConfVars.COMPRESSINTERMEDIATE); FileSinkDesc desc = new FileSinkDesc(taskTmpDir, tt_desc, compressIntermediate); if (compressIntermediate) { desc.setCompressCodec(parseCtx.getConf().getVar(HiveConf.ConfVars.COMPRESSINTERMEDIATECODEC)); desc.setCompressType(parseCtx.getConf().getVar(HiveConf.ConfVars.COMPRESSINTERMEDIATETYPE)); } Operator<? extends Serializable> fs_op = putOpInsertMap(OperatorFactory.get(desc, parent.getSchema()), null, parseCtx); // replace the reduce child with this operator List<Operator<? extends Serializable>> childOpList = parent.getChildOperators(); for (int pos = 0; pos < childOpList.size(); pos++) { if (childOpList.get(pos) == op) { childOpList.set(pos, fs_op); break; } } List<Operator<? extends Serializable>> parentOpList = new ArrayList<Operator<? extends Serializable>>(); parentOpList.add(parent); fs_op.setParentOperators(parentOpList); // create a dummy tableScan operator on top of op // TableScanOperator is implicitly created here for each MapOperator RowResolver rowResolver = opProcCtx.getParseCtx().getOpParseCtx().get(parent).getRowResolver(); Operator<? extends Serializable> ts_op = putOpInsertMap( OperatorFactory.get(TableScanDesc.class, parent.getSchema()), rowResolver, parseCtx); childOpList = new ArrayList<Operator<? extends Serializable>>(); childOpList.add(op); ts_op.setChildOperators(childOpList); op.getParentOperators().set(posn, ts_op); Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx(); mapCurrCtx.put(ts_op, new GenMapRedCtx(childTask, null, null)); String streamDesc = taskTmpDir; MapredWork cplan = (MapredWork) childTask.getWork(); if (setReducer) { Operator<? extends Serializable> reducer = op.getChildOperators().get(0); if (reducer.getClass() == JoinOperator.class) { String origStreamDesc; streamDesc = "$INTNAME"; origStreamDesc = streamDesc; int pos = 0; while (cplan.getAliasToWork().get(streamDesc) != null) { streamDesc = origStreamDesc.concat(String.valueOf(++pos)); } } // TODO: Allocate work to remove the temporary files and make that // dependent on the redTask if (reducer.getClass() == JoinOperator.class) { cplan.setNeedsTagging(true); } } // Add the path to alias mapping setTaskPlan(taskTmpDir, streamDesc, ts_op, cplan, local, tt_desc); // This can be cleaned up as a function table in future if (op instanceof AbstractMapJoinOperator<?>) { AbstractMapJoinOperator<? extends MapJoinDesc> mjOp = (AbstractMapJoinOperator<? extends MapJoinDesc>) op; opProcCtx.setCurrMapJoinOp(mjOp); GenMRMapJoinCtx mjCtx = opProcCtx.getMapJoinCtx(mjOp); if (mjCtx == null) { mjCtx = new GenMRMapJoinCtx(taskTmpDir, tt_desc, ts_op, null); } else { mjCtx.setTaskTmpDir(taskTmpDir); mjCtx.setTTDesc(tt_desc); mjCtx.setRootMapJoinOp(ts_op); } opProcCtx.setMapJoinCtx(mjOp, mjCtx); opProcCtx.getMapCurrCtx().put(parent, new GenMapRedCtx(childTask, null, null)); setupBucketMapJoinInfo(cplan, mjOp, false); } currTopOp = null; String currAliasId = null; opProcCtx.setCurrTopOp(currTopOp); opProcCtx.setCurrAliasId(currAliasId); opProcCtx.setCurrTask(childTask); }
/** * Merge the current task with the task for the current reducer. * * @param op operator being processed * @param oldTask the old task for the current reducer * @param task the current task for the current reducer * @param opProcCtx processing context * @param pos position of the parent in the stack */ public static void joinPlan( Operator<? extends Serializable> op, Task<? extends Serializable> oldTask, Task<? extends Serializable> task, GenMRProcContext opProcCtx, int pos, boolean split, boolean readMapJoinData, boolean readUnionData, boolean createLocalWork) throws SemanticException { Task<? extends Serializable> currTask = task; MapredWork plan = (MapredWork) currTask.getWork(); Operator<? extends Serializable> currTopOp = opProcCtx.getCurrTopOp(); List<Task<? extends Serializable>> parTasks = null; // terminate the old task and make current task dependent on it if (split) { assert oldTask != null; splitTasks(op, oldTask, currTask, opProcCtx, true, false, 0); } else { if ((oldTask != null) && (oldTask.getParentTasks() != null) && !oldTask.getParentTasks().isEmpty()) { parTasks = new ArrayList<Task<? extends Serializable>>(); parTasks.addAll(oldTask.getParentTasks()); Object[] parTaskArr = parTasks.toArray(); for (Object element : parTaskArr) { ((Task<? extends Serializable>) element).removeDependentTask(oldTask); } } } if (currTopOp != null) { List<Operator<? extends Serializable>> seenOps = opProcCtx.getSeenOps(); String currAliasId = opProcCtx.getCurrAliasId(); if (!seenOps.contains(currTopOp)) { seenOps.add(currTopOp); boolean local = false; if (pos != -1) { local = (pos == ((MapJoinDesc) op.getConf()).getPosBigTable()) ? false : true; } setTaskPlan(currAliasId, currTopOp, plan, local, opProcCtx); if (op instanceof AbstractMapJoinOperator) { setupBucketMapJoinInfo( plan, (AbstractMapJoinOperator<? extends MapJoinDesc>) op, createLocalWork); } } currTopOp = null; opProcCtx.setCurrTopOp(currTopOp); } else if (opProcCtx.getCurrMapJoinOp() != null) { AbstractMapJoinOperator<? extends MapJoinDesc> mjOp = opProcCtx.getCurrMapJoinOp(); if (readUnionData) { initUnionPlan(opProcCtx, currTask, false); } else { GenMRMapJoinCtx mjCtx = opProcCtx.getMapJoinCtx(mjOp); // In case of map-join followed by map-join, the file needs to be // obtained from the old map join AbstractMapJoinOperator<? extends MapJoinDesc> oldMapJoin = mjCtx.getOldMapJoin(); String taskTmpDir = null; TableDesc tt_desc = null; Operator<? extends Serializable> rootOp = null; boolean local = ((pos == -1) || (pos == (mjOp.getConf()).getPosBigTable())) ? false : true; if (oldMapJoin == null) { if (opProcCtx.getParseCtx().getListMapJoinOpsNoReducer().contains(mjOp) || local || (oldTask != null) && (parTasks != null)) { taskTmpDir = mjCtx.getTaskTmpDir(); tt_desc = mjCtx.getTTDesc(); rootOp = mjCtx.getRootMapJoinOp(); } } else { GenMRMapJoinCtx oldMjCtx = opProcCtx.getMapJoinCtx(oldMapJoin); assert oldMjCtx != null; taskTmpDir = oldMjCtx.getTaskTmpDir(); tt_desc = oldMjCtx.getTTDesc(); rootOp = oldMjCtx.getRootMapJoinOp(); } setTaskPlan(taskTmpDir, taskTmpDir, rootOp, plan, local, tt_desc); setupBucketMapJoinInfo(plan, oldMapJoin, createLocalWork); } opProcCtx.setCurrMapJoinOp(null); if ((oldTask != null) && (parTasks != null)) { for (Task<? extends Serializable> parTask : parTasks) { parTask.addDependentTask(currTask); if (opProcCtx.getRootTasks().contains(currTask)) { opProcCtx.getRootTasks().remove(currTask); } } } } opProcCtx.setCurrTask(currTask); }
@Override public void analyzeInternal(ASTNode ast) throws SemanticException { isLocal = false; isOverWrite = false; Tree fromTree = ast.getChild(0); Tree tableTree = ast.getChild(1); if (ast.getChildCount() == 4) { isLocal = true; isOverWrite = true; } if (ast.getChildCount() == 3) { if (ast.getChild(2).getText().toLowerCase().equals("local")) { isLocal = true; } else { isOverWrite = true; } } // initialize load path URI fromURI; try { String fromPath = stripQuotes(fromTree.getText()); fromURI = initializeFromURI(fromPath); } catch (IOException e) { throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e.getMessage()), e); } catch (URISyntaxException e) { throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e.getMessage()), e); } // initialize destination table/partition tableSpec ts = new tableSpec(db, conf, (ASTNode) tableTree); if (ts.tableHandle.isOffline()) { throw new SemanticException( ErrorMsg.OFFLINE_TABLE_OR_PARTITION.getMsg(":Table " + ts.tableName)); } if (ts.tableHandle.isView()) { throw new SemanticException(ErrorMsg.DML_AGAINST_VIEW.getMsg()); } if (ts.tableHandle.isNonNative()) { throw new SemanticException(ErrorMsg.LOAD_INTO_NON_NATIVE.getMsg()); } if (ts.tableHandle.isStoredAsSubDirectories()) { throw new SemanticException(ErrorMsg.LOAD_INTO_STORED_AS_DIR.getMsg()); } URI toURI = (ts.partHandle != null) ? ts.partHandle.getDataLocation() : ts.tableHandle.getDataLocation(); List<FieldSchema> parts = ts.tableHandle.getPartitionKeys(); if ((parts != null && parts.size() > 0) && (ts.partSpec == null || ts.partSpec.size() == 0)) { throw new SemanticException(ErrorMsg.NEED_PARTITION_ERROR.getMsg()); } // make sure the arguments make sense applyConstraints(fromURI, toURI, fromTree, isLocal); Task<? extends Serializable> rTask = null; // create copy work if (isLocal) { // if the local keyword is specified - we will always make a copy. this // might seem redundant in the case // that the hive warehouse is also located in the local file system - but // that's just a test case. String copyURIStr = ctx.getExternalTmpFileURI(toURI); URI copyURI = URI.create(copyURIStr); rTask = TaskFactory.get(new CopyWork(fromURI.toString(), copyURIStr), conf); fromURI = copyURI; } // create final load/move work String loadTmpPath = ctx.getExternalTmpFileURI(toURI); Map<String, String> partSpec = ts.getPartSpec(); if (partSpec == null) { partSpec = new LinkedHashMap<String, String>(); outputs.add(new WriteEntity(ts.tableHandle)); } else { try { Partition part = Hive.get().getPartition(ts.tableHandle, partSpec, false); if (part != null) { if (part.isOffline()) { throw new SemanticException( ErrorMsg.OFFLINE_TABLE_OR_PARTITION.getMsg(ts.tableName + ":" + part.getName())); } outputs.add(new WriteEntity(part)); } else { outputs.add(new WriteEntity(ts.tableHandle)); } } catch (HiveException e) { throw new SemanticException(e); } } LoadTableDesc loadTableWork = new LoadTableDesc( fromURI.toString(), loadTmpPath, Utilities.getTableDesc(ts.tableHandle), partSpec, isOverWrite); Task<? extends Serializable> childTask = TaskFactory.get(new MoveWork(getInputs(), getOutputs(), loadTableWork, null, true), conf); if (rTask != null) { rTask.addDependentTask(childTask); } else { rTask = childTask; } rootTasks.add(rTask); // The user asked for stats to be collected. // Some stats like number of rows require a scan of the data // However, some other stats, like number of files, do not require a complete scan // Update the stats which do not require a complete scan. Task<? extends Serializable> statTask = null; if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { StatsWork statDesc = new StatsWork(loadTableWork); statDesc.setNoStatsAggregator(true); statDesc.setClearAggregatorStats(true); statDesc.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE)); statTask = TaskFactory.get(statDesc, conf); } // HIVE-3334 has been filed for load file with index auto update if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEINDEXAUTOUPDATE)) { IndexUpdater indexUpdater = new IndexUpdater(loadTableWork, getInputs(), conf); try { List<Task<? extends Serializable>> indexUpdateTasks = indexUpdater.generateUpdateTasks(); for (Task<? extends Serializable> updateTask : indexUpdateTasks) { // LOAD DATA will either have a copy & move or just a move, // we always want the update to be dependent on the move childTask.addDependentTask(updateTask); if (statTask != null) { updateTask.addDependentTask(statTask); } } } catch (HiveException e) { console.printInfo( "WARNING: could not auto-update stale indexes, indexes are not out of sync"); } } else if (statTask != null) { childTask.addDependentTask(statTask); } }