/** * Construct a conditional task given the current leaf task, the MoveWork and the MapredWork. * * @param conf HiveConf * @param currTask current leaf task * @param mvWork MoveWork for the move task * @param mergeWork MapredWork for the merge task. * @param inputPath the input directory of the merge/move task * @return The conditional task */ private ConditionalTask createCondTask( HiveConf conf, Task<? extends Serializable> currTask, MoveWork mvWork, MapredWork mergeWork, String inputPath) { Task<? extends Serializable> mergeTask = TaskFactory.get(mergeWork, conf); Task<? extends Serializable> moveTask = TaskFactory.get(mvWork, conf); List<Serializable> listWorks = new ArrayList<Serializable>(); listWorks.add(mvWork); listWorks.add(mergeWork); ConditionalWork cndWork = new ConditionalWork(listWorks); List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>(); listTasks.add(moveTask); listTasks.add(mergeTask); ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, conf); cndTsk.setListTasks(listTasks); // create resolver cndTsk.setResolver(new ConditionalResolverMergeFiles()); ConditionalResolverMergeFilesCtx mrCtx = new ConditionalResolverMergeFilesCtx(listTasks, inputPath); cndTsk.setResolverCtx(mrCtx); // make the conditional task as the child of the current leaf task currTask.addDependentTask(cndTsk); return cndTsk; }
/** * Construct a conditional task given the current leaf task, the MoveWork and the MapredWork. * * @param conf HiveConf * @param currTask current leaf task * @param mvWork MoveWork for the move task * @param mergeWork MapredWork for the merge task. * @param inputPath the input directory of the merge/move task * @return The conditional task */ private ConditionalTask createCondTask( HiveConf conf, Task<? extends Serializable> currTask, MoveWork mvWork, MapredWork mergeWork, String inputPath) { // There are 3 options for this ConditionalTask: // 1) Merge the partitions // 2) Move the partitions (i.e. don't merge the partitions) // 3) Merge some partitions and move other partitions (i.e. merge some partitions and don't // merge others) in this case the merge is done first followed by the move to prevent // conflicts. Task<? extends Serializable> mergeOnlyMergeTask = TaskFactory.get(mergeWork, conf); Task<? extends Serializable> moveOnlyMoveTask = TaskFactory.get(mvWork, conf); Task<? extends Serializable> mergeAndMoveMergeTask = TaskFactory.get(mergeWork, conf); Task<? extends Serializable> mergeAndMoveMoveTask = TaskFactory.get(mvWork, conf); // NOTE! It is necessary merge task is the parent of the move task, and not // the other way around, for the proper execution of the execute method of // ConditionalTask mergeAndMoveMergeTask.addDependentTask(mergeAndMoveMoveTask); List<Serializable> listWorks = new ArrayList<Serializable>(); listWorks.add(mvWork); listWorks.add(mergeWork); ConditionalWork cndWork = new ConditionalWork(listWorks); List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>(); listTasks.add(moveOnlyMoveTask); listTasks.add(mergeOnlyMergeTask); listTasks.add(mergeAndMoveMergeTask); ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, conf); cndTsk.setListTasks(listTasks); // create resolver cndTsk.setResolver(new ConditionalResolverMergeFiles()); ConditionalResolverMergeFilesCtx mrCtx = new ConditionalResolverMergeFilesCtx(listTasks, inputPath); cndTsk.setResolverCtx(mrCtx); // make the conditional task as the child of the current leaf task currTask.addDependentTask(cndTsk); return cndTsk; }
/** * Make the move task in the GenMRProcContext following the FileSinkOperator a dependent of all * possible subtrees branching from the ConditionalTask. * * @param ctx * @param newOutput * @param cndTsk */ private void linkMoveTask( GenMRProcContext ctx, FileSinkOperator newOutput, ConditionalTask cndTsk) { List<Task<MoveWork>> mvTasks = ctx.getMvTask(); Task<MoveWork> mvTask = findMoveTask(mvTasks, newOutput); for (Task<? extends Serializable> tsk : cndTsk.getListTasks()) { linkMoveTask(ctx, mvTask, tsk); } }
private void LinkMoveTask( GenMRProcContext ctx, FileSinkOperator newOutput, ConditionalTask cndTsk) { List<Task<? extends Serializable>> mvTasks = ctx.getMvTask(); Task<? extends Serializable> mvTask = findMoveTask(mvTasks, newOutput); if (mvTask != null) { for (Task<? extends Serializable> tsk : cndTsk.getListTasks()) { tsk.addDependentTask(mvTask); } } }
/** * create a Map-only merge job with the following operators: * * @param fsInput * @param ctx * @param finalName MR job J0: ... | v FileSinkOperator_1 (fsInput) | v Merge job J1: | v * TableScan (using CombineHiveInputFormat) (tsMerge) | v FileSinkOperator (fsMerge) * <p>Here the pathToPartitionInfo & pathToAlias will remain the same, which means the paths * do not contain the dynamic partitions (their parent). So after the dynamic partitions are * created (after the first job finished before the moveTask or ConditionalTask start), we * need to change the pathToPartitionInfo & pathToAlias to include the dynamic partition * directories. */ private void createMap4Merge(FileSinkOperator fsInput, GenMRProcContext ctx, String finalName) { // // 1. create the operator tree // ParseContext parseCtx = ctx.getParseCtx(); FileSinkDesc fsInputDesc = fsInput.getConf(); // Create a TableScan operator RowSchema inputRS = fsInput.getSchema(); Operator<? extends Serializable> tsMerge = OperatorFactory.get(TableScanDesc.class, inputRS); // Create a FileSink operator TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone(); FileSinkDesc fsOutputDesc = new FileSinkDesc( finalName, ts, parseCtx.getConf().getBoolVar(HiveConf.ConfVars.COMPRESSRESULT)); FileSinkOperator fsOutput = (FileSinkOperator) OperatorFactory.getAndMakeChild(fsOutputDesc, inputRS, tsMerge); // If the input FileSinkOperator is a dynamic partition enabled, the tsMerge input schema // needs to include the partition column, and the fsOutput should have // a DynamicPartitionCtx to indicate that it needs to dynamically partitioned. DynamicPartitionCtx dpCtx = fsInputDesc.getDynPartCtx(); if (dpCtx != null && dpCtx.getNumDPCols() > 0) { // adding DP ColumnInfo to the RowSchema signature ArrayList<ColumnInfo> signature = inputRS.getSignature(); String tblAlias = fsInputDesc.getTableInfo().getTableName(); LinkedHashMap<String, String> colMap = new LinkedHashMap<String, String>(); StringBuilder partCols = new StringBuilder(); for (String dpCol : dpCtx.getDPColNames()) { ColumnInfo colInfo = new ColumnInfo( dpCol, TypeInfoFactory.stringTypeInfo, // all partition column type should be string tblAlias, true); // partition column is virtual column signature.add(colInfo); colMap.put(dpCol, dpCol); // input and output have the same column name partCols.append(dpCol).append('/'); } partCols.setLength(partCols.length() - 1); // remove the last '/' inputRS.setSignature(signature); // create another DynamicPartitionCtx, which has a different input-to-DP column mapping DynamicPartitionCtx dpCtx2 = new DynamicPartitionCtx(dpCtx); dpCtx2.setInputToDPCols(colMap); fsOutputDesc.setDynPartCtx(dpCtx2); // update the FileSinkOperator to include partition columns fsInputDesc .getTableInfo() .getProperties() .setProperty( org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_PARTITION_COLUMNS, partCols.toString()); // list of dynamic partition column names } else { // non-partitioned table fsInputDesc .getTableInfo() .getProperties() .remove(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_PARTITION_COLUMNS); } // // 2. Constructing a conditional task consisting of a move task and a map reduce task // MapRedTask currTask = (MapRedTask) ctx.getCurrTask(); MoveWork dummyMv = new MoveWork( null, null, null, new LoadFileDesc(fsInputDesc.getDirName(), finalName, true, null, null), false); MapredWork cplan = createMergeTask(ctx.getConf(), tsMerge, fsInputDesc); // use CombineHiveInputFormat for map-only merging cplan.setInputformat("org.apache.hadoop.hive.ql.io.CombineHiveInputFormat"); // NOTE: we should gather stats in MR1 rather than MR2 at merge job since we don't // know if merge MR2 will be triggered at execution time ConditionalTask cndTsk = createCondTask(ctx.getConf(), ctx.getCurrTask(), dummyMv, cplan, fsInputDesc.getDirName()); // keep the dynamic partition context in conditional task resolver context ConditionalResolverMergeFilesCtx mrCtx = (ConditionalResolverMergeFilesCtx) cndTsk.getResolverCtx(); mrCtx.setDPCtx(fsInputDesc.getDynPartCtx()); // // 3. add the moveTask as the children of the conditional task // LinkMoveTask(ctx, fsOutput, cndTsk); }
/** * @param fsInput The FileSink operator. * @param ctx The MR processing context. * @param finalName the final destination path the merge job should output. * @throws SemanticException * <p>create a Map-only merge job using CombineHiveInputFormat for all partitions with * following operators: MR job J0: ... | v FileSinkOperator_1 (fsInput) | v Merge job J1: | v * TableScan (using CombineHiveInputFormat) (tsMerge) | v FileSinkOperator (fsMerge) * <p>Here the pathToPartitionInfo & pathToAlias will remain the same, which means the paths * do not contain the dynamic partitions (their parent). So after the dynamic partitions are * created (after the first job finished before the moveTask or ConditionalTask start), we * need to change the pathToPartitionInfo & pathToAlias to include the dynamic partition * directories. */ private void createMRWorkForMergingFiles( FileSinkOperator fsInput, GenMRProcContext ctx, String finalName) throws SemanticException { // // 1. create the operator tree // HiveConf conf = ctx.getParseCtx().getConf(); FileSinkDesc fsInputDesc = fsInput.getConf(); // Create a TableScan operator RowSchema inputRS = fsInput.getSchema(); Operator<? extends OperatorDesc> tsMerge = OperatorFactory.get(TableScanDesc.class, inputRS); // Create a FileSink operator TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone(); FileSinkDesc fsOutputDesc = new FileSinkDesc(finalName, ts, conf.getBoolVar(ConfVars.COMPRESSRESULT)); FileSinkOperator fsOutput = (FileSinkOperator) OperatorFactory.getAndMakeChild(fsOutputDesc, inputRS, tsMerge); // If the input FileSinkOperator is a dynamic partition enabled, the tsMerge input schema // needs to include the partition column, and the fsOutput should have // a DynamicPartitionCtx to indicate that it needs to dynamically partitioned. DynamicPartitionCtx dpCtx = fsInputDesc.getDynPartCtx(); if (dpCtx != null && dpCtx.getNumDPCols() > 0) { // adding DP ColumnInfo to the RowSchema signature ArrayList<ColumnInfo> signature = inputRS.getSignature(); String tblAlias = fsInputDesc.getTableInfo().getTableName(); LinkedHashMap<String, String> colMap = new LinkedHashMap<String, String>(); StringBuilder partCols = new StringBuilder(); for (String dpCol : dpCtx.getDPColNames()) { ColumnInfo colInfo = new ColumnInfo( dpCol, TypeInfoFactory.stringTypeInfo, // all partition column type should be string tblAlias, true); // partition column is virtual column signature.add(colInfo); colMap.put(dpCol, dpCol); // input and output have the same column name partCols.append(dpCol).append('/'); } partCols.setLength(partCols.length() - 1); // remove the last '/' inputRS.setSignature(signature); // create another DynamicPartitionCtx, which has a different input-to-DP column mapping DynamicPartitionCtx dpCtx2 = new DynamicPartitionCtx(dpCtx); dpCtx2.setInputToDPCols(colMap); fsOutputDesc.setDynPartCtx(dpCtx2); // update the FileSinkOperator to include partition columns fsInputDesc .getTableInfo() .getProperties() .setProperty( org.apache .hadoop .hive .metastore .api .hive_metastoreConstants .META_TABLE_PARTITION_COLUMNS, partCols.toString()); // list of dynamic partition column names } else { // non-partitioned table fsInputDesc .getTableInfo() .getProperties() .remove( org.apache .hadoop .hive .metastore .api .hive_metastoreConstants .META_TABLE_PARTITION_COLUMNS); } // // 2. Constructing a conditional task consisting of a move task and a map reduce task // MoveWork dummyMv = new MoveWork( null, null, null, new LoadFileDesc(fsInputDesc.getFinalDirName(), finalName, true, null, null), false); MapredWork cplan; if (conf.getBoolVar(ConfVars.HIVEMERGERCFILEBLOCKLEVEL) && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(RCFileInputFormat.class)) { // Check if InputFormatClass is valid String inputFormatClass = conf.getVar(ConfVars.HIVEMERGERCFILEINPUTFORMATBLOCKLEVEL); try { Class c = (Class<? extends InputFormat>) Class.forName(inputFormatClass); LOG.info("RCFile format- Using block level merge"); cplan = createBlockMergeTask( fsInputDesc, finalName, dpCtx != null && dpCtx.getNumDPCols() > 0, RCFileMergeMapper.class, RCFileInputFormat.class, RCFileBlockMergeInputFormat.class); } catch (ClassNotFoundException e) { String msg = "Illegal input format class: " + inputFormatClass; throw new SemanticException(msg); } } else if (conf.getBoolVar(ConfVars.HIVEMERGEORCBLOCKLEVEL) && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(OrcInputFormat.class)) { // Check if InputFormatClass is valid String inputFormatClass = conf.getVar(ConfVars.HIVEMERGEORCINPUTFORMATBLOCKLEVEL); try { Class c = (Class<? extends InputFormat>) Class.forName(inputFormatClass); LOG.info("ORCFile format- Using block level merge"); cplan = createBlockMergeTask( fsInputDesc, finalName, dpCtx != null && dpCtx.getNumDPCols() > 0, OrcMergeMapper.class, OrcInputFormat.class, OrcBlockMergeInputFormat.class); } catch (ClassNotFoundException e) { String msg = "Illegal input format class: " + inputFormatClass; throw new SemanticException(msg); } } else { cplan = createMRWorkForMergingFiles(conf, tsMerge, fsInputDesc); // use CombineHiveInputFormat for map-only merging } cplan.setInputformat("org.apache.hadoop.hive.ql.io.CombineHiveInputFormat"); // NOTE: we should gather stats in MR1 rather than MR2 at merge job since we don't // know if merge MR2 will be triggered at execution time ConditionalTask cndTsk = createCondTask(conf, ctx.getCurrTask(), dummyMv, cplan, fsInputDesc.getFinalDirName()); // keep the dynamic partition context in conditional task resolver context ConditionalResolverMergeFilesCtx mrCtx = (ConditionalResolverMergeFilesCtx) cndTsk.getResolverCtx(); mrCtx.setDPCtx(fsInputDesc.getDynPartCtx()); mrCtx.setLbCtx(fsInputDesc.getLbCtx()); // // 3. add the moveTask as the children of the conditional task // linkMoveTask(ctx, fsOutput, cndTsk); }