/** * The pruning needs to preserve the order of columns in the input schema * * @param op * @param cols * @return * @throws SemanticException */ private static List<String> preserveColumnOrder( Operator<? extends OperatorDesc> op, List<String> cols) throws SemanticException { RowSchema inputSchema = op.getSchema(); if (inputSchema != null) { ArrayList<String> rs = new ArrayList<String>(); ArrayList<ColumnInfo> inputCols = inputSchema.getSignature(); for (ColumnInfo i : inputCols) { if (cols.contains(i.getInternalName())) { rs.add(i.getInternalName()); } } return rs; } else { return cols; } }
// insert filter operator between target(child) and input(parent) private Operator<FilterDesc> createFilter( Operator<?> target, Operator<?> parent, RowSchema parentRS, ExprNodeDesc filterExpr) { Operator<FilterDesc> filter = OperatorFactory.get( new FilterDesc(filterExpr, false), new RowSchema(parentRS.getSignature())); filter.getParentOperators().add(parent); filter.getChildOperators().add(target); parent.replaceChild(target, filter); target.replaceParent(parent, filter); return filter; }
private List<Integer> getPartitionPositions(DynamicPartitionCtx dpCtx, RowSchema schema) { int numPartCols = dpCtx.getNumDPCols(); int numCols = schema.getSignature().size(); List<Integer> partPos = Lists.newArrayList(); // partition columns will always at the last for (int i = numCols - numPartCols; i < numCols; i++) { partPos.add(i); } return partPos; }
public int getPosition(String internalName) { int pos = -1; for (ColumnInfo var : rowSchema.getSignature()) { ++pos; if (var.getInternalName().equals(internalName)) { return pos; } } return -1; }
/** * create a Map-only merge job with the following operators: * * @param fsInput * @param ctx * @param finalName MR job J0: ... | v FileSinkOperator_1 (fsInput) | v Merge job J1: | v * TableScan (using CombineHiveInputFormat) (tsMerge) | v FileSinkOperator (fsMerge) * <p>Here the pathToPartitionInfo & pathToAlias will remain the same, which means the paths * do not contain the dynamic partitions (their parent). So after the dynamic partitions are * created (after the first job finished before the moveTask or ConditionalTask start), we * need to change the pathToPartitionInfo & pathToAlias to include the dynamic partition * directories. */ private void createMap4Merge(FileSinkOperator fsInput, GenMRProcContext ctx, String finalName) { // // 1. create the operator tree // ParseContext parseCtx = ctx.getParseCtx(); FileSinkDesc fsInputDesc = fsInput.getConf(); // Create a TableScan operator RowSchema inputRS = fsInput.getSchema(); Operator<? extends Serializable> tsMerge = OperatorFactory.get(TableScanDesc.class, inputRS); // Create a FileSink operator TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone(); FileSinkDesc fsOutputDesc = new FileSinkDesc( finalName, ts, parseCtx.getConf().getBoolVar(HiveConf.ConfVars.COMPRESSRESULT)); FileSinkOperator fsOutput = (FileSinkOperator) OperatorFactory.getAndMakeChild(fsOutputDesc, inputRS, tsMerge); // If the input FileSinkOperator is a dynamic partition enabled, the tsMerge input schema // needs to include the partition column, and the fsOutput should have // a DynamicPartitionCtx to indicate that it needs to dynamically partitioned. DynamicPartitionCtx dpCtx = fsInputDesc.getDynPartCtx(); if (dpCtx != null && dpCtx.getNumDPCols() > 0) { // adding DP ColumnInfo to the RowSchema signature ArrayList<ColumnInfo> signature = inputRS.getSignature(); String tblAlias = fsInputDesc.getTableInfo().getTableName(); LinkedHashMap<String, String> colMap = new LinkedHashMap<String, String>(); StringBuilder partCols = new StringBuilder(); for (String dpCol : dpCtx.getDPColNames()) { ColumnInfo colInfo = new ColumnInfo( dpCol, TypeInfoFactory.stringTypeInfo, // all partition column type should be string tblAlias, true); // partition column is virtual column signature.add(colInfo); colMap.put(dpCol, dpCol); // input and output have the same column name partCols.append(dpCol).append('/'); } partCols.setLength(partCols.length() - 1); // remove the last '/' inputRS.setSignature(signature); // create another DynamicPartitionCtx, which has a different input-to-DP column mapping DynamicPartitionCtx dpCtx2 = new DynamicPartitionCtx(dpCtx); dpCtx2.setInputToDPCols(colMap); fsOutputDesc.setDynPartCtx(dpCtx2); // update the FileSinkOperator to include partition columns fsInputDesc .getTableInfo() .getProperties() .setProperty( org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_PARTITION_COLUMNS, partCols.toString()); // list of dynamic partition column names } else { // non-partitioned table fsInputDesc .getTableInfo() .getProperties() .remove(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_PARTITION_COLUMNS); } // // 2. Constructing a conditional task consisting of a move task and a map reduce task // MapRedTask currTask = (MapRedTask) ctx.getCurrTask(); MoveWork dummyMv = new MoveWork( null, null, null, new LoadFileDesc(fsInputDesc.getDirName(), finalName, true, null, null), false); MapredWork cplan = createMergeTask(ctx.getConf(), tsMerge, fsInputDesc); // use CombineHiveInputFormat for map-only merging cplan.setInputformat("org.apache.hadoop.hive.ql.io.CombineHiveInputFormat"); // NOTE: we should gather stats in MR1 rather than MR2 at merge job since we don't // know if merge MR2 will be triggered at execution time ConditionalTask cndTsk = createCondTask(ctx.getConf(), ctx.getCurrTask(), dummyMv, cplan, fsInputDesc.getDirName()); // keep the dynamic partition context in conditional task resolver context ConditionalResolverMergeFilesCtx mrCtx = (ConditionalResolverMergeFilesCtx) cndTsk.getResolverCtx(); mrCtx.setDPCtx(fsInputDesc.getDynPartCtx()); // // 3. add the moveTask as the children of the conditional task // LinkMoveTask(ctx, fsOutput, cndTsk); }
private void createMapReduce4Merge(FileSinkOperator fsOp, GenMRProcContext ctx, String finalName) throws SemanticException { Task<? extends Serializable> currTask = ctx.getCurrTask(); RowSchema inputRS = fsOp.getSchema(); // create a reduce Sink operator - key is the first column ArrayList<ExprNodeDesc> keyCols = new ArrayList<ExprNodeDesc>(); keyCols.add(TypeCheckProcFactory.DefaultExprProcessor.getFuncExprNodeDesc("rand")); // value is all the columns in the FileSink operator input ArrayList<ExprNodeDesc> valueCols = new ArrayList<ExprNodeDesc>(); for (ColumnInfo ci : inputRS.getSignature()) { valueCols.add( new ExprNodeColumnDesc( ci.getType(), ci.getInternalName(), ci.getTabAlias(), ci.getIsVirtualCol())); } // create a dummy tableScan operator Operator<? extends Serializable> tsMerge = OperatorFactory.get(TableScanDesc.class, inputRS); ArrayList<String> outputColumns = new ArrayList<String>(); for (int i = 0; i < valueCols.size(); i++) { outputColumns.add(SemanticAnalyzer.getColumnInternalName(i)); } ReduceSinkDesc rsDesc = PlanUtils.getReduceSinkDesc( new ArrayList<ExprNodeDesc>(), valueCols, outputColumns, false, -1, -1, -1); OperatorFactory.getAndMakeChild(rsDesc, inputRS, tsMerge); ParseContext parseCtx = ctx.getParseCtx(); FileSinkDesc fsConf = fsOp.getConf(); // Add the extract operator to get the value fields RowResolver out_rwsch = new RowResolver(); RowResolver interim_rwsch = ctx.getParseCtx().getOpParseCtx().get(fsOp).getRowResolver(); Integer pos = Integer.valueOf(0); for (ColumnInfo colInfo : interim_rwsch.getColumnInfos()) { String[] info = interim_rwsch.reverseLookup(colInfo.getInternalName()); out_rwsch.put( info[0], info[1], new ColumnInfo( pos.toString(), colInfo.getType(), info[0], colInfo.getIsVirtualCol(), colInfo.isHiddenVirtualCol())); pos = Integer.valueOf(pos.intValue() + 1); } Operator<ExtractDesc> extract = OperatorFactory.getAndMakeChild( new ExtractDesc( new ExprNodeColumnDesc( TypeInfoFactory.stringTypeInfo, Utilities.ReduceField.VALUE.toString(), "", false)), new RowSchema(out_rwsch.getColumnInfos())); TableDesc ts = (TableDesc) fsConf.getTableInfo().clone(); fsConf .getTableInfo() .getProperties() .remove(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_PARTITION_COLUMNS); FileSinkDesc newFSD = new FileSinkDesc( finalName, ts, parseCtx.getConf().getBoolVar(HiveConf.ConfVars.COMPRESSRESULT)); FileSinkOperator newOutput = (FileSinkOperator) OperatorFactory.getAndMakeChild(newFSD, inputRS, extract); HiveConf conf = parseCtx.getConf(); MapredWork cplan = createMergeTask(conf, tsMerge, fsConf); cplan.setReducer(extract); // NOTE: we should gather stats in MR1 (rather than the merge MR job) // since it is unknown if the merge MR will be triggered at execution time. MoveWork dummyMv = new MoveWork( null, null, null, new LoadFileDesc(fsConf.getDirName(), finalName, true, null, null), false); ConditionalTask cndTsk = createCondTask(conf, currTask, dummyMv, cplan, fsConf.getDirName()); LinkMoveTask(ctx, newOutput, cndTsk); }
/** * @param fsInput The FileSink operator. * @param ctx The MR processing context. * @param finalName the final destination path the merge job should output. * @throws SemanticException * <p>create a Map-only merge job using CombineHiveInputFormat for all partitions with * following operators: MR job J0: ... | v FileSinkOperator_1 (fsInput) | v Merge job J1: | v * TableScan (using CombineHiveInputFormat) (tsMerge) | v FileSinkOperator (fsMerge) * <p>Here the pathToPartitionInfo & pathToAlias will remain the same, which means the paths * do not contain the dynamic partitions (their parent). So after the dynamic partitions are * created (after the first job finished before the moveTask or ConditionalTask start), we * need to change the pathToPartitionInfo & pathToAlias to include the dynamic partition * directories. */ private void createMRWorkForMergingFiles( FileSinkOperator fsInput, GenMRProcContext ctx, String finalName) throws SemanticException { // // 1. create the operator tree // HiveConf conf = ctx.getParseCtx().getConf(); FileSinkDesc fsInputDesc = fsInput.getConf(); // Create a TableScan operator RowSchema inputRS = fsInput.getSchema(); Operator<? extends OperatorDesc> tsMerge = OperatorFactory.get(TableScanDesc.class, inputRS); // Create a FileSink operator TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone(); FileSinkDesc fsOutputDesc = new FileSinkDesc(finalName, ts, conf.getBoolVar(ConfVars.COMPRESSRESULT)); FileSinkOperator fsOutput = (FileSinkOperator) OperatorFactory.getAndMakeChild(fsOutputDesc, inputRS, tsMerge); // If the input FileSinkOperator is a dynamic partition enabled, the tsMerge input schema // needs to include the partition column, and the fsOutput should have // a DynamicPartitionCtx to indicate that it needs to dynamically partitioned. DynamicPartitionCtx dpCtx = fsInputDesc.getDynPartCtx(); if (dpCtx != null && dpCtx.getNumDPCols() > 0) { // adding DP ColumnInfo to the RowSchema signature ArrayList<ColumnInfo> signature = inputRS.getSignature(); String tblAlias = fsInputDesc.getTableInfo().getTableName(); LinkedHashMap<String, String> colMap = new LinkedHashMap<String, String>(); StringBuilder partCols = new StringBuilder(); for (String dpCol : dpCtx.getDPColNames()) { ColumnInfo colInfo = new ColumnInfo( dpCol, TypeInfoFactory.stringTypeInfo, // all partition column type should be string tblAlias, true); // partition column is virtual column signature.add(colInfo); colMap.put(dpCol, dpCol); // input and output have the same column name partCols.append(dpCol).append('/'); } partCols.setLength(partCols.length() - 1); // remove the last '/' inputRS.setSignature(signature); // create another DynamicPartitionCtx, which has a different input-to-DP column mapping DynamicPartitionCtx dpCtx2 = new DynamicPartitionCtx(dpCtx); dpCtx2.setInputToDPCols(colMap); fsOutputDesc.setDynPartCtx(dpCtx2); // update the FileSinkOperator to include partition columns fsInputDesc .getTableInfo() .getProperties() .setProperty( org.apache .hadoop .hive .metastore .api .hive_metastoreConstants .META_TABLE_PARTITION_COLUMNS, partCols.toString()); // list of dynamic partition column names } else { // non-partitioned table fsInputDesc .getTableInfo() .getProperties() .remove( org.apache .hadoop .hive .metastore .api .hive_metastoreConstants .META_TABLE_PARTITION_COLUMNS); } // // 2. Constructing a conditional task consisting of a move task and a map reduce task // MoveWork dummyMv = new MoveWork( null, null, null, new LoadFileDesc(fsInputDesc.getFinalDirName(), finalName, true, null, null), false); MapredWork cplan; if (conf.getBoolVar(ConfVars.HIVEMERGERCFILEBLOCKLEVEL) && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(RCFileInputFormat.class)) { // Check if InputFormatClass is valid String inputFormatClass = conf.getVar(ConfVars.HIVEMERGERCFILEINPUTFORMATBLOCKLEVEL); try { Class c = (Class<? extends InputFormat>) Class.forName(inputFormatClass); LOG.info("RCFile format- Using block level merge"); cplan = createBlockMergeTask( fsInputDesc, finalName, dpCtx != null && dpCtx.getNumDPCols() > 0, RCFileMergeMapper.class, RCFileInputFormat.class, RCFileBlockMergeInputFormat.class); } catch (ClassNotFoundException e) { String msg = "Illegal input format class: " + inputFormatClass; throw new SemanticException(msg); } } else if (conf.getBoolVar(ConfVars.HIVEMERGEORCBLOCKLEVEL) && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(OrcInputFormat.class)) { // Check if InputFormatClass is valid String inputFormatClass = conf.getVar(ConfVars.HIVEMERGEORCINPUTFORMATBLOCKLEVEL); try { Class c = (Class<? extends InputFormat>) Class.forName(inputFormatClass); LOG.info("ORCFile format- Using block level merge"); cplan = createBlockMergeTask( fsInputDesc, finalName, dpCtx != null && dpCtx.getNumDPCols() > 0, OrcMergeMapper.class, OrcInputFormat.class, OrcBlockMergeInputFormat.class); } catch (ClassNotFoundException e) { String msg = "Illegal input format class: " + inputFormatClass; throw new SemanticException(msg); } } else { cplan = createMRWorkForMergingFiles(conf, tsMerge, fsInputDesc); // use CombineHiveInputFormat for map-only merging } cplan.setInputformat("org.apache.hadoop.hive.ql.io.CombineHiveInputFormat"); // NOTE: we should gather stats in MR1 rather than MR2 at merge job since we don't // know if merge MR2 will be triggered at execution time ConditionalTask cndTsk = createCondTask(conf, ctx.getCurrTask(), dummyMv, cplan, fsInputDesc.getFinalDirName()); // keep the dynamic partition context in conditional task resolver context ConditionalResolverMergeFilesCtx mrCtx = (ConditionalResolverMergeFilesCtx) cndTsk.getResolverCtx(); mrCtx.setDPCtx(fsInputDesc.getDynPartCtx()); mrCtx.setLbCtx(fsInputDesc.getLbCtx()); // // 3. add the moveTask as the children of the conditional task // linkMoveTask(ctx, fsOutput, cndTsk); }
// main work @SuppressWarnings("unchecked") public static void analyzeHelper(Operator sinkOp, int level) { println(level, sinkOp.getClass()); if (sinkOp instanceof TableScanOperator) { // System.out.println("=========== " + // opParseCtx.get(sinkOp).getRowResolver().tableOriginalName); // System.out.println("========= " + ((TableScanOperator)(sinkOp)).getNeededColumnIDs()); // System.out.println("========= " + ((TableScanOperator)(sinkOp)).getNeededColumns()); // System.out.println("======Table Desc " + ((TableScanOperator)(sinkOp)).getTableDesc()); // System.out.println(qb.getTabNameForAlias("a")); // System.out.println(qb.getTabNameForAlias("b")); } println(level, "Column Expr Map: "); Map<String, ExprNodeDesc> map = sinkOp.getColumnExprMap(); if (map != null && map.entrySet() != null) { for (Entry<String, ExprNodeDesc> entry : map.entrySet()) { if (entry.getValue() instanceof ExprNodeColumnDesc) { println( level, entry.getKey() + ": " + ((ExprNodeColumnDesc) entry.getValue()).getTabAlias() + ((ExprNodeColumnDesc) entry.getValue()).getCols()); } else if (entry.getValue() instanceof ExprNodeConstantDesc) { println( level, entry.getKey() + ":: " + ((ExprNodeConstantDesc) entry.getValue()).getExprString()); // + ((ExprNodeConstantDesc)entry.getValue()).getCols()); } else { println(level, entry.getValue().getExprString()); // throw new RuntimeException("ExprNode Type does not supported!"); } } } println(level, "Schema: "); RowSchema schema = sinkOp.getSchema(); for (ColumnInfo info : schema.getSignature()) { println(level, info.getTabAlias() + "[" + info.getInternalName() + "]"); } if (sinkOp instanceof JoinOperator) { // println(level, ((JoinOperator) sinkOp).getPosToAliasMap()); // println(level, "Reversed Mapping: " + ((JoinOperator)sinkOp).getConf().getReversedExprs()); // println(level, ((JoinOperator)sinkOp).getConf()); // for (ExprNodeDesc nodeDesc: ((JoinOperator)sinkOp).getConf().getExprs()) {} // println(level, ((JoinOperator)sinkOp).getColumnExprMap()); // for exprs /* for (List<ExprNodeDesc> lst : ((JoinOperator)sinkOp).getConf().getExprs().values()) { printLevel(level); for (ExprNodeDesc desc: lst) { print(((ExprNodeColumnDesc)desc).getTabAlias() + " " + ((ExprNodeColumnDesc)desc).getCols()); } println(); } //for filters for (List<ExprNodeDesc> lst : ((JoinOperator)sinkOp).getConf().getFilters().values()) { printLevel(level); //print(((JoinOperator)sinkOp).getConf().getFilters()); for (ExprNodeDesc desc: lst) { print(desc.getClass() + " "); //print(((ExprNodeColumnDesc)desc).getTabAlias() + " " + ((ExprNodeColumnDesc)desc).getCols()); } println(); } println(level, "output"); println(level, ((JoinOperator)sinkOp).getConf().getOutputColumnNames()); */ // println(level, ((JoinOperator)sinkOp).getConf().getExprsStringMap()); } if (sinkOp instanceof ReduceSinkOperator) { // println(level, ((ReduceSinkOperator)sinkOp).getConf().getOutputKeyColumnNames()); /* for (ExprNodeDesc desc: ((ReduceSinkOperator)sinkOp).getConf().getValueCols()) { println(level, ((ExprNodeColumnDesc)desc).getTabAlias() + " " + ((ExprNodeColumnDesc)desc).getCols()); } */ } if (sinkOp instanceof SelectOperator) { /* for (ExprNodeDesc desc: ((SelectOperator)sinkOp).getConf().getColList()) { println(level, ((ExprNodeColumnDesc)desc).getTabAlias() + " " + ((ExprNodeColumnDesc)desc).getCols()); }*/ // println(level, ((SelectOperator)sinkOp).getConf().getColList()); // println(level, ((SelectOperator)sinkOp).getConf().getOutputColumnNames()); } if (sinkOp instanceof TableScanOperator) { // TableScanDesc desc = ((TableScanOperator)sinkOp).getConf(); // println(level, desc.getAlias()); // println(level, desc.getFilterExpr()); // println(level, desc.getBucketFileNameMapping()); // println(level, desc.getVirtualCols()); // println(level, desc.getPartColumns()); } if (sinkOp instanceof FilterOperator) { println(level, ((FilterOperator) sinkOp).getConf().getPredicate().getExprString()); // ExprNodeDesc desc = ((FilterOperator)sinkOp).getConf().getPredicate(); // (ExprNodeGenericFuncDesc)((FilterOperator)sinkOp).getConf().getPredicate() // println(level, ((ExprNodeGenericFuncDesc)desc).getExprString()); // println(level, ((ExprNodeGenericFuncDesc)desc).getCols()); } if (sinkOp instanceof LimitOperator) { println(level, ((LimitOperator) sinkOp).getConf().getClass()); // ExprNodeDesc desc = ((FilterOperator)sinkOp).getConf().getPredicate(); // (ExprNodeGenericFuncDesc)((FilterOperator)sinkOp).getConf().getPredicate() // println(level, ((ExprNodeGenericFuncDesc)desc).getExprString()); // println(level, ((ExprNodeGenericFuncDesc)desc).getCols()); } List<Operator> lst = sinkOp.getParentOperators(); if (lst != null) { for (Operator l : lst) { analyzeHelper(l, level + 1); } } }
public void put(String tab_alias, String col_alias, ColumnInfo colInfo) { if (!addMappingOnly(tab_alias, col_alias, colInfo)) { rowSchema.getSignature().add(colInfo); } }
public ArrayList<ColumnInfo> getColumnInfos() { return rowSchema.getSignature(); }
@Override public Object process( Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { // introduce RS and EX before FS. If the operator tree already contains // RS then ReduceSinkDeDuplication optimization should merge them FileSinkOperator fsOp = (FileSinkOperator) nd; LOG.info("Sorted dynamic partitioning optimization kicked in.."); // if not dynamic partitioning then bail out if (fsOp.getConf().getDynPartCtx() == null) { LOG.debug( "Bailing out of sort dynamic partition optimization as dynamic partitioning context is null"); return null; } // if list bucketing then bail out ListBucketingCtx lbCtx = fsOp.getConf().getLbCtx(); if (lbCtx != null && !lbCtx.getSkewedColNames().isEmpty() && !lbCtx.getSkewedColValues().isEmpty()) { LOG.debug( "Bailing out of sort dynamic partition optimization as list bucketing is enabled"); return null; } Table destTable = fsOp.getConf().getTable(); if (destTable == null) { LOG.debug( "Bailing out of sort dynamic partition optimization as destination table is null"); return null; } // unlink connection between FS and its parent Operator<? extends OperatorDesc> fsParent = fsOp.getParentOperators().get(0); // if all dp columns got constant folded then disable this optimization if (allStaticPartitions(fsParent, fsOp.getConf().getDynPartCtx())) { LOG.debug( "Bailing out of sorted dynamic partition optimizer as all dynamic partition" + " columns got constant folded (static partitioning)"); return null; } // if RS is inserted by enforce bucketing or sorting, we need to remove it // since ReduceSinkDeDuplication will not merge them to single RS. // RS inserted by enforce bucketing/sorting will have bucketing column in // reduce sink key whereas RS inserted by this optimization will have // partition columns followed by bucket number followed by sort columns in // the reduce sink key. Since both key columns are not prefix subset // ReduceSinkDeDuplication will not merge them together resulting in 2 MR jobs. // To avoid that we will remove the RS (and EX) inserted by enforce bucketing/sorting. if (!removeRSInsertedByEnforceBucketing(fsOp)) { LOG.debug( "Bailing out of sort dynamic partition optimization as some partition columns " + "got constant folded."); return null; } // unlink connection between FS and its parent fsParent = fsOp.getParentOperators().get(0); fsParent.getChildOperators().clear(); DynamicPartitionCtx dpCtx = fsOp.getConf().getDynPartCtx(); int numBuckets = destTable.getNumBuckets(); // if enforce bucketing/sorting is disabled numBuckets will not be set. // set the number of buckets here to ensure creation of empty buckets dpCtx.setNumBuckets(numBuckets); // Get the positions for partition, bucket and sort columns List<Integer> bucketPositions = getBucketPositions(destTable.getBucketCols(), destTable.getCols()); List<Integer> sortPositions = null; List<Integer> sortOrder = null; ArrayList<ExprNodeDesc> bucketColumns; if (fsOp.getConf().getWriteType() == AcidUtils.Operation.UPDATE || fsOp.getConf().getWriteType() == AcidUtils.Operation.DELETE) { // When doing updates and deletes we always want to sort on the rowid because the ACID // reader will expect this sort order when doing reads. So // ignore whatever comes from the table and enforce this sort order instead. sortPositions = Arrays.asList(0); sortOrder = Arrays.asList(1); // 1 means asc, could really use enum here in the thrift if bucketColumns = new ArrayList<>(); // Bucketing column is already present in ROW__ID, which is specially // handled in ReduceSink } else { if (!destTable.getSortCols().isEmpty()) { // Sort columns specified by table sortPositions = getSortPositions(destTable.getSortCols(), destTable.getCols()); sortOrder = getSortOrders(destTable.getSortCols(), destTable.getCols()); } else { // Infer sort columns from operator tree sortPositions = Lists.newArrayList(); sortOrder = Lists.newArrayList(); inferSortPositions(fsParent, sortPositions, sortOrder); } List<ColumnInfo> colInfos = fsParent.getSchema().getSignature(); bucketColumns = getPositionsToExprNodes(bucketPositions, colInfos); } List<Integer> sortNullOrder = new ArrayList<Integer>(); for (int order : sortOrder) { sortNullOrder.add(order == 1 ? 0 : 1); // for asc, nulls first; for desc, nulls last } LOG.debug("Got sort order"); for (int i : sortPositions) LOG.debug("sort position " + i); for (int i : sortOrder) LOG.debug("sort order " + i); for (int i : sortNullOrder) LOG.debug("sort null order " + i); List<Integer> partitionPositions = getPartitionPositions(dpCtx, fsParent.getSchema()); // update file sink descriptor fsOp.getConf().setMultiFileSpray(false); fsOp.getConf().setNumFiles(1); fsOp.getConf().setTotalFiles(1); ArrayList<ColumnInfo> parentCols = Lists.newArrayList(fsParent.getSchema().getSignature()); ArrayList<ExprNodeDesc> allRSCols = Lists.newArrayList(); for (ColumnInfo ci : parentCols) { allRSCols.add(new ExprNodeColumnDesc(ci)); } // Create ReduceSink operator ReduceSinkOperator rsOp = getReduceSinkOp( partitionPositions, sortPositions, sortOrder, sortNullOrder, allRSCols, bucketColumns, numBuckets, fsParent, fsOp.getConf().getWriteType()); List<ExprNodeDesc> descs = new ArrayList<ExprNodeDesc>(allRSCols.size()); List<String> colNames = new ArrayList<String>(); String colName; for (int i = 0; i < allRSCols.size(); i++) { ExprNodeDesc col = allRSCols.get(i); colName = col.getExprString(); colNames.add(colName); if (partitionPositions.contains(i) || sortPositions.contains(i)) { descs.add( new ExprNodeColumnDesc( col.getTypeInfo(), ReduceField.KEY.toString() + "." + colName, null, false)); } else { descs.add( new ExprNodeColumnDesc( col.getTypeInfo(), ReduceField.VALUE.toString() + "." + colName, null, false)); } } RowSchema selRS = new RowSchema(fsParent.getSchema()); if (!bucketColumns.isEmpty() || fsOp.getConf().getWriteType() == Operation.DELETE || fsOp.getConf().getWriteType() == Operation.UPDATE) { descs.add( new ExprNodeColumnDesc( TypeInfoFactory.stringTypeInfo, ReduceField.KEY.toString() + ".'" + BUCKET_NUMBER_COL_NAME + "'", null, false)); colNames.add("'" + BUCKET_NUMBER_COL_NAME + "'"); ColumnInfo ci = new ColumnInfo( BUCKET_NUMBER_COL_NAME, TypeInfoFactory.stringTypeInfo, selRS.getSignature().get(0).getTabAlias(), true, true); selRS.getSignature().add(ci); fsParent.getSchema().getSignature().add(ci); } // Create SelectDesc SelectDesc selConf = new SelectDesc(descs, colNames); // Create Select Operator SelectOperator selOp = (SelectOperator) OperatorFactory.getAndMakeChild(selConf, selRS, rsOp); // link SEL to FS fsOp.getParentOperators().clear(); fsOp.getParentOperators().add(selOp); selOp.getChildOperators().add(fsOp); // Set if partition sorted or partition bucket sorted fsOp.getConf().setDpSortState(FileSinkDesc.DPSortState.PARTITION_SORTED); if (bucketColumns.size() > 0 || fsOp.getConf().getWriteType() == Operation.DELETE || fsOp.getConf().getWriteType() == Operation.UPDATE) { fsOp.getConf().setDpSortState(FileSinkDesc.DPSortState.PARTITION_BUCKET_SORTED); } // update partition column info in FS descriptor fsOp.getConf().setPartitionCols(rsOp.getConf().getPartitionCols()); LOG.info( "Inserted " + rsOp.getOperatorId() + " and " + selOp.getOperatorId() + " as parent of " + fsOp.getOperatorId() + " and child of " + fsParent.getOperatorId()); parseCtx.setReduceSinkAddedBySortedDynPartition(true); return null; }