private static void pruneReduceSinkOperator( boolean[] retainFlags, ReduceSinkOperator reduce, ColumnPrunerProcCtx cppCtx) throws SemanticException { ReduceSinkDesc reduceConf = reduce.getConf(); Map<String, ExprNodeDesc> oldMap = reduce.getColumnExprMap(); LOG.info("RS " + reduce.getIdentifier() + " oldColExprMap: " + oldMap); RowResolver oldRR = cppCtx.getOpToParseCtxMap().get(reduce).getRowResolver(); ArrayList<ColumnInfo> old_signature = oldRR.getRowSchema().getSignature(); ArrayList<ColumnInfo> signature = new ArrayList<ColumnInfo>(old_signature); List<String> valueColNames = reduceConf.getOutputValueColumnNames(); ArrayList<String> newValueColNames = new ArrayList<String>(); List<ExprNodeDesc> keyExprs = reduceConf.getKeyCols(); List<ExprNodeDesc> valueExprs = reduceConf.getValueCols(); ArrayList<ExprNodeDesc> newValueExprs = new ArrayList<ExprNodeDesc>(); for (int i = 0; i < retainFlags.length; i++) { String outputCol = valueColNames.get(i); ExprNodeDesc outputColExpr = valueExprs.get(i); if (!retainFlags[i]) { String[] nm = oldRR.reverseLookup(outputCol); if (nm == null) { outputCol = Utilities.ReduceField.VALUE.toString() + "." + outputCol; nm = oldRR.reverseLookup(outputCol); } // In case there are multiple columns referenced to the same column name, we won't // do row resolve once more because the ColumnInfo in row resolver is already removed if (nm == null) { continue; } // Only remove information of a column if it is not a key, // i.e. this column is not appearing in keyExprs of the RS if (ExprNodeDescUtils.indexOf(outputColExpr, keyExprs) == -1) { ColumnInfo colInfo = oldRR.getFieldMap(nm[0]).remove(nm[1]); oldRR.getInvRslvMap().remove(colInfo.getInternalName()); oldMap.remove(outputCol); signature.remove(colInfo); } } else { newValueColNames.add(outputCol); newValueExprs.add(outputColExpr); } } oldRR.getRowSchema().setSignature(signature); reduce.getSchema().setSignature(signature); reduceConf.setOutputValueColumnNames(newValueColNames); reduceConf.setValueCols(newValueExprs); TableDesc newValueTable = PlanUtils.getReduceValueTableDesc( PlanUtils.getFieldSchemasFromColumnList( reduceConf.getValueCols(), newValueColNames, 0, "")); reduceConf.setValueSerializeInfo(newValueTable); LOG.info("RS " + reduce.getIdentifier() + " newColExprMap: " + oldMap); }
/** * Current RSDedup remove/replace child RS. For key columns, sorting order, and the number of * reducers, copy more specific part of configurations of child RS to that of parent RS. For * partitioning columns, if both child RS and parent RS have been assigned partitioning columns, * we will choose the more general partitioning columns. If parent RS has not been assigned any * partitioning column, we will use partitioning columns (if exist) of child RS. */ protected boolean merge(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer) throws SemanticException { int[] result = checkStatus(cRS, pRS, minReducer); if (result == null) { return false; } if (result[0] > 0) { // The sorting columns of the child RS are more specific than // those of the parent RS. Assign sorting columns of the child RS // to the parent RS. List<ExprNodeDesc> childKCs = cRS.getConf().getKeyCols(); pRS.getConf().setKeyCols(ExprNodeDescUtils.backtrack(childKCs, cRS, pRS)); } if (result[1] < 0) { // The partitioning columns of the parent RS are more specific than // those of the child RS. List<ExprNodeDesc> childPCs = cRS.getConf().getPartitionCols(); if (childPCs != null && !childPCs.isEmpty()) { // If partitioning columns of the child RS are assigned, // assign these to the partitioning columns of the parent RS. pRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(childPCs, cRS, pRS)); } } else if (result[1] > 0) { // The partitioning columns of the child RS are more specific than // those of the parent RS. List<ExprNodeDesc> parentPCs = pRS.getConf().getPartitionCols(); if (parentPCs == null || parentPCs.isEmpty()) { // If partitioning columns of the parent RS are not assigned, // assign partitioning columns of the child RS to the parent RS. ArrayList<ExprNodeDesc> childPCs = cRS.getConf().getPartitionCols(); pRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(childPCs, cRS, pRS)); } } if (result[2] > 0) { // The sorting order of the child RS is more specific than // that of the parent RS. Assign the sorting order of the child RS // to the parent RS. if (result[0] <= 0) { // Sorting columns of the parent RS are more specific than those of the // child RS but Sorting order of the child RS is more specific than // that of the parent RS. throw new SemanticException( "Sorting columns and order don't match. " + "Try set " + HiveConf.ConfVars.HIVEOPTREDUCEDEDUPLICATION + "=false;"); } pRS.getConf().setOrder(cRS.getConf().getOrder()); } if (result[3] > 0) { // The number of reducers of the child RS is more specific than // that of the parent RS. Assign the number of reducers of the child RS // to the parent RS. pRS.getConf().setNumReducers(cRS.getConf().getNumReducers()); } if (result[4] > 0) { // This case happens only when pRS key is empty in which case we can use // number of distribution keys and key serialization info from cRS pRS.getConf().setNumDistributionKeys(cRS.getConf().getNumDistributionKeys()); List<FieldSchema> fields = PlanUtils.getFieldSchemasFromColumnList(pRS.getConf().getKeyCols(), "reducesinkkey"); TableDesc keyTable = PlanUtils.getReduceKeyTableDesc(fields, pRS.getConf().getOrder()); ArrayList<String> outputKeyCols = Lists.newArrayList(); for (int i = 0; i < fields.size(); i++) { outputKeyCols.add(fields.get(i).getName()); } pRS.getConf().setOutputKeyColumnNames(outputKeyCols); pRS.getConf().setKeySerializeInfo(keyTable); } return true; }
public ReduceSinkOperator getReduceSinkOp( List<Integer> partitionPositions, List<Integer> sortPositions, List<Integer> sortOrder, List<Integer> sortNullOrder, ArrayList<ExprNodeDesc> allCols, ArrayList<ExprNodeDesc> bucketColumns, int numBuckets, Operator<? extends OperatorDesc> parent, AcidUtils.Operation writeType) throws SemanticException { // Order of KEY columns // 1) Partition columns // 2) Bucket number column // 3) Sort columns Set<Integer> keyColsPosInVal = Sets.newLinkedHashSet(); ArrayList<ExprNodeDesc> keyCols = Lists.newArrayList(); List<Integer> newSortOrder = Lists.newArrayList(); List<Integer> newSortNullOrder = Lists.newArrayList(); int numPartAndBuck = partitionPositions.size(); keyColsPosInVal.addAll(partitionPositions); if (!bucketColumns.isEmpty() || writeType == Operation.DELETE || writeType == Operation.UPDATE) { keyColsPosInVal.add(-1); numPartAndBuck += 1; } keyColsPosInVal.addAll(sortPositions); // by default partition and bucket columns are sorted in ascending order Integer order = 1; if (sortOrder != null && !sortOrder.isEmpty()) { if (sortOrder.get(0).intValue() == 0) { order = 0; } } for (int i = 0; i < numPartAndBuck; i++) { newSortOrder.add(order); } newSortOrder.addAll(sortOrder); String orderStr = ""; for (Integer i : newSortOrder) { if (i.intValue() == 1) { orderStr += "+"; } else { orderStr += "-"; } } // if partition and bucket columns are sorted in ascending order, by default // nulls come first; otherwise nulls come last Integer nullOrder = order == 1 ? 0 : 1; if (sortNullOrder != null && !sortNullOrder.isEmpty()) { if (sortNullOrder.get(0).intValue() == 0) { nullOrder = 0; } else { nullOrder = 1; } } for (int i = 0; i < numPartAndBuck; i++) { newSortNullOrder.add(nullOrder); } newSortNullOrder.addAll(sortNullOrder); String nullOrderStr = ""; for (Integer i : newSortNullOrder) { if (i.intValue() == 0) { nullOrderStr += "a"; } else { nullOrderStr += "z"; } } Map<String, ExprNodeDesc> colExprMap = Maps.newHashMap(); ArrayList<ExprNodeDesc> partCols = Lists.newArrayList(); // we will clone here as RS will update bucket column key with its // corresponding with bucket number and hence their OIs for (Integer idx : keyColsPosInVal) { if (idx < 0) { ExprNodeConstantDesc bucketNumCol = new ExprNodeConstantDesc(TypeInfoFactory.stringTypeInfo, BUCKET_NUMBER_COL_NAME); keyCols.add(bucketNumCol); colExprMap.put( Utilities.ReduceField.KEY + ".'" + BUCKET_NUMBER_COL_NAME + "'", bucketNumCol); } else { keyCols.add(allCols.get(idx).clone()); } } ArrayList<ExprNodeDesc> valCols = Lists.newArrayList(); for (int i = 0; i < allCols.size(); i++) { if (!keyColsPosInVal.contains(i)) { valCols.add(allCols.get(i).clone()); } } for (Integer idx : partitionPositions) { partCols.add(allCols.get(idx).clone()); } // in the absence of SORTED BY clause, the sorted dynamic partition insert // should honor the ordering of records provided by ORDER BY in SELECT statement ReduceSinkOperator parentRSOp = OperatorUtils.findSingleOperatorUpstream(parent, ReduceSinkOperator.class); if (parentRSOp != null && parseCtx.getQueryProperties().hasOuterOrderBy()) { String parentRSOpOrder = parentRSOp.getConf().getOrder(); String parentRSOpNullOrder = parentRSOp.getConf().getNullOrder(); if (parentRSOpOrder != null && !parentRSOpOrder.isEmpty() && sortPositions.isEmpty()) { keyCols.addAll(parentRSOp.getConf().getKeyCols()); orderStr += parentRSOpOrder; nullOrderStr += parentRSOpNullOrder; } } // map _col0 to KEY._col0, etc Map<String, String> nameMapping = new HashMap<>(); ArrayList<String> keyColNames = Lists.newArrayList(); for (ExprNodeDesc keyCol : keyCols) { String keyColName = keyCol.getExprString(); keyColNames.add(keyColName); colExprMap.put(Utilities.ReduceField.KEY + "." + keyColName, keyCol); nameMapping.put(keyColName, Utilities.ReduceField.KEY + "." + keyColName); } ArrayList<String> valColNames = Lists.newArrayList(); for (ExprNodeDesc valCol : valCols) { String colName = valCol.getExprString(); valColNames.add(colName); colExprMap.put(Utilities.ReduceField.VALUE + "." + colName, valCol); nameMapping.put(colName, Utilities.ReduceField.VALUE + "." + colName); } // Create Key/Value TableDesc. When the operator plan is split into MR tasks, // the reduce operator will initialize Extract operator with information // from Key and Value TableDesc List<FieldSchema> fields = PlanUtils.getFieldSchemasFromColumnList(keyCols, keyColNames, 0, ""); TableDesc keyTable = PlanUtils.getReduceKeyTableDesc(fields, orderStr, nullOrderStr); List<FieldSchema> valFields = PlanUtils.getFieldSchemasFromColumnList(valCols, valColNames, 0, ""); TableDesc valueTable = PlanUtils.getReduceValueTableDesc(valFields); List<List<Integer>> distinctColumnIndices = Lists.newArrayList(); // Number of reducers is set to default (-1) ReduceSinkDesc rsConf = new ReduceSinkDesc( keyCols, keyCols.size(), valCols, keyColNames, distinctColumnIndices, valColNames, -1, partCols, -1, keyTable, valueTable, writeType); rsConf.setBucketCols(bucketColumns); rsConf.setNumBuckets(numBuckets); ArrayList<ColumnInfo> signature = new ArrayList<>(); for (int index = 0; index < parent.getSchema().getSignature().size(); index++) { ColumnInfo colInfo = new ColumnInfo(parent.getSchema().getSignature().get(index)); colInfo.setInternalName(nameMapping.get(colInfo.getInternalName())); signature.add(colInfo); } ReduceSinkOperator op = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(rsConf, new RowSchema(signature), parent); op.setColumnExprMap(colExprMap); return op; }
private static void pruneJoinOperator( NodeProcessorCtx ctx, CommonJoinOperator op, JoinDesc conf, Map<String, ExprNodeDesc> columnExprMap, Map<Byte, List<Integer>> retainMap, boolean mapJoin) throws SemanticException { ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx; List<Operator<? extends OperatorDesc>> childOperators = op.getChildOperators(); LOG.info("JOIN " + op.getIdentifier() + " oldExprs: " + conf.getExprs()); List<String> childColLists = cppCtx.genColLists(op); if (childColLists == null) { return; } Map<Byte, List<String>> prunedColLists = new HashMap<Byte, List<String>>(); for (byte tag : conf.getTagOrder()) { prunedColLists.put(tag, new ArrayList<String>()); } // add the columns in join filters Set<Map.Entry<Byte, List<ExprNodeDesc>>> filters = conf.getFilters().entrySet(); Iterator<Map.Entry<Byte, List<ExprNodeDesc>>> iter = filters.iterator(); while (iter.hasNext()) { Map.Entry<Byte, List<ExprNodeDesc>> entry = iter.next(); Byte tag = entry.getKey(); for (ExprNodeDesc desc : entry.getValue()) { List<String> cols = prunedColLists.get(tag); cols = Utilities.mergeUniqElems(cols, desc.getCols()); prunedColLists.put(tag, cols); } } RowResolver joinRR = cppCtx.getOpToParseCtxMap().get(op).getRowResolver(); RowResolver newJoinRR = new RowResolver(); ArrayList<String> outputCols = new ArrayList<String>(); ArrayList<ColumnInfo> rs = new ArrayList<ColumnInfo>(); Map<String, ExprNodeDesc> newColExprMap = new HashMap<String, ExprNodeDesc>(); for (int i = 0; i < conf.getOutputColumnNames().size(); i++) { String internalName = conf.getOutputColumnNames().get(i); ExprNodeDesc desc = columnExprMap.get(internalName); Byte tag = conf.getReversedExprs().get(internalName); if (!childColLists.contains(internalName)) { int index = conf.getExprs().get(tag).indexOf(desc); if (index < 0) { continue; } conf.getExprs().get(tag).remove(desc); if (retainMap != null) { retainMap.get(tag).remove(index); } } else { List<String> prunedRSList = prunedColLists.get(tag); if (prunedRSList == null) { prunedRSList = new ArrayList<String>(); prunedColLists.put(tag, prunedRSList); } prunedRSList = Utilities.mergeUniqElems(prunedRSList, desc.getCols()); outputCols.add(internalName); newColExprMap.put(internalName, desc); } } if (mapJoin) { // regenerate the valueTableDesc List<TableDesc> valueTableDescs = new ArrayList<TableDesc>(); for (int pos = 0; pos < op.getParentOperators().size(); pos++) { List<ExprNodeDesc> valueCols = conf.getExprs().get(Byte.valueOf((byte) pos)); StringBuilder keyOrder = new StringBuilder(); for (int i = 0; i < valueCols.size(); i++) { keyOrder.append("+"); } TableDesc valueTableDesc = PlanUtils.getMapJoinValueTableDesc( PlanUtils.getFieldSchemasFromColumnList(valueCols, "mapjoinvalue")); valueTableDescs.add(valueTableDesc); } ((MapJoinDesc) conf).setValueTblDescs(valueTableDescs); Set<Map.Entry<Byte, List<ExprNodeDesc>>> exprs = ((MapJoinDesc) conf).getKeys().entrySet(); Iterator<Map.Entry<Byte, List<ExprNodeDesc>>> iters = exprs.iterator(); while (iters.hasNext()) { Map.Entry<Byte, List<ExprNodeDesc>> entry = iters.next(); List<ExprNodeDesc> lists = entry.getValue(); for (int j = 0; j < lists.size(); j++) { ExprNodeDesc desc = lists.get(j); Byte tag = entry.getKey(); List<String> cols = prunedColLists.get(tag); cols = Utilities.mergeUniqElems(cols, desc.getCols()); prunedColLists.put(tag, cols); } } } for (Operator<? extends OperatorDesc> child : childOperators) { if (child instanceof ReduceSinkOperator) { boolean[] flags = getPruneReduceSinkOpRetainFlags(childColLists, (ReduceSinkOperator) child); pruneReduceSinkOperator(flags, (ReduceSinkOperator) child, cppCtx); } } for (int i = 0; i < outputCols.size(); i++) { String internalName = outputCols.get(i); String[] nm = joinRR.reverseLookup(internalName); ColumnInfo col = joinRR.get(nm[0], nm[1]); newJoinRR.put(nm[0], nm[1], col); rs.add(col); } LOG.info("JOIN " + op.getIdentifier() + " newExprs: " + conf.getExprs()); op.setColumnExprMap(newColExprMap); conf.setOutputColumnNames(outputCols); op.getSchema().setSignature(rs); cppCtx.getOpToParseCtxMap().get(op).setRowResolver(newJoinRR); cppCtx.getJoinPrunedColLists().put(op, prunedColLists); }