/** * Current RSDedup remove/replace child RS. For key columns, sorting order, and the number of * reducers, copy more specific part of configurations of child RS to that of parent RS. For * partitioning columns, if both child RS and parent RS have been assigned partitioning columns, * we will choose the more general partitioning columns. If parent RS has not been assigned any * partitioning column, we will use partitioning columns (if exist) of child RS. */ protected boolean merge(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer) throws SemanticException { int[] result = checkStatus(cRS, pRS, minReducer); if (result == null) { return false; } if (result[0] > 0) { // The sorting columns of the child RS are more specific than // those of the parent RS. Assign sorting columns of the child RS // to the parent RS. List<ExprNodeDesc> childKCs = cRS.getConf().getKeyCols(); pRS.getConf().setKeyCols(ExprNodeDescUtils.backtrack(childKCs, cRS, pRS)); } if (result[1] < 0) { // The partitioning columns of the parent RS are more specific than // those of the child RS. List<ExprNodeDesc> childPCs = cRS.getConf().getPartitionCols(); if (childPCs != null && !childPCs.isEmpty()) { // If partitioning columns of the child RS are assigned, // assign these to the partitioning columns of the parent RS. pRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(childPCs, cRS, pRS)); } } else if (result[1] > 0) { // The partitioning columns of the child RS are more specific than // those of the parent RS. List<ExprNodeDesc> parentPCs = pRS.getConf().getPartitionCols(); if (parentPCs == null || parentPCs.isEmpty()) { // If partitioning columns of the parent RS are not assigned, // assign partitioning columns of the child RS to the parent RS. ArrayList<ExprNodeDesc> childPCs = cRS.getConf().getPartitionCols(); pRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(childPCs, cRS, pRS)); } } if (result[2] > 0) { // The sorting order of the child RS is more specific than // that of the parent RS. Assign the sorting order of the child RS // to the parent RS. if (result[0] <= 0) { // Sorting columns of the parent RS are more specific than those of the // child RS but Sorting order of the child RS is more specific than // that of the parent RS. throw new SemanticException( "Sorting columns and order don't match. " + "Try set " + HiveConf.ConfVars.HIVEOPTREDUCEDEDUPLICATION + "=false;"); } pRS.getConf().setOrder(cRS.getConf().getOrder()); } if (result[3] > 0) { // The number of reducers of the child RS is more specific than // that of the parent RS. Assign the number of reducers of the child RS // to the parent RS. pRS.getConf().setNumReducers(cRS.getConf().getNumReducers()); } if (result[4] > 0) { // This case happens only when pRS key is empty in which case we can use // number of distribution keys and key serialization info from cRS pRS.getConf().setNumDistributionKeys(cRS.getConf().getNumDistributionKeys()); List<FieldSchema> fields = PlanUtils.getFieldSchemasFromColumnList(pRS.getConf().getKeyCols(), "reducesinkkey"); TableDesc keyTable = PlanUtils.getReduceKeyTableDesc(fields, pRS.getConf().getOrder()); ArrayList<String> outputKeyCols = Lists.newArrayList(); for (int i = 0; i < fields.size(); i++) { outputKeyCols.add(fields.get(i).getName()); } pRS.getConf().setOutputKeyColumnNames(outputKeyCols); pRS.getConf().setKeySerializeInfo(keyTable); } return true; }
public ReduceSinkOperator getReduceSinkOp( List<Integer> partitionPositions, List<Integer> sortPositions, List<Integer> sortOrder, List<Integer> sortNullOrder, ArrayList<ExprNodeDesc> allCols, ArrayList<ExprNodeDesc> bucketColumns, int numBuckets, Operator<? extends OperatorDesc> parent, AcidUtils.Operation writeType) throws SemanticException { // Order of KEY columns // 1) Partition columns // 2) Bucket number column // 3) Sort columns Set<Integer> keyColsPosInVal = Sets.newLinkedHashSet(); ArrayList<ExprNodeDesc> keyCols = Lists.newArrayList(); List<Integer> newSortOrder = Lists.newArrayList(); List<Integer> newSortNullOrder = Lists.newArrayList(); int numPartAndBuck = partitionPositions.size(); keyColsPosInVal.addAll(partitionPositions); if (!bucketColumns.isEmpty() || writeType == Operation.DELETE || writeType == Operation.UPDATE) { keyColsPosInVal.add(-1); numPartAndBuck += 1; } keyColsPosInVal.addAll(sortPositions); // by default partition and bucket columns are sorted in ascending order Integer order = 1; if (sortOrder != null && !sortOrder.isEmpty()) { if (sortOrder.get(0).intValue() == 0) { order = 0; } } for (int i = 0; i < numPartAndBuck; i++) { newSortOrder.add(order); } newSortOrder.addAll(sortOrder); String orderStr = ""; for (Integer i : newSortOrder) { if (i.intValue() == 1) { orderStr += "+"; } else { orderStr += "-"; } } // if partition and bucket columns are sorted in ascending order, by default // nulls come first; otherwise nulls come last Integer nullOrder = order == 1 ? 0 : 1; if (sortNullOrder != null && !sortNullOrder.isEmpty()) { if (sortNullOrder.get(0).intValue() == 0) { nullOrder = 0; } else { nullOrder = 1; } } for (int i = 0; i < numPartAndBuck; i++) { newSortNullOrder.add(nullOrder); } newSortNullOrder.addAll(sortNullOrder); String nullOrderStr = ""; for (Integer i : newSortNullOrder) { if (i.intValue() == 0) { nullOrderStr += "a"; } else { nullOrderStr += "z"; } } Map<String, ExprNodeDesc> colExprMap = Maps.newHashMap(); ArrayList<ExprNodeDesc> partCols = Lists.newArrayList(); // we will clone here as RS will update bucket column key with its // corresponding with bucket number and hence their OIs for (Integer idx : keyColsPosInVal) { if (idx < 0) { ExprNodeConstantDesc bucketNumCol = new ExprNodeConstantDesc(TypeInfoFactory.stringTypeInfo, BUCKET_NUMBER_COL_NAME); keyCols.add(bucketNumCol); colExprMap.put( Utilities.ReduceField.KEY + ".'" + BUCKET_NUMBER_COL_NAME + "'", bucketNumCol); } else { keyCols.add(allCols.get(idx).clone()); } } ArrayList<ExprNodeDesc> valCols = Lists.newArrayList(); for (int i = 0; i < allCols.size(); i++) { if (!keyColsPosInVal.contains(i)) { valCols.add(allCols.get(i).clone()); } } for (Integer idx : partitionPositions) { partCols.add(allCols.get(idx).clone()); } // in the absence of SORTED BY clause, the sorted dynamic partition insert // should honor the ordering of records provided by ORDER BY in SELECT statement ReduceSinkOperator parentRSOp = OperatorUtils.findSingleOperatorUpstream(parent, ReduceSinkOperator.class); if (parentRSOp != null && parseCtx.getQueryProperties().hasOuterOrderBy()) { String parentRSOpOrder = parentRSOp.getConf().getOrder(); String parentRSOpNullOrder = parentRSOp.getConf().getNullOrder(); if (parentRSOpOrder != null && !parentRSOpOrder.isEmpty() && sortPositions.isEmpty()) { keyCols.addAll(parentRSOp.getConf().getKeyCols()); orderStr += parentRSOpOrder; nullOrderStr += parentRSOpNullOrder; } } // map _col0 to KEY._col0, etc Map<String, String> nameMapping = new HashMap<>(); ArrayList<String> keyColNames = Lists.newArrayList(); for (ExprNodeDesc keyCol : keyCols) { String keyColName = keyCol.getExprString(); keyColNames.add(keyColName); colExprMap.put(Utilities.ReduceField.KEY + "." + keyColName, keyCol); nameMapping.put(keyColName, Utilities.ReduceField.KEY + "." + keyColName); } ArrayList<String> valColNames = Lists.newArrayList(); for (ExprNodeDesc valCol : valCols) { String colName = valCol.getExprString(); valColNames.add(colName); colExprMap.put(Utilities.ReduceField.VALUE + "." + colName, valCol); nameMapping.put(colName, Utilities.ReduceField.VALUE + "." + colName); } // Create Key/Value TableDesc. When the operator plan is split into MR tasks, // the reduce operator will initialize Extract operator with information // from Key and Value TableDesc List<FieldSchema> fields = PlanUtils.getFieldSchemasFromColumnList(keyCols, keyColNames, 0, ""); TableDesc keyTable = PlanUtils.getReduceKeyTableDesc(fields, orderStr, nullOrderStr); List<FieldSchema> valFields = PlanUtils.getFieldSchemasFromColumnList(valCols, valColNames, 0, ""); TableDesc valueTable = PlanUtils.getReduceValueTableDesc(valFields); List<List<Integer>> distinctColumnIndices = Lists.newArrayList(); // Number of reducers is set to default (-1) ReduceSinkDesc rsConf = new ReduceSinkDesc( keyCols, keyCols.size(), valCols, keyColNames, distinctColumnIndices, valColNames, -1, partCols, -1, keyTable, valueTable, writeType); rsConf.setBucketCols(bucketColumns); rsConf.setNumBuckets(numBuckets); ArrayList<ColumnInfo> signature = new ArrayList<>(); for (int index = 0; index < parent.getSchema().getSignature().size(); index++) { ColumnInfo colInfo = new ColumnInfo(parent.getSchema().getSignature().get(index)); colInfo.setInternalName(nameMapping.get(colInfo.getInternalName())); signature.add(colInfo); } ReduceSinkOperator op = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(rsConf, new RowSchema(signature), parent); op.setColumnExprMap(colExprMap); return op; }