/** * Returns merge directions between two RSs for criterias (ordering, number of reducers, reducer * keys, partition keys). Returns null if any of categories is not mergeable. * * <p>Values for each index can be -1, 0, 1 1. 0 means two configuration in the category is the * same 2. for -1, configuration of parent RS is more specific than child RS 3. for 1, * configuration of child RS is more specific than parent RS */ private int[] checkStatus(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer) throws SemanticException { ReduceSinkDesc cConf = cRS.getConf(); ReduceSinkDesc pConf = pRS.getConf(); Integer moveRSOrderTo = checkOrder(cConf.getOrder(), pConf.getOrder()); if (moveRSOrderTo == null) { return null; } Integer moveReducerNumTo = checkNumReducer(cConf.getNumReducers(), pConf.getNumReducers()); if (moveReducerNumTo == null || moveReducerNumTo > 0 && cConf.getNumReducers() < minReducer) { return null; } List<ExprNodeDesc> ckeys = cConf.getKeyCols(); List<ExprNodeDesc> pkeys = pConf.getKeyCols(); Integer moveKeyColTo = checkExprs(ckeys, pkeys, cRS, pRS); if (moveKeyColTo == null) { return null; } List<ExprNodeDesc> cpars = cConf.getPartitionCols(); List<ExprNodeDesc> ppars = pConf.getPartitionCols(); Integer movePartitionColTo = checkExprs(cpars, ppars, cRS, pRS); if (movePartitionColTo == null) { return null; } Integer moveNumDistKeyTo = checkNumDistributionKey(cConf.getNumDistributionKeys(), pConf.getNumDistributionKeys()); return new int[] { moveKeyColTo, movePartitionColTo, moveRSOrderTo, moveReducerNumTo, moveNumDistKeyTo }; }
// for JOIN-RS case, it's not possible generally to merge if child has // less key/partition columns than parents protected boolean merge(ReduceSinkOperator cRS, JoinOperator pJoin, int minReducer) throws SemanticException { List<Operator<?>> parents = pJoin.getParentOperators(); ReduceSinkOperator[] pRSs = parents.toArray(new ReduceSinkOperator[parents.size()]); ReduceSinkDesc cRSc = cRS.getConf(); ReduceSinkDesc pRS0c = pRSs[0].getConf(); if (cRSc.getKeyCols().size() < pRS0c.getKeyCols().size()) { return false; } if (cRSc.getPartitionCols().size() != pRS0c.getPartitionCols().size()) { return false; } Integer moveReducerNumTo = checkNumReducer(cRSc.getNumReducers(), pRS0c.getNumReducers()); if (moveReducerNumTo == null || moveReducerNumTo > 0 && cRSc.getNumReducers() < minReducer) { return false; } Integer moveRSOrderTo = checkOrder(cRSc.getOrder(), pRS0c.getOrder()); if (moveRSOrderTo == null) { return false; } boolean[] sorted = CorrelationUtilities.getSortedTags(pJoin); int cKeySize = cRSc.getKeyCols().size(); for (int i = 0; i < cKeySize; i++) { ExprNodeDesc cexpr = cRSc.getKeyCols().get(i); ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length]; for (int tag = 0; tag < pRSs.length; tag++) { pexprs[tag] = pRSs[tag].getConf().getKeyCols().get(i); } int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted); if (found != i) { return false; } } int cPartSize = cRSc.getPartitionCols().size(); for (int i = 0; i < cPartSize; i++) { ExprNodeDesc cexpr = cRSc.getPartitionCols().get(i); ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length]; for (int tag = 0; tag < pRSs.length; tag++) { pexprs[tag] = pRSs[tag].getConf().getPartitionCols().get(i); } int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted); if (found != i) { return false; } } if (moveReducerNumTo > 0) { for (ReduceSinkOperator pRS : pRSs) { pRS.getConf().setNumReducers(cRS.getConf().getNumReducers()); } } return true; }
private static void pruneReduceSinkOperator( boolean[] retainFlags, ReduceSinkOperator reduce, ColumnPrunerProcCtx cppCtx) throws SemanticException { ReduceSinkDesc reduceConf = reduce.getConf(); Map<String, ExprNodeDesc> oldMap = reduce.getColumnExprMap(); LOG.info("RS " + reduce.getIdentifier() + " oldColExprMap: " + oldMap); RowResolver oldRR = cppCtx.getOpToParseCtxMap().get(reduce).getRowResolver(); ArrayList<ColumnInfo> old_signature = oldRR.getRowSchema().getSignature(); ArrayList<ColumnInfo> signature = new ArrayList<ColumnInfo>(old_signature); List<String> valueColNames = reduceConf.getOutputValueColumnNames(); ArrayList<String> newValueColNames = new ArrayList<String>(); List<ExprNodeDesc> keyExprs = reduceConf.getKeyCols(); List<ExprNodeDesc> valueExprs = reduceConf.getValueCols(); ArrayList<ExprNodeDesc> newValueExprs = new ArrayList<ExprNodeDesc>(); for (int i = 0; i < retainFlags.length; i++) { String outputCol = valueColNames.get(i); ExprNodeDesc outputColExpr = valueExprs.get(i); if (!retainFlags[i]) { String[] nm = oldRR.reverseLookup(outputCol); if (nm == null) { outputCol = Utilities.ReduceField.VALUE.toString() + "." + outputCol; nm = oldRR.reverseLookup(outputCol); } // In case there are multiple columns referenced to the same column name, we won't // do row resolve once more because the ColumnInfo in row resolver is already removed if (nm == null) { continue; } // Only remove information of a column if it is not a key, // i.e. this column is not appearing in keyExprs of the RS if (ExprNodeDescUtils.indexOf(outputColExpr, keyExprs) == -1) { ColumnInfo colInfo = oldRR.getFieldMap(nm[0]).remove(nm[1]); oldRR.getInvRslvMap().remove(colInfo.getInternalName()); oldMap.remove(outputCol); signature.remove(colInfo); } } else { newValueColNames.add(outputCol); newValueExprs.add(outputColExpr); } } oldRR.getRowSchema().setSignature(signature); reduce.getSchema().setSignature(signature); reduceConf.setOutputValueColumnNames(newValueColNames); reduceConf.setValueCols(newValueExprs); TableDesc newValueTable = PlanUtils.getReduceValueTableDesc( PlanUtils.getFieldSchemasFromColumnList( reduceConf.getValueCols(), newValueColNames, 0, "")); reduceConf.setValueSerializeInfo(newValueTable); LOG.info("RS " + reduce.getIdentifier() + " newColExprMap: " + oldMap); }
@Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx, Object... nodeOutputs) throws SemanticException { ReduceSinkOperator op = (ReduceSinkOperator) nd; ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx; RowResolver resolver = cppCtx.getOpToParseCtxMap().get(op).getRowResolver(); ReduceSinkDesc conf = op.getConf(); List<String> colLists = new ArrayList<String>(); ArrayList<ExprNodeDesc> keys = conf.getKeyCols(); LOG.debug("Reduce Sink Operator " + op.getIdentifier() + " key:" + keys); for (ExprNodeDesc key : keys) { colLists = Utilities.mergeUniqElems(colLists, key.getCols()); } assert op.getNumChild() == 1; Operator<? extends OperatorDesc> child = op.getChildOperators().get(0); List<String> childCols; if (child instanceof CommonJoinOperator) { childCols = cppCtx.getJoinPrunedColLists().get(child).get((byte) conf.getTag()); } else { childCols = cppCtx.getPrunedColList(child); } List<ExprNodeDesc> valCols = conf.getValueCols(); List<String> valColNames = conf.getOutputValueColumnNames(); if (childCols != null) { boolean[] flags = new boolean[valCols.size()]; for (String childCol : childCols) { int index = valColNames.indexOf(Utilities.removeValueTag(childCol)); if (index < 0) { continue; } flags[index] = true; colLists = Utilities.mergeUniqElems(colLists, valCols.get(index).getCols()); } Collections.sort(colLists); pruneReduceSinkOperator(flags, op, cppCtx); cppCtx.getPrunedColLists().put(op, colLists); return null; } // Reduce Sink contains the columns needed - no need to aggregate from // children for (ExprNodeDesc val : valCols) { colLists = Utilities.mergeUniqElems(colLists, val.getCols()); } cppCtx.getPrunedColLists().put(op, colLists); return null; }
/** * Returns the skewed values in all the tables which are going to be scanned. If the join is on * columns c1, c2 and c3 on tables T1 and T2, T1 is skewed on c1 and c4 with the skew values * ((1,2),(3,4)), whereas T2 is skewed on c1, c2 with skew values ((5,6),(7,8)), the resulting * map would be: <(c1) -> ((1), (3)), (c1,c2) -> ((5,6),(7,8))> * * @param op The join operator being optimized * @param tableScanOpsForJoin table scan operators which are parents of the join operator * @return map<join keys intersection skewedkeys, list of skewed values>. */ private Map<List<ExprNodeDesc>, List<List<String>>> getSkewedValues( Operator<? extends OperatorDesc> op, List<TableScanOperator> tableScanOpsForJoin) { Map<List<ExprNodeDesc>, List<List<String>>> skewDataReturn = new HashMap<List<ExprNodeDesc>, List<List<String>>>(); Map<List<ExprNodeDescEqualityWrapper>, List<List<String>>> skewData = new HashMap<List<ExprNodeDescEqualityWrapper>, List<List<String>>>(); // The join keys are available in the reduceSinkOperators before join for (Operator<? extends OperatorDesc> reduceSinkOp : op.getParentOperators()) { ReduceSinkDesc rsDesc = ((ReduceSinkOperator) reduceSinkOp).getConf(); if (rsDesc.getKeyCols() != null) { Table table = null; // Find the skew information corresponding to the table List<String> skewedColumns = null; List<List<String>> skewedValueList = null; // The join columns which are also skewed List<ExprNodeDescEqualityWrapper> joinKeysSkewedCols = new ArrayList<ExprNodeDescEqualityWrapper>(); // skewed Keys which intersect with join keys List<Integer> positionSkewedKeys = new ArrayList<Integer>(); // Update the joinKeys appropriately. for (ExprNodeDesc keyColDesc : rsDesc.getKeyCols()) { ExprNodeColumnDesc keyCol = null; // If the key column is not a column, then dont apply this optimization. // This will be fixed as part of https://issues.apache.org/jira/browse/HIVE-3445 // for type conversion UDFs. if (keyColDesc instanceof ExprNodeColumnDesc) { keyCol = (ExprNodeColumnDesc) keyColDesc; if (table == null) { table = getTable(parseContext, reduceSinkOp, tableScanOpsForJoin); skewedColumns = table == null ? null : table.getSkewedColNames(); // No skew on the table to take care of if ((skewedColumns == null) || (skewedColumns.isEmpty())) { continue; } skewedValueList = table == null ? null : table.getSkewedColValues(); } int pos = skewedColumns.indexOf(keyCol.getColumn()); if ((pos >= 0) && (!positionSkewedKeys.contains(pos))) { positionSkewedKeys.add(pos); ExprNodeColumnDesc keyColClone = (ExprNodeColumnDesc) keyCol.clone(); keyColClone.setTabAlias(null); joinKeysSkewedCols.add(new ExprNodeDescEqualityWrapper(keyColClone)); } } } // If the skew keys match the join keys, then add it to the list if ((skewedColumns != null) && (!skewedColumns.isEmpty())) { if (!joinKeysSkewedCols.isEmpty()) { // If the join keys matches the skewed keys, use the table skewed keys List<List<String>> skewedJoinValues; if (skewedColumns.size() == positionSkewedKeys.size()) { skewedJoinValues = skewedValueList; } else { skewedJoinValues = getSkewedJoinValues(skewedValueList, positionSkewedKeys); } List<List<String>> oldSkewedJoinValues = skewData.get(joinKeysSkewedCols); if (oldSkewedJoinValues == null) { oldSkewedJoinValues = new ArrayList<List<String>>(); } for (List<String> skewValue : skewedJoinValues) { if (!oldSkewedJoinValues.contains(skewValue)) { oldSkewedJoinValues.add(skewValue); } } skewData.put(joinKeysSkewedCols, oldSkewedJoinValues); } } } } // convert skewData to contain ExprNodeDesc in the keys for (Map.Entry<List<ExprNodeDescEqualityWrapper>, List<List<String>>> mapEntry : skewData.entrySet()) { List<ExprNodeDesc> skewedKeyJoinCols = new ArrayList<ExprNodeDesc>(); for (ExprNodeDescEqualityWrapper key : mapEntry.getKey()) { skewedKeyJoinCols.add(key.getExprNodeDesc()); } skewDataReturn.put(skewedKeyJoinCols, mapEntry.getValue()); } return skewDataReturn; }