/** * Returns the skewed values in all the tables which are going to be scanned. If the join is on * columns c1, c2 and c3 on tables T1 and T2, T1 is skewed on c1 and c4 with the skew values * ((1,2),(3,4)), whereas T2 is skewed on c1, c2 with skew values ((5,6),(7,8)), the resulting * map would be: <(c1) -> ((1), (3)), (c1,c2) -> ((5,6),(7,8))> * * @param op The join operator being optimized * @param tableScanOpsForJoin table scan operators which are parents of the join operator * @return map<join keys intersection skewedkeys, list of skewed values>. */ private Map<List<ExprNodeDesc>, List<List<String>>> getSkewedValues( Operator<? extends OperatorDesc> op, List<TableScanOperator> tableScanOpsForJoin) { Map<List<ExprNodeDesc>, List<List<String>>> skewDataReturn = new HashMap<List<ExprNodeDesc>, List<List<String>>>(); Map<List<ExprNodeDescEqualityWrapper>, List<List<String>>> skewData = new HashMap<List<ExprNodeDescEqualityWrapper>, List<List<String>>>(); // The join keys are available in the reduceSinkOperators before join for (Operator<? extends OperatorDesc> reduceSinkOp : op.getParentOperators()) { ReduceSinkDesc rsDesc = ((ReduceSinkOperator) reduceSinkOp).getConf(); if (rsDesc.getKeyCols() != null) { Table table = null; // Find the skew information corresponding to the table List<String> skewedColumns = null; List<List<String>> skewedValueList = null; // The join columns which are also skewed List<ExprNodeDescEqualityWrapper> joinKeysSkewedCols = new ArrayList<ExprNodeDescEqualityWrapper>(); // skewed Keys which intersect with join keys List<Integer> positionSkewedKeys = new ArrayList<Integer>(); // Update the joinKeys appropriately. for (ExprNodeDesc keyColDesc : rsDesc.getKeyCols()) { ExprNodeColumnDesc keyCol = null; // If the key column is not a column, then dont apply this optimization. // This will be fixed as part of https://issues.apache.org/jira/browse/HIVE-3445 // for type conversion UDFs. if (keyColDesc instanceof ExprNodeColumnDesc) { keyCol = (ExprNodeColumnDesc) keyColDesc; if (table == null) { table = getTable(parseContext, reduceSinkOp, tableScanOpsForJoin); skewedColumns = table == null ? null : table.getSkewedColNames(); // No skew on the table to take care of if ((skewedColumns == null) || (skewedColumns.isEmpty())) { continue; } skewedValueList = table == null ? null : table.getSkewedColValues(); } int pos = skewedColumns.indexOf(keyCol.getColumn()); if ((pos >= 0) && (!positionSkewedKeys.contains(pos))) { positionSkewedKeys.add(pos); ExprNodeColumnDesc keyColClone = (ExprNodeColumnDesc) keyCol.clone(); keyColClone.setTabAlias(null); joinKeysSkewedCols.add(new ExprNodeDescEqualityWrapper(keyColClone)); } } } // If the skew keys match the join keys, then add it to the list if ((skewedColumns != null) && (!skewedColumns.isEmpty())) { if (!joinKeysSkewedCols.isEmpty()) { // If the join keys matches the skewed keys, use the table skewed keys List<List<String>> skewedJoinValues; if (skewedColumns.size() == positionSkewedKeys.size()) { skewedJoinValues = skewedValueList; } else { skewedJoinValues = getSkewedJoinValues(skewedValueList, positionSkewedKeys); } List<List<String>> oldSkewedJoinValues = skewData.get(joinKeysSkewedCols); if (oldSkewedJoinValues == null) { oldSkewedJoinValues = new ArrayList<List<String>>(); } for (List<String> skewValue : skewedJoinValues) { if (!oldSkewedJoinValues.contains(skewValue)) { oldSkewedJoinValues.add(skewValue); } } skewData.put(joinKeysSkewedCols, oldSkewedJoinValues); } } } } // convert skewData to contain ExprNodeDesc in the keys for (Map.Entry<List<ExprNodeDescEqualityWrapper>, List<List<String>>> mapEntry : skewData.entrySet()) { List<ExprNodeDesc> skewedKeyJoinCols = new ArrayList<ExprNodeDesc>(); for (ExprNodeDescEqualityWrapper key : mapEntry.getKey()) { skewedKeyJoinCols.add(key.getExprNodeDesc()); } skewDataReturn.put(skewedKeyJoinCols, mapEntry.getValue()); } return skewDataReturn; }