@Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx, Object... nodeOutputs) throws SemanticException { JoinOperator op = (JoinOperator) nd; pruneJoinOperator(ctx, op, op.getConf(), op.getColumnExprMap(), null, false); return null; }
/** * Find all big tables from STREAMTABLE hints. * * @param joinCtx The join context * @return Set of all big tables */ private Set<String> getBigTables(ParseContext joinCtx) { Set<String> bigTables = new HashSet<String>(); for (JoinOperator joinOp : joinCtx.getJoinOps()) { if (joinOp.getConf().getStreamAliases() != null) { bigTables.addAll(joinOp.getConf().getStreamAliases()); } } return bigTables; }
/* * Get the list of table scan operators for this join. A interface supportSkewJoinOptimization * has been provided. Currently, it is only enabled for simple filters and selects. */ private boolean getTableScanOpsForJoin(JoinOperator op, List<TableScanOperator> tsOps) { for (Operator<? extends OperatorDesc> parent : op.getParentOperators()) { if (!getTableScanOps(parent, tsOps)) { return false; } } return true; }
// pRS-pJOIN-cRS @Override public Object process(ReduceSinkOperator cRS, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { JoinOperator pJoin = CorrelationUtilities.findPossibleParent(cRS, JoinOperator.class, dedupCtx.trustScript()); if (pJoin != null && merge(cRS, pJoin, dedupCtx.minReducer())) { pJoin.getConf().setFixedAsSorted(true); CorrelationUtilities.replaceReduceSinkWithSelectOperator(cRS, dedupCtx.getPctx(), dedupCtx); ReduceSinkOperator pRS = CorrelationUtilities.findPossibleParent( pJoin, ReduceSinkOperator.class, dedupCtx.trustScript()); if (pRS != null) { pRS.getConf().setDeduplicated(true); } return true; } return false; }
// for JOIN-RS case, it's not possible generally to merge if child has // less key/partition columns than parents protected boolean merge(ReduceSinkOperator cRS, JoinOperator pJoin, int minReducer) throws SemanticException { List<Operator<?>> parents = pJoin.getParentOperators(); ReduceSinkOperator[] pRSs = parents.toArray(new ReduceSinkOperator[parents.size()]); ReduceSinkDesc cRSc = cRS.getConf(); ReduceSinkDesc pRS0c = pRSs[0].getConf(); if (cRSc.getKeyCols().size() < pRS0c.getKeyCols().size()) { return false; } if (cRSc.getPartitionCols().size() != pRS0c.getPartitionCols().size()) { return false; } Integer moveReducerNumTo = checkNumReducer(cRSc.getNumReducers(), pRS0c.getNumReducers()); if (moveReducerNumTo == null || moveReducerNumTo > 0 && cRSc.getNumReducers() < minReducer) { return false; } Integer moveRSOrderTo = checkOrder(cRSc.getOrder(), pRS0c.getOrder()); if (moveRSOrderTo == null) { return false; } boolean[] sorted = CorrelationUtilities.getSortedTags(pJoin); int cKeySize = cRSc.getKeyCols().size(); for (int i = 0; i < cKeySize; i++) { ExprNodeDesc cexpr = cRSc.getKeyCols().get(i); ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length]; for (int tag = 0; tag < pRSs.length; tag++) { pexprs[tag] = pRSs[tag].getConf().getKeyCols().get(i); } int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted); if (found != i) { return false; } } int cPartSize = cRSc.getPartitionCols().size(); for (int i = 0; i < cPartSize; i++) { ExprNodeDesc cexpr = cRSc.getPartitionCols().get(i); ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length]; for (int tag = 0; tag < pRSs.length; tag++) { pexprs[tag] = pRSs[tag].getConf().getPartitionCols().get(i); } int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted); if (found != i) { return false; } } if (moveReducerNumTo > 0) { for (ReduceSinkOperator pRS : pRSs) { pRS.getConf().setNumReducers(cRS.getConf().getNumReducers()); } } return true; }
// pRS-pJOIN-cRS-cGBY @Override public Object process( ReduceSinkOperator cRS, GroupByOperator cGBY, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { Operator<?> start = CorrelationUtilities.getStartForGroupBy(cRS, dedupCtx); JoinOperator pJoin = CorrelationUtilities.findPossibleParent( start, JoinOperator.class, dedupCtx.trustScript()); if (pJoin != null && merge(cRS, pJoin, dedupCtx.minReducer())) { pJoin.getConf().setFixedAsSorted(true); CorrelationUtilities.removeReduceSinkForGroupBy(cRS, cGBY, dedupCtx.getPctx(), dedupCtx); ReduceSinkOperator pRS = CorrelationUtilities.findPossibleParent( pJoin, ReduceSinkOperator.class, dedupCtx.trustScript()); if (pRS != null) { pRS.getConf().setDeduplicated(true); } return true; } return false; }
/** * Reorder the tables in a join operator appropriately (by reordering the tags of the reduces * sinks). * * @param joinOp The join operator to be processed * @param bigTables Set of all big tables */ private void reorder(JoinOperator joinOp, Set<String> bigTables) { int count = joinOp.getParentOperators().size(); // Find the biggest reduce sink int biggestPos = count - 1; int biggestSize = getOutputSize(joinOp.getParentOperators().get(biggestPos), bigTables); for (int i = 0; i < count - 1; i++) { int currSize = getOutputSize(joinOp.getParentOperators().get(i), bigTables); if (currSize > biggestSize) { biggestSize = currSize; biggestPos = i; } } // Reorder tags if need be if (biggestPos != (count - 1)) { Byte[] tagOrder = joinOp.getConf().getTagOrder(); Byte temp = tagOrder[biggestPos]; tagOrder[biggestPos] = tagOrder[count - 1]; tagOrder[count - 1] = temp; // Update tags of reduce sinks ((ReduceSinkOperator) joinOp.getParentOperators().get(biggestPos)) .getConf() .setTag(count - 1); ((ReduceSinkOperator) joinOp.getParentOperators().get(count - 1)) .getConf() .setTag(biggestPos); } }
@Override public ParseContext transform(ParseContext pctx) throws SemanticException { pGraphContext = pctx; Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); opRules.put( new RuleRegExp( "R1", "(" + FilterOperator.getOperatorName() + "%" + ReduceSinkOperator.getOperatorName() + "%" + JoinOperator.getOperatorName() + "%)"), new JoinTransitive()); // The dispatcher fires the processor corresponding to the closest matching // rule and passes the context along TransitiveContext context = new TransitiveContext(); Dispatcher disp = new DefaultRuleDispatcher(null, opRules, context); GraphWalker ogw = new LevelOrderWalker(disp, 2); // Create a list of topop nodes List<Node> topNodes = new ArrayList<Node>(); topNodes.addAll(pGraphContext.getTopOps().values()); ogw.startWalking(topNodes, null); Map<ReduceSinkOperator, List<ExprNodeDesc>> newFilters = context.getNewfilters(); // insert new filter between RS and parent of RS for (Map.Entry<ReduceSinkOperator, List<ExprNodeDesc>> entry : newFilters.entrySet()) { ReduceSinkOperator reducer = entry.getKey(); Operator<?> parent = reducer.getParentOperators().get(0); List<ExprNodeDesc> exprs = entry.getValue(); if (parent instanceof FilterOperator) { exprs = ExprNodeDescUtils.split(((FilterOperator) parent).getConf().getPredicate(), exprs); ExprNodeDesc merged = ExprNodeDescUtils.mergePredicates(exprs); ((FilterOperator) parent).getConf().setPredicate(merged); } else { ExprNodeDesc merged = ExprNodeDescUtils.mergePredicates(exprs); RowSchema parentRS = parent.getSchema(); Operator<FilterDesc> newFilter = createFilter(reducer, parent, parentRS, merged); } } return pGraphContext; }
@Override public Object process( Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { // We should be having a tree which looks like this // TS -> * -> RS - // \ // -> JOIN -> .. // / // TS -> * -> RS - // // We are in the join operator now. SkewJoinOptProcCtx ctx = (SkewJoinOptProcCtx) procCtx; parseContext = ctx.getpGraphContext(); JoinOperator joinOp = (JoinOperator) nd; // This join has already been processed if (ctx.getDoneJoins().contains(joinOp)) { return null; } ctx.getDoneJoins().add(joinOp); Operator<? extends OperatorDesc> currOp = joinOp; boolean processSelect = false; // Is there a select following // Clone the select also. It is useful for a follow-on optimization where the union // followed by a select star is completely removed. if ((joinOp.getChildOperators().size() == 1) && (joinOp.getChildOperators().get(0) instanceof SelectOperator)) { currOp = joinOp.getChildOperators().get(0); processSelect = true; } List<TableScanOperator> tableScanOpsForJoin = new ArrayList<TableScanOperator>(); if (!getTableScanOpsForJoin(joinOp, tableScanOpsForJoin)) { return null; } if ((tableScanOpsForJoin == null) || (tableScanOpsForJoin.isEmpty())) { return null; } // Get the skewed values in all the tables Map<List<ExprNodeDesc>, List<List<String>>> skewedValues = getSkewedValues(joinOp, tableScanOpsForJoin); // If there are no skewed values, nothing needs to be done if (skewedValues == null || skewedValues.size() == 0) { return null; } // After this optimization, the tree should be like: // TS -> (FIL "skewed rows") * -> RS - // \ // -> JOIN // / \ // TS -> (FIL "skewed rows") * -> RS - \ // \ // -> UNION -> .. // / // TS -> (FIL "no skewed rows") * -> RS - / // \ / // -> JOIN // / // TS -> (FIL "no skewed rows") * -> RS - // // Create a clone of the operator Operator<? extends OperatorDesc> currOpClone; try { currOpClone = currOp.clone(); insertRowResolvers(currOp, currOpClone, ctx); } catch (CloneNotSupportedException e) { LOG.debug("Operator tree could not be cloned"); return null; } JoinOperator joinOpClone; if (processSelect) { joinOpClone = (JoinOperator) (currOpClone.getParentOperators().get(0)); } else { joinOpClone = (JoinOperator) currOpClone; } joinOpClone.getConf().cloneQBJoinTreeProps(joinOp.getConf()); parseContext.getJoinOps().add(joinOpClone); List<TableScanOperator> tableScanCloneOpsForJoin = new ArrayList<TableScanOperator>(); if (!getTableScanOpsForJoin(joinOpClone, tableScanCloneOpsForJoin)) { LOG.debug("Operator tree not properly cloned!"); return null; } // Put the filter "skewed column = skewed keys" in op // and "skewed columns != skewed keys" in selectOpClone insertSkewFilter(tableScanOpsForJoin, skewedValues, true); insertSkewFilter(tableScanCloneOpsForJoin, skewedValues, false); // Update the topOps appropriately Map<String, Operator<? extends OperatorDesc>> topOps = getTopOps(joinOpClone); Map<String, Operator<? extends OperatorDesc>> origTopOps = parseContext.getTopOps(); for (Entry<String, Operator<? extends OperatorDesc>> topOp : topOps.entrySet()) { TableScanOperator tso = (TableScanOperator) topOp.getValue(); String tabAlias = tso.getConf().getAlias(); int initCnt = 1; String newAlias = "subquery" + initCnt + ":" + tabAlias; while (origTopOps.containsKey(newAlias)) { initCnt++; newAlias = "subquery" + initCnt + ":" + tabAlias; } parseContext.getTopOps().put(newAlias, tso); setUpAlias(joinOp, joinOpClone, tabAlias, newAlias, tso); } // Now do a union of the select operators: selectOp and selectOpClone // Store the operator that follows the select after the join, we will be // adding this as a child to the Union later List<Operator<? extends OperatorDesc>> finalOps = currOp.getChildOperators(); currOp.setChildOperators(null); currOpClone.setChildOperators(null); // Make the union operator List<Operator<? extends OperatorDesc>> oplist = new ArrayList<Operator<? extends OperatorDesc>>(); oplist.add(currOp); oplist.add(currOpClone); Operator<? extends OperatorDesc> unionOp = OperatorFactory.getAndMakeChild( new UnionDesc(), new RowSchema(currOp.getSchema().getSignature()), oplist); // Introduce a select after the union List<Operator<? extends OperatorDesc>> unionList = new ArrayList<Operator<? extends OperatorDesc>>(); unionList.add(unionOp); Operator<? extends OperatorDesc> selectUnionOp = OperatorFactory.getAndMakeChild( new SelectDesc(true), new RowSchema(unionOp.getSchema().getSignature()), unionList); // add the finalOp after the union selectUnionOp.setChildOperators(finalOps); // replace the original selectOp in the parents with selectUnionOp for (Operator<? extends OperatorDesc> finalOp : finalOps) { finalOp.replaceParent(currOp, selectUnionOp); } return null; }
/** Set alias in the cloned join tree */ private static void setUpAlias( JoinOperator origin, JoinOperator cloned, String origAlias, String newAlias, Operator<? extends OperatorDesc> topOp) { cloned.getConf().getAliasToOpInfo().remove(origAlias); cloned.getConf().getAliasToOpInfo().put(newAlias, topOp); if (origin.getConf().getLeftAlias().equals(origAlias)) { cloned.getConf().setLeftAlias(null); cloned.getConf().setLeftAlias(newAlias); } replaceAlias( origin.getConf().getLeftAliases(), cloned.getConf().getLeftAliases(), origAlias, newAlias); replaceAlias( origin.getConf().getRightAliases(), cloned.getConf().getRightAliases(), origAlias, newAlias); replaceAlias( origin.getConf().getBaseSrc(), cloned.getConf().getBaseSrc(), origAlias, newAlias); replaceAlias( origin.getConf().getMapAliases(), cloned.getConf().getMapAliases(), origAlias, newAlias); replaceAlias( origin.getConf().getStreamAliases(), cloned.getConf().getStreamAliases(), origAlias, newAlias); }
/** * If two reducer sink operators share the same partition/sort columns and order, they can be * merged. This should happen after map join optimization because map join optimization will remove * reduce sink operators. * * <p>This optimizer removes/replaces child-RS (not parent) which is safer way for * DefaultGraphWalker. */ public class ReduceSinkDeDuplication extends Transform { private static final String RS = ReduceSinkOperator.getOperatorName(); private static final String GBY = GroupByOperator.getOperatorName(); private static final String JOIN = JoinOperator.getOperatorName(); protected ParseContext pGraphContext; @Override public ParseContext transform(ParseContext pctx) throws SemanticException { pGraphContext = pctx; // generate pruned column list for all relevant operators ReduceSinkDeduplicateProcCtx cppCtx = new ReduceSinkDeduplicateProcCtx(pGraphContext); // for auto convert map-joins, it not safe to dedup in here (todo) boolean mergeJoins = !pctx.getConf().getBoolVar(HIVECONVERTJOIN) && !pctx.getConf().getBoolVar(HIVECONVERTJOINNOCONDITIONALTASK) && !pctx.getConf().getBoolVar(ConfVars.HIVE_CONVERT_JOIN_BUCKET_MAPJOIN_TEZ) && !pctx.getConf().getBoolVar(ConfVars.HIVEDYNAMICPARTITIONHASHJOIN); // If multiple rules can be matched with same cost, last rule will be choosen as a processor // see DefaultRuleDispatcher#dispatch() Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); opRules.put( new RuleRegExp("R1", RS + "%.*%" + RS + "%"), ReduceSinkDeduplicateProcFactory.getReducerReducerProc()); opRules.put( new RuleRegExp("R2", RS + "%" + GBY + "%.*%" + RS + "%"), ReduceSinkDeduplicateProcFactory.getGroupbyReducerProc()); if (mergeJoins) { opRules.put( new RuleRegExp("R3", JOIN + "%.*%" + RS + "%"), ReduceSinkDeduplicateProcFactory.getJoinReducerProc()); } // TODO RS+JOIN // The dispatcher fires the processor corresponding to the closest matching // rule and passes the context along Dispatcher disp = new DefaultRuleDispatcher( ReduceSinkDeduplicateProcFactory.getDefaultProc(), opRules, cppCtx); GraphWalker ogw = new DefaultGraphWalker(disp); // Create a list of topop nodes ArrayList<Node> topNodes = new ArrayList<Node>(); topNodes.addAll(pGraphContext.getTopOps().values()); ogw.startWalking(topNodes, null); return pGraphContext; } protected class ReduceSinkDeduplicateProcCtx extends AbstractCorrelationProcCtx { public ReduceSinkDeduplicateProcCtx(ParseContext pctx) { super(pctx); } } static class ReduceSinkDeduplicateProcFactory { public static NodeProcessor getReducerReducerProc() { return new ReducerReducerProc(); } public static NodeProcessor getGroupbyReducerProc() { return new GroupbyReducerProc(); } public static NodeProcessor getJoinReducerProc() { return new JoinReducerProc(); } public static NodeProcessor getDefaultProc() { return new DefaultProc(); } } /* * do nothing. */ static class DefaultProc implements NodeProcessor { @Override public Object process( Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { return null; } } public abstract static class AbsctractReducerReducerProc implements NodeProcessor { @Override public Object process( Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { ReduceSinkDeduplicateProcCtx dedupCtx = (ReduceSinkDeduplicateProcCtx) procCtx; if (dedupCtx.hasBeenRemoved((Operator<?>) nd)) { return false; } ReduceSinkOperator cRS = (ReduceSinkOperator) nd; Operator<?> child = CorrelationUtilities.getSingleChild(cRS); if (child instanceof JoinOperator) { return false; // not supported } if (child instanceof GroupByOperator) { GroupByOperator cGBY = (GroupByOperator) child; if (!CorrelationUtilities.hasGroupingSet(cRS) && !cGBY.getConf().isGroupingSetsPresent()) { return process(cRS, cGBY, dedupCtx); } return false; } if (child instanceof SelectOperator) { return process(cRS, dedupCtx); } return false; } protected abstract Object process(ReduceSinkOperator cRS, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException; protected abstract Object process( ReduceSinkOperator cRS, GroupByOperator cGBY, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException; // for JOIN-RS case, it's not possible generally to merge if child has // less key/partition columns than parents protected boolean merge(ReduceSinkOperator cRS, JoinOperator pJoin, int minReducer) throws SemanticException { List<Operator<?>> parents = pJoin.getParentOperators(); ReduceSinkOperator[] pRSs = parents.toArray(new ReduceSinkOperator[parents.size()]); ReduceSinkDesc cRSc = cRS.getConf(); ReduceSinkDesc pRS0c = pRSs[0].getConf(); if (cRSc.getKeyCols().size() < pRS0c.getKeyCols().size()) { return false; } if (cRSc.getPartitionCols().size() != pRS0c.getPartitionCols().size()) { return false; } Integer moveReducerNumTo = checkNumReducer(cRSc.getNumReducers(), pRS0c.getNumReducers()); if (moveReducerNumTo == null || moveReducerNumTo > 0 && cRSc.getNumReducers() < minReducer) { return false; } Integer moveRSOrderTo = checkOrder(cRSc.getOrder(), pRS0c.getOrder()); if (moveRSOrderTo == null) { return false; } boolean[] sorted = CorrelationUtilities.getSortedTags(pJoin); int cKeySize = cRSc.getKeyCols().size(); for (int i = 0; i < cKeySize; i++) { ExprNodeDesc cexpr = cRSc.getKeyCols().get(i); ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length]; for (int tag = 0; tag < pRSs.length; tag++) { pexprs[tag] = pRSs[tag].getConf().getKeyCols().get(i); } int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted); if (found != i) { return false; } } int cPartSize = cRSc.getPartitionCols().size(); for (int i = 0; i < cPartSize; i++) { ExprNodeDesc cexpr = cRSc.getPartitionCols().get(i); ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length]; for (int tag = 0; tag < pRSs.length; tag++) { pexprs[tag] = pRSs[tag].getConf().getPartitionCols().get(i); } int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted); if (found != i) { return false; } } if (moveReducerNumTo > 0) { for (ReduceSinkOperator pRS : pRSs) { pRS.getConf().setNumReducers(cRS.getConf().getNumReducers()); } } return true; } /** * Current RSDedup remove/replace child RS. For key columns, sorting order, and the number of * reducers, copy more specific part of configurations of child RS to that of parent RS. For * partitioning columns, if both child RS and parent RS have been assigned partitioning columns, * we will choose the more general partitioning columns. If parent RS has not been assigned any * partitioning column, we will use partitioning columns (if exist) of child RS. */ protected boolean merge(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer) throws SemanticException { int[] result = checkStatus(cRS, pRS, minReducer); if (result == null) { return false; } if (result[0] > 0) { // The sorting columns of the child RS are more specific than // those of the parent RS. Assign sorting columns of the child RS // to the parent RS. List<ExprNodeDesc> childKCs = cRS.getConf().getKeyCols(); pRS.getConf().setKeyCols(ExprNodeDescUtils.backtrack(childKCs, cRS, pRS)); } if (result[1] < 0) { // The partitioning columns of the parent RS are more specific than // those of the child RS. List<ExprNodeDesc> childPCs = cRS.getConf().getPartitionCols(); if (childPCs != null && !childPCs.isEmpty()) { // If partitioning columns of the child RS are assigned, // assign these to the partitioning columns of the parent RS. pRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(childPCs, cRS, pRS)); } } else if (result[1] > 0) { // The partitioning columns of the child RS are more specific than // those of the parent RS. List<ExprNodeDesc> parentPCs = pRS.getConf().getPartitionCols(); if (parentPCs == null || parentPCs.isEmpty()) { // If partitioning columns of the parent RS are not assigned, // assign partitioning columns of the child RS to the parent RS. ArrayList<ExprNodeDesc> childPCs = cRS.getConf().getPartitionCols(); pRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(childPCs, cRS, pRS)); } } if (result[2] > 0) { // The sorting order of the child RS is more specific than // that of the parent RS. Assign the sorting order of the child RS // to the parent RS. if (result[0] <= 0) { // Sorting columns of the parent RS are more specific than those of the // child RS but Sorting order of the child RS is more specific than // that of the parent RS. throw new SemanticException( "Sorting columns and order don't match. " + "Try set " + HiveConf.ConfVars.HIVEOPTREDUCEDEDUPLICATION + "=false;"); } pRS.getConf().setOrder(cRS.getConf().getOrder()); } if (result[3] > 0) { // The number of reducers of the child RS is more specific than // that of the parent RS. Assign the number of reducers of the child RS // to the parent RS. pRS.getConf().setNumReducers(cRS.getConf().getNumReducers()); } if (result[4] > 0) { // This case happens only when pRS key is empty in which case we can use // number of distribution keys and key serialization info from cRS pRS.getConf().setNumDistributionKeys(cRS.getConf().getNumDistributionKeys()); List<FieldSchema> fields = PlanUtils.getFieldSchemasFromColumnList(pRS.getConf().getKeyCols(), "reducesinkkey"); TableDesc keyTable = PlanUtils.getReduceKeyTableDesc(fields, pRS.getConf().getOrder()); ArrayList<String> outputKeyCols = Lists.newArrayList(); for (int i = 0; i < fields.size(); i++) { outputKeyCols.add(fields.get(i).getName()); } pRS.getConf().setOutputKeyColumnNames(outputKeyCols); pRS.getConf().setKeySerializeInfo(keyTable); } return true; } /** * Returns merge directions between two RSs for criterias (ordering, number of reducers, reducer * keys, partition keys). Returns null if any of categories is not mergeable. * * <p>Values for each index can be -1, 0, 1 1. 0 means two configuration in the category is the * same 2. for -1, configuration of parent RS is more specific than child RS 3. for 1, * configuration of child RS is more specific than parent RS */ private int[] checkStatus(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer) throws SemanticException { ReduceSinkDesc cConf = cRS.getConf(); ReduceSinkDesc pConf = pRS.getConf(); Integer moveRSOrderTo = checkOrder(cConf.getOrder(), pConf.getOrder()); if (moveRSOrderTo == null) { return null; } Integer moveReducerNumTo = checkNumReducer(cConf.getNumReducers(), pConf.getNumReducers()); if (moveReducerNumTo == null || moveReducerNumTo > 0 && cConf.getNumReducers() < minReducer) { return null; } List<ExprNodeDesc> ckeys = cConf.getKeyCols(); List<ExprNodeDesc> pkeys = pConf.getKeyCols(); Integer moveKeyColTo = checkExprs(ckeys, pkeys, cRS, pRS); if (moveKeyColTo == null) { return null; } List<ExprNodeDesc> cpars = cConf.getPartitionCols(); List<ExprNodeDesc> ppars = pConf.getPartitionCols(); Integer movePartitionColTo = checkExprs(cpars, ppars, cRS, pRS); if (movePartitionColTo == null) { return null; } Integer moveNumDistKeyTo = checkNumDistributionKey(cConf.getNumDistributionKeys(), pConf.getNumDistributionKeys()); return new int[] { moveKeyColTo, movePartitionColTo, moveRSOrderTo, moveReducerNumTo, moveNumDistKeyTo }; } private Integer checkNumDistributionKey(int cnd, int pnd) { // number of distribution keys of cRS is chosen only when numDistKeys of pRS // is 0 or less. In all other cases, distribution of the keys is based on // the pRS which is more generic than cRS. // Examples: // case 1: if pRS sort key is (a, b) and cRS sort key is (a, b, c) and number of // distribution keys are 2 and 3 resp. then after merge the sort keys will // be (a, b, c) while the number of distribution keys will be 2. // case 2: if pRS sort key is empty and number of distribution keys is 0 // and if cRS sort key is (a, b) and number of distribution keys is 2 then // after merge new sort key will be (a, b) and number of distribution keys // will be 2. if (pnd <= 0) { return 1; } return 0; } /** * Overlapping part of keys should be the same between parent and child. And if child has more * keys than parent, non-overlapping part of keys should be backtrackable to parent. */ private Integer checkExprs( List<ExprNodeDesc> ckeys, List<ExprNodeDesc> pkeys, ReduceSinkOperator cRS, ReduceSinkOperator pRS) throws SemanticException { Integer moveKeyColTo = 0; if (ckeys == null || ckeys.isEmpty()) { if (pkeys != null && !pkeys.isEmpty()) { moveKeyColTo = -1; } } else { if (pkeys == null || pkeys.isEmpty()) { for (ExprNodeDesc ckey : ckeys) { if (ExprNodeDescUtils.backtrack(ckey, cRS, pRS) == null) { // cKey is not present in parent return null; } } moveKeyColTo = 1; } else { moveKeyColTo = sameKeys(ckeys, pkeys, cRS, pRS); } } return moveKeyColTo; } // backtrack key exprs of child to parent and compare it with parent's protected Integer sameKeys( List<ExprNodeDesc> cexprs, List<ExprNodeDesc> pexprs, Operator<?> child, Operator<?> parent) throws SemanticException { int common = Math.min(cexprs.size(), pexprs.size()); int limit = Math.max(cexprs.size(), pexprs.size()); int i = 0; for (; i < common; i++) { ExprNodeDesc pexpr = pexprs.get(i); ExprNodeDesc cexpr = ExprNodeDescUtils.backtrack(cexprs.get(i), child, parent); if (cexpr == null || !pexpr.isSame(cexpr)) { return null; } } for (; i < limit; i++) { if (cexprs.size() > pexprs.size()) { if (ExprNodeDescUtils.backtrack(cexprs.get(i), child, parent) == null) { // cKey is not present in parent return null; } } } return Integer.valueOf(cexprs.size()).compareTo(pexprs.size()); } // order of overlapping keys should be exactly the same protected Integer checkOrder(String corder, String porder) { if (corder == null || corder.trim().equals("")) { if (porder == null || porder.trim().equals("")) { return 0; } return -1; } if (porder == null || porder.trim().equals("")) { return 1; } corder = corder.trim(); porder = porder.trim(); int target = Math.min(corder.length(), porder.length()); if (!corder.substring(0, target).equals(porder.substring(0, target))) { return null; } return Integer.valueOf(corder.length()).compareTo(porder.length()); } /** * If number of reducers for RS is -1, the RS can have any number of reducers. It's generally * true except for order-by or forced bucketing cases. if both of num-reducers are not -1, those * number should be the same. */ protected Integer checkNumReducer(int creduce, int preduce) { if (creduce < 0) { if (preduce < 0) { return 0; } return -1; } if (preduce < 0) { return 1; } if (creduce != preduce) { return null; } return 0; } } static class GroupbyReducerProc extends AbsctractReducerReducerProc { // pRS-pGBY-cRS @Override public Object process(ReduceSinkOperator cRS, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { GroupByOperator pGBY = CorrelationUtilities.findPossibleParent( cRS, GroupByOperator.class, dedupCtx.trustScript()); if (pGBY == null) { return false; } ReduceSinkOperator pRS = CorrelationUtilities.findPossibleParent( pGBY, ReduceSinkOperator.class, dedupCtx.trustScript()); if (pRS != null && merge(cRS, pRS, dedupCtx.minReducer())) { CorrelationUtilities.replaceReduceSinkWithSelectOperator(cRS, dedupCtx.getPctx(), dedupCtx); pRS.getConf().setDeduplicated(true); return true; } return false; } // pRS-pGBY-cRS-cGBY @Override public Object process( ReduceSinkOperator cRS, GroupByOperator cGBY, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { Operator<?> start = CorrelationUtilities.getStartForGroupBy(cRS, dedupCtx); GroupByOperator pGBY = CorrelationUtilities.findPossibleParent( start, GroupByOperator.class, dedupCtx.trustScript()); if (pGBY == null) { return false; } ReduceSinkOperator pRS = CorrelationUtilities.getSingleParent(pGBY, ReduceSinkOperator.class); if (pRS != null && merge(cRS, pRS, dedupCtx.minReducer())) { CorrelationUtilities.removeReduceSinkForGroupBy(cRS, cGBY, dedupCtx.getPctx(), dedupCtx); pRS.getConf().setDeduplicated(true); return true; } return false; } } static class JoinReducerProc extends AbsctractReducerReducerProc { // pRS-pJOIN-cRS @Override public Object process(ReduceSinkOperator cRS, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { JoinOperator pJoin = CorrelationUtilities.findPossibleParent(cRS, JoinOperator.class, dedupCtx.trustScript()); if (pJoin != null && merge(cRS, pJoin, dedupCtx.minReducer())) { pJoin.getConf().setFixedAsSorted(true); CorrelationUtilities.replaceReduceSinkWithSelectOperator(cRS, dedupCtx.getPctx(), dedupCtx); ReduceSinkOperator pRS = CorrelationUtilities.findPossibleParent( pJoin, ReduceSinkOperator.class, dedupCtx.trustScript()); if (pRS != null) { pRS.getConf().setDeduplicated(true); } return true; } return false; } // pRS-pJOIN-cRS-cGBY @Override public Object process( ReduceSinkOperator cRS, GroupByOperator cGBY, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { Operator<?> start = CorrelationUtilities.getStartForGroupBy(cRS, dedupCtx); JoinOperator pJoin = CorrelationUtilities.findPossibleParent( start, JoinOperator.class, dedupCtx.trustScript()); if (pJoin != null && merge(cRS, pJoin, dedupCtx.minReducer())) { pJoin.getConf().setFixedAsSorted(true); CorrelationUtilities.removeReduceSinkForGroupBy(cRS, cGBY, dedupCtx.getPctx(), dedupCtx); ReduceSinkOperator pRS = CorrelationUtilities.findPossibleParent( pJoin, ReduceSinkOperator.class, dedupCtx.trustScript()); if (pRS != null) { pRS.getConf().setDeduplicated(true); } return true; } return false; } } static class ReducerReducerProc extends AbsctractReducerReducerProc { // pRS-cRS @Override public Object process(ReduceSinkOperator cRS, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { ReduceSinkOperator pRS = CorrelationUtilities.findPossibleParent( cRS, ReduceSinkOperator.class, dedupCtx.trustScript()); if (pRS != null && merge(cRS, pRS, dedupCtx.minReducer())) { CorrelationUtilities.replaceReduceSinkWithSelectOperator(cRS, dedupCtx.getPctx(), dedupCtx); pRS.getConf().setDeduplicated(true); return true; } return false; } // pRS-cRS-cGBY @Override public Object process( ReduceSinkOperator cRS, GroupByOperator cGBY, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { Operator<?> start = CorrelationUtilities.getStartForGroupBy(cRS, dedupCtx); ReduceSinkOperator pRS = CorrelationUtilities.findPossibleParent( start, ReduceSinkOperator.class, dedupCtx.trustScript()); if (pRS != null && merge(cRS, pRS, dedupCtx.minReducer())) { if (dedupCtx.getPctx().getConf().getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW)) { return false; } CorrelationUtilities.removeReduceSinkForGroupBy(cRS, cGBY, dedupCtx.getPctx(), dedupCtx); pRS.getConf().setDeduplicated(true); return true; } return false; } } }