/** * Find all big tables from STREAMTABLE hints. * * @param joinCtx The join context * @return Set of all big tables */ private Set<String> getBigTables(ParseContext joinCtx) { Set<String> bigTables = new HashSet<String>(); for (JoinOperator joinOp : joinCtx.getJoinOps()) { if (joinOp.getConf().getStreamAliases() != null) { bigTables.addAll(joinOp.getConf().getStreamAliases()); } } return bigTables; }
/** * Reorder the tables in a join operator appropriately (by reordering the tags of the reduces * sinks). * * @param joinOp The join operator to be processed * @param bigTables Set of all big tables */ private void reorder(JoinOperator joinOp, Set<String> bigTables) { int count = joinOp.getParentOperators().size(); // Find the biggest reduce sink int biggestPos = count - 1; int biggestSize = getOutputSize(joinOp.getParentOperators().get(biggestPos), bigTables); for (int i = 0; i < count - 1; i++) { int currSize = getOutputSize(joinOp.getParentOperators().get(i), bigTables); if (currSize > biggestSize) { biggestSize = currSize; biggestPos = i; } } // Reorder tags if need be if (biggestPos != (count - 1)) { Byte[] tagOrder = joinOp.getConf().getTagOrder(); Byte temp = tagOrder[biggestPos]; tagOrder[biggestPos] = tagOrder[count - 1]; tagOrder[count - 1] = temp; // Update tags of reduce sinks ((ReduceSinkOperator) joinOp.getParentOperators().get(biggestPos)) .getConf() .setTag(count - 1); ((ReduceSinkOperator) joinOp.getParentOperators().get(count - 1)) .getConf() .setTag(biggestPos); } }
@Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx, Object... nodeOutputs) throws SemanticException { JoinOperator op = (JoinOperator) nd; pruneJoinOperator(ctx, op, op.getConf(), op.getColumnExprMap(), null, false); return null; }
// pRS-pJOIN-cRS @Override public Object process(ReduceSinkOperator cRS, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { JoinOperator pJoin = CorrelationUtilities.findPossibleParent(cRS, JoinOperator.class, dedupCtx.trustScript()); if (pJoin != null && merge(cRS, pJoin, dedupCtx.minReducer())) { pJoin.getConf().setFixedAsSorted(true); CorrelationUtilities.replaceReduceSinkWithSelectOperator(cRS, dedupCtx.getPctx(), dedupCtx); ReduceSinkOperator pRS = CorrelationUtilities.findPossibleParent( pJoin, ReduceSinkOperator.class, dedupCtx.trustScript()); if (pRS != null) { pRS.getConf().setDeduplicated(true); } return true; } return false; }
// pRS-pJOIN-cRS-cGBY @Override public Object process( ReduceSinkOperator cRS, GroupByOperator cGBY, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { Operator<?> start = CorrelationUtilities.getStartForGroupBy(cRS, dedupCtx); JoinOperator pJoin = CorrelationUtilities.findPossibleParent( start, JoinOperator.class, dedupCtx.trustScript()); if (pJoin != null && merge(cRS, pJoin, dedupCtx.minReducer())) { pJoin.getConf().setFixedAsSorted(true); CorrelationUtilities.removeReduceSinkForGroupBy(cRS, cGBY, dedupCtx.getPctx(), dedupCtx); ReduceSinkOperator pRS = CorrelationUtilities.findPossibleParent( pJoin, ReduceSinkOperator.class, dedupCtx.trustScript()); if (pRS != null) { pRS.getConf().setDeduplicated(true); } return true; } return false; }
@Override public Object process( Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { // We should be having a tree which looks like this // TS -> * -> RS - // \ // -> JOIN -> .. // / // TS -> * -> RS - // // We are in the join operator now. SkewJoinOptProcCtx ctx = (SkewJoinOptProcCtx) procCtx; parseContext = ctx.getpGraphContext(); JoinOperator joinOp = (JoinOperator) nd; // This join has already been processed if (ctx.getDoneJoins().contains(joinOp)) { return null; } ctx.getDoneJoins().add(joinOp); Operator<? extends OperatorDesc> currOp = joinOp; boolean processSelect = false; // Is there a select following // Clone the select also. It is useful for a follow-on optimization where the union // followed by a select star is completely removed. if ((joinOp.getChildOperators().size() == 1) && (joinOp.getChildOperators().get(0) instanceof SelectOperator)) { currOp = joinOp.getChildOperators().get(0); processSelect = true; } List<TableScanOperator> tableScanOpsForJoin = new ArrayList<TableScanOperator>(); if (!getTableScanOpsForJoin(joinOp, tableScanOpsForJoin)) { return null; } if ((tableScanOpsForJoin == null) || (tableScanOpsForJoin.isEmpty())) { return null; } // Get the skewed values in all the tables Map<List<ExprNodeDesc>, List<List<String>>> skewedValues = getSkewedValues(joinOp, tableScanOpsForJoin); // If there are no skewed values, nothing needs to be done if (skewedValues == null || skewedValues.size() == 0) { return null; } // After this optimization, the tree should be like: // TS -> (FIL "skewed rows") * -> RS - // \ // -> JOIN // / \ // TS -> (FIL "skewed rows") * -> RS - \ // \ // -> UNION -> .. // / // TS -> (FIL "no skewed rows") * -> RS - / // \ / // -> JOIN // / // TS -> (FIL "no skewed rows") * -> RS - // // Create a clone of the operator Operator<? extends OperatorDesc> currOpClone; try { currOpClone = currOp.clone(); insertRowResolvers(currOp, currOpClone, ctx); } catch (CloneNotSupportedException e) { LOG.debug("Operator tree could not be cloned"); return null; } JoinOperator joinOpClone; if (processSelect) { joinOpClone = (JoinOperator) (currOpClone.getParentOperators().get(0)); } else { joinOpClone = (JoinOperator) currOpClone; } joinOpClone.getConf().cloneQBJoinTreeProps(joinOp.getConf()); parseContext.getJoinOps().add(joinOpClone); List<TableScanOperator> tableScanCloneOpsForJoin = new ArrayList<TableScanOperator>(); if (!getTableScanOpsForJoin(joinOpClone, tableScanCloneOpsForJoin)) { LOG.debug("Operator tree not properly cloned!"); return null; } // Put the filter "skewed column = skewed keys" in op // and "skewed columns != skewed keys" in selectOpClone insertSkewFilter(tableScanOpsForJoin, skewedValues, true); insertSkewFilter(tableScanCloneOpsForJoin, skewedValues, false); // Update the topOps appropriately Map<String, Operator<? extends OperatorDesc>> topOps = getTopOps(joinOpClone); Map<String, Operator<? extends OperatorDesc>> origTopOps = parseContext.getTopOps(); for (Entry<String, Operator<? extends OperatorDesc>> topOp : topOps.entrySet()) { TableScanOperator tso = (TableScanOperator) topOp.getValue(); String tabAlias = tso.getConf().getAlias(); int initCnt = 1; String newAlias = "subquery" + initCnt + ":" + tabAlias; while (origTopOps.containsKey(newAlias)) { initCnt++; newAlias = "subquery" + initCnt + ":" + tabAlias; } parseContext.getTopOps().put(newAlias, tso); setUpAlias(joinOp, joinOpClone, tabAlias, newAlias, tso); } // Now do a union of the select operators: selectOp and selectOpClone // Store the operator that follows the select after the join, we will be // adding this as a child to the Union later List<Operator<? extends OperatorDesc>> finalOps = currOp.getChildOperators(); currOp.setChildOperators(null); currOpClone.setChildOperators(null); // Make the union operator List<Operator<? extends OperatorDesc>> oplist = new ArrayList<Operator<? extends OperatorDesc>>(); oplist.add(currOp); oplist.add(currOpClone); Operator<? extends OperatorDesc> unionOp = OperatorFactory.getAndMakeChild( new UnionDesc(), new RowSchema(currOp.getSchema().getSignature()), oplist); // Introduce a select after the union List<Operator<? extends OperatorDesc>> unionList = new ArrayList<Operator<? extends OperatorDesc>>(); unionList.add(unionOp); Operator<? extends OperatorDesc> selectUnionOp = OperatorFactory.getAndMakeChild( new SelectDesc(true), new RowSchema(unionOp.getSchema().getSignature()), unionList); // add the finalOp after the union selectUnionOp.setChildOperators(finalOps); // replace the original selectOp in the parents with selectUnionOp for (Operator<? extends OperatorDesc> finalOp : finalOps) { finalOp.replaceParent(currOp, selectUnionOp); } return null; }
/** Set alias in the cloned join tree */ private static void setUpAlias( JoinOperator origin, JoinOperator cloned, String origAlias, String newAlias, Operator<? extends OperatorDesc> topOp) { cloned.getConf().getAliasToOpInfo().remove(origAlias); cloned.getConf().getAliasToOpInfo().put(newAlias, topOp); if (origin.getConf().getLeftAlias().equals(origAlias)) { cloned.getConf().setLeftAlias(null); cloned.getConf().setLeftAlias(newAlias); } replaceAlias( origin.getConf().getLeftAliases(), cloned.getConf().getLeftAliases(), origAlias, newAlias); replaceAlias( origin.getConf().getRightAliases(), cloned.getConf().getRightAliases(), origAlias, newAlias); replaceAlias( origin.getConf().getBaseSrc(), cloned.getConf().getBaseSrc(), origAlias, newAlias); replaceAlias( origin.getConf().getMapAliases(), cloned.getConf().getMapAliases(), origAlias, newAlias); replaceAlias( origin.getConf().getStreamAliases(), cloned.getConf().getStreamAliases(), origAlias, newAlias); }