@Override public Object process( Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { // If the reduce sink has not been introduced due to bucketing/sorting, ignore it FileSinkOperator fsOp = (FileSinkOperator) nd; ReduceSinkOperator rsOp = (ReduceSinkOperator) fsOp.getParentOperators().get(0).getParentOperators().get(0); List<ReduceSinkOperator> rsOps = pGraphContext.getReduceSinkOperatorsAddedByEnforceBucketingSorting(); // nothing to do if ((rsOps != null) && (!rsOps.contains(rsOp))) { return null; } // Don't do this optimization with updates or deletes if (pGraphContext.getContext().getAcidOperation() == AcidUtils.Operation.UPDATE || pGraphContext.getContext().getAcidOperation() == AcidUtils.Operation.DELETE) { return null; } if (stack.get(0) instanceof TableScanOperator) { TableScanOperator tso = ((TableScanOperator) stack.get(0)); if (SemanticAnalyzer.isAcidTable(tso.getConf().getTableMetadata())) { /*ACID tables have complex directory layout and require merging of delta files * on read thus we should not try to read bucket files directly*/ return null; } } // Support for dynamic partitions can be added later if (fsOp.getConf().getDynPartCtx() != null) { return null; } // No conversion is possible for the reduce keys for (ExprNodeDesc keyCol : rsOp.getConf().getKeyCols()) { if (!(keyCol instanceof ExprNodeColumnDesc)) { return null; } } Table destTable = fsOp.getConf().getTable(); if (destTable == null) { return null; } int numBucketsDestination = destTable.getNumBuckets(); // Get the positions for sorted and bucketed columns // For sorted columns, also get the order (ascending/descending) - that should // also match for this to be converted to a map-only job. // Get the positions for sorted and bucketed columns // For sorted columns, also get the order (ascending/descending) - that should // also match for this to be converted to a map-only job. List<Integer> bucketPositions = getBucketPositions(destTable.getBucketCols(), destTable.getCols()); ObjectPair<List<Integer>, List<Integer>> sortOrderPositions = getSortPositionsOrder(destTable.getSortCols(), destTable.getCols()); List<Integer> sortPositions = sortOrderPositions.getFirst(); List<Integer> sortOrder = sortOrderPositions.getSecond(); boolean useBucketSortPositions = true; // Only selects and filters are allowed Operator<? extends OperatorDesc> op = rsOp; // TableScan will also be followed by a Select Operator. Find the expressions for the // bucketed/sorted columns for the destination table List<ExprNodeColumnDesc> sourceTableBucketCols = new ArrayList<ExprNodeColumnDesc>(); List<ExprNodeColumnDesc> sourceTableSortCols = new ArrayList<ExprNodeColumnDesc>(); op = op.getParentOperators().get(0); while (true) { if (!(op instanceof TableScanOperator) && !(op instanceof FilterOperator) && !(op instanceof SelectOperator) && !(op instanceof SMBMapJoinOperator)) { return null; } if (op instanceof SMBMapJoinOperator) { // Bucketing and sorting keys should exactly match if (!(bucketPositions.equals(sortPositions))) { return null; } SMBMapJoinOperator smbOp = (SMBMapJoinOperator) op; SMBJoinDesc smbJoinDesc = smbOp.getConf(); int posBigTable = smbJoinDesc.getPosBigTable(); // join keys dont match the bucketing keys List<ExprNodeDesc> keysBigTable = smbJoinDesc.getKeys().get((byte) posBigTable); if (keysBigTable.size() != bucketPositions.size()) { return null; } if (!validateSMBJoinKeys( smbJoinDesc, sourceTableBucketCols, sourceTableSortCols, sortOrder)) { return null; } sourceTableBucketCols.clear(); sourceTableSortCols.clear(); useBucketSortPositions = false; for (ExprNodeDesc keyBigTable : keysBigTable) { if (!(keyBigTable instanceof ExprNodeColumnDesc)) { return null; } sourceTableBucketCols.add((ExprNodeColumnDesc) keyBigTable); sourceTableSortCols.add((ExprNodeColumnDesc) keyBigTable); } // since it is a sort-merge join, only follow the big table op = op.getParentOperators().get(posBigTable); } else { // nothing to be done for filters - the output schema does not change. if (op instanceof TableScanOperator) { assert !useBucketSortPositions; TableScanOperator ts = (TableScanOperator) op; Table srcTable = ts.getConf().getTableMetadata(); // Find the positions of the bucketed columns in the table corresponding // to the select list. // Consider the following scenario: // T1(key, value1, value2) bucketed/sorted by key into 2 buckets // T2(dummy, key, value1, value2) bucketed/sorted by key into 2 buckets // A query like: insert overwrite table T2 select 1, key, value1, value2 from T1 // should be optimized. // Start with the destination: T2, bucketed/sorted position is [1] // At the source T1, the column corresponding to that position is [key], which // maps to column [0] of T1, which is also bucketed/sorted into the same // number of buckets List<Integer> newBucketPositions = new ArrayList<Integer>(); for (int pos = 0; pos < bucketPositions.size(); pos++) { ExprNodeColumnDesc col = sourceTableBucketCols.get(pos); String colName = col.getColumn(); int bucketPos = findColumnPosition(srcTable.getCols(), colName); if (bucketPos < 0) { return null; } newBucketPositions.add(bucketPos); } // Find the positions/order of the sorted columns in the table corresponding // to the select list. List<Integer> newSortPositions = new ArrayList<Integer>(); for (int pos = 0; pos < sortPositions.size(); pos++) { ExprNodeColumnDesc col = sourceTableSortCols.get(pos); String colName = col.getColumn(); int sortPos = findColumnPosition(srcTable.getCols(), colName); if (sortPos < 0) { return null; } newSortPositions.add(sortPos); } if (srcTable.isPartitioned()) { PrunedPartitionList prunedParts = pGraphContext.getPrunedPartitions(srcTable.getTableName(), ts); List<Partition> partitions = prunedParts.getNotDeniedPartns(); // Support for dynamic partitions can be added later // The following is not optimized: // insert overwrite table T1(ds='1', hr) select key, value, hr from T2 where ds = '1'; // where T1 and T2 are bucketed by the same keys and partitioned by ds. hr if ((partitions == null) || (partitions.isEmpty()) || (partitions.size() > 1)) { return null; } for (Partition partition : partitions) { if (!checkPartition( partition, newBucketPositions, newSortPositions, sortOrder, numBucketsDestination)) { return null; } } removeReduceSink( rsOp, (TableScanOperator) op, fsOp, partitions.get(0).getSortedPaths()); return null; } else { if (!checkTable( srcTable, newBucketPositions, newSortPositions, sortOrder, numBucketsDestination)) { return null; } removeReduceSink(rsOp, (TableScanOperator) op, fsOp, srcTable.getSortedPaths()); return null; } } // None of the operators is changing the positions else if (op instanceof SelectOperator) { SelectOperator selectOp = (SelectOperator) op; SelectDesc selectDesc = selectOp.getConf(); // Iterate backwards, from the destination table to the top of the tree // Based on the output column names, get the new columns. if (!useBucketSortPositions) { bucketPositions.clear(); sortPositions.clear(); List<String> outputColumnNames = selectDesc.getOutputColumnNames(); for (ExprNodeColumnDesc col : sourceTableBucketCols) { String colName = col.getColumn(); int colPos = outputColumnNames.indexOf(colName); if (colPos < 0) { return null; } bucketPositions.add(colPos); } for (ExprNodeColumnDesc col : sourceTableSortCols) { String colName = col.getColumn(); int colPos = outputColumnNames.indexOf(colName); if (colPos < 0) { return null; } sortPositions.add(colPos); } } // There may be multiple selects - chose the one closest to the table sourceTableBucketCols.clear(); sourceTableSortCols.clear(); // Only columns can be selected for both sorted and bucketed positions for (int pos : bucketPositions) { ExprNodeDesc selectColList = selectDesc.getColList().get(pos); if (!(selectColList instanceof ExprNodeColumnDesc)) { return null; } sourceTableBucketCols.add((ExprNodeColumnDesc) selectColList); } for (int pos : sortPositions) { ExprNodeDesc selectColList = selectDesc.getColList().get(pos); if (!(selectColList instanceof ExprNodeColumnDesc)) { return null; } sourceTableSortCols.add((ExprNodeColumnDesc) selectColList); } useBucketSortPositions = false; } op = op.getParentOperators().get(0); } } }
@Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx, Object... nodeOutputs) throws SemanticException { SelectOperator op = (SelectOperator) nd; ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx; if (op.getChildOperators() != null) { for (Operator<? extends OperatorDesc> child : op.getChildOperators()) { // UDTF is not handled yet, so the parent SelectOp of UDTF should just assume // all columns. if ((child instanceof UDTFOperator)) { cppCtx.getPrunedColLists().put(op, cppCtx.getColsFromSelectExpr(op)); return null; } } } LateralViewJoinOperator lvJoin = null; if (op.getConf().isSelStarNoCompute()) { assert op.getNumChild() == 1; Operator<? extends OperatorDesc> child = op.getChildOperators().get(0); if (child instanceof LateralViewJoinOperator) { // this SEL is SEL(*) // for LV lvJoin = (LateralViewJoinOperator) child; } } List<String> cols = cppCtx.genColLists(op); SelectDesc conf = op.getConf(); if (lvJoin != null) { // get columns for SEL(*) from LVJ if (cols != null) { RowResolver rr = cppCtx.getOpToParseCtxMap().get(op).getRowResolver(); cppCtx.getPrunedColLists().put(op, cppCtx.getSelectColsFromLVJoin(rr, cols)); } return null; } // The input to the select does not matter. Go over the expressions // and return the ones which have a marked column cppCtx.getPrunedColLists().put(op, cppCtx.getSelectColsFromChildren(op, cols)); if (cols == null || conf.isSelStarNoCompute()) { return null; } // do we need to prune the select operator? List<ExprNodeDesc> originalColList = op.getConf().getColList(); // by now, 'prunedCols' are columns used by child operators, and 'columns' // are columns used by this select operator. List<String> originalOutputColumnNames = conf.getOutputColumnNames(); if (cols.size() < originalOutputColumnNames.size()) { ArrayList<ExprNodeDesc> newColList = new ArrayList<ExprNodeDesc>(); ArrayList<String> newOutputColumnNames = new ArrayList<String>(); ArrayList<ColumnInfo> rs_oldsignature = op.getSchema().getSignature(); ArrayList<ColumnInfo> rs_newsignature = new ArrayList<ColumnInfo>(); RowResolver old_rr = cppCtx.getOpToParseCtxMap().get(op).getRowResolver(); RowResolver new_rr = new RowResolver(); for (String col : cols) { int index = originalOutputColumnNames.indexOf(col); newOutputColumnNames.add(col); newColList.add(originalColList.get(index)); rs_newsignature.add(rs_oldsignature.get(index)); String[] tabcol = old_rr.reverseLookup(col); ColumnInfo columnInfo = old_rr.get(tabcol[0], tabcol[1]); new_rr.put(tabcol[0], tabcol[1], columnInfo); } cppCtx.getOpToParseCtxMap().get(op).setRowResolver(new_rr); op.getSchema().setSignature(rs_newsignature); conf.setColList(newColList); conf.setOutputColumnNames(newOutputColumnNames); handleChildren(op, cols, cppCtx); } return null; }