// The output columns for the destination table should match with the join keys // This is to handle queries of the form: // insert overwrite table T3 // select T1.key, T1.key2, UDF(T1.value, T2.value) // from T1 join T2 on T1.key = T2.key and T1.key2 = T2.key2 // where T1, T2 and T3 are bucketized/sorted on key and key2 // Assuming T1 is the table on which the mapper is run, the following is true: // . The number of buckets for T1 and T3 should be same // . The bucketing/sorting columns for T1, T2 and T3 should be same // . The sort order of T1 should match with the sort order for T3. // . If T1 is partitioned, only a single partition of T1 can be selected. // . The select list should contain with (T1.key, T1.key2) or (T2.key, T2.key2) // . After the join, only selects and filters are allowed. private boolean validateSMBJoinKeys( SMBJoinDesc smbJoinDesc, List<ExprNodeColumnDesc> sourceTableBucketCols, List<ExprNodeColumnDesc> sourceTableSortCols, List<Integer> sortOrder) { // The sort-merge join creates the output sorted and bucketized by the same columns. // This can be relaxed in the future if there is a requirement. if (!sourceTableBucketCols.equals(sourceTableSortCols)) { return false; } // Get the total number of columns selected, and for each output column, store the // base table it points to. For // insert overwrite table T3 // select T1.key, T1.key2, UDF(T1.value, T2.value) // from T1 join T2 on T1.key = T2.key and T1.key2 = T2.key2 // the following arrays are created // [0, 0, 0, 1] --> [T1, T1, T1, T2] (table mapping) // [0, 1, 2, 0] --> [T1.0, T1.1, T1.2, T2.0] (table columns mapping) Byte[] tagOrder = smbJoinDesc.getTagOrder(); Map<Byte, List<Integer>> retainList = smbJoinDesc.getRetainList(); int totalNumberColumns = 0; for (Byte tag : tagOrder) { totalNumberColumns += retainList.get(tag).size(); } byte[] columnTableMappings = new byte[totalNumberColumns]; int[] columnNumberMappings = new int[totalNumberColumns]; int currentColumnPosition = 0; for (Byte tag : tagOrder) { for (int pos = 0; pos < retainList.get(tag).size(); pos++) { columnTableMappings[currentColumnPosition] = tag; columnNumberMappings[currentColumnPosition] = pos; currentColumnPosition++; } } // All output columns used for bucketing/sorting of the destination table should // belong to the same input table // insert overwrite table T3 // select T1.key, T2.key2, UDF(T1.value, T2.value) // from T1 join T2 on T1.key = T2.key and T1.key2 = T2.key2 // is not optimized, whereas the insert is optimized if the select list is either changed to // (T1.key, T1.key2, UDF(T1.value, T2.value)) or (T2.key, T2.key2, UDF(T1.value, T2.value)) // Get the input table and make sure the keys match List<String> outputColumnNames = smbJoinDesc.getOutputColumnNames(); byte tableTag = -1; int[] columnNumbersExprList = new int[sourceTableBucketCols.size()]; int currentColPosition = 0; for (ExprNodeColumnDesc bucketCol : sourceTableBucketCols) { String colName = bucketCol.getColumn(); int colNumber = outputColumnNames.indexOf(colName); if (colNumber < 0) { return false; } if (tableTag < 0) { tableTag = columnTableMappings[colNumber]; } else if (tableTag != columnTableMappings[colNumber]) { return false; } columnNumbersExprList[currentColPosition++] = columnNumberMappings[colNumber]; } List<ExprNodeDesc> allExprs = smbJoinDesc.getExprs().get(tableTag); List<ExprNodeDesc> keysSelectedTable = smbJoinDesc.getKeys().get(tableTag); currentColPosition = 0; for (ExprNodeDesc keySelectedTable : keysSelectedTable) { if (!(keySelectedTable instanceof ExprNodeColumnDesc)) { return false; } if (!allExprs.get(columnNumbersExprList[currentColPosition++]).isSame(keySelectedTable)) { return false; } } return true; }