Beispiel #1
0
 /**
  * Converts the skewedValue available as a string in the metadata to the appropriate object by
  * using the type of the column from the join key.
  *
  * @param skewedValue
  * @param keyCol
  * @return an expression node descriptor of the appropriate constant
  */
 private ExprNodeConstantDesc createConstDesc(String skewedValue, ExprNodeColumnDesc keyCol) {
   ObjectInspector inputOI =
       TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(TypeInfoFactory.stringTypeInfo);
   ObjectInspector outputOI =
       TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(keyCol.getTypeInfo());
   Converter converter = ObjectInspectorConverters.getConverter(inputOI, outputOI);
   Object skewedValueObject = converter.convert(skewedValue);
   return new ExprNodeConstantDesc(keyCol.getTypeInfo(), skewedValueObject);
 }
Beispiel #2
0
 private String getColumnName(String alias, ExprNodeDesc exprNode, int colIdx) {
   if (alias != null) {
     return alias;
   } else if (exprNode instanceof ExprNodeColumnDesc) {
     ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc) exprNode;
     return colDesc.getColumn();
   }
   return "matchpath_col_" + colIdx;
 }
Beispiel #3
0
 protected RexNode convert(ExprNodeColumnDesc col) throws SemanticException {
   InputCtx ic = getInputCtx(col);
   int pos = ic.hiveNameToPosMap.get(col.getColumn());
   return cluster
       .getRexBuilder()
       .makeInputRef(
           ic.calciteInpDataType.getFieldList().get(pos).getType(),
           pos + ic.offsetInCalciteSchema);
 }
Beispiel #4
0
  private InputCtx getInputCtx(ExprNodeColumnDesc col) throws SemanticException {
    InputCtx ctxLookingFor = null;

    if (inputCtxs.size() == 1) {
      ctxLookingFor = inputCtxs.get(0);
    } else {
      String tableAlias = col.getTabAlias();
      String colAlias = col.getColumn();
      int noInp = 0;
      for (InputCtx ic : inputCtxs) {
        if (tableAlias == null || ic.hiveRR.hasTableAlias(tableAlias)) {
          if (ic.hiveRR.getPosition(colAlias) >= 0) {
            ctxLookingFor = ic;
            noInp++;
          }
        }
      }

      if (noInp > 1) throw new RuntimeException("Ambigous column mapping");
    }

    return ctxLookingFor;
  }
  private ExprNodeDesc analyzeExpr(
      ExprNodeGenericFuncDesc expr,
      List<IndexSearchCondition> searchConditions,
      Object... nodeOutputs) {

    if (FunctionRegistry.isOpAnd(expr)) {
      assert (nodeOutputs.length == 2);
      ExprNodeDesc residual1 = (ExprNodeDesc) nodeOutputs[0];
      ExprNodeDesc residual2 = (ExprNodeDesc) nodeOutputs[1];
      if (residual1 == null) {
        return residual2;
      }
      if (residual2 == null) {
        return residual1;
      }
      List<ExprNodeDesc> residuals = new ArrayList<ExprNodeDesc>();
      residuals.add(residual1);
      residuals.add(residual2);
      return new ExprNodeGenericFuncDesc(
          TypeInfoFactory.booleanTypeInfo, FunctionRegistry.getGenericUDFForAnd(), residuals);
    }

    GenericUDF genericUDF = expr.getGenericUDF();
    if (!(genericUDF instanceof GenericUDFBaseCompare)) {
      return expr;
    }
    ExprNodeDesc expr1 = (ExprNodeDesc) nodeOutputs[0];
    ExprNodeDesc expr2 = (ExprNodeDesc) nodeOutputs[1];
    // We may need to peel off the GenericUDFBridge that is added by CBO or user
    if (expr1.getTypeInfo().equals(expr2.getTypeInfo())) {
      expr1 = getColumnExpr(expr1);
      expr2 = getColumnExpr(expr2);
    }

    ExprNodeDesc[] extracted = ExprNodeDescUtils.extractComparePair(expr1, expr2);
    if (extracted == null || (extracted.length > 2 && !acceptsFields)) {
      return expr;
    }

    ExprNodeColumnDesc columnDesc;
    ExprNodeConstantDesc constantDesc;
    if (extracted[0] instanceof ExprNodeConstantDesc) {
      genericUDF = genericUDF.flip();
      columnDesc = (ExprNodeColumnDesc) extracted[1];
      constantDesc = (ExprNodeConstantDesc) extracted[0];
    } else {
      columnDesc = (ExprNodeColumnDesc) extracted[0];
      constantDesc = (ExprNodeConstantDesc) extracted[1];
    }

    String udfName = genericUDF.getUdfName();
    if (!udfNames.contains(genericUDF.getUdfName())) {
      return expr;
    }

    if (!allowedColumnNames.contains(columnDesc.getColumn())) {
      return expr;
    }

    String[] fields = null;
    if (extracted.length > 2) {
      ExprNodeFieldDesc fieldDesc = (ExprNodeFieldDesc) extracted[2];
      if (!isValidField(fieldDesc)) {
        return expr;
      }
      fields = ExprNodeDescUtils.extractFields(fieldDesc);
    }

    // We also need to update the expr so that the index query can be generated.
    // Note that, hive does not support UDFToDouble etc in the query text.
    List<ExprNodeDesc> list = new ArrayList<ExprNodeDesc>();
    list.add(expr1);
    list.add(expr2);
    expr = new ExprNodeGenericFuncDesc(expr.getTypeInfo(), expr.getGenericUDF(), list);

    searchConditions.add(new IndexSearchCondition(columnDesc, udfName, constantDesc, expr, fields));

    // we converted the expression to a search condition, so
    // remove it from the residual predicate
    return fields == null ? null : expr;
  }
Beispiel #6
0
    /**
     * Returns the skewed values in all the tables which are going to be scanned. If the join is on
     * columns c1, c2 and c3 on tables T1 and T2, T1 is skewed on c1 and c4 with the skew values
     * ((1,2),(3,4)), whereas T2 is skewed on c1, c2 with skew values ((5,6),(7,8)), the resulting
     * map would be: <(c1) -> ((1), (3)), (c1,c2) -> ((5,6),(7,8))>
     *
     * @param op The join operator being optimized
     * @param tableScanOpsForJoin table scan operators which are parents of the join operator
     * @return map<join keys intersection skewedkeys, list of skewed values>.
     */
    private Map<List<ExprNodeDesc>, List<List<String>>> getSkewedValues(
        Operator<? extends OperatorDesc> op, List<TableScanOperator> tableScanOpsForJoin) {

      Map<List<ExprNodeDesc>, List<List<String>>> skewDataReturn =
          new HashMap<List<ExprNodeDesc>, List<List<String>>>();

      Map<List<ExprNodeDescEqualityWrapper>, List<List<String>>> skewData =
          new HashMap<List<ExprNodeDescEqualityWrapper>, List<List<String>>>();

      // The join keys are available in the reduceSinkOperators before join
      for (Operator<? extends OperatorDesc> reduceSinkOp : op.getParentOperators()) {
        ReduceSinkDesc rsDesc = ((ReduceSinkOperator) reduceSinkOp).getConf();

        if (rsDesc.getKeyCols() != null) {
          Table table = null;
          // Find the skew information corresponding to the table
          List<String> skewedColumns = null;
          List<List<String>> skewedValueList = null;

          // The join columns which are also skewed
          List<ExprNodeDescEqualityWrapper> joinKeysSkewedCols =
              new ArrayList<ExprNodeDescEqualityWrapper>();

          // skewed Keys which intersect with join keys
          List<Integer> positionSkewedKeys = new ArrayList<Integer>();

          // Update the joinKeys appropriately.
          for (ExprNodeDesc keyColDesc : rsDesc.getKeyCols()) {
            ExprNodeColumnDesc keyCol = null;

            // If the key column is not a column, then dont apply this optimization.
            // This will be fixed as part of https://issues.apache.org/jira/browse/HIVE-3445
            // for type conversion UDFs.
            if (keyColDesc instanceof ExprNodeColumnDesc) {
              keyCol = (ExprNodeColumnDesc) keyColDesc;
              if (table == null) {
                table = getTable(parseContext, reduceSinkOp, tableScanOpsForJoin);
                skewedColumns = table == null ? null : table.getSkewedColNames();
                // No skew on the table to take care of
                if ((skewedColumns == null) || (skewedColumns.isEmpty())) {
                  continue;
                }

                skewedValueList = table == null ? null : table.getSkewedColValues();
              }
              int pos = skewedColumns.indexOf(keyCol.getColumn());
              if ((pos >= 0) && (!positionSkewedKeys.contains(pos))) {
                positionSkewedKeys.add(pos);
                ExprNodeColumnDesc keyColClone = (ExprNodeColumnDesc) keyCol.clone();
                keyColClone.setTabAlias(null);
                joinKeysSkewedCols.add(new ExprNodeDescEqualityWrapper(keyColClone));
              }
            }
          }

          // If the skew keys match the join keys, then add it to the list
          if ((skewedColumns != null) && (!skewedColumns.isEmpty())) {
            if (!joinKeysSkewedCols.isEmpty()) {
              // If the join keys matches the skewed keys, use the table skewed keys
              List<List<String>> skewedJoinValues;
              if (skewedColumns.size() == positionSkewedKeys.size()) {
                skewedJoinValues = skewedValueList;
              } else {
                skewedJoinValues = getSkewedJoinValues(skewedValueList, positionSkewedKeys);
              }

              List<List<String>> oldSkewedJoinValues = skewData.get(joinKeysSkewedCols);
              if (oldSkewedJoinValues == null) {
                oldSkewedJoinValues = new ArrayList<List<String>>();
              }
              for (List<String> skewValue : skewedJoinValues) {
                if (!oldSkewedJoinValues.contains(skewValue)) {
                  oldSkewedJoinValues.add(skewValue);
                }
              }

              skewData.put(joinKeysSkewedCols, oldSkewedJoinValues);
            }
          }
        }
      }

      // convert skewData to contain ExprNodeDesc in the keys
      for (Map.Entry<List<ExprNodeDescEqualityWrapper>, List<List<String>>> mapEntry :
          skewData.entrySet()) {
        List<ExprNodeDesc> skewedKeyJoinCols = new ArrayList<ExprNodeDesc>();
        for (ExprNodeDescEqualityWrapper key : mapEntry.getKey()) {
          skewedKeyJoinCols.add(key.getExprNodeDesc());
        }
        skewDataReturn.put(skewedKeyJoinCols, mapEntry.getValue());
      }

      return skewDataReturn;
    }
    @Override
    public Object process(
        Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs)
        throws SemanticException {

      // If the reduce sink has not been introduced due to bucketing/sorting, ignore it
      FileSinkOperator fsOp = (FileSinkOperator) nd;
      ReduceSinkOperator rsOp =
          (ReduceSinkOperator) fsOp.getParentOperators().get(0).getParentOperators().get(0);

      List<ReduceSinkOperator> rsOps =
          pGraphContext.getReduceSinkOperatorsAddedByEnforceBucketingSorting();
      // nothing to do
      if ((rsOps != null) && (!rsOps.contains(rsOp))) {
        return null;
      }

      // Don't do this optimization with updates or deletes
      if (pGraphContext.getContext().getAcidOperation() == AcidUtils.Operation.UPDATE
          || pGraphContext.getContext().getAcidOperation() == AcidUtils.Operation.DELETE) {
        return null;
      }

      if (stack.get(0) instanceof TableScanOperator) {
        TableScanOperator tso = ((TableScanOperator) stack.get(0));
        if (SemanticAnalyzer.isAcidTable(tso.getConf().getTableMetadata())) {
          /*ACID tables have complex directory layout and require merging of delta files
           * on read thus we should not try to read bucket files directly*/
          return null;
        }
      }
      // Support for dynamic partitions can be added later
      if (fsOp.getConf().getDynPartCtx() != null) {
        return null;
      }

      // No conversion is possible for the reduce keys
      for (ExprNodeDesc keyCol : rsOp.getConf().getKeyCols()) {
        if (!(keyCol instanceof ExprNodeColumnDesc)) {
          return null;
        }
      }

      Table destTable = fsOp.getConf().getTable();
      if (destTable == null) {
        return null;
      }
      int numBucketsDestination = destTable.getNumBuckets();

      // Get the positions for sorted and bucketed columns
      // For sorted columns, also get the order (ascending/descending) - that should
      // also match for this to be converted to a map-only job.
      // Get the positions for sorted and bucketed columns
      // For sorted columns, also get the order (ascending/descending) - that should
      // also match for this to be converted to a map-only job.
      List<Integer> bucketPositions =
          getBucketPositions(destTable.getBucketCols(), destTable.getCols());
      ObjectPair<List<Integer>, List<Integer>> sortOrderPositions =
          getSortPositionsOrder(destTable.getSortCols(), destTable.getCols());
      List<Integer> sortPositions = sortOrderPositions.getFirst();
      List<Integer> sortOrder = sortOrderPositions.getSecond();
      boolean useBucketSortPositions = true;

      // Only selects and filters are allowed
      Operator<? extends OperatorDesc> op = rsOp;
      // TableScan will also be followed by a Select Operator. Find the expressions for the
      // bucketed/sorted columns for the destination table
      List<ExprNodeColumnDesc> sourceTableBucketCols = new ArrayList<ExprNodeColumnDesc>();
      List<ExprNodeColumnDesc> sourceTableSortCols = new ArrayList<ExprNodeColumnDesc>();
      op = op.getParentOperators().get(0);

      while (true) {
        if (!(op instanceof TableScanOperator)
            && !(op instanceof FilterOperator)
            && !(op instanceof SelectOperator)
            && !(op instanceof SMBMapJoinOperator)) {
          return null;
        }

        if (op instanceof SMBMapJoinOperator) {
          // Bucketing and sorting keys should exactly match
          if (!(bucketPositions.equals(sortPositions))) {
            return null;
          }
          SMBMapJoinOperator smbOp = (SMBMapJoinOperator) op;
          SMBJoinDesc smbJoinDesc = smbOp.getConf();
          int posBigTable = smbJoinDesc.getPosBigTable();

          // join keys dont match the bucketing keys
          List<ExprNodeDesc> keysBigTable = smbJoinDesc.getKeys().get((byte) posBigTable);
          if (keysBigTable.size() != bucketPositions.size()) {
            return null;
          }

          if (!validateSMBJoinKeys(
              smbJoinDesc, sourceTableBucketCols, sourceTableSortCols, sortOrder)) {
            return null;
          }

          sourceTableBucketCols.clear();
          sourceTableSortCols.clear();
          useBucketSortPositions = false;

          for (ExprNodeDesc keyBigTable : keysBigTable) {
            if (!(keyBigTable instanceof ExprNodeColumnDesc)) {
              return null;
            }
            sourceTableBucketCols.add((ExprNodeColumnDesc) keyBigTable);
            sourceTableSortCols.add((ExprNodeColumnDesc) keyBigTable);
          }

          // since it is a sort-merge join, only follow the big table
          op = op.getParentOperators().get(posBigTable);
        } else {
          // nothing to be done for filters - the output schema does not change.
          if (op instanceof TableScanOperator) {
            assert !useBucketSortPositions;
            TableScanOperator ts = (TableScanOperator) op;
            Table srcTable = ts.getConf().getTableMetadata();

            // Find the positions of the bucketed columns in the table corresponding
            // to the select list.
            // Consider the following scenario:
            // T1(key, value1, value2) bucketed/sorted by key into 2 buckets
            // T2(dummy, key, value1, value2) bucketed/sorted by key into 2 buckets
            // A query like: insert overwrite table T2 select 1, key, value1, value2 from T1
            // should be optimized.

            // Start with the destination: T2, bucketed/sorted position is [1]
            // At the source T1, the column corresponding to that position is [key], which
            // maps to column [0] of T1, which is also bucketed/sorted into the same
            // number of buckets
            List<Integer> newBucketPositions = new ArrayList<Integer>();
            for (int pos = 0; pos < bucketPositions.size(); pos++) {
              ExprNodeColumnDesc col = sourceTableBucketCols.get(pos);
              String colName = col.getColumn();
              int bucketPos = findColumnPosition(srcTable.getCols(), colName);
              if (bucketPos < 0) {
                return null;
              }
              newBucketPositions.add(bucketPos);
            }

            // Find the positions/order of the sorted columns in the table corresponding
            // to the select list.
            List<Integer> newSortPositions = new ArrayList<Integer>();
            for (int pos = 0; pos < sortPositions.size(); pos++) {
              ExprNodeColumnDesc col = sourceTableSortCols.get(pos);
              String colName = col.getColumn();
              int sortPos = findColumnPosition(srcTable.getCols(), colName);
              if (sortPos < 0) {
                return null;
              }
              newSortPositions.add(sortPos);
            }

            if (srcTable.isPartitioned()) {
              PrunedPartitionList prunedParts =
                  pGraphContext.getPrunedPartitions(srcTable.getTableName(), ts);
              List<Partition> partitions = prunedParts.getNotDeniedPartns();

              // Support for dynamic partitions can be added later
              // The following is not optimized:
              // insert overwrite table T1(ds='1', hr) select key, value, hr from T2 where ds = '1';
              // where T1 and T2 are bucketed by the same keys and partitioned by ds. hr
              if ((partitions == null) || (partitions.isEmpty()) || (partitions.size() > 1)) {
                return null;
              }
              for (Partition partition : partitions) {
                if (!checkPartition(
                    partition,
                    newBucketPositions,
                    newSortPositions,
                    sortOrder,
                    numBucketsDestination)) {
                  return null;
                }
              }

              removeReduceSink(
                  rsOp, (TableScanOperator) op, fsOp, partitions.get(0).getSortedPaths());
              return null;
            } else {
              if (!checkTable(
                  srcTable,
                  newBucketPositions,
                  newSortPositions,
                  sortOrder,
                  numBucketsDestination)) {
                return null;
              }

              removeReduceSink(rsOp, (TableScanOperator) op, fsOp, srcTable.getSortedPaths());
              return null;
            }
          }
          // None of the operators is changing the positions
          else if (op instanceof SelectOperator) {
            SelectOperator selectOp = (SelectOperator) op;
            SelectDesc selectDesc = selectOp.getConf();

            // Iterate backwards, from the destination table to the top of the tree
            // Based on the output column names, get the new columns.
            if (!useBucketSortPositions) {
              bucketPositions.clear();
              sortPositions.clear();
              List<String> outputColumnNames = selectDesc.getOutputColumnNames();

              for (ExprNodeColumnDesc col : sourceTableBucketCols) {
                String colName = col.getColumn();
                int colPos = outputColumnNames.indexOf(colName);
                if (colPos < 0) {
                  return null;
                }
                bucketPositions.add(colPos);
              }

              for (ExprNodeColumnDesc col : sourceTableSortCols) {
                String colName = col.getColumn();
                int colPos = outputColumnNames.indexOf(colName);
                if (colPos < 0) {
                  return null;
                }
                sortPositions.add(colPos);
              }
            }

            // There may be multiple selects - chose the one closest to the table
            sourceTableBucketCols.clear();
            sourceTableSortCols.clear();

            // Only columns can be selected for both sorted and bucketed positions
            for (int pos : bucketPositions) {
              ExprNodeDesc selectColList = selectDesc.getColList().get(pos);
              if (!(selectColList instanceof ExprNodeColumnDesc)) {
                return null;
              }
              sourceTableBucketCols.add((ExprNodeColumnDesc) selectColList);
            }

            for (int pos : sortPositions) {
              ExprNodeDesc selectColList = selectDesc.getColList().get(pos);
              if (!(selectColList instanceof ExprNodeColumnDesc)) {
                return null;
              }
              sourceTableSortCols.add((ExprNodeColumnDesc) selectColList);
            }

            useBucketSortPositions = false;
          }
          op = op.getParentOperators().get(0);
        }
      }
    }
    // The output columns for the destination table should match with the join keys
    // This is to handle queries of the form:
    // insert overwrite table T3
    // select T1.key, T1.key2, UDF(T1.value, T2.value)
    // from T1 join T2 on T1.key = T2.key and T1.key2 = T2.key2
    // where T1, T2 and T3 are bucketized/sorted on key and key2
    // Assuming T1 is the table on which the mapper is run, the following is true:
    // . The number of buckets for T1 and T3 should be same
    // . The bucketing/sorting columns for T1, T2 and T3 should be same
    // . The sort order of T1 should match with the sort order for T3.
    // . If T1 is partitioned, only a single partition of T1 can be selected.
    // . The select list should contain with (T1.key, T1.key2) or (T2.key, T2.key2)
    // . After the join, only selects and filters are allowed.
    private boolean validateSMBJoinKeys(
        SMBJoinDesc smbJoinDesc,
        List<ExprNodeColumnDesc> sourceTableBucketCols,
        List<ExprNodeColumnDesc> sourceTableSortCols,
        List<Integer> sortOrder) {
      // The sort-merge join creates the output sorted and bucketized by the same columns.
      // This can be relaxed in the future if there is a requirement.
      if (!sourceTableBucketCols.equals(sourceTableSortCols)) {
        return false;
      }

      // Get the total number of columns selected, and for each output column, store the
      // base table it points to. For
      // insert overwrite table T3
      // select T1.key, T1.key2, UDF(T1.value, T2.value)
      // from T1 join T2 on T1.key = T2.key and T1.key2 = T2.key2
      // the following arrays are created
      // [0, 0, 0, 1] --> [T1, T1, T1, T2] (table mapping)
      // [0, 1, 2, 0] --> [T1.0, T1.1, T1.2, T2.0] (table columns mapping)
      Byte[] tagOrder = smbJoinDesc.getTagOrder();
      Map<Byte, List<Integer>> retainList = smbJoinDesc.getRetainList();
      int totalNumberColumns = 0;
      for (Byte tag : tagOrder) {
        totalNumberColumns += retainList.get(tag).size();
      }

      byte[] columnTableMappings = new byte[totalNumberColumns];
      int[] columnNumberMappings = new int[totalNumberColumns];
      int currentColumnPosition = 0;
      for (Byte tag : tagOrder) {
        for (int pos = 0; pos < retainList.get(tag).size(); pos++) {
          columnTableMappings[currentColumnPosition] = tag;
          columnNumberMappings[currentColumnPosition] = pos;
          currentColumnPosition++;
        }
      }

      // All output columns used for bucketing/sorting of the destination table should
      // belong to the same input table
      //   insert overwrite table T3
      //   select T1.key, T2.key2, UDF(T1.value, T2.value)
      //   from T1 join T2 on T1.key = T2.key and T1.key2 = T2.key2
      // is not optimized, whereas the insert is optimized if the select list is either changed to
      // (T1.key, T1.key2, UDF(T1.value, T2.value)) or (T2.key, T2.key2, UDF(T1.value, T2.value))
      // Get the input table and make sure the keys match
      List<String> outputColumnNames = smbJoinDesc.getOutputColumnNames();
      byte tableTag = -1;
      int[] columnNumbersExprList = new int[sourceTableBucketCols.size()];
      int currentColPosition = 0;
      for (ExprNodeColumnDesc bucketCol : sourceTableBucketCols) {
        String colName = bucketCol.getColumn();
        int colNumber = outputColumnNames.indexOf(colName);
        if (colNumber < 0) {
          return false;
        }
        if (tableTag < 0) {
          tableTag = columnTableMappings[colNumber];
        } else if (tableTag != columnTableMappings[colNumber]) {
          return false;
        }
        columnNumbersExprList[currentColPosition++] = columnNumberMappings[colNumber];
      }

      List<ExprNodeDesc> allExprs = smbJoinDesc.getExprs().get(tableTag);
      List<ExprNodeDesc> keysSelectedTable = smbJoinDesc.getKeys().get(tableTag);
      currentColPosition = 0;
      for (ExprNodeDesc keySelectedTable : keysSelectedTable) {
        if (!(keySelectedTable instanceof ExprNodeColumnDesc)) {
          return false;
        }
        if (!allExprs.get(columnNumbersExprList[currentColPosition++]).isSame(keySelectedTable)) {
          return false;
        }
      }

      return true;
    }