Java ReduceSinkDesc 예제들, org.apache.hadoop.hive.ql.plan.ReduceSinkDesc Java 예제들

예제 #1

0

파일 보기

파일: ColumnPrunerProcFactory.java 프로젝트: cschenyuan/hive-hack

    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx, Object... nodeOutputs)
        throws SemanticException {
      ReduceSinkOperator op = (ReduceSinkOperator) nd;
      ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx;
      RowResolver resolver = cppCtx.getOpToParseCtxMap().get(op).getRowResolver();
      ReduceSinkDesc conf = op.getConf();

      List<String> colLists = new ArrayList<String>();
      ArrayList<ExprNodeDesc> keys = conf.getKeyCols();
      LOG.debug("Reduce Sink Operator " + op.getIdentifier() + " key:" + keys);
      for (ExprNodeDesc key : keys) {
        colLists = Utilities.mergeUniqElems(colLists, key.getCols());
      }

      assert op.getNumChild() == 1;

      Operator<? extends OperatorDesc> child = op.getChildOperators().get(0);

      List<String> childCols;
      if (child instanceof CommonJoinOperator) {
        childCols = cppCtx.getJoinPrunedColLists().get(child).get((byte) conf.getTag());
      } else {
        childCols = cppCtx.getPrunedColList(child);
      }
      List<ExprNodeDesc> valCols = conf.getValueCols();
      List<String> valColNames = conf.getOutputValueColumnNames();

      if (childCols != null) {
        boolean[] flags = new boolean[valCols.size()];

        for (String childCol : childCols) {
          int index = valColNames.indexOf(Utilities.removeValueTag(childCol));
          if (index < 0) {
            continue;
          }
          flags[index] = true;
          colLists = Utilities.mergeUniqElems(colLists, valCols.get(index).getCols());
        }

        Collections.sort(colLists);
        pruneReduceSinkOperator(flags, op, cppCtx);
        cppCtx.getPrunedColLists().put(op, colLists);
        return null;
      }

      // Reduce Sink contains the columns needed - no need to aggregate from
      // children
      for (ExprNodeDesc val : valCols) {
        colLists = Utilities.mergeUniqElems(colLists, val.getCols());
      }

      cppCtx.getPrunedColLists().put(op, colLists);
      return null;
    }

예제 #2

0

파일 보기

파일: ColumnPrunerProcFactory.java 프로젝트: cschenyuan/hive-hack

  private static void pruneReduceSinkOperator(
      boolean[] retainFlags, ReduceSinkOperator reduce, ColumnPrunerProcCtx cppCtx)
      throws SemanticException {
    ReduceSinkDesc reduceConf = reduce.getConf();
    Map<String, ExprNodeDesc> oldMap = reduce.getColumnExprMap();
    LOG.info("RS " + reduce.getIdentifier() + " oldColExprMap: " + oldMap);
    RowResolver oldRR = cppCtx.getOpToParseCtxMap().get(reduce).getRowResolver();
    ArrayList<ColumnInfo> old_signature = oldRR.getRowSchema().getSignature();
    ArrayList<ColumnInfo> signature = new ArrayList<ColumnInfo>(old_signature);

    List<String> valueColNames = reduceConf.getOutputValueColumnNames();
    ArrayList<String> newValueColNames = new ArrayList<String>();

    List<ExprNodeDesc> keyExprs = reduceConf.getKeyCols();
    List<ExprNodeDesc> valueExprs = reduceConf.getValueCols();
    ArrayList<ExprNodeDesc> newValueExprs = new ArrayList<ExprNodeDesc>();

    for (int i = 0; i < retainFlags.length; i++) {
      String outputCol = valueColNames.get(i);
      ExprNodeDesc outputColExpr = valueExprs.get(i);
      if (!retainFlags[i]) {
        String[] nm = oldRR.reverseLookup(outputCol);
        if (nm == null) {
          outputCol = Utilities.ReduceField.VALUE.toString() + "." + outputCol;
          nm = oldRR.reverseLookup(outputCol);
        }

        // In case there are multiple columns referenced to the same column name, we won't
        // do row resolve once more because the ColumnInfo in row resolver is already removed
        if (nm == null) {
          continue;
        }

        // Only remove information of a column if it is not a key,
        // i.e. this column is not appearing in keyExprs of the RS
        if (ExprNodeDescUtils.indexOf(outputColExpr, keyExprs) == -1) {
          ColumnInfo colInfo = oldRR.getFieldMap(nm[0]).remove(nm[1]);
          oldRR.getInvRslvMap().remove(colInfo.getInternalName());
          oldMap.remove(outputCol);
          signature.remove(colInfo);
        }

      } else {
        newValueColNames.add(outputCol);
        newValueExprs.add(outputColExpr);
      }
    }

    oldRR.getRowSchema().setSignature(signature);
    reduce.getSchema().setSignature(signature);
    reduceConf.setOutputValueColumnNames(newValueColNames);
    reduceConf.setValueCols(newValueExprs);
    TableDesc newValueTable =
        PlanUtils.getReduceValueTableDesc(
            PlanUtils.getFieldSchemasFromColumnList(
                reduceConf.getValueCols(), newValueColNames, 0, ""));
    reduceConf.setValueSerializeInfo(newValueTable);
    LOG.info("RS " + reduce.getIdentifier() + " newColExprMap: " + oldMap);
  }

예제 #3

0

파일 보기

파일: GenMapRedUtils.java 프로젝트: Carlie20083/hive-0.7.0

  /**
   * Initialize the current plan by adding it to root tasks.
   *
   * @param op the reduce sink operator encountered
   * @param opProcCtx processing context
   */
  public static void initPlan(ReduceSinkOperator op, GenMRProcContext opProcCtx)
      throws SemanticException {
    Operator<? extends Serializable> reducer = op.getChildOperators().get(0);
    Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0));
    Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
    MapredWork plan = (MapredWork) currTask.getWork();
    HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap =
        opProcCtx.getOpTaskMap();
    Operator<? extends Serializable> currTopOp = opProcCtx.getCurrTopOp();

    opTaskMap.put(reducer, currTask);
    plan.setReducer(reducer);
    ReduceSinkDesc desc = op.getConf();

    plan.setNumReduceTasks(desc.getNumReducers());

    List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();

    if (!rootTasks.contains(currTask)) {
      rootTasks.add(currTask);
    }
    if (reducer.getClass() == JoinOperator.class) {
      plan.setNeedsTagging(true);
    }

    assert currTopOp != null;
    List<Operator<? extends Serializable>> seenOps = opProcCtx.getSeenOps();
    String currAliasId = opProcCtx.getCurrAliasId();

    if (!seenOps.contains(currTopOp)) {
      seenOps.add(currTopOp);
      setTaskPlan(currAliasId, currTopOp, plan, false, opProcCtx);
    }

    currTopOp = null;
    currAliasId = null;

    opProcCtx.setCurrTask(currTask);
    opProcCtx.setCurrTopOp(currTopOp);
    opProcCtx.setCurrAliasId(currAliasId);
  }

예제 #4

0

파일 보기

파일: ColumnPrunerProcFactory.java 프로젝트: cschenyuan/hive-hack

 private static boolean[] getPruneReduceSinkOpRetainFlags(
     List<String> retainedParentOpOutputCols, ReduceSinkOperator reduce) {
   ReduceSinkDesc reduceConf = reduce.getConf();
   java.util.ArrayList<ExprNodeDesc> originalValueEval = reduceConf.getValueCols();
   boolean[] flags = new boolean[originalValueEval.size()];
   for (int i = 0; i < originalValueEval.size(); i++) {
     flags[i] = false;
     List<String> current = originalValueEval.get(i).getCols();
     if (current == null || current.size() == 0) {
       flags[i] = true;
     } else {
       for (int j = 0; j < current.size(); j++) {
         if (retainedParentOpOutputCols.contains(current.get(j))) {
           flags[i] = true;
           break;
         }
       }
     }
   }
   return flags;
 }

예제 #5

0

파일 보기

파일: GenMapRedUtils.java 프로젝트: Carlie20083/hive-0.7.0

  /**
   * Split the current plan by creating a temporary destination.
   *
   * @param op the reduce sink operator encountered
   * @param opProcCtx processing context
   */
  public static void splitPlan(ReduceSinkOperator op, GenMRProcContext opProcCtx)
      throws SemanticException {
    // Generate a new task
    ParseContext parseCtx = opProcCtx.getParseCtx();
    MapredWork cplan = getMapRedWork(parseCtx.getConf());
    Task<? extends Serializable> redTask = TaskFactory.get(cplan, parseCtx.getConf());
    Operator<? extends Serializable> reducer = op.getChildOperators().get(0);

    // Add the reducer
    cplan.setReducer(reducer);
    ReduceSinkDesc desc = op.getConf();

    cplan.setNumReduceTasks(new Integer(desc.getNumReducers()));

    HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap =
        opProcCtx.getOpTaskMap();
    opTaskMap.put(reducer, redTask);
    Task<? extends Serializable> currTask = opProcCtx.getCurrTask();

    splitTasks(op, currTask, redTask, opProcCtx, true, false, 0);
    opProcCtx.getRootOps().add(op);
  }

예제 #6

0

파일 보기

파일: GenMapRedUtils.java 프로젝트: Carlie20083/hive-0.7.0

  /**
   * Initialize the current union plan.
   *
   * @param op the reduce sink operator encountered
   * @param opProcCtx processing context
   */
  public static void initUnionPlan(ReduceSinkOperator op, GenMRProcContext opProcCtx)
      throws SemanticException {
    Operator<? extends Serializable> reducer = op.getChildOperators().get(0);
    Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0));
    Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
    MapredWork plan = (MapredWork) currTask.getWork();
    HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap =
        opProcCtx.getOpTaskMap();

    opTaskMap.put(reducer, currTask);
    plan.setReducer(reducer);
    ReduceSinkDesc desc = op.getConf();

    plan.setNumReduceTasks(desc.getNumReducers());

    if (reducer.getClass() == JoinOperator.class) {
      plan.setNeedsTagging(true);
    }

    initUnionPlan(opProcCtx, currTask, false);
  }

예제 #7

0

파일 보기

파일: GenMapRedUtils.java 프로젝트: Carlie20083/hive-0.7.0

  /**
   * Initialize the current plan by adding it to root tasks.
   *
   * @param op the map join operator encountered
   * @param opProcCtx processing context
   * @param pos position of the parent
   */
  public static void initMapJoinPlan(
      Operator<? extends Serializable> op,
      GenMRProcContext opProcCtx,
      boolean readInputMapJoin,
      boolean readInputUnion,
      boolean setReducer,
      int pos,
      boolean createLocalPlan)
      throws SemanticException {
    Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
    assert (((pos == -1) && (readInputMapJoin)) || (pos != -1));
    int parentPos = (pos == -1) ? 0 : pos;
    GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(parentPos));
    Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
    MapredWork plan = (MapredWork) currTask.getWork();
    HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap =
        opProcCtx.getOpTaskMap();
    Operator<? extends Serializable> currTopOp = opProcCtx.getCurrTopOp();

    // The mapjoin has already been encountered. Some context must be stored
    // about that
    if (readInputMapJoin) {
      AbstractMapJoinOperator<? extends MapJoinDesc> currMapJoinOp = opProcCtx.getCurrMapJoinOp();
      assert currMapJoinOp != null;
      boolean local =
          ((pos == -1) || (pos == (currMapJoinOp.getConf()).getPosBigTable())) ? false : true;

      if (setReducer) {
        Operator<? extends Serializable> reducer = op.getChildOperators().get(0);
        plan.setReducer(reducer);
        opTaskMap.put(reducer, currTask);
        if (reducer.getClass() == JoinOperator.class) {
          plan.setNeedsTagging(true);
        }
        ReduceSinkDesc desc = (ReduceSinkDesc) op.getConf();
        plan.setNumReduceTasks(desc.getNumReducers());
      } else {
        opTaskMap.put(op, currTask);
      }

      if (!readInputUnion) {
        GenMRMapJoinCtx mjCtx = opProcCtx.getMapJoinCtx(currMapJoinOp);
        String taskTmpDir;
        TableDesc tt_desc;
        Operator<? extends Serializable> rootOp;

        if (mjCtx.getOldMapJoin() == null || setReducer) {
          taskTmpDir = mjCtx.getTaskTmpDir();
          tt_desc = mjCtx.getTTDesc();
          rootOp = mjCtx.getRootMapJoinOp();
        } else {
          GenMRMapJoinCtx oldMjCtx = opProcCtx.getMapJoinCtx(mjCtx.getOldMapJoin());
          taskTmpDir = oldMjCtx.getTaskTmpDir();
          tt_desc = oldMjCtx.getTTDesc();
          rootOp = oldMjCtx.getRootMapJoinOp();
        }

        setTaskPlan(taskTmpDir, taskTmpDir, rootOp, plan, local, tt_desc);
        setupBucketMapJoinInfo(plan, currMapJoinOp, createLocalPlan);
      } else {
        initUnionPlan(opProcCtx, currTask, false);
      }

      opProcCtx.setCurrMapJoinOp(null);
    } else {
      MapJoinDesc desc = (MapJoinDesc) op.getConf();

      // The map is overloaded to keep track of mapjoins also
      opTaskMap.put(op, currTask);

      List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
      rootTasks.add(currTask);

      assert currTopOp != null;
      List<Operator<? extends Serializable>> seenOps = opProcCtx.getSeenOps();
      String currAliasId = opProcCtx.getCurrAliasId();

      seenOps.add(currTopOp);
      boolean local = (pos == desc.getPosBigTable()) ? false : true;
      setTaskPlan(currAliasId, currTopOp, plan, local, opProcCtx);
      setupBucketMapJoinInfo(
          plan, (AbstractMapJoinOperator<? extends MapJoinDesc>) op, createLocalPlan);
    }

    opProcCtx.setCurrTask(currTask);
    opProcCtx.setCurrTopOp(null);
    opProcCtx.setCurrAliasId(null);
  }

예제 #8

0

파일 보기

파일: SkewJoinOptimizer.java 프로젝트: Jude7/hive

    /**
     * Returns the skewed values in all the tables which are going to be scanned. If the join is on
     * columns c1, c2 and c3 on tables T1 and T2, T1 is skewed on c1 and c4 with the skew values
     * ((1,2),(3,4)), whereas T2 is skewed on c1, c2 with skew values ((5,6),(7,8)), the resulting
     * map would be: <(c1) -> ((1), (3)), (c1,c2) -> ((5,6),(7,8))>
     *
     * @param op The join operator being optimized
     * @param tableScanOpsForJoin table scan operators which are parents of the join operator
     * @return map<join keys intersection skewedkeys, list of skewed values>.
     */
    private Map<List<ExprNodeDesc>, List<List<String>>> getSkewedValues(
        Operator<? extends OperatorDesc> op, List<TableScanOperator> tableScanOpsForJoin) {

      Map<List<ExprNodeDesc>, List<List<String>>> skewDataReturn =
          new HashMap<List<ExprNodeDesc>, List<List<String>>>();

      Map<List<ExprNodeDescEqualityWrapper>, List<List<String>>> skewData =
          new HashMap<List<ExprNodeDescEqualityWrapper>, List<List<String>>>();

      // The join keys are available in the reduceSinkOperators before join
      for (Operator<? extends OperatorDesc> reduceSinkOp : op.getParentOperators()) {
        ReduceSinkDesc rsDesc = ((ReduceSinkOperator) reduceSinkOp).getConf();

        if (rsDesc.getKeyCols() != null) {
          Table table = null;
          // Find the skew information corresponding to the table
          List<String> skewedColumns = null;
          List<List<String>> skewedValueList = null;

          // The join columns which are also skewed
          List<ExprNodeDescEqualityWrapper> joinKeysSkewedCols =
              new ArrayList<ExprNodeDescEqualityWrapper>();

          // skewed Keys which intersect with join keys
          List<Integer> positionSkewedKeys = new ArrayList<Integer>();

          // Update the joinKeys appropriately.
          for (ExprNodeDesc keyColDesc : rsDesc.getKeyCols()) {
            ExprNodeColumnDesc keyCol = null;

            // If the key column is not a column, then dont apply this optimization.
            // This will be fixed as part of https://issues.apache.org/jira/browse/HIVE-3445
            // for type conversion UDFs.
            if (keyColDesc instanceof ExprNodeColumnDesc) {
              keyCol = (ExprNodeColumnDesc) keyColDesc;
              if (table == null) {
                table = getTable(parseContext, reduceSinkOp, tableScanOpsForJoin);
                skewedColumns = table == null ? null : table.getSkewedColNames();
                // No skew on the table to take care of
                if ((skewedColumns == null) || (skewedColumns.isEmpty())) {
                  continue;
                }

                skewedValueList = table == null ? null : table.getSkewedColValues();
              }
              int pos = skewedColumns.indexOf(keyCol.getColumn());
              if ((pos >= 0) && (!positionSkewedKeys.contains(pos))) {
                positionSkewedKeys.add(pos);
                ExprNodeColumnDesc keyColClone = (ExprNodeColumnDesc) keyCol.clone();
                keyColClone.setTabAlias(null);
                joinKeysSkewedCols.add(new ExprNodeDescEqualityWrapper(keyColClone));
              }
            }
          }

          // If the skew keys match the join keys, then add it to the list
          if ((skewedColumns != null) && (!skewedColumns.isEmpty())) {
            if (!joinKeysSkewedCols.isEmpty()) {
              // If the join keys matches the skewed keys, use the table skewed keys
              List<List<String>> skewedJoinValues;
              if (skewedColumns.size() == positionSkewedKeys.size()) {
                skewedJoinValues = skewedValueList;
              } else {
                skewedJoinValues = getSkewedJoinValues(skewedValueList, positionSkewedKeys);
              }

              List<List<String>> oldSkewedJoinValues = skewData.get(joinKeysSkewedCols);
              if (oldSkewedJoinValues == null) {
                oldSkewedJoinValues = new ArrayList<List<String>>();
              }
              for (List<String> skewValue : skewedJoinValues) {
                if (!oldSkewedJoinValues.contains(skewValue)) {
                  oldSkewedJoinValues.add(skewValue);
                }
              }

              skewData.put(joinKeysSkewedCols, oldSkewedJoinValues);
            }
          }
        }
      }

      // convert skewData to contain ExprNodeDesc in the keys
      for (Map.Entry<List<ExprNodeDescEqualityWrapper>, List<List<String>>> mapEntry :
          skewData.entrySet()) {
        List<ExprNodeDesc> skewedKeyJoinCols = new ArrayList<ExprNodeDesc>();
        for (ExprNodeDescEqualityWrapper key : mapEntry.getKey()) {
          skewedKeyJoinCols.add(key.getExprNodeDesc());
        }
        skewDataReturn.put(skewedKeyJoinCols, mapEntry.getValue());
      }

      return skewDataReturn;
    }

예제 #9

0

파일 보기

파일: ReduceSinkDeDuplication.java 프로젝트: phoenixhadoop/hive

 /**
  * Returns merge directions between two RSs for criterias (ordering, number of reducers, reducer
  * keys, partition keys). Returns null if any of categories is not mergeable.
  *
  * <p>Values for each index can be -1, 0, 1 1. 0 means two configuration in the category is the
  * same 2. for -1, configuration of parent RS is more specific than child RS 3. for 1,
  * configuration of child RS is more specific than parent RS
  */
 private int[] checkStatus(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer)
     throws SemanticException {
   ReduceSinkDesc cConf = cRS.getConf();
   ReduceSinkDesc pConf = pRS.getConf();
   Integer moveRSOrderTo = checkOrder(cConf.getOrder(), pConf.getOrder());
   if (moveRSOrderTo == null) {
     return null;
   }
   Integer moveReducerNumTo = checkNumReducer(cConf.getNumReducers(), pConf.getNumReducers());
   if (moveReducerNumTo == null || moveReducerNumTo > 0 && cConf.getNumReducers() < minReducer) {
     return null;
   }
   List<ExprNodeDesc> ckeys = cConf.getKeyCols();
   List<ExprNodeDesc> pkeys = pConf.getKeyCols();
   Integer moveKeyColTo = checkExprs(ckeys, pkeys, cRS, pRS);
   if (moveKeyColTo == null) {
     return null;
   }
   List<ExprNodeDesc> cpars = cConf.getPartitionCols();
   List<ExprNodeDesc> ppars = pConf.getPartitionCols();
   Integer movePartitionColTo = checkExprs(cpars, ppars, cRS, pRS);
   if (movePartitionColTo == null) {
     return null;
   }
   Integer moveNumDistKeyTo =
       checkNumDistributionKey(cConf.getNumDistributionKeys(), pConf.getNumDistributionKeys());
   return new int[] {
     moveKeyColTo, movePartitionColTo, moveRSOrderTo, moveReducerNumTo, moveNumDistKeyTo
   };
 }

예제 #10

0

파일 보기

파일: ReduceSinkDeDuplication.java 프로젝트: phoenixhadoop/hive

    // for JOIN-RS case, it's not possible generally to merge if child has
    // less key/partition columns than parents
    protected boolean merge(ReduceSinkOperator cRS, JoinOperator pJoin, int minReducer)
        throws SemanticException {
      List<Operator<?>> parents = pJoin.getParentOperators();
      ReduceSinkOperator[] pRSs = parents.toArray(new ReduceSinkOperator[parents.size()]);
      ReduceSinkDesc cRSc = cRS.getConf();
      ReduceSinkDesc pRS0c = pRSs[0].getConf();
      if (cRSc.getKeyCols().size() < pRS0c.getKeyCols().size()) {
        return false;
      }
      if (cRSc.getPartitionCols().size() != pRS0c.getPartitionCols().size()) {
        return false;
      }
      Integer moveReducerNumTo = checkNumReducer(cRSc.getNumReducers(), pRS0c.getNumReducers());
      if (moveReducerNumTo == null || moveReducerNumTo > 0 && cRSc.getNumReducers() < minReducer) {
        return false;
      }

      Integer moveRSOrderTo = checkOrder(cRSc.getOrder(), pRS0c.getOrder());
      if (moveRSOrderTo == null) {
        return false;
      }

      boolean[] sorted = CorrelationUtilities.getSortedTags(pJoin);

      int cKeySize = cRSc.getKeyCols().size();
      for (int i = 0; i < cKeySize; i++) {
        ExprNodeDesc cexpr = cRSc.getKeyCols().get(i);
        ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length];
        for (int tag = 0; tag < pRSs.length; tag++) {
          pexprs[tag] = pRSs[tag].getConf().getKeyCols().get(i);
        }
        int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted);
        if (found != i) {
          return false;
        }
      }
      int cPartSize = cRSc.getPartitionCols().size();
      for (int i = 0; i < cPartSize; i++) {
        ExprNodeDesc cexpr = cRSc.getPartitionCols().get(i);
        ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length];
        for (int tag = 0; tag < pRSs.length; tag++) {
          pexprs[tag] = pRSs[tag].getConf().getPartitionCols().get(i);
        }
        int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted);
        if (found != i) {
          return false;
        }
      }

      if (moveReducerNumTo > 0) {
        for (ReduceSinkOperator pRS : pRSs) {
          pRS.getConf().setNumReducers(cRS.getConf().getNumReducers());
        }
      }
      return true;
    }

예제 #11

0

파일 보기

파일: SortedDynPartitionOptimizer.java 프로젝트: hadoop-zuiwanyuan/hive

    public ReduceSinkOperator getReduceSinkOp(
        List<Integer> partitionPositions,
        List<Integer> sortPositions,
        List<Integer> sortOrder,
        List<Integer> sortNullOrder,
        ArrayList<ExprNodeDesc> allCols,
        ArrayList<ExprNodeDesc> bucketColumns,
        int numBuckets,
        Operator<? extends OperatorDesc> parent,
        AcidUtils.Operation writeType)
        throws SemanticException {

      // Order of KEY columns
      // 1) Partition columns
      // 2) Bucket number column
      // 3) Sort columns
      Set<Integer> keyColsPosInVal = Sets.newLinkedHashSet();
      ArrayList<ExprNodeDesc> keyCols = Lists.newArrayList();
      List<Integer> newSortOrder = Lists.newArrayList();
      List<Integer> newSortNullOrder = Lists.newArrayList();
      int numPartAndBuck = partitionPositions.size();

      keyColsPosInVal.addAll(partitionPositions);
      if (!bucketColumns.isEmpty()
          || writeType == Operation.DELETE
          || writeType == Operation.UPDATE) {
        keyColsPosInVal.add(-1);
        numPartAndBuck += 1;
      }
      keyColsPosInVal.addAll(sortPositions);

      // by default partition and bucket columns are sorted in ascending order
      Integer order = 1;
      if (sortOrder != null && !sortOrder.isEmpty()) {
        if (sortOrder.get(0).intValue() == 0) {
          order = 0;
        }
      }
      for (int i = 0; i < numPartAndBuck; i++) {
        newSortOrder.add(order);
      }
      newSortOrder.addAll(sortOrder);

      String orderStr = "";
      for (Integer i : newSortOrder) {
        if (i.intValue() == 1) {
          orderStr += "+";
        } else {
          orderStr += "-";
        }
      }

      // if partition and bucket columns are sorted in ascending order, by default
      // nulls come first; otherwise nulls come last
      Integer nullOrder = order == 1 ? 0 : 1;
      if (sortNullOrder != null && !sortNullOrder.isEmpty()) {
        if (sortNullOrder.get(0).intValue() == 0) {
          nullOrder = 0;
        } else {
          nullOrder = 1;
        }
      }
      for (int i = 0; i < numPartAndBuck; i++) {
        newSortNullOrder.add(nullOrder);
      }
      newSortNullOrder.addAll(sortNullOrder);

      String nullOrderStr = "";
      for (Integer i : newSortNullOrder) {
        if (i.intValue() == 0) {
          nullOrderStr += "a";
        } else {
          nullOrderStr += "z";
        }
      }

      Map<String, ExprNodeDesc> colExprMap = Maps.newHashMap();
      ArrayList<ExprNodeDesc> partCols = Lists.newArrayList();

      // we will clone here as RS will update bucket column key with its
      // corresponding with bucket number and hence their OIs
      for (Integer idx : keyColsPosInVal) {
        if (idx < 0) {
          ExprNodeConstantDesc bucketNumCol =
              new ExprNodeConstantDesc(TypeInfoFactory.stringTypeInfo, BUCKET_NUMBER_COL_NAME);
          keyCols.add(bucketNumCol);
          colExprMap.put(
              Utilities.ReduceField.KEY + ".'" + BUCKET_NUMBER_COL_NAME + "'", bucketNumCol);
        } else {
          keyCols.add(allCols.get(idx).clone());
        }
      }

      ArrayList<ExprNodeDesc> valCols = Lists.newArrayList();
      for (int i = 0; i < allCols.size(); i++) {
        if (!keyColsPosInVal.contains(i)) {
          valCols.add(allCols.get(i).clone());
        }
      }

      for (Integer idx : partitionPositions) {
        partCols.add(allCols.get(idx).clone());
      }

      // in the absence of SORTED BY clause, the sorted dynamic partition insert
      // should honor the ordering of records provided by ORDER BY in SELECT statement
      ReduceSinkOperator parentRSOp =
          OperatorUtils.findSingleOperatorUpstream(parent, ReduceSinkOperator.class);
      if (parentRSOp != null && parseCtx.getQueryProperties().hasOuterOrderBy()) {
        String parentRSOpOrder = parentRSOp.getConf().getOrder();
        String parentRSOpNullOrder = parentRSOp.getConf().getNullOrder();
        if (parentRSOpOrder != null && !parentRSOpOrder.isEmpty() && sortPositions.isEmpty()) {
          keyCols.addAll(parentRSOp.getConf().getKeyCols());
          orderStr += parentRSOpOrder;
          nullOrderStr += parentRSOpNullOrder;
        }
      }

      // map _col0 to KEY._col0, etc
      Map<String, String> nameMapping = new HashMap<>();
      ArrayList<String> keyColNames = Lists.newArrayList();
      for (ExprNodeDesc keyCol : keyCols) {
        String keyColName = keyCol.getExprString();
        keyColNames.add(keyColName);
        colExprMap.put(Utilities.ReduceField.KEY + "." + keyColName, keyCol);
        nameMapping.put(keyColName, Utilities.ReduceField.KEY + "." + keyColName);
      }
      ArrayList<String> valColNames = Lists.newArrayList();
      for (ExprNodeDesc valCol : valCols) {
        String colName = valCol.getExprString();
        valColNames.add(colName);
        colExprMap.put(Utilities.ReduceField.VALUE + "." + colName, valCol);
        nameMapping.put(colName, Utilities.ReduceField.VALUE + "." + colName);
      }

      // Create Key/Value TableDesc. When the operator plan is split into MR tasks,
      // the reduce operator will initialize Extract operator with information
      // from Key and Value TableDesc
      List<FieldSchema> fields =
          PlanUtils.getFieldSchemasFromColumnList(keyCols, keyColNames, 0, "");
      TableDesc keyTable = PlanUtils.getReduceKeyTableDesc(fields, orderStr, nullOrderStr);
      List<FieldSchema> valFields =
          PlanUtils.getFieldSchemasFromColumnList(valCols, valColNames, 0, "");
      TableDesc valueTable = PlanUtils.getReduceValueTableDesc(valFields);
      List<List<Integer>> distinctColumnIndices = Lists.newArrayList();

      // Number of reducers is set to default (-1)
      ReduceSinkDesc rsConf =
          new ReduceSinkDesc(
              keyCols,
              keyCols.size(),
              valCols,
              keyColNames,
              distinctColumnIndices,
              valColNames,
              -1,
              partCols,
              -1,
              keyTable,
              valueTable,
              writeType);
      rsConf.setBucketCols(bucketColumns);
      rsConf.setNumBuckets(numBuckets);

      ArrayList<ColumnInfo> signature = new ArrayList<>();
      for (int index = 0; index < parent.getSchema().getSignature().size(); index++) {
        ColumnInfo colInfo = new ColumnInfo(parent.getSchema().getSignature().get(index));
        colInfo.setInternalName(nameMapping.get(colInfo.getInternalName()));
        signature.add(colInfo);
      }
      ReduceSinkOperator op =
          (ReduceSinkOperator)
              OperatorFactory.getAndMakeChild(rsConf, new RowSchema(signature), parent);
      op.setColumnExprMap(colExprMap);
      return op;
    }