Java ReduceSinkOperator Examples, org.apache.hadoop.hive.ql.exec.ReduceSinkOperator Java Examples

Example #1

0

Show file

File: ReduceSinkDeDuplication.java Project: phoenixhadoop/hive

 /**
  * Returns merge directions between two RSs for criterias (ordering, number of reducers, reducer
  * keys, partition keys). Returns null if any of categories is not mergeable.
  *
  * <p>Values for each index can be -1, 0, 1 1. 0 means two configuration in the category is the
  * same 2. for -1, configuration of parent RS is more specific than child RS 3. for 1,
  * configuration of child RS is more specific than parent RS
  */
 private int[] checkStatus(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer)
     throws SemanticException {
   ReduceSinkDesc cConf = cRS.getConf();
   ReduceSinkDesc pConf = pRS.getConf();
   Integer moveRSOrderTo = checkOrder(cConf.getOrder(), pConf.getOrder());
   if (moveRSOrderTo == null) {
     return null;
   }
   Integer moveReducerNumTo = checkNumReducer(cConf.getNumReducers(), pConf.getNumReducers());
   if (moveReducerNumTo == null || moveReducerNumTo > 0 && cConf.getNumReducers() < minReducer) {
     return null;
   }
   List<ExprNodeDesc> ckeys = cConf.getKeyCols();
   List<ExprNodeDesc> pkeys = pConf.getKeyCols();
   Integer moveKeyColTo = checkExprs(ckeys, pkeys, cRS, pRS);
   if (moveKeyColTo == null) {
     return null;
   }
   List<ExprNodeDesc> cpars = cConf.getPartitionCols();
   List<ExprNodeDesc> ppars = pConf.getPartitionCols();
   Integer movePartitionColTo = checkExprs(cpars, ppars, cRS, pRS);
   if (movePartitionColTo == null) {
     return null;
   }
   Integer moveNumDistKeyTo =
       checkNumDistributionKey(cConf.getNumDistributionKeys(), pConf.getNumDistributionKeys());
   return new int[] {
     moveKeyColTo, movePartitionColTo, moveRSOrderTo, moveReducerNumTo, moveNumDistKeyTo
   };
 }

Example #2

0

Show file

File: SortedDynPartitionOptimizer.java Project: hadoop-zuiwanyuan/hive

 // Try to infer possible sort columns in the query
 // i.e. the sequence must be pRS-SEL*-fsParent
 // Returns true if columns could be inferred, false otherwise
 private void inferSortPositions(
     Operator<? extends OperatorDesc> fsParent,
     List<Integer> sortPositions,
     List<Integer> sortOrder)
     throws SemanticException {
   // If it is not a SEL operator, we bail out
   if (!(fsParent instanceof SelectOperator)) {
     return;
   }
   SelectOperator pSel = (SelectOperator) fsParent;
   Operator<? extends OperatorDesc> parent = pSel;
   while (!(parent instanceof ReduceSinkOperator)) {
     if (parent.getNumParent() != 1 || !(parent instanceof SelectOperator)) {
       return;
     }
     parent = parent.getParentOperators().get(0);
   }
   // Backtrack SEL columns to pRS
   List<ExprNodeDesc> selColsInPRS =
       ExprNodeDescUtils.backtrack(pSel.getConf().getColList(), pSel, parent);
   ReduceSinkOperator pRS = (ReduceSinkOperator) parent;
   for (int i = 0; i < pRS.getConf().getKeyCols().size(); i++) {
     ExprNodeDesc col = pRS.getConf().getKeyCols().get(i);
     int pos = selColsInPRS.indexOf(col);
     if (pos == -1) {
       sortPositions.clear();
       sortOrder.clear();
       return;
     }
     sortPositions.add(pos);
     sortOrder.add(pRS.getConf().getOrder().charAt(i) == '+' ? 1 : 0); // 1 asc, 0 desc
   }
 }

Example #3

0

Show file

File: ColumnPrunerProcFactory.java Project: cschenyuan/hive-hack

  private static void pruneReduceSinkOperator(
      boolean[] retainFlags, ReduceSinkOperator reduce, ColumnPrunerProcCtx cppCtx)
      throws SemanticException {
    ReduceSinkDesc reduceConf = reduce.getConf();
    Map<String, ExprNodeDesc> oldMap = reduce.getColumnExprMap();
    LOG.info("RS " + reduce.getIdentifier() + " oldColExprMap: " + oldMap);
    RowResolver oldRR = cppCtx.getOpToParseCtxMap().get(reduce).getRowResolver();
    ArrayList<ColumnInfo> old_signature = oldRR.getRowSchema().getSignature();
    ArrayList<ColumnInfo> signature = new ArrayList<ColumnInfo>(old_signature);

    List<String> valueColNames = reduceConf.getOutputValueColumnNames();
    ArrayList<String> newValueColNames = new ArrayList<String>();

    List<ExprNodeDesc> keyExprs = reduceConf.getKeyCols();
    List<ExprNodeDesc> valueExprs = reduceConf.getValueCols();
    ArrayList<ExprNodeDesc> newValueExprs = new ArrayList<ExprNodeDesc>();

    for (int i = 0; i < retainFlags.length; i++) {
      String outputCol = valueColNames.get(i);
      ExprNodeDesc outputColExpr = valueExprs.get(i);
      if (!retainFlags[i]) {
        String[] nm = oldRR.reverseLookup(outputCol);
        if (nm == null) {
          outputCol = Utilities.ReduceField.VALUE.toString() + "." + outputCol;
          nm = oldRR.reverseLookup(outputCol);
        }

        // In case there are multiple columns referenced to the same column name, we won't
        // do row resolve once more because the ColumnInfo in row resolver is already removed
        if (nm == null) {
          continue;
        }

        // Only remove information of a column if it is not a key,
        // i.e. this column is not appearing in keyExprs of the RS
        if (ExprNodeDescUtils.indexOf(outputColExpr, keyExprs) == -1) {
          ColumnInfo colInfo = oldRR.getFieldMap(nm[0]).remove(nm[1]);
          oldRR.getInvRslvMap().remove(colInfo.getInternalName());
          oldMap.remove(outputCol);
          signature.remove(colInfo);
        }

      } else {
        newValueColNames.add(outputCol);
        newValueExprs.add(outputColExpr);
      }
    }

    oldRR.getRowSchema().setSignature(signature);
    reduce.getSchema().setSignature(signature);
    reduceConf.setOutputValueColumnNames(newValueColNames);
    reduceConf.setValueCols(newValueExprs);
    TableDesc newValueTable =
        PlanUtils.getReduceValueTableDesc(
            PlanUtils.getFieldSchemasFromColumnList(
                reduceConf.getValueCols(), newValueColNames, 0, ""));
    reduceConf.setValueSerializeInfo(newValueTable);
    LOG.info("RS " + reduce.getIdentifier() + " newColExprMap: " + oldMap);
  }

Example #4

0

Show file

File: ReduceSinkDeDuplication.java Project: phoenixhadoop/hive

    // for JOIN-RS case, it's not possible generally to merge if child has
    // less key/partition columns than parents
    protected boolean merge(ReduceSinkOperator cRS, JoinOperator pJoin, int minReducer)
        throws SemanticException {
      List<Operator<?>> parents = pJoin.getParentOperators();
      ReduceSinkOperator[] pRSs = parents.toArray(new ReduceSinkOperator[parents.size()]);
      ReduceSinkDesc cRSc = cRS.getConf();
      ReduceSinkDesc pRS0c = pRSs[0].getConf();
      if (cRSc.getKeyCols().size() < pRS0c.getKeyCols().size()) {
        return false;
      }
      if (cRSc.getPartitionCols().size() != pRS0c.getPartitionCols().size()) {
        return false;
      }
      Integer moveReducerNumTo = checkNumReducer(cRSc.getNumReducers(), pRS0c.getNumReducers());
      if (moveReducerNumTo == null || moveReducerNumTo > 0 && cRSc.getNumReducers() < minReducer) {
        return false;
      }

      Integer moveRSOrderTo = checkOrder(cRSc.getOrder(), pRS0c.getOrder());
      if (moveRSOrderTo == null) {
        return false;
      }

      boolean[] sorted = CorrelationUtilities.getSortedTags(pJoin);

      int cKeySize = cRSc.getKeyCols().size();
      for (int i = 0; i < cKeySize; i++) {
        ExprNodeDesc cexpr = cRSc.getKeyCols().get(i);
        ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length];
        for (int tag = 0; tag < pRSs.length; tag++) {
          pexprs[tag] = pRSs[tag].getConf().getKeyCols().get(i);
        }
        int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted);
        if (found != i) {
          return false;
        }
      }
      int cPartSize = cRSc.getPartitionCols().size();
      for (int i = 0; i < cPartSize; i++) {
        ExprNodeDesc cexpr = cRSc.getPartitionCols().get(i);
        ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length];
        for (int tag = 0; tag < pRSs.length; tag++) {
          pexprs[tag] = pRSs[tag].getConf().getPartitionCols().get(i);
        }
        int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted);
        if (found != i) {
          return false;
        }
      }

      if (moveReducerNumTo > 0) {
        for (ReduceSinkOperator pRS : pRSs) {
          pRS.getConf().setNumReducers(cRS.getConf().getNumReducers());
        }
      }
      return true;
    }

Example #5

0

Show file

File: ColumnPrunerProcFactory.java Project: cschenyuan/hive-hack

    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx, Object... nodeOutputs)
        throws SemanticException {
      ReduceSinkOperator op = (ReduceSinkOperator) nd;
      ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx;
      RowResolver resolver = cppCtx.getOpToParseCtxMap().get(op).getRowResolver();
      ReduceSinkDesc conf = op.getConf();

      List<String> colLists = new ArrayList<String>();
      ArrayList<ExprNodeDesc> keys = conf.getKeyCols();
      LOG.debug("Reduce Sink Operator " + op.getIdentifier() + " key:" + keys);
      for (ExprNodeDesc key : keys) {
        colLists = Utilities.mergeUniqElems(colLists, key.getCols());
      }

      assert op.getNumChild() == 1;

      Operator<? extends OperatorDesc> child = op.getChildOperators().get(0);

      List<String> childCols;
      if (child instanceof CommonJoinOperator) {
        childCols = cppCtx.getJoinPrunedColLists().get(child).get((byte) conf.getTag());
      } else {
        childCols = cppCtx.getPrunedColList(child);
      }
      List<ExprNodeDesc> valCols = conf.getValueCols();
      List<String> valColNames = conf.getOutputValueColumnNames();

      if (childCols != null) {
        boolean[] flags = new boolean[valCols.size()];

        for (String childCol : childCols) {
          int index = valColNames.indexOf(Utilities.removeValueTag(childCol));
          if (index < 0) {
            continue;
          }
          flags[index] = true;
          colLists = Utilities.mergeUniqElems(colLists, valCols.get(index).getCols());
        }

        Collections.sort(colLists);
        pruneReduceSinkOperator(flags, op, cppCtx);
        cppCtx.getPrunedColLists().put(op, colLists);
        return null;
      }

      // Reduce Sink contains the columns needed - no need to aggregate from
      // children
      for (ExprNodeDesc val : valCols) {
        colLists = Utilities.mergeUniqElems(colLists, val.getCols());
      }

      cppCtx.getPrunedColLists().put(op, colLists);
      return null;
    }

Example #6

0

Show file

File: SortedDynPartitionOptimizer.java Project: hadoop-zuiwanyuan/hive

    // Remove RS and SEL introduced by enforce bucketing/sorting config
    // Convert PARENT -> RS -> SEL -> FS to PARENT -> FS
    private boolean removeRSInsertedByEnforceBucketing(FileSinkOperator fsOp) {

      Set<ReduceSinkOperator> reduceSinks =
          OperatorUtils.findOperatorsUpstream(fsOp, ReduceSinkOperator.class);
      Operator<? extends OperatorDesc> rsToRemove = null;
      List<ReduceSinkOperator> rsOps =
          parseCtx.getReduceSinkOperatorsAddedByEnforceBucketingSorting();
      boolean found = false;

      // iterate through all RS and locate the one introduce by enforce bucketing
      for (ReduceSinkOperator reduceSink : reduceSinks) {
        for (ReduceSinkOperator rsOp : rsOps) {
          if (reduceSink.equals(rsOp)) {
            rsToRemove = reduceSink;
            found = true;
            break;
          }
        }

        if (found) {
          break;
        }
      }

      // iF RS is found remove it and its child (EX) and connect its parent
      // and grand child
      if (found) {
        Operator<? extends OperatorDesc> rsParent = rsToRemove.getParentOperators().get(0);
        Operator<? extends OperatorDesc> rsChild = rsToRemove.getChildOperators().get(0);
        Operator<? extends OperatorDesc> rsGrandChild = rsChild.getChildOperators().get(0);

        if (rsChild instanceof SelectOperator) {
          // if schema size cannot be matched, then it could be because of constant folding
          // converting partition column expression to constant expression. The constant
          // expression will then get pruned by column pruner since it will not reference to
          // any columns.
          if (rsParent.getSchema().getSignature().size()
              != rsChild.getSchema().getSignature().size()) {
            return false;
          }
          rsParent.getChildOperators().clear();
          rsParent.getChildOperators().add(rsGrandChild);
          rsGrandChild.getParentOperators().clear();
          rsGrandChild.getParentOperators().add(rsParent);
          LOG.info(
              "Removed "
                  + rsToRemove.getOperatorId()
                  + " and "
                  + rsChild.getOperatorId()
                  + " as it was introduced by enforce bucketing/sorting.");
        }
      }
      return true;
    }

Example #7

0

Show file

File: ReduceSinkDeDuplication.java Project: phoenixhadoop/hive

 // pRS-cRS
 @Override
 public Object process(ReduceSinkOperator cRS, ReduceSinkDeduplicateProcCtx dedupCtx)
     throws SemanticException {
   ReduceSinkOperator pRS =
       CorrelationUtilities.findPossibleParent(
           cRS, ReduceSinkOperator.class, dedupCtx.trustScript());
   if (pRS != null && merge(cRS, pRS, dedupCtx.minReducer())) {
     CorrelationUtilities.replaceReduceSinkWithSelectOperator(cRS, dedupCtx.getPctx(), dedupCtx);
     pRS.getConf().setDeduplicated(true);
     return true;
   }
   return false;
 }

Example #8

0

Show file

File: PredicateTransitivePropagate.java Project: Leolh/hive

  @Override
  public ParseContext transform(ParseContext pctx) throws SemanticException {
    pGraphContext = pctx;

    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
    opRules.put(
        new RuleRegExp(
            "R1",
            "("
                + FilterOperator.getOperatorName()
                + "%"
                + ReduceSinkOperator.getOperatorName()
                + "%"
                + JoinOperator.getOperatorName()
                + "%)"),
        new JoinTransitive());

    // The dispatcher fires the processor corresponding to the closest matching
    // rule and passes the context along
    TransitiveContext context = new TransitiveContext();
    Dispatcher disp = new DefaultRuleDispatcher(null, opRules, context);
    GraphWalker ogw = new LevelOrderWalker(disp, 2);

    // Create a list of topop nodes
    List<Node> topNodes = new ArrayList<Node>();
    topNodes.addAll(pGraphContext.getTopOps().values());
    ogw.startWalking(topNodes, null);

    Map<ReduceSinkOperator, List<ExprNodeDesc>> newFilters = context.getNewfilters();

    // insert new filter between RS and parent of RS
    for (Map.Entry<ReduceSinkOperator, List<ExprNodeDesc>> entry : newFilters.entrySet()) {
      ReduceSinkOperator reducer = entry.getKey();
      Operator<?> parent = reducer.getParentOperators().get(0);

      List<ExprNodeDesc> exprs = entry.getValue();
      if (parent instanceof FilterOperator) {
        exprs = ExprNodeDescUtils.split(((FilterOperator) parent).getConf().getPredicate(), exprs);
        ExprNodeDesc merged = ExprNodeDescUtils.mergePredicates(exprs);
        ((FilterOperator) parent).getConf().setPredicate(merged);
      } else {
        ExprNodeDesc merged = ExprNodeDescUtils.mergePredicates(exprs);
        RowSchema parentRS = parent.getSchema();
        Operator<FilterDesc> newFilter = createFilter(reducer, parent, parentRS, merged);
      }
    }

    return pGraphContext;
  }

Example #9

0

Show file

File: BucketingSortingReduceSinkOptimizer.java Project: saman-aghazadeh/apache-hive-1.2.0-src2

  @Override
  public ParseContext transform(ParseContext pctx) throws SemanticException {

    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();

    // process reduce sink added by hive.enforce.bucketing or hive.enforce.sorting
    opRules.put(
        new RuleRegExp(
            "R1",
            ReduceSinkOperator.getOperatorName()
                + "%"
                + SelectOperator.getOperatorName()
                + "%"
                + FileSinkOperator.getOperatorName()
                + "%"),
        getBucketSortReduceSinkProc(pctx));

    // The dispatcher fires the processor corresponding to the closest matching rule
    Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, null);
    GraphWalker ogw = new DefaultGraphWalker(disp);

    // Create a list of top nodes
    ArrayList<Node> topNodes = new ArrayList<Node>();
    topNodes.addAll(pctx.getTopOps().values());
    ogw.startWalking(topNodes, null);

    return pctx;
  }

Example #10

0

Show file

File: ReduceSinkDeDuplication.java Project: phoenixhadoop/hive

 // pRS-cRS-cGBY
 @Override
 public Object process(
     ReduceSinkOperator cRS, GroupByOperator cGBY, ReduceSinkDeduplicateProcCtx dedupCtx)
     throws SemanticException {
   Operator<?> start = CorrelationUtilities.getStartForGroupBy(cRS, dedupCtx);
   ReduceSinkOperator pRS =
       CorrelationUtilities.findPossibleParent(
           start, ReduceSinkOperator.class, dedupCtx.trustScript());
   if (pRS != null && merge(cRS, pRS, dedupCtx.minReducer())) {
     if (dedupCtx.getPctx().getConf().getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW)) {
       return false;
     }
     CorrelationUtilities.removeReduceSinkForGroupBy(cRS, cGBY, dedupCtx.getPctx(), dedupCtx);
     pRS.getConf().setDeduplicated(true);
     return true;
   }
   return false;
 }

Example #11

0

Show file

File: PredicateTransitivePropagate.java Project: Leolh/hive

 // check same filter exists already
 private boolean filterExists(ReduceSinkOperator target, ExprNodeDesc replaced) {
   Operator<?> operator = target.getParentOperators().get(0);
   for (; operator instanceof FilterOperator; operator = operator.getParentOperators().get(0)) {
     ExprNodeDesc predicate = ((FilterOperator) operator).getConf().getPredicate();
     if (ExprNodeDescUtils.containsPredicate(predicate, replaced)) {
       return true;
     }
   }
   return false;
 }

Example #12

0

Show file

File: GenMapRedUtils.java Project: Carlie20083/hive-0.7.0

  /**
   * Initialize the current plan by adding it to root tasks.
   *
   * @param op the reduce sink operator encountered
   * @param opProcCtx processing context
   */
  public static void initPlan(ReduceSinkOperator op, GenMRProcContext opProcCtx)
      throws SemanticException {
    Operator<? extends Serializable> reducer = op.getChildOperators().get(0);
    Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0));
    Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
    MapredWork plan = (MapredWork) currTask.getWork();
    HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap =
        opProcCtx.getOpTaskMap();
    Operator<? extends Serializable> currTopOp = opProcCtx.getCurrTopOp();

    opTaskMap.put(reducer, currTask);
    plan.setReducer(reducer);
    ReduceSinkDesc desc = op.getConf();

    plan.setNumReduceTasks(desc.getNumReducers());

    List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();

    if (!rootTasks.contains(currTask)) {
      rootTasks.add(currTask);
    }
    if (reducer.getClass() == JoinOperator.class) {
      plan.setNeedsTagging(true);
    }

    assert currTopOp != null;
    List<Operator<? extends Serializable>> seenOps = opProcCtx.getSeenOps();
    String currAliasId = opProcCtx.getCurrAliasId();

    if (!seenOps.contains(currTopOp)) {
      seenOps.add(currTopOp);
      setTaskPlan(currAliasId, currTopOp, plan, false, opProcCtx);
    }

    currTopOp = null;
    currAliasId = null;

    opProcCtx.setCurrTask(currTask);
    opProcCtx.setCurrTopOp(currTopOp);
    opProcCtx.setCurrAliasId(currAliasId);
  }

Example #13

0

Show file

File: BucketingSortingReduceSinkOptimizer.java Project: saman-aghazadeh/apache-hive-1.2.0-src2

 // Remove the reduce sink operator
 // Use BucketizedHiveInputFormat so that one mapper processes exactly one file
 private void removeReduceSink(
     ReduceSinkOperator rsOp, TableScanOperator tsOp, FileSinkOperator fsOp) {
   Operator<? extends OperatorDesc> parRSOp = rsOp.getParentOperators().get(0);
   parRSOp.getChildOperators().set(0, fsOp);
   fsOp.getParentOperators().set(0, parRSOp);
   fsOp.getConf().setMultiFileSpray(false);
   fsOp.getConf().setTotalFiles(1);
   fsOp.getConf().setNumFiles(1);
   fsOp.getConf().setRemovedReduceSinkBucketSort(true);
   tsOp.setUseBucketizedHiveInputFormat(true);
 }

Example #14

0

Show file

File: ReduceSinkDeDuplication.java Project: phoenixhadoop/hive

 // pRS-pJOIN-cRS-cGBY
 @Override
 public Object process(
     ReduceSinkOperator cRS, GroupByOperator cGBY, ReduceSinkDeduplicateProcCtx dedupCtx)
     throws SemanticException {
   Operator<?> start = CorrelationUtilities.getStartForGroupBy(cRS, dedupCtx);
   JoinOperator pJoin =
       CorrelationUtilities.findPossibleParent(
           start, JoinOperator.class, dedupCtx.trustScript());
   if (pJoin != null && merge(cRS, pJoin, dedupCtx.minReducer())) {
     pJoin.getConf().setFixedAsSorted(true);
     CorrelationUtilities.removeReduceSinkForGroupBy(cRS, cGBY, dedupCtx.getPctx(), dedupCtx);
     ReduceSinkOperator pRS =
         CorrelationUtilities.findPossibleParent(
             pJoin, ReduceSinkOperator.class, dedupCtx.trustScript());
     if (pRS != null) {
       pRS.getConf().setDeduplicated(true);
     }
     return true;
   }
   return false;
 }

Example #15

0

Show file

File: GenMapRedUtils.java Project: Carlie20083/hive-0.7.0

  /**
   * Split the current plan by creating a temporary destination.
   *
   * @param op the reduce sink operator encountered
   * @param opProcCtx processing context
   */
  public static void splitPlan(ReduceSinkOperator op, GenMRProcContext opProcCtx)
      throws SemanticException {
    // Generate a new task
    ParseContext parseCtx = opProcCtx.getParseCtx();
    MapredWork cplan = getMapRedWork(parseCtx.getConf());
    Task<? extends Serializable> redTask = TaskFactory.get(cplan, parseCtx.getConf());
    Operator<? extends Serializable> reducer = op.getChildOperators().get(0);

    // Add the reducer
    cplan.setReducer(reducer);
    ReduceSinkDesc desc = op.getConf();

    cplan.setNumReduceTasks(new Integer(desc.getNumReducers()));

    HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap =
        opProcCtx.getOpTaskMap();
    opTaskMap.put(reducer, redTask);
    Task<? extends Serializable> currTask = opProcCtx.getCurrTask();

    splitTasks(op, currTask, redTask, opProcCtx, true, false, 0);
    opProcCtx.getRootOps().add(op);
  }

Example #16

0

Show file

File: GenMapRedUtils.java Project: Carlie20083/hive-0.7.0

  /**
   * Initialize the current union plan.
   *
   * @param op the reduce sink operator encountered
   * @param opProcCtx processing context
   */
  public static void initUnionPlan(ReduceSinkOperator op, GenMRProcContext opProcCtx)
      throws SemanticException {
    Operator<? extends Serializable> reducer = op.getChildOperators().get(0);
    Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0));
    Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
    MapredWork plan = (MapredWork) currTask.getWork();
    HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap =
        opProcCtx.getOpTaskMap();

    opTaskMap.put(reducer, currTask);
    plan.setReducer(reducer);
    ReduceSinkDesc desc = op.getConf();

    plan.setNumReduceTasks(desc.getNumReducers());

    if (reducer.getClass() == JoinOperator.class) {
      plan.setNeedsTagging(true);
    }

    initUnionPlan(opProcCtx, currTask, false);
  }

Example #17

0

Show file

File: OperatorUtils.java Project: sushrutikhar/hive

 public static void setChildrenCollector(
     List<Operator<? extends OperatorDesc>> childOperators, OutputCollector out) {
   if (childOperators == null) {
     return;
   }
   for (Operator<? extends OperatorDesc> op : childOperators) {
     if (op.getName().equals(ReduceSinkOperator.getOperatorName())) {
       op.setOutputCollector(out);
     } else {
       setChildrenCollector(op.getChildOperators(), out);
     }
   }
 }

Example #18

0

Show file

File: PredicateTransitivePropagate.java Project: Leolh/hive

    @Override
    public Object process(
        Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs)
        throws SemanticException {
      @SuppressWarnings("unchecked")
      CommonJoinOperator<JoinDesc> join = (CommonJoinOperator) nd;
      ReduceSinkOperator source = (ReduceSinkOperator) stack.get(stack.size() - 2);
      FilterOperator filter = (FilterOperator) stack.get(stack.size() - 3);
      int srcPos = join.getParentOperators().indexOf(source);

      TransitiveContext context = (TransitiveContext) procCtx;
      Map<CommonJoinOperator, int[][]> filterPropagates = context.getFilterPropagates();
      Map<ReduceSinkOperator, List<ExprNodeDesc>> newFilters = context.getNewfilters();

      int[][] targets = filterPropagates.get(join);
      if (targets == null) {
        filterPropagates.put(join, targets = getTargets(join));
      }

      List<Operator<? extends OperatorDesc>> parents = join.getParentOperators();
      for (int targetPos : targets[srcPos]) {
        ReduceSinkOperator target = (ReduceSinkOperator) parents.get(targetPos);
        List<ExprNodeDesc> sourceKeys = source.getConf().getKeyCols();
        List<ExprNodeDesc> targetKeys = target.getConf().getKeyCols();

        ExprNodeDesc predicate = filter.getConf().getPredicate();
        ExprNodeDesc replaced = ExprNodeDescUtils.replace(predicate, sourceKeys, targetKeys);
        if (replaced != null && !filterExists(target, replaced)) {
          List<ExprNodeDesc> prev = newFilters.get(target);
          if (prev == null) {
            newFilters.put(target, ExprNodeDescUtils.split(replaced));
          } else {
            ExprNodeDescUtils.split(replaced, prev);
          }
        }
      }
      return null;
    }

Example #19

0

Show file

File: GenMapRedUtils.java Project: Carlie20083/hive-0.7.0

  /**
   * set key and value descriptor.
   *
   * @param plan current plan
   * @param topOp current top operator in the path
   */
  public static void setKeyAndValueDesc(MapredWork plan, Operator<? extends Serializable> topOp) {
    if (topOp == null) {
      return;
    }

    if (topOp instanceof ReduceSinkOperator) {
      ReduceSinkOperator rs = (ReduceSinkOperator) topOp;
      plan.setKeyDesc(rs.getConf().getKeySerializeInfo());
      int tag = Math.max(0, rs.getConf().getTag());
      List<TableDesc> tagToSchema = plan.getTagToValueDesc();
      while (tag + 1 > tagToSchema.size()) {
        tagToSchema.add(null);
      }
      tagToSchema.set(tag, rs.getConf().getValueSerializeInfo());
    } else {
      List<Operator<? extends Serializable>> children = topOp.getChildOperators();
      if (children != null) {
        for (Operator<? extends Serializable> op : children) {
          setKeyAndValueDesc(plan, op);
        }
      }
    }
  }

Example #20

0

Show file

File: PredicatePushDown.java Project: phoenixhadoop/hive

  @Override
  public ParseContext transform(ParseContext pctx) throws SemanticException {
    pGraphContext = pctx;

    // create a the context for walking operators
    OpWalkerInfo opWalkerInfo = new OpWalkerInfo(pGraphContext);

    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
    opRules.put(
        new RuleRegExp("R1", FilterOperator.getOperatorName() + "%"),
        OpProcFactory.getFilterProc());
    opRules.put(
        new RuleRegExp("R2", PTFOperator.getOperatorName() + "%"), OpProcFactory.getPTFProc());
    opRules.put(
        new RuleRegExp("R3", CommonJoinOperator.getOperatorName() + "%"),
        OpProcFactory.getJoinProc());
    opRules.put(
        new RuleRegExp("R4", TableScanOperator.getOperatorName() + "%"), OpProcFactory.getTSProc());
    opRules.put(
        new RuleRegExp("R5", ScriptOperator.getOperatorName() + "%"), OpProcFactory.getSCRProc());
    opRules.put(
        new RuleRegExp("R6", LimitOperator.getOperatorName() + "%"), OpProcFactory.getLIMProc());
    opRules.put(
        new RuleRegExp("R7", UDTFOperator.getOperatorName() + "%"), OpProcFactory.getUDTFProc());
    opRules.put(
        new RuleRegExp("R8", LateralViewForwardOperator.getOperatorName() + "%"),
        OpProcFactory.getLVFProc());
    opRules.put(
        new RuleRegExp("R9", LateralViewJoinOperator.getOperatorName() + "%"),
        OpProcFactory.getLVJProc());
    opRules.put(
        new RuleRegExp("R10", ReduceSinkOperator.getOperatorName() + "%"),
        OpProcFactory.getRSProc());

    // The dispatcher fires the processor corresponding to the closest matching
    // rule and passes the context along
    Dispatcher disp =
        new DefaultRuleDispatcher(OpProcFactory.getDefaultProc(), opRules, opWalkerInfo);
    GraphWalker ogw = new DefaultGraphWalker(disp);

    // Create a list of topop nodes
    ArrayList<Node> topNodes = new ArrayList<Node>();
    topNodes.addAll(pGraphContext.getTopOps().values());
    ogw.startWalking(topNodes, null);

    if (LOG.isDebugEnabled()) {
      LOG.debug("After PPD:\n" + Operator.toString(pctx.getTopOps().values()));
    }
    return pGraphContext;
  }

Example #21

0

Show file

File: ColumnPrunerProcFactory.java Project: cschenyuan/hive-hack

 private static boolean[] getPruneReduceSinkOpRetainFlags(
     List<String> retainedParentOpOutputCols, ReduceSinkOperator reduce) {
   ReduceSinkDesc reduceConf = reduce.getConf();
   java.util.ArrayList<ExprNodeDesc> originalValueEval = reduceConf.getValueCols();
   boolean[] flags = new boolean[originalValueEval.size()];
   for (int i = 0; i < originalValueEval.size(); i++) {
     flags[i] = false;
     List<String> current = originalValueEval.get(i).getCols();
     if (current == null || current.size() == 0) {
       flags[i] = true;
     } else {
       for (int j = 0; j < current.size(); j++) {
         if (retainedParentOpOutputCols.contains(current.get(j))) {
           flags[i] = true;
           break;
         }
       }
     }
   }
   return flags;
 }

Example #22

0

Show file

File: OperatorUtils.java Project: sushrutikhar/hive

 public static void setChildrenCollector(
     List<Operator<? extends OperatorDesc>> childOperators, Map<String, OutputCollector> outMap) {
   if (childOperators == null) {
     return;
   }
   for (Operator<? extends OperatorDesc> op : childOperators) {
     if (op.getName().equals(ReduceSinkOperator.getOperatorName())) {
       ReduceSinkOperator rs = ((ReduceSinkOperator) op);
       if (outMap.containsKey(rs.getConf().getOutputName())) {
         LOG.info("Setting output collector: " + rs + " --> " + rs.getConf().getOutputName());
         rs.setOutputCollector(outMap.get(rs.getConf().getOutputName()));
       }
     } else {
       setChildrenCollector(op.getChildOperators(), outMap);
     }
   }
 }

Example #23

0

Show file

File: ReduceSinkDeDuplication.java Project: phoenixhadoop/hive

/**
 * If two reducer sink operators share the same partition/sort columns and order, they can be
 * merged. This should happen after map join optimization because map join optimization will remove
 * reduce sink operators.
 *
 * <p>This optimizer removes/replaces child-RS (not parent) which is safer way for
 * DefaultGraphWalker.
 */
public class ReduceSinkDeDuplication extends Transform {

  private static final String RS = ReduceSinkOperator.getOperatorName();
  private static final String GBY = GroupByOperator.getOperatorName();
  private static final String JOIN = JoinOperator.getOperatorName();

  protected ParseContext pGraphContext;

  @Override
  public ParseContext transform(ParseContext pctx) throws SemanticException {
    pGraphContext = pctx;

    // generate pruned column list for all relevant operators
    ReduceSinkDeduplicateProcCtx cppCtx = new ReduceSinkDeduplicateProcCtx(pGraphContext);

    // for auto convert map-joins, it not safe to dedup in here (todo)
    boolean mergeJoins =
        !pctx.getConf().getBoolVar(HIVECONVERTJOIN)
            && !pctx.getConf().getBoolVar(HIVECONVERTJOINNOCONDITIONALTASK)
            && !pctx.getConf().getBoolVar(ConfVars.HIVE_CONVERT_JOIN_BUCKET_MAPJOIN_TEZ)
            && !pctx.getConf().getBoolVar(ConfVars.HIVEDYNAMICPARTITIONHASHJOIN);

    // If multiple rules can be matched with same cost, last rule will be choosen as a processor
    // see DefaultRuleDispatcher#dispatch()
    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
    opRules.put(
        new RuleRegExp("R1", RS + "%.*%" + RS + "%"),
        ReduceSinkDeduplicateProcFactory.getReducerReducerProc());
    opRules.put(
        new RuleRegExp("R2", RS + "%" + GBY + "%.*%" + RS + "%"),
        ReduceSinkDeduplicateProcFactory.getGroupbyReducerProc());
    if (mergeJoins) {
      opRules.put(
          new RuleRegExp("R3", JOIN + "%.*%" + RS + "%"),
          ReduceSinkDeduplicateProcFactory.getJoinReducerProc());
    }
    // TODO RS+JOIN

    // The dispatcher fires the processor corresponding to the closest matching
    // rule and passes the context along
    Dispatcher disp =
        new DefaultRuleDispatcher(
            ReduceSinkDeduplicateProcFactory.getDefaultProc(), opRules, cppCtx);
    GraphWalker ogw = new DefaultGraphWalker(disp);

    // Create a list of topop nodes
    ArrayList<Node> topNodes = new ArrayList<Node>();
    topNodes.addAll(pGraphContext.getTopOps().values());
    ogw.startWalking(topNodes, null);
    return pGraphContext;
  }

  protected class ReduceSinkDeduplicateProcCtx extends AbstractCorrelationProcCtx {

    public ReduceSinkDeduplicateProcCtx(ParseContext pctx) {
      super(pctx);
    }
  }

  static class ReduceSinkDeduplicateProcFactory {

    public static NodeProcessor getReducerReducerProc() {
      return new ReducerReducerProc();
    }

    public static NodeProcessor getGroupbyReducerProc() {
      return new GroupbyReducerProc();
    }

    public static NodeProcessor getJoinReducerProc() {
      return new JoinReducerProc();
    }

    public static NodeProcessor getDefaultProc() {
      return new DefaultProc();
    }
  }

  /*
   * do nothing.
   */
  static class DefaultProc implements NodeProcessor {
    @Override
    public Object process(
        Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs)
        throws SemanticException {
      return null;
    }
  }

  public abstract static class AbsctractReducerReducerProc implements NodeProcessor {

    @Override
    public Object process(
        Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs)
        throws SemanticException {
      ReduceSinkDeduplicateProcCtx dedupCtx = (ReduceSinkDeduplicateProcCtx) procCtx;
      if (dedupCtx.hasBeenRemoved((Operator<?>) nd)) {
        return false;
      }
      ReduceSinkOperator cRS = (ReduceSinkOperator) nd;
      Operator<?> child = CorrelationUtilities.getSingleChild(cRS);
      if (child instanceof JoinOperator) {
        return false; // not supported
      }
      if (child instanceof GroupByOperator) {
        GroupByOperator cGBY = (GroupByOperator) child;
        if (!CorrelationUtilities.hasGroupingSet(cRS) && !cGBY.getConf().isGroupingSetsPresent()) {
          return process(cRS, cGBY, dedupCtx);
        }
        return false;
      }
      if (child instanceof SelectOperator) {
        return process(cRS, dedupCtx);
      }
      return false;
    }

    protected abstract Object process(ReduceSinkOperator cRS, ReduceSinkDeduplicateProcCtx dedupCtx)
        throws SemanticException;

    protected abstract Object process(
        ReduceSinkOperator cRS, GroupByOperator cGBY, ReduceSinkDeduplicateProcCtx dedupCtx)
        throws SemanticException;

    // for JOIN-RS case, it's not possible generally to merge if child has
    // less key/partition columns than parents
    protected boolean merge(ReduceSinkOperator cRS, JoinOperator pJoin, int minReducer)
        throws SemanticException {
      List<Operator<?>> parents = pJoin.getParentOperators();
      ReduceSinkOperator[] pRSs = parents.toArray(new ReduceSinkOperator[parents.size()]);
      ReduceSinkDesc cRSc = cRS.getConf();
      ReduceSinkDesc pRS0c = pRSs[0].getConf();
      if (cRSc.getKeyCols().size() < pRS0c.getKeyCols().size()) {
        return false;
      }
      if (cRSc.getPartitionCols().size() != pRS0c.getPartitionCols().size()) {
        return false;
      }
      Integer moveReducerNumTo = checkNumReducer(cRSc.getNumReducers(), pRS0c.getNumReducers());
      if (moveReducerNumTo == null || moveReducerNumTo > 0 && cRSc.getNumReducers() < minReducer) {
        return false;
      }

      Integer moveRSOrderTo = checkOrder(cRSc.getOrder(), pRS0c.getOrder());
      if (moveRSOrderTo == null) {
        return false;
      }

      boolean[] sorted = CorrelationUtilities.getSortedTags(pJoin);

      int cKeySize = cRSc.getKeyCols().size();
      for (int i = 0; i < cKeySize; i++) {
        ExprNodeDesc cexpr = cRSc.getKeyCols().get(i);
        ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length];
        for (int tag = 0; tag < pRSs.length; tag++) {
          pexprs[tag] = pRSs[tag].getConf().getKeyCols().get(i);
        }
        int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted);
        if (found != i) {
          return false;
        }
      }
      int cPartSize = cRSc.getPartitionCols().size();
      for (int i = 0; i < cPartSize; i++) {
        ExprNodeDesc cexpr = cRSc.getPartitionCols().get(i);
        ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length];
        for (int tag = 0; tag < pRSs.length; tag++) {
          pexprs[tag] = pRSs[tag].getConf().getPartitionCols().get(i);
        }
        int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted);
        if (found != i) {
          return false;
        }
      }

      if (moveReducerNumTo > 0) {
        for (ReduceSinkOperator pRS : pRSs) {
          pRS.getConf().setNumReducers(cRS.getConf().getNumReducers());
        }
      }
      return true;
    }

    /**
     * Current RSDedup remove/replace child RS. For key columns, sorting order, and the number of
     * reducers, copy more specific part of configurations of child RS to that of parent RS. For
     * partitioning columns, if both child RS and parent RS have been assigned partitioning columns,
     * we will choose the more general partitioning columns. If parent RS has not been assigned any
     * partitioning column, we will use partitioning columns (if exist) of child RS.
     */
    protected boolean merge(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer)
        throws SemanticException {
      int[] result = checkStatus(cRS, pRS, minReducer);
      if (result == null) {
        return false;
      }

      if (result[0] > 0) {
        // The sorting columns of the child RS are more specific than
        // those of the parent RS. Assign sorting columns of the child RS
        // to the parent RS.
        List<ExprNodeDesc> childKCs = cRS.getConf().getKeyCols();
        pRS.getConf().setKeyCols(ExprNodeDescUtils.backtrack(childKCs, cRS, pRS));
      }

      if (result[1] < 0) {
        // The partitioning columns of the parent RS are more specific than
        // those of the child RS.
        List<ExprNodeDesc> childPCs = cRS.getConf().getPartitionCols();
        if (childPCs != null && !childPCs.isEmpty()) {
          // If partitioning columns of the child RS are assigned,
          // assign these to the partitioning columns of the parent RS.
          pRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(childPCs, cRS, pRS));
        }
      } else if (result[1] > 0) {
        // The partitioning columns of the child RS are more specific than
        // those of the parent RS.
        List<ExprNodeDesc> parentPCs = pRS.getConf().getPartitionCols();
        if (parentPCs == null || parentPCs.isEmpty()) {
          // If partitioning columns of the parent RS are not assigned,
          // assign partitioning columns of the child RS to the parent RS.
          ArrayList<ExprNodeDesc> childPCs = cRS.getConf().getPartitionCols();
          pRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(childPCs, cRS, pRS));
        }
      }

      if (result[2] > 0) {
        // The sorting order of the child RS is more specific than
        // that of the parent RS. Assign the sorting order of the child RS
        // to the parent RS.
        if (result[0] <= 0) {
          // Sorting columns of the parent RS are more specific than those of the
          // child RS but Sorting order of the child RS is more specific than
          // that of the parent RS.
          throw new SemanticException(
              "Sorting columns and order don't match. "
                  + "Try set "
                  + HiveConf.ConfVars.HIVEOPTREDUCEDEDUPLICATION
                  + "=false;");
        }
        pRS.getConf().setOrder(cRS.getConf().getOrder());
      }

      if (result[3] > 0) {
        // The number of reducers of the child RS is more specific than
        // that of the parent RS. Assign the number of reducers of the child RS
        // to the parent RS.
        pRS.getConf().setNumReducers(cRS.getConf().getNumReducers());
      }

      if (result[4] > 0) {
        // This case happens only when pRS key is empty in which case we can use
        // number of distribution keys and key serialization info from cRS
        pRS.getConf().setNumDistributionKeys(cRS.getConf().getNumDistributionKeys());
        List<FieldSchema> fields =
            PlanUtils.getFieldSchemasFromColumnList(pRS.getConf().getKeyCols(), "reducesinkkey");
        TableDesc keyTable = PlanUtils.getReduceKeyTableDesc(fields, pRS.getConf().getOrder());
        ArrayList<String> outputKeyCols = Lists.newArrayList();
        for (int i = 0; i < fields.size(); i++) {
          outputKeyCols.add(fields.get(i).getName());
        }
        pRS.getConf().setOutputKeyColumnNames(outputKeyCols);
        pRS.getConf().setKeySerializeInfo(keyTable);
      }
      return true;
    }

    /**
     * Returns merge directions between two RSs for criterias (ordering, number of reducers, reducer
     * keys, partition keys). Returns null if any of categories is not mergeable.
     *
     * <p>Values for each index can be -1, 0, 1 1. 0 means two configuration in the category is the
     * same 2. for -1, configuration of parent RS is more specific than child RS 3. for 1,
     * configuration of child RS is more specific than parent RS
     */
    private int[] checkStatus(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer)
        throws SemanticException {
      ReduceSinkDesc cConf = cRS.getConf();
      ReduceSinkDesc pConf = pRS.getConf();
      Integer moveRSOrderTo = checkOrder(cConf.getOrder(), pConf.getOrder());
      if (moveRSOrderTo == null) {
        return null;
      }
      Integer moveReducerNumTo = checkNumReducer(cConf.getNumReducers(), pConf.getNumReducers());
      if (moveReducerNumTo == null || moveReducerNumTo > 0 && cConf.getNumReducers() < minReducer) {
        return null;
      }
      List<ExprNodeDesc> ckeys = cConf.getKeyCols();
      List<ExprNodeDesc> pkeys = pConf.getKeyCols();
      Integer moveKeyColTo = checkExprs(ckeys, pkeys, cRS, pRS);
      if (moveKeyColTo == null) {
        return null;
      }
      List<ExprNodeDesc> cpars = cConf.getPartitionCols();
      List<ExprNodeDesc> ppars = pConf.getPartitionCols();
      Integer movePartitionColTo = checkExprs(cpars, ppars, cRS, pRS);
      if (movePartitionColTo == null) {
        return null;
      }
      Integer moveNumDistKeyTo =
          checkNumDistributionKey(cConf.getNumDistributionKeys(), pConf.getNumDistributionKeys());
      return new int[] {
        moveKeyColTo, movePartitionColTo, moveRSOrderTo, moveReducerNumTo, moveNumDistKeyTo
      };
    }

    private Integer checkNumDistributionKey(int cnd, int pnd) {
      // number of distribution keys of cRS is chosen only when numDistKeys of pRS
      // is 0 or less. In all other cases, distribution of the keys is based on
      // the pRS which is more generic than cRS.
      // Examples:
      // case 1: if pRS sort key is (a, b) and cRS sort key is (a, b, c) and number of
      // distribution keys are 2 and 3 resp. then after merge the sort keys will
      // be (a, b, c) while the number of distribution keys will be 2.
      // case 2: if pRS sort key is empty and number of distribution keys is 0
      // and if cRS sort key is (a, b) and number of distribution keys is 2 then
      // after merge new sort key will be (a, b) and number of distribution keys
      // will be 2.
      if (pnd <= 0) {
        return 1;
      }
      return 0;
    }

    /**
     * Overlapping part of keys should be the same between parent and child. And if child has more
     * keys than parent, non-overlapping part of keys should be backtrackable to parent.
     */
    private Integer checkExprs(
        List<ExprNodeDesc> ckeys,
        List<ExprNodeDesc> pkeys,
        ReduceSinkOperator cRS,
        ReduceSinkOperator pRS)
        throws SemanticException {
      Integer moveKeyColTo = 0;
      if (ckeys == null || ckeys.isEmpty()) {
        if (pkeys != null && !pkeys.isEmpty()) {
          moveKeyColTo = -1;
        }
      } else {
        if (pkeys == null || pkeys.isEmpty()) {
          for (ExprNodeDesc ckey : ckeys) {
            if (ExprNodeDescUtils.backtrack(ckey, cRS, pRS) == null) {
              // cKey is not present in parent
              return null;
            }
          }
          moveKeyColTo = 1;
        } else {
          moveKeyColTo = sameKeys(ckeys, pkeys, cRS, pRS);
        }
      }
      return moveKeyColTo;
    }

    // backtrack key exprs of child to parent and compare it with parent's
    protected Integer sameKeys(
        List<ExprNodeDesc> cexprs, List<ExprNodeDesc> pexprs, Operator<?> child, Operator<?> parent)
        throws SemanticException {
      int common = Math.min(cexprs.size(), pexprs.size());
      int limit = Math.max(cexprs.size(), pexprs.size());
      int i = 0;
      for (; i < common; i++) {
        ExprNodeDesc pexpr = pexprs.get(i);
        ExprNodeDesc cexpr = ExprNodeDescUtils.backtrack(cexprs.get(i), child, parent);
        if (cexpr == null || !pexpr.isSame(cexpr)) {
          return null;
        }
      }
      for (; i < limit; i++) {
        if (cexprs.size() > pexprs.size()) {
          if (ExprNodeDescUtils.backtrack(cexprs.get(i), child, parent) == null) {
            // cKey is not present in parent
            return null;
          }
        }
      }
      return Integer.valueOf(cexprs.size()).compareTo(pexprs.size());
    }

    // order of overlapping keys should be exactly the same
    protected Integer checkOrder(String corder, String porder) {
      if (corder == null || corder.trim().equals("")) {
        if (porder == null || porder.trim().equals("")) {
          return 0;
        }
        return -1;
      }
      if (porder == null || porder.trim().equals("")) {
        return 1;
      }
      corder = corder.trim();
      porder = porder.trim();
      int target = Math.min(corder.length(), porder.length());
      if (!corder.substring(0, target).equals(porder.substring(0, target))) {
        return null;
      }
      return Integer.valueOf(corder.length()).compareTo(porder.length());
    }

    /**
     * If number of reducers for RS is -1, the RS can have any number of reducers. It's generally
     * true except for order-by or forced bucketing cases. if both of num-reducers are not -1, those
     * number should be the same.
     */
    protected Integer checkNumReducer(int creduce, int preduce) {
      if (creduce < 0) {
        if (preduce < 0) {
          return 0;
        }
        return -1;
      }
      if (preduce < 0) {
        return 1;
      }
      if (creduce != preduce) {
        return null;
      }
      return 0;
    }
  }

  static class GroupbyReducerProc extends AbsctractReducerReducerProc {

    // pRS-pGBY-cRS
    @Override
    public Object process(ReduceSinkOperator cRS, ReduceSinkDeduplicateProcCtx dedupCtx)
        throws SemanticException {
      GroupByOperator pGBY =
          CorrelationUtilities.findPossibleParent(
              cRS, GroupByOperator.class, dedupCtx.trustScript());
      if (pGBY == null) {
        return false;
      }
      ReduceSinkOperator pRS =
          CorrelationUtilities.findPossibleParent(
              pGBY, ReduceSinkOperator.class, dedupCtx.trustScript());
      if (pRS != null && merge(cRS, pRS, dedupCtx.minReducer())) {
        CorrelationUtilities.replaceReduceSinkWithSelectOperator(cRS, dedupCtx.getPctx(), dedupCtx);
        pRS.getConf().setDeduplicated(true);
        return true;
      }
      return false;
    }

    // pRS-pGBY-cRS-cGBY
    @Override
    public Object process(
        ReduceSinkOperator cRS, GroupByOperator cGBY, ReduceSinkDeduplicateProcCtx dedupCtx)
        throws SemanticException {
      Operator<?> start = CorrelationUtilities.getStartForGroupBy(cRS, dedupCtx);
      GroupByOperator pGBY =
          CorrelationUtilities.findPossibleParent(
              start, GroupByOperator.class, dedupCtx.trustScript());
      if (pGBY == null) {
        return false;
      }
      ReduceSinkOperator pRS = CorrelationUtilities.getSingleParent(pGBY, ReduceSinkOperator.class);
      if (pRS != null && merge(cRS, pRS, dedupCtx.minReducer())) {
        CorrelationUtilities.removeReduceSinkForGroupBy(cRS, cGBY, dedupCtx.getPctx(), dedupCtx);
        pRS.getConf().setDeduplicated(true);
        return true;
      }
      return false;
    }
  }

  static class JoinReducerProc extends AbsctractReducerReducerProc {

    // pRS-pJOIN-cRS
    @Override
    public Object process(ReduceSinkOperator cRS, ReduceSinkDeduplicateProcCtx dedupCtx)
        throws SemanticException {
      JoinOperator pJoin =
          CorrelationUtilities.findPossibleParent(cRS, JoinOperator.class, dedupCtx.trustScript());
      if (pJoin != null && merge(cRS, pJoin, dedupCtx.minReducer())) {
        pJoin.getConf().setFixedAsSorted(true);
        CorrelationUtilities.replaceReduceSinkWithSelectOperator(cRS, dedupCtx.getPctx(), dedupCtx);
        ReduceSinkOperator pRS =
            CorrelationUtilities.findPossibleParent(
                pJoin, ReduceSinkOperator.class, dedupCtx.trustScript());
        if (pRS != null) {
          pRS.getConf().setDeduplicated(true);
        }
        return true;
      }
      return false;
    }

    // pRS-pJOIN-cRS-cGBY
    @Override
    public Object process(
        ReduceSinkOperator cRS, GroupByOperator cGBY, ReduceSinkDeduplicateProcCtx dedupCtx)
        throws SemanticException {
      Operator<?> start = CorrelationUtilities.getStartForGroupBy(cRS, dedupCtx);
      JoinOperator pJoin =
          CorrelationUtilities.findPossibleParent(
              start, JoinOperator.class, dedupCtx.trustScript());
      if (pJoin != null && merge(cRS, pJoin, dedupCtx.minReducer())) {
        pJoin.getConf().setFixedAsSorted(true);
        CorrelationUtilities.removeReduceSinkForGroupBy(cRS, cGBY, dedupCtx.getPctx(), dedupCtx);
        ReduceSinkOperator pRS =
            CorrelationUtilities.findPossibleParent(
                pJoin, ReduceSinkOperator.class, dedupCtx.trustScript());
        if (pRS != null) {
          pRS.getConf().setDeduplicated(true);
        }
        return true;
      }
      return false;
    }
  }

  static class ReducerReducerProc extends AbsctractReducerReducerProc {

    // pRS-cRS
    @Override
    public Object process(ReduceSinkOperator cRS, ReduceSinkDeduplicateProcCtx dedupCtx)
        throws SemanticException {
      ReduceSinkOperator pRS =
          CorrelationUtilities.findPossibleParent(
              cRS, ReduceSinkOperator.class, dedupCtx.trustScript());
      if (pRS != null && merge(cRS, pRS, dedupCtx.minReducer())) {
        CorrelationUtilities.replaceReduceSinkWithSelectOperator(cRS, dedupCtx.getPctx(), dedupCtx);
        pRS.getConf().setDeduplicated(true);
        return true;
      }
      return false;
    }

    // pRS-cRS-cGBY
    @Override
    public Object process(
        ReduceSinkOperator cRS, GroupByOperator cGBY, ReduceSinkDeduplicateProcCtx dedupCtx)
        throws SemanticException {
      Operator<?> start = CorrelationUtilities.getStartForGroupBy(cRS, dedupCtx);
      ReduceSinkOperator pRS =
          CorrelationUtilities.findPossibleParent(
              start, ReduceSinkOperator.class, dedupCtx.trustScript());
      if (pRS != null && merge(cRS, pRS, dedupCtx.minReducer())) {
        if (dedupCtx.getPctx().getConf().getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW)) {
          return false;
        }
        CorrelationUtilities.removeReduceSinkForGroupBy(cRS, cGBY, dedupCtx.getPctx(), dedupCtx);
        pRS.getConf().setDeduplicated(true);
        return true;
      }
      return false;
    }
  }
}

Example #24

0

Show file

File: SortedDynPartitionOptimizer.java Project: hadoop-zuiwanyuan/hive

    @Override
    public Object process(
        Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs)
        throws SemanticException {

      // introduce RS and EX before FS. If the operator tree already contains
      // RS then ReduceSinkDeDuplication optimization should merge them
      FileSinkOperator fsOp = (FileSinkOperator) nd;

      LOG.info("Sorted dynamic partitioning optimization kicked in..");

      // if not dynamic partitioning then bail out
      if (fsOp.getConf().getDynPartCtx() == null) {
        LOG.debug(
            "Bailing out of sort dynamic partition optimization as dynamic partitioning context is null");
        return null;
      }

      // if list bucketing then bail out
      ListBucketingCtx lbCtx = fsOp.getConf().getLbCtx();
      if (lbCtx != null
          && !lbCtx.getSkewedColNames().isEmpty()
          && !lbCtx.getSkewedColValues().isEmpty()) {
        LOG.debug(
            "Bailing out of sort dynamic partition optimization as list bucketing is enabled");
        return null;
      }

      Table destTable = fsOp.getConf().getTable();
      if (destTable == null) {
        LOG.debug(
            "Bailing out of sort dynamic partition optimization as destination table is null");
        return null;
      }

      // unlink connection between FS and its parent
      Operator<? extends OperatorDesc> fsParent = fsOp.getParentOperators().get(0);
      // if all dp columns got constant folded then disable this optimization
      if (allStaticPartitions(fsParent, fsOp.getConf().getDynPartCtx())) {
        LOG.debug(
            "Bailing out of sorted dynamic partition optimizer as all dynamic partition"
                + " columns got constant folded (static partitioning)");
        return null;
      }

      // if RS is inserted by enforce bucketing or sorting, we need to remove it
      // since ReduceSinkDeDuplication will not merge them to single RS.
      // RS inserted by enforce bucketing/sorting will have bucketing column in
      // reduce sink key whereas RS inserted by this optimization will have
      // partition columns followed by bucket number followed by sort columns in
      // the reduce sink key. Since both key columns are not prefix subset
      // ReduceSinkDeDuplication will not merge them together resulting in 2 MR jobs.
      // To avoid that we will remove the RS (and EX) inserted by enforce bucketing/sorting.
      if (!removeRSInsertedByEnforceBucketing(fsOp)) {
        LOG.debug(
            "Bailing out of sort dynamic partition optimization as some partition columns "
                + "got constant folded.");
        return null;
      }

      // unlink connection between FS and its parent
      fsParent = fsOp.getParentOperators().get(0);
      fsParent.getChildOperators().clear();

      DynamicPartitionCtx dpCtx = fsOp.getConf().getDynPartCtx();
      int numBuckets = destTable.getNumBuckets();

      // if enforce bucketing/sorting is disabled numBuckets will not be set.
      // set the number of buckets here to ensure creation of empty buckets
      dpCtx.setNumBuckets(numBuckets);

      // Get the positions for partition, bucket and sort columns
      List<Integer> bucketPositions =
          getBucketPositions(destTable.getBucketCols(), destTable.getCols());
      List<Integer> sortPositions = null;
      List<Integer> sortOrder = null;
      ArrayList<ExprNodeDesc> bucketColumns;
      if (fsOp.getConf().getWriteType() == AcidUtils.Operation.UPDATE
          || fsOp.getConf().getWriteType() == AcidUtils.Operation.DELETE) {
        // When doing updates and deletes we always want to sort on the rowid because the ACID
        // reader will expect this sort order when doing reads.  So
        // ignore whatever comes from the table and enforce this sort order instead.
        sortPositions = Arrays.asList(0);
        sortOrder = Arrays.asList(1); // 1 means asc, could really use enum here in the thrift if
        bucketColumns =
            new ArrayList<>(); // Bucketing column is already present in ROW__ID, which is specially
                               // handled in ReduceSink
      } else {
        if (!destTable.getSortCols().isEmpty()) {
          // Sort columns specified by table
          sortPositions = getSortPositions(destTable.getSortCols(), destTable.getCols());
          sortOrder = getSortOrders(destTable.getSortCols(), destTable.getCols());
        } else {
          // Infer sort columns from operator tree
          sortPositions = Lists.newArrayList();
          sortOrder = Lists.newArrayList();
          inferSortPositions(fsParent, sortPositions, sortOrder);
        }
        List<ColumnInfo> colInfos = fsParent.getSchema().getSignature();
        bucketColumns = getPositionsToExprNodes(bucketPositions, colInfos);
      }
      List<Integer> sortNullOrder = new ArrayList<Integer>();
      for (int order : sortOrder) {
        sortNullOrder.add(order == 1 ? 0 : 1); // for asc, nulls first; for desc, nulls last
      }
      LOG.debug("Got sort order");
      for (int i : sortPositions) LOG.debug("sort position " + i);
      for (int i : sortOrder) LOG.debug("sort order " + i);
      for (int i : sortNullOrder) LOG.debug("sort null order " + i);
      List<Integer> partitionPositions = getPartitionPositions(dpCtx, fsParent.getSchema());

      // update file sink descriptor
      fsOp.getConf().setMultiFileSpray(false);
      fsOp.getConf().setNumFiles(1);
      fsOp.getConf().setTotalFiles(1);

      ArrayList<ColumnInfo> parentCols = Lists.newArrayList(fsParent.getSchema().getSignature());
      ArrayList<ExprNodeDesc> allRSCols = Lists.newArrayList();
      for (ColumnInfo ci : parentCols) {
        allRSCols.add(new ExprNodeColumnDesc(ci));
      }

      // Create ReduceSink operator
      ReduceSinkOperator rsOp =
          getReduceSinkOp(
              partitionPositions,
              sortPositions,
              sortOrder,
              sortNullOrder,
              allRSCols,
              bucketColumns,
              numBuckets,
              fsParent,
              fsOp.getConf().getWriteType());

      List<ExprNodeDesc> descs = new ArrayList<ExprNodeDesc>(allRSCols.size());
      List<String> colNames = new ArrayList<String>();
      String colName;
      for (int i = 0; i < allRSCols.size(); i++) {
        ExprNodeDesc col = allRSCols.get(i);
        colName = col.getExprString();
        colNames.add(colName);
        if (partitionPositions.contains(i) || sortPositions.contains(i)) {
          descs.add(
              new ExprNodeColumnDesc(
                  col.getTypeInfo(), ReduceField.KEY.toString() + "." + colName, null, false));
        } else {
          descs.add(
              new ExprNodeColumnDesc(
                  col.getTypeInfo(), ReduceField.VALUE.toString() + "." + colName, null, false));
        }
      }
      RowSchema selRS = new RowSchema(fsParent.getSchema());
      if (!bucketColumns.isEmpty()
          || fsOp.getConf().getWriteType() == Operation.DELETE
          || fsOp.getConf().getWriteType() == Operation.UPDATE) {
        descs.add(
            new ExprNodeColumnDesc(
                TypeInfoFactory.stringTypeInfo,
                ReduceField.KEY.toString() + ".'" + BUCKET_NUMBER_COL_NAME + "'",
                null,
                false));
        colNames.add("'" + BUCKET_NUMBER_COL_NAME + "'");
        ColumnInfo ci =
            new ColumnInfo(
                BUCKET_NUMBER_COL_NAME,
                TypeInfoFactory.stringTypeInfo,
                selRS.getSignature().get(0).getTabAlias(),
                true,
                true);
        selRS.getSignature().add(ci);
        fsParent.getSchema().getSignature().add(ci);
      }
      // Create SelectDesc
      SelectDesc selConf = new SelectDesc(descs, colNames);

      // Create Select Operator
      SelectOperator selOp = (SelectOperator) OperatorFactory.getAndMakeChild(selConf, selRS, rsOp);

      // link SEL to FS
      fsOp.getParentOperators().clear();
      fsOp.getParentOperators().add(selOp);
      selOp.getChildOperators().add(fsOp);

      // Set if partition sorted or partition bucket sorted
      fsOp.getConf().setDpSortState(FileSinkDesc.DPSortState.PARTITION_SORTED);
      if (bucketColumns.size() > 0
          || fsOp.getConf().getWriteType() == Operation.DELETE
          || fsOp.getConf().getWriteType() == Operation.UPDATE) {
        fsOp.getConf().setDpSortState(FileSinkDesc.DPSortState.PARTITION_BUCKET_SORTED);
      }

      // update partition column info in FS descriptor
      fsOp.getConf().setPartitionCols(rsOp.getConf().getPartitionCols());

      LOG.info(
          "Inserted "
              + rsOp.getOperatorId()
              + " and "
              + selOp.getOperatorId()
              + " as parent of "
              + fsOp.getOperatorId()
              + " and child of "
              + fsParent.getOperatorId());

      parseCtx.setReduceSinkAddedBySortedDynPartition(true);
      return null;
    }

Example #25

0

Show file

File: SortedDynPartitionOptimizer.java Project: hadoop-zuiwanyuan/hive

    public ReduceSinkOperator getReduceSinkOp(
        List<Integer> partitionPositions,
        List<Integer> sortPositions,
        List<Integer> sortOrder,
        List<Integer> sortNullOrder,
        ArrayList<ExprNodeDesc> allCols,
        ArrayList<ExprNodeDesc> bucketColumns,
        int numBuckets,
        Operator<? extends OperatorDesc> parent,
        AcidUtils.Operation writeType)
        throws SemanticException {

      // Order of KEY columns
      // 1) Partition columns
      // 2) Bucket number column
      // 3) Sort columns
      Set<Integer> keyColsPosInVal = Sets.newLinkedHashSet();
      ArrayList<ExprNodeDesc> keyCols = Lists.newArrayList();
      List<Integer> newSortOrder = Lists.newArrayList();
      List<Integer> newSortNullOrder = Lists.newArrayList();
      int numPartAndBuck = partitionPositions.size();

      keyColsPosInVal.addAll(partitionPositions);
      if (!bucketColumns.isEmpty()
          || writeType == Operation.DELETE
          || writeType == Operation.UPDATE) {
        keyColsPosInVal.add(-1);
        numPartAndBuck += 1;
      }
      keyColsPosInVal.addAll(sortPositions);

      // by default partition and bucket columns are sorted in ascending order
      Integer order = 1;
      if (sortOrder != null && !sortOrder.isEmpty()) {
        if (sortOrder.get(0).intValue() == 0) {
          order = 0;
        }
      }
      for (int i = 0; i < numPartAndBuck; i++) {
        newSortOrder.add(order);
      }
      newSortOrder.addAll(sortOrder);

      String orderStr = "";
      for (Integer i : newSortOrder) {
        if (i.intValue() == 1) {
          orderStr += "+";
        } else {
          orderStr += "-";
        }
      }

      // if partition and bucket columns are sorted in ascending order, by default
      // nulls come first; otherwise nulls come last
      Integer nullOrder = order == 1 ? 0 : 1;
      if (sortNullOrder != null && !sortNullOrder.isEmpty()) {
        if (sortNullOrder.get(0).intValue() == 0) {
          nullOrder = 0;
        } else {
          nullOrder = 1;
        }
      }
      for (int i = 0; i < numPartAndBuck; i++) {
        newSortNullOrder.add(nullOrder);
      }
      newSortNullOrder.addAll(sortNullOrder);

      String nullOrderStr = "";
      for (Integer i : newSortNullOrder) {
        if (i.intValue() == 0) {
          nullOrderStr += "a";
        } else {
          nullOrderStr += "z";
        }
      }

      Map<String, ExprNodeDesc> colExprMap = Maps.newHashMap();
      ArrayList<ExprNodeDesc> partCols = Lists.newArrayList();

      // we will clone here as RS will update bucket column key with its
      // corresponding with bucket number and hence their OIs
      for (Integer idx : keyColsPosInVal) {
        if (idx < 0) {
          ExprNodeConstantDesc bucketNumCol =
              new ExprNodeConstantDesc(TypeInfoFactory.stringTypeInfo, BUCKET_NUMBER_COL_NAME);
          keyCols.add(bucketNumCol);
          colExprMap.put(
              Utilities.ReduceField.KEY + ".'" + BUCKET_NUMBER_COL_NAME + "'", bucketNumCol);
        } else {
          keyCols.add(allCols.get(idx).clone());
        }
      }

      ArrayList<ExprNodeDesc> valCols = Lists.newArrayList();
      for (int i = 0; i < allCols.size(); i++) {
        if (!keyColsPosInVal.contains(i)) {
          valCols.add(allCols.get(i).clone());
        }
      }

      for (Integer idx : partitionPositions) {
        partCols.add(allCols.get(idx).clone());
      }

      // in the absence of SORTED BY clause, the sorted dynamic partition insert
      // should honor the ordering of records provided by ORDER BY in SELECT statement
      ReduceSinkOperator parentRSOp =
          OperatorUtils.findSingleOperatorUpstream(parent, ReduceSinkOperator.class);
      if (parentRSOp != null && parseCtx.getQueryProperties().hasOuterOrderBy()) {
        String parentRSOpOrder = parentRSOp.getConf().getOrder();
        String parentRSOpNullOrder = parentRSOp.getConf().getNullOrder();
        if (parentRSOpOrder != null && !parentRSOpOrder.isEmpty() && sortPositions.isEmpty()) {
          keyCols.addAll(parentRSOp.getConf().getKeyCols());
          orderStr += parentRSOpOrder;
          nullOrderStr += parentRSOpNullOrder;
        }
      }

      // map _col0 to KEY._col0, etc
      Map<String, String> nameMapping = new HashMap<>();
      ArrayList<String> keyColNames = Lists.newArrayList();
      for (ExprNodeDesc keyCol : keyCols) {
        String keyColName = keyCol.getExprString();
        keyColNames.add(keyColName);
        colExprMap.put(Utilities.ReduceField.KEY + "." + keyColName, keyCol);
        nameMapping.put(keyColName, Utilities.ReduceField.KEY + "." + keyColName);
      }
      ArrayList<String> valColNames = Lists.newArrayList();
      for (ExprNodeDesc valCol : valCols) {
        String colName = valCol.getExprString();
        valColNames.add(colName);
        colExprMap.put(Utilities.ReduceField.VALUE + "." + colName, valCol);
        nameMapping.put(colName, Utilities.ReduceField.VALUE + "." + colName);
      }

      // Create Key/Value TableDesc. When the operator plan is split into MR tasks,
      // the reduce operator will initialize Extract operator with information
      // from Key and Value TableDesc
      List<FieldSchema> fields =
          PlanUtils.getFieldSchemasFromColumnList(keyCols, keyColNames, 0, "");
      TableDesc keyTable = PlanUtils.getReduceKeyTableDesc(fields, orderStr, nullOrderStr);
      List<FieldSchema> valFields =
          PlanUtils.getFieldSchemasFromColumnList(valCols, valColNames, 0, "");
      TableDesc valueTable = PlanUtils.getReduceValueTableDesc(valFields);
      List<List<Integer>> distinctColumnIndices = Lists.newArrayList();

      // Number of reducers is set to default (-1)
      ReduceSinkDesc rsConf =
          new ReduceSinkDesc(
              keyCols,
              keyCols.size(),
              valCols,
              keyColNames,
              distinctColumnIndices,
              valColNames,
              -1,
              partCols,
              -1,
              keyTable,
              valueTable,
              writeType);
      rsConf.setBucketCols(bucketColumns);
      rsConf.setNumBuckets(numBuckets);

      ArrayList<ColumnInfo> signature = new ArrayList<>();
      for (int index = 0; index < parent.getSchema().getSignature().size(); index++) {
        ColumnInfo colInfo = new ColumnInfo(parent.getSchema().getSignature().get(index));
        colInfo.setInternalName(nameMapping.get(colInfo.getInternalName()));
        signature.add(colInfo);
      }
      ReduceSinkOperator op =
          (ReduceSinkOperator)
              OperatorFactory.getAndMakeChild(rsConf, new RowSchema(signature), parent);
      op.setColumnExprMap(colExprMap);
      return op;
    }

Example #26

0

Show file

File: ReduceSinkDeDuplication.java Project: phoenixhadoop/hive

    /**
     * Current RSDedup remove/replace child RS. For key columns, sorting order, and the number of
     * reducers, copy more specific part of configurations of child RS to that of parent RS. For
     * partitioning columns, if both child RS and parent RS have been assigned partitioning columns,
     * we will choose the more general partitioning columns. If parent RS has not been assigned any
     * partitioning column, we will use partitioning columns (if exist) of child RS.
     */
    protected boolean merge(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer)
        throws SemanticException {
      int[] result = checkStatus(cRS, pRS, minReducer);
      if (result == null) {
        return false;
      }

      if (result[0] > 0) {
        // The sorting columns of the child RS are more specific than
        // those of the parent RS. Assign sorting columns of the child RS
        // to the parent RS.
        List<ExprNodeDesc> childKCs = cRS.getConf().getKeyCols();
        pRS.getConf().setKeyCols(ExprNodeDescUtils.backtrack(childKCs, cRS, pRS));
      }

      if (result[1] < 0) {
        // The partitioning columns of the parent RS are more specific than
        // those of the child RS.
        List<ExprNodeDesc> childPCs = cRS.getConf().getPartitionCols();
        if (childPCs != null && !childPCs.isEmpty()) {
          // If partitioning columns of the child RS are assigned,
          // assign these to the partitioning columns of the parent RS.
          pRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(childPCs, cRS, pRS));
        }
      } else if (result[1] > 0) {
        // The partitioning columns of the child RS are more specific than
        // those of the parent RS.
        List<ExprNodeDesc> parentPCs = pRS.getConf().getPartitionCols();
        if (parentPCs == null || parentPCs.isEmpty()) {
          // If partitioning columns of the parent RS are not assigned,
          // assign partitioning columns of the child RS to the parent RS.
          ArrayList<ExprNodeDesc> childPCs = cRS.getConf().getPartitionCols();
          pRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(childPCs, cRS, pRS));
        }
      }

      if (result[2] > 0) {
        // The sorting order of the child RS is more specific than
        // that of the parent RS. Assign the sorting order of the child RS
        // to the parent RS.
        if (result[0] <= 0) {
          // Sorting columns of the parent RS are more specific than those of the
          // child RS but Sorting order of the child RS is more specific than
          // that of the parent RS.
          throw new SemanticException(
              "Sorting columns and order don't match. "
                  + "Try set "
                  + HiveConf.ConfVars.HIVEOPTREDUCEDEDUPLICATION
                  + "=false;");
        }
        pRS.getConf().setOrder(cRS.getConf().getOrder());
      }

      if (result[3] > 0) {
        // The number of reducers of the child RS is more specific than
        // that of the parent RS. Assign the number of reducers of the child RS
        // to the parent RS.
        pRS.getConf().setNumReducers(cRS.getConf().getNumReducers());
      }

      if (result[4] > 0) {
        // This case happens only when pRS key is empty in which case we can use
        // number of distribution keys and key serialization info from cRS
        pRS.getConf().setNumDistributionKeys(cRS.getConf().getNumDistributionKeys());
        List<FieldSchema> fields =
            PlanUtils.getFieldSchemasFromColumnList(pRS.getConf().getKeyCols(), "reducesinkkey");
        TableDesc keyTable = PlanUtils.getReduceKeyTableDesc(fields, pRS.getConf().getOrder());
        ArrayList<String> outputKeyCols = Lists.newArrayList();
        for (int i = 0; i < fields.size(); i++) {
          outputKeyCols.add(fields.get(i).getName());
        }
        pRS.getConf().setOutputKeyColumnNames(outputKeyCols);
        pRS.getConf().setKeySerializeInfo(keyTable);
      }
      return true;
    }

Example #27

0

Show file

File: BucketingSortingReduceSinkOptimizer.java Project: saman-aghazadeh/apache-hive-1.2.0-src2

    @Override
    public Object process(
        Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs)
        throws SemanticException {

      // If the reduce sink has not been introduced due to bucketing/sorting, ignore it
      FileSinkOperator fsOp = (FileSinkOperator) nd;
      ReduceSinkOperator rsOp =
          (ReduceSinkOperator) fsOp.getParentOperators().get(0).getParentOperators().get(0);

      List<ReduceSinkOperator> rsOps =
          pGraphContext.getReduceSinkOperatorsAddedByEnforceBucketingSorting();
      // nothing to do
      if ((rsOps != null) && (!rsOps.contains(rsOp))) {
        return null;
      }

      // Don't do this optimization with updates or deletes
      if (pGraphContext.getContext().getAcidOperation() == AcidUtils.Operation.UPDATE
          || pGraphContext.getContext().getAcidOperation() == AcidUtils.Operation.DELETE) {
        return null;
      }

      if (stack.get(0) instanceof TableScanOperator) {
        TableScanOperator tso = ((TableScanOperator) stack.get(0));
        if (SemanticAnalyzer.isAcidTable(tso.getConf().getTableMetadata())) {
          /*ACID tables have complex directory layout and require merging of delta files
           * on read thus we should not try to read bucket files directly*/
          return null;
        }
      }
      // Support for dynamic partitions can be added later
      if (fsOp.getConf().getDynPartCtx() != null) {
        return null;
      }

      // No conversion is possible for the reduce keys
      for (ExprNodeDesc keyCol : rsOp.getConf().getKeyCols()) {
        if (!(keyCol instanceof ExprNodeColumnDesc)) {
          return null;
        }
      }

      Table destTable = fsOp.getConf().getTable();
      if (destTable == null) {
        return null;
      }
      int numBucketsDestination = destTable.getNumBuckets();

      // Get the positions for sorted and bucketed columns
      // For sorted columns, also get the order (ascending/descending) - that should
      // also match for this to be converted to a map-only job.
      // Get the positions for sorted and bucketed columns
      // For sorted columns, also get the order (ascending/descending) - that should
      // also match for this to be converted to a map-only job.
      List<Integer> bucketPositions =
          getBucketPositions(destTable.getBucketCols(), destTable.getCols());
      ObjectPair<List<Integer>, List<Integer>> sortOrderPositions =
          getSortPositionsOrder(destTable.getSortCols(), destTable.getCols());
      List<Integer> sortPositions = sortOrderPositions.getFirst();
      List<Integer> sortOrder = sortOrderPositions.getSecond();
      boolean useBucketSortPositions = true;

      // Only selects and filters are allowed
      Operator<? extends OperatorDesc> op = rsOp;
      // TableScan will also be followed by a Select Operator. Find the expressions for the
      // bucketed/sorted columns for the destination table
      List<ExprNodeColumnDesc> sourceTableBucketCols = new ArrayList<ExprNodeColumnDesc>();
      List<ExprNodeColumnDesc> sourceTableSortCols = new ArrayList<ExprNodeColumnDesc>();
      op = op.getParentOperators().get(0);

      while (true) {
        if (!(op instanceof TableScanOperator)
            && !(op instanceof FilterOperator)
            && !(op instanceof SelectOperator)
            && !(op instanceof SMBMapJoinOperator)) {
          return null;
        }

        if (op instanceof SMBMapJoinOperator) {
          // Bucketing and sorting keys should exactly match
          if (!(bucketPositions.equals(sortPositions))) {
            return null;
          }
          SMBMapJoinOperator smbOp = (SMBMapJoinOperator) op;
          SMBJoinDesc smbJoinDesc = smbOp.getConf();
          int posBigTable = smbJoinDesc.getPosBigTable();

          // join keys dont match the bucketing keys
          List<ExprNodeDesc> keysBigTable = smbJoinDesc.getKeys().get((byte) posBigTable);
          if (keysBigTable.size() != bucketPositions.size()) {
            return null;
          }

          if (!validateSMBJoinKeys(
              smbJoinDesc, sourceTableBucketCols, sourceTableSortCols, sortOrder)) {
            return null;
          }

          sourceTableBucketCols.clear();
          sourceTableSortCols.clear();
          useBucketSortPositions = false;

          for (ExprNodeDesc keyBigTable : keysBigTable) {
            if (!(keyBigTable instanceof ExprNodeColumnDesc)) {
              return null;
            }
            sourceTableBucketCols.add((ExprNodeColumnDesc) keyBigTable);
            sourceTableSortCols.add((ExprNodeColumnDesc) keyBigTable);
          }

          // since it is a sort-merge join, only follow the big table
          op = op.getParentOperators().get(posBigTable);
        } else {
          // nothing to be done for filters - the output schema does not change.
          if (op instanceof TableScanOperator) {
            assert !useBucketSortPositions;
            TableScanOperator ts = (TableScanOperator) op;
            Table srcTable = ts.getConf().getTableMetadata();

            // Find the positions of the bucketed columns in the table corresponding
            // to the select list.
            // Consider the following scenario:
            // T1(key, value1, value2) bucketed/sorted by key into 2 buckets
            // T2(dummy, key, value1, value2) bucketed/sorted by key into 2 buckets
            // A query like: insert overwrite table T2 select 1, key, value1, value2 from T1
            // should be optimized.

            // Start with the destination: T2, bucketed/sorted position is [1]
            // At the source T1, the column corresponding to that position is [key], which
            // maps to column [0] of T1, which is also bucketed/sorted into the same
            // number of buckets
            List<Integer> newBucketPositions = new ArrayList<Integer>();
            for (int pos = 0; pos < bucketPositions.size(); pos++) {
              ExprNodeColumnDesc col = sourceTableBucketCols.get(pos);
              String colName = col.getColumn();
              int bucketPos = findColumnPosition(srcTable.getCols(), colName);
              if (bucketPos < 0) {
                return null;
              }
              newBucketPositions.add(bucketPos);
            }

            // Find the positions/order of the sorted columns in the table corresponding
            // to the select list.
            List<Integer> newSortPositions = new ArrayList<Integer>();
            for (int pos = 0; pos < sortPositions.size(); pos++) {
              ExprNodeColumnDesc col = sourceTableSortCols.get(pos);
              String colName = col.getColumn();
              int sortPos = findColumnPosition(srcTable.getCols(), colName);
              if (sortPos < 0) {
                return null;
              }
              newSortPositions.add(sortPos);
            }

            if (srcTable.isPartitioned()) {
              PrunedPartitionList prunedParts =
                  pGraphContext.getPrunedPartitions(srcTable.getTableName(), ts);
              List<Partition> partitions = prunedParts.getNotDeniedPartns();

              // Support for dynamic partitions can be added later
              // The following is not optimized:
              // insert overwrite table T1(ds='1', hr) select key, value, hr from T2 where ds = '1';
              // where T1 and T2 are bucketed by the same keys and partitioned by ds. hr
              if ((partitions == null) || (partitions.isEmpty()) || (partitions.size() > 1)) {
                return null;
              }
              for (Partition partition : partitions) {
                if (!checkPartition(
                    partition,
                    newBucketPositions,
                    newSortPositions,
                    sortOrder,
                    numBucketsDestination)) {
                  return null;
                }
              }

              removeReduceSink(
                  rsOp, (TableScanOperator) op, fsOp, partitions.get(0).getSortedPaths());
              return null;
            } else {
              if (!checkTable(
                  srcTable,
                  newBucketPositions,
                  newSortPositions,
                  sortOrder,
                  numBucketsDestination)) {
                return null;
              }

              removeReduceSink(rsOp, (TableScanOperator) op, fsOp, srcTable.getSortedPaths());
              return null;
            }
          }
          // None of the operators is changing the positions
          else if (op instanceof SelectOperator) {
            SelectOperator selectOp = (SelectOperator) op;
            SelectDesc selectDesc = selectOp.getConf();

            // Iterate backwards, from the destination table to the top of the tree
            // Based on the output column names, get the new columns.
            if (!useBucketSortPositions) {
              bucketPositions.clear();
              sortPositions.clear();
              List<String> outputColumnNames = selectDesc.getOutputColumnNames();

              for (ExprNodeColumnDesc col : sourceTableBucketCols) {
                String colName = col.getColumn();
                int colPos = outputColumnNames.indexOf(colName);
                if (colPos < 0) {
                  return null;
                }
                bucketPositions.add(colPos);
              }

              for (ExprNodeColumnDesc col : sourceTableSortCols) {
                String colName = col.getColumn();
                int colPos = outputColumnNames.indexOf(colName);
                if (colPos < 0) {
                  return null;
                }
                sortPositions.add(colPos);
              }
            }

            // There may be multiple selects - chose the one closest to the table
            sourceTableBucketCols.clear();
            sourceTableSortCols.clear();

            // Only columns can be selected for both sorted and bucketed positions
            for (int pos : bucketPositions) {
              ExprNodeDesc selectColList = selectDesc.getColList().get(pos);
              if (!(selectColList instanceof ExprNodeColumnDesc)) {
                return null;
              }
              sourceTableBucketCols.add((ExprNodeColumnDesc) selectColList);
            }

            for (int pos : sortPositions) {
              ExprNodeDesc selectColList = selectDesc.getColList().get(pos);
              if (!(selectColList instanceof ExprNodeColumnDesc)) {
                return null;
              }
              sourceTableSortCols.add((ExprNodeColumnDesc) selectColList);
            }

            useBucketSortPositions = false;
          }
          op = op.getParentOperators().get(0);
        }
      }
    }