Java RowSchema 예제들, org.apache.hadoop.hive.ql.exec.RowSchema Java 예제들

예제 #1

0

파일 보기

파일: ColumnPrunerProcFactory.java 프로젝트: cschenyuan/hive-hack

 /**
  * The pruning needs to preserve the order of columns in the input schema
  *
  * @param op
  * @param cols
  * @return
  * @throws SemanticException
  */
 private static List<String> preserveColumnOrder(
     Operator<? extends OperatorDesc> op, List<String> cols) throws SemanticException {
   RowSchema inputSchema = op.getSchema();
   if (inputSchema != null) {
     ArrayList<String> rs = new ArrayList<String>();
     ArrayList<ColumnInfo> inputCols = inputSchema.getSignature();
     for (ColumnInfo i : inputCols) {
       if (cols.contains(i.getInternalName())) {
         rs.add(i.getInternalName());
       }
     }
     return rs;
   } else {
     return cols;
   }
 }

예제 #2

0

파일 보기

파일: PredicateTransitivePropagate.java 프로젝트: Leolh/hive

 // insert filter operator between target(child) and input(parent)
 private Operator<FilterDesc> createFilter(
     Operator<?> target, Operator<?> parent, RowSchema parentRS, ExprNodeDesc filterExpr) {
   Operator<FilterDesc> filter =
       OperatorFactory.get(
           new FilterDesc(filterExpr, false), new RowSchema(parentRS.getSignature()));
   filter.getParentOperators().add(parent);
   filter.getChildOperators().add(target);
   parent.replaceChild(target, filter);
   target.replaceParent(parent, filter);
   return filter;
 }

예제 #3

0

파일 보기

파일: SortedDynPartitionOptimizer.java 프로젝트: hadoop-zuiwanyuan/hive

    private List<Integer> getPartitionPositions(DynamicPartitionCtx dpCtx, RowSchema schema) {
      int numPartCols = dpCtx.getNumDPCols();
      int numCols = schema.getSignature().size();
      List<Integer> partPos = Lists.newArrayList();

      // partition columns will always at the last
      for (int i = numCols - numPartCols; i < numCols; i++) {
        partPos.add(i);
      }
      return partPos;
    }

예제 #4

0

파일 보기

파일: RowResolver.java 프로젝트: zshao/hive-1.1.0-cdh5.4.1

  public int getPosition(String internalName) {
    int pos = -1;

    for (ColumnInfo var : rowSchema.getSignature()) {
      ++pos;
      if (var.getInternalName().equals(internalName)) {
        return pos;
      }
    }

    return -1;
  }

예제 #5

0

파일 보기

파일: GenMRFileSink1.java 프로젝트: Carlie20083/hive-0.7.0

  /**
   * create a Map-only merge job with the following operators:
   *
   * @param fsInput
   * @param ctx
   * @param finalName MR job J0: ... | v FileSinkOperator_1 (fsInput) | v Merge job J1: | v
   *     TableScan (using CombineHiveInputFormat) (tsMerge) | v FileSinkOperator (fsMerge)
   *     <p>Here the pathToPartitionInfo & pathToAlias will remain the same, which means the paths
   *     do not contain the dynamic partitions (their parent). So after the dynamic partitions are
   *     created (after the first job finished before the moveTask or ConditionalTask start), we
   *     need to change the pathToPartitionInfo & pathToAlias to include the dynamic partition
   *     directories.
   */
  private void createMap4Merge(FileSinkOperator fsInput, GenMRProcContext ctx, String finalName) {

    //
    // 1. create the operator tree
    //
    ParseContext parseCtx = ctx.getParseCtx();
    FileSinkDesc fsInputDesc = fsInput.getConf();

    // Create a TableScan operator
    RowSchema inputRS = fsInput.getSchema();
    Operator<? extends Serializable> tsMerge = OperatorFactory.get(TableScanDesc.class, inputRS);

    // Create a FileSink operator
    TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone();
    FileSinkDesc fsOutputDesc =
        new FileSinkDesc(
            finalName, ts, parseCtx.getConf().getBoolVar(HiveConf.ConfVars.COMPRESSRESULT));
    FileSinkOperator fsOutput =
        (FileSinkOperator) OperatorFactory.getAndMakeChild(fsOutputDesc, inputRS, tsMerge);

    // If the input FileSinkOperator is a dynamic partition enabled, the tsMerge input schema
    // needs to include the partition column, and the fsOutput should have
    // a DynamicPartitionCtx to indicate that it needs to dynamically partitioned.
    DynamicPartitionCtx dpCtx = fsInputDesc.getDynPartCtx();
    if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
      // adding DP ColumnInfo to the RowSchema signature
      ArrayList<ColumnInfo> signature = inputRS.getSignature();
      String tblAlias = fsInputDesc.getTableInfo().getTableName();
      LinkedHashMap<String, String> colMap = new LinkedHashMap<String, String>();
      StringBuilder partCols = new StringBuilder();
      for (String dpCol : dpCtx.getDPColNames()) {
        ColumnInfo colInfo =
            new ColumnInfo(
                dpCol,
                TypeInfoFactory.stringTypeInfo, // all partition column type should be string
                tblAlias,
                true); // partition column is virtual column
        signature.add(colInfo);
        colMap.put(dpCol, dpCol); // input and output have the same column name
        partCols.append(dpCol).append('/');
      }
      partCols.setLength(partCols.length() - 1); // remove the last '/'
      inputRS.setSignature(signature);

      // create another DynamicPartitionCtx, which has a different input-to-DP column mapping
      DynamicPartitionCtx dpCtx2 = new DynamicPartitionCtx(dpCtx);
      dpCtx2.setInputToDPCols(colMap);
      fsOutputDesc.setDynPartCtx(dpCtx2);

      // update the FileSinkOperator to include partition columns
      fsInputDesc
          .getTableInfo()
          .getProperties()
          .setProperty(
              org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_PARTITION_COLUMNS,
              partCols.toString()); // list of dynamic partition column names
    } else {
      // non-partitioned table
      fsInputDesc
          .getTableInfo()
          .getProperties()
          .remove(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_PARTITION_COLUMNS);
    }

    //
    // 2. Constructing a conditional task consisting of a move task and a map reduce task
    //
    MapRedTask currTask = (MapRedTask) ctx.getCurrTask();
    MoveWork dummyMv =
        new MoveWork(
            null,
            null,
            null,
            new LoadFileDesc(fsInputDesc.getDirName(), finalName, true, null, null),
            false);
    MapredWork cplan = createMergeTask(ctx.getConf(), tsMerge, fsInputDesc);
    // use CombineHiveInputFormat for map-only merging
    cplan.setInputformat("org.apache.hadoop.hive.ql.io.CombineHiveInputFormat");
    // NOTE: we should gather stats in MR1 rather than MR2 at merge job since we don't
    // know if merge MR2 will be triggered at execution time
    ConditionalTask cndTsk =
        createCondTask(ctx.getConf(), ctx.getCurrTask(), dummyMv, cplan, fsInputDesc.getDirName());

    // keep the dynamic partition context in conditional task resolver context
    ConditionalResolverMergeFilesCtx mrCtx =
        (ConditionalResolverMergeFilesCtx) cndTsk.getResolverCtx();
    mrCtx.setDPCtx(fsInputDesc.getDynPartCtx());

    //
    // 3. add the moveTask as the children of the conditional task
    //
    LinkMoveTask(ctx, fsOutput, cndTsk);
  }

예제 #6

0

파일 보기

파일: GenMRFileSink1.java 프로젝트: Carlie20083/hive-0.7.0

  private void createMapReduce4Merge(FileSinkOperator fsOp, GenMRProcContext ctx, String finalName)
      throws SemanticException {
    Task<? extends Serializable> currTask = ctx.getCurrTask();
    RowSchema inputRS = fsOp.getSchema();

    // create a reduce Sink operator - key is the first column
    ArrayList<ExprNodeDesc> keyCols = new ArrayList<ExprNodeDesc>();
    keyCols.add(TypeCheckProcFactory.DefaultExprProcessor.getFuncExprNodeDesc("rand"));

    // value is all the columns in the FileSink operator input
    ArrayList<ExprNodeDesc> valueCols = new ArrayList<ExprNodeDesc>();
    for (ColumnInfo ci : inputRS.getSignature()) {
      valueCols.add(
          new ExprNodeColumnDesc(
              ci.getType(), ci.getInternalName(), ci.getTabAlias(), ci.getIsVirtualCol()));
    }

    // create a dummy tableScan operator
    Operator<? extends Serializable> tsMerge = OperatorFactory.get(TableScanDesc.class, inputRS);

    ArrayList<String> outputColumns = new ArrayList<String>();
    for (int i = 0; i < valueCols.size(); i++) {
      outputColumns.add(SemanticAnalyzer.getColumnInternalName(i));
    }

    ReduceSinkDesc rsDesc =
        PlanUtils.getReduceSinkDesc(
            new ArrayList<ExprNodeDesc>(), valueCols, outputColumns, false, -1, -1, -1);
    OperatorFactory.getAndMakeChild(rsDesc, inputRS, tsMerge);
    ParseContext parseCtx = ctx.getParseCtx();
    FileSinkDesc fsConf = fsOp.getConf();

    // Add the extract operator to get the value fields
    RowResolver out_rwsch = new RowResolver();
    RowResolver interim_rwsch = ctx.getParseCtx().getOpParseCtx().get(fsOp).getRowResolver();
    Integer pos = Integer.valueOf(0);
    for (ColumnInfo colInfo : interim_rwsch.getColumnInfos()) {
      String[] info = interim_rwsch.reverseLookup(colInfo.getInternalName());
      out_rwsch.put(
          info[0],
          info[1],
          new ColumnInfo(
              pos.toString(),
              colInfo.getType(),
              info[0],
              colInfo.getIsVirtualCol(),
              colInfo.isHiddenVirtualCol()));
      pos = Integer.valueOf(pos.intValue() + 1);
    }

    Operator<ExtractDesc> extract =
        OperatorFactory.getAndMakeChild(
            new ExtractDesc(
                new ExprNodeColumnDesc(
                    TypeInfoFactory.stringTypeInfo,
                    Utilities.ReduceField.VALUE.toString(),
                    "",
                    false)),
            new RowSchema(out_rwsch.getColumnInfos()));

    TableDesc ts = (TableDesc) fsConf.getTableInfo().clone();
    fsConf
        .getTableInfo()
        .getProperties()
        .remove(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_PARTITION_COLUMNS);

    FileSinkDesc newFSD =
        new FileSinkDesc(
            finalName, ts, parseCtx.getConf().getBoolVar(HiveConf.ConfVars.COMPRESSRESULT));
    FileSinkOperator newOutput =
        (FileSinkOperator) OperatorFactory.getAndMakeChild(newFSD, inputRS, extract);

    HiveConf conf = parseCtx.getConf();
    MapredWork cplan = createMergeTask(conf, tsMerge, fsConf);
    cplan.setReducer(extract);

    // NOTE: we should gather stats in MR1 (rather than the merge MR job)
    // since it is unknown if the merge MR will be triggered at execution time.

    MoveWork dummyMv =
        new MoveWork(
            null,
            null,
            null,
            new LoadFileDesc(fsConf.getDirName(), finalName, true, null, null),
            false);

    ConditionalTask cndTsk = createCondTask(conf, currTask, dummyMv, cplan, fsConf.getDirName());

    LinkMoveTask(ctx, newOutput, cndTsk);
  }

예제 #7

0

파일 보기

파일: GenMRFileSink1.java 프로젝트: uclaabs/absHive

  /**
   * @param fsInput The FileSink operator.
   * @param ctx The MR processing context.
   * @param finalName the final destination path the merge job should output.
   * @throws SemanticException
   *     <p>create a Map-only merge job using CombineHiveInputFormat for all partitions with
   *     following operators: MR job J0: ... | v FileSinkOperator_1 (fsInput) | v Merge job J1: | v
   *     TableScan (using CombineHiveInputFormat) (tsMerge) | v FileSinkOperator (fsMerge)
   *     <p>Here the pathToPartitionInfo & pathToAlias will remain the same, which means the paths
   *     do not contain the dynamic partitions (their parent). So after the dynamic partitions are
   *     created (after the first job finished before the moveTask or ConditionalTask start), we
   *     need to change the pathToPartitionInfo & pathToAlias to include the dynamic partition
   *     directories.
   */
  private void createMRWorkForMergingFiles(
      FileSinkOperator fsInput, GenMRProcContext ctx, String finalName) throws SemanticException {

    //
    // 1. create the operator tree
    //
    HiveConf conf = ctx.getParseCtx().getConf();
    FileSinkDesc fsInputDesc = fsInput.getConf();

    // Create a TableScan operator
    RowSchema inputRS = fsInput.getSchema();
    Operator<? extends OperatorDesc> tsMerge = OperatorFactory.get(TableScanDesc.class, inputRS);

    // Create a FileSink operator
    TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone();
    FileSinkDesc fsOutputDesc =
        new FileSinkDesc(finalName, ts, conf.getBoolVar(ConfVars.COMPRESSRESULT));
    FileSinkOperator fsOutput =
        (FileSinkOperator) OperatorFactory.getAndMakeChild(fsOutputDesc, inputRS, tsMerge);

    // If the input FileSinkOperator is a dynamic partition enabled, the tsMerge input schema
    // needs to include the partition column, and the fsOutput should have
    // a DynamicPartitionCtx to indicate that it needs to dynamically partitioned.
    DynamicPartitionCtx dpCtx = fsInputDesc.getDynPartCtx();
    if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
      // adding DP ColumnInfo to the RowSchema signature
      ArrayList<ColumnInfo> signature = inputRS.getSignature();
      String tblAlias = fsInputDesc.getTableInfo().getTableName();
      LinkedHashMap<String, String> colMap = new LinkedHashMap<String, String>();
      StringBuilder partCols = new StringBuilder();
      for (String dpCol : dpCtx.getDPColNames()) {
        ColumnInfo colInfo =
            new ColumnInfo(
                dpCol,
                TypeInfoFactory.stringTypeInfo, // all partition column type should be string
                tblAlias,
                true); // partition column is virtual column
        signature.add(colInfo);
        colMap.put(dpCol, dpCol); // input and output have the same column name
        partCols.append(dpCol).append('/');
      }
      partCols.setLength(partCols.length() - 1); // remove the last '/'
      inputRS.setSignature(signature);

      // create another DynamicPartitionCtx, which has a different input-to-DP column mapping
      DynamicPartitionCtx dpCtx2 = new DynamicPartitionCtx(dpCtx);
      dpCtx2.setInputToDPCols(colMap);
      fsOutputDesc.setDynPartCtx(dpCtx2);

      // update the FileSinkOperator to include partition columns
      fsInputDesc
          .getTableInfo()
          .getProperties()
          .setProperty(
              org.apache
                  .hadoop
                  .hive
                  .metastore
                  .api
                  .hive_metastoreConstants
                  .META_TABLE_PARTITION_COLUMNS,
              partCols.toString()); // list of dynamic partition column names
    } else {
      // non-partitioned table
      fsInputDesc
          .getTableInfo()
          .getProperties()
          .remove(
              org.apache
                  .hadoop
                  .hive
                  .metastore
                  .api
                  .hive_metastoreConstants
                  .META_TABLE_PARTITION_COLUMNS);
    }

    //
    // 2. Constructing a conditional task consisting of a move task and a map reduce task
    //
    MoveWork dummyMv =
        new MoveWork(
            null,
            null,
            null,
            new LoadFileDesc(fsInputDesc.getFinalDirName(), finalName, true, null, null),
            false);
    MapredWork cplan;

    if (conf.getBoolVar(ConfVars.HIVEMERGERCFILEBLOCKLEVEL)
        && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(RCFileInputFormat.class)) {

      // Check if InputFormatClass is valid
      String inputFormatClass = conf.getVar(ConfVars.HIVEMERGERCFILEINPUTFORMATBLOCKLEVEL);
      try {
        Class c = (Class<? extends InputFormat>) Class.forName(inputFormatClass);

        LOG.info("RCFile format- Using block level merge");
        cplan =
            createBlockMergeTask(
                fsInputDesc,
                finalName,
                dpCtx != null && dpCtx.getNumDPCols() > 0,
                RCFileMergeMapper.class,
                RCFileInputFormat.class,
                RCFileBlockMergeInputFormat.class);
      } catch (ClassNotFoundException e) {
        String msg = "Illegal input format class: " + inputFormatClass;
        throw new SemanticException(msg);
      }

    } else if (conf.getBoolVar(ConfVars.HIVEMERGEORCBLOCKLEVEL)
        && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(OrcInputFormat.class)) {

      // Check if InputFormatClass is valid
      String inputFormatClass = conf.getVar(ConfVars.HIVEMERGEORCINPUTFORMATBLOCKLEVEL);
      try {
        Class c = (Class<? extends InputFormat>) Class.forName(inputFormatClass);

        LOG.info("ORCFile format- Using block level merge");
        cplan =
            createBlockMergeTask(
                fsInputDesc,
                finalName,
                dpCtx != null && dpCtx.getNumDPCols() > 0,
                OrcMergeMapper.class,
                OrcInputFormat.class,
                OrcBlockMergeInputFormat.class);
      } catch (ClassNotFoundException e) {
        String msg = "Illegal input format class: " + inputFormatClass;
        throw new SemanticException(msg);
      }

    } else {
      cplan = createMRWorkForMergingFiles(conf, tsMerge, fsInputDesc);
      // use CombineHiveInputFormat for map-only merging
    }
    cplan.setInputformat("org.apache.hadoop.hive.ql.io.CombineHiveInputFormat");
    // NOTE: we should gather stats in MR1 rather than MR2 at merge job since we don't
    // know if merge MR2 will be triggered at execution time
    ConditionalTask cndTsk =
        createCondTask(conf, ctx.getCurrTask(), dummyMv, cplan, fsInputDesc.getFinalDirName());

    // keep the dynamic partition context in conditional task resolver context
    ConditionalResolverMergeFilesCtx mrCtx =
        (ConditionalResolverMergeFilesCtx) cndTsk.getResolverCtx();
    mrCtx.setDPCtx(fsInputDesc.getDynPartCtx());
    mrCtx.setLbCtx(fsInputDesc.getLbCtx());

    //
    // 3. add the moveTask as the children of the conditional task
    //
    linkMoveTask(ctx, fsOutput, cndTsk);
  }

예제 #8

0

파일 보기

파일: ExplainTaskHelper.java 프로젝트: victor2100/hive

  // main work
  @SuppressWarnings("unchecked")
  public static void analyzeHelper(Operator sinkOp, int level) {

    println(level, sinkOp.getClass());
    if (sinkOp instanceof TableScanOperator) {
      // System.out.println("=========== " +
      // opParseCtx.get(sinkOp).getRowResolver().tableOriginalName);

      // System.out.println("========= " + ((TableScanOperator)(sinkOp)).getNeededColumnIDs());
      // System.out.println("========= " + ((TableScanOperator)(sinkOp)).getNeededColumns());
      // System.out.println("======Table Desc " + ((TableScanOperator)(sinkOp)).getTableDesc());
      // System.out.println(qb.getTabNameForAlias("a"));
      // System.out.println(qb.getTabNameForAlias("b"));
    }

    println(level, "Column Expr Map: ");

    Map<String, ExprNodeDesc> map = sinkOp.getColumnExprMap();
    if (map != null && map.entrySet() != null) {
      for (Entry<String, ExprNodeDesc> entry : map.entrySet()) {
        if (entry.getValue() instanceof ExprNodeColumnDesc) {
          println(
              level,
              entry.getKey()
                  + ": "
                  + ((ExprNodeColumnDesc) entry.getValue()).getTabAlias()
                  + ((ExprNodeColumnDesc) entry.getValue()).getCols());
        } else if (entry.getValue() instanceof ExprNodeConstantDesc) {
          println(
              level,
              entry.getKey() + ":: " + ((ExprNodeConstantDesc) entry.getValue()).getExprString());
          // + ((ExprNodeConstantDesc)entry.getValue()).getCols());
        } else {
          println(level, entry.getValue().getExprString());
          // throw new RuntimeException("ExprNode Type does not supported!");
        }
      }
    }

    println(level, "Schema: ");
    RowSchema schema = sinkOp.getSchema();
    for (ColumnInfo info : schema.getSignature()) {
      println(level, info.getTabAlias() + "[" + info.getInternalName() + "]");
    }

    if (sinkOp instanceof JoinOperator) {

      // println(level, ((JoinOperator) sinkOp).getPosToAliasMap());
      // println(level, "Reversed Mapping: " + ((JoinOperator)sinkOp).getConf().getReversedExprs());
      // println(level, ((JoinOperator)sinkOp).getConf());

      // for (ExprNodeDesc nodeDesc: ((JoinOperator)sinkOp).getConf().getExprs()) {}
      // println(level, ((JoinOperator)sinkOp).getColumnExprMap());

      // for exprs
      /*
      for (List<ExprNodeDesc> lst : ((JoinOperator)sinkOp).getConf().getExprs().values()) {
      	printLevel(level);
      	for (ExprNodeDesc desc: lst) {
      		print(((ExprNodeColumnDesc)desc).getTabAlias() + " " + ((ExprNodeColumnDesc)desc).getCols());
      	}
      	println();
      }

      //for filters
      for (List<ExprNodeDesc> lst : ((JoinOperator)sinkOp).getConf().getFilters().values()) {
      	printLevel(level);
      	//print(((JoinOperator)sinkOp).getConf().getFilters());
      	for (ExprNodeDesc desc: lst) {
      		print(desc.getClass() + " ");
      		//print(((ExprNodeColumnDesc)desc).getTabAlias() + " " + ((ExprNodeColumnDesc)desc).getCols());
      	}
      	println();
      }

      println(level, "output");

      println(level, ((JoinOperator)sinkOp).getConf().getOutputColumnNames());
       */

      // println(level, ((JoinOperator)sinkOp).getConf().getExprsStringMap());
    }

    if (sinkOp instanceof ReduceSinkOperator) {
      // println(level, ((ReduceSinkOperator)sinkOp).getConf().getOutputKeyColumnNames());
      /*
      for (ExprNodeDesc desc: ((ReduceSinkOperator)sinkOp).getConf().getValueCols()) {
      	println(level, ((ExprNodeColumnDesc)desc).getTabAlias() + " "
      					+ ((ExprNodeColumnDesc)desc).getCols());
      }
       */

    }

    if (sinkOp instanceof SelectOperator) {
      /*
      for (ExprNodeDesc desc: ((SelectOperator)sinkOp).getConf().getColList()) {
      	println(level, ((ExprNodeColumnDesc)desc).getTabAlias() + " "
      					+ ((ExprNodeColumnDesc)desc).getCols());
      }*/
      // println(level, ((SelectOperator)sinkOp).getConf().getColList());
      // println(level, ((SelectOperator)sinkOp).getConf().getOutputColumnNames());
    }

    if (sinkOp instanceof TableScanOperator) {
      // TableScanDesc desc = ((TableScanOperator)sinkOp).getConf();
      // println(level, desc.getAlias());

      // println(level, desc.getFilterExpr());
      // println(level, desc.getBucketFileNameMapping());
      // println(level, desc.getVirtualCols());
      // println(level, desc.getPartColumns());
    }

    if (sinkOp instanceof FilterOperator) {
      println(level, ((FilterOperator) sinkOp).getConf().getPredicate().getExprString());
      // ExprNodeDesc desc = ((FilterOperator)sinkOp).getConf().getPredicate();
      // (ExprNodeGenericFuncDesc)((FilterOperator)sinkOp).getConf().getPredicate()
      // println(level, ((ExprNodeGenericFuncDesc)desc).getExprString());
      // println(level, ((ExprNodeGenericFuncDesc)desc).getCols());
    }

    if (sinkOp instanceof LimitOperator) {
      println(level, ((LimitOperator) sinkOp).getConf().getClass());
      // ExprNodeDesc desc = ((FilterOperator)sinkOp).getConf().getPredicate();
      // (ExprNodeGenericFuncDesc)((FilterOperator)sinkOp).getConf().getPredicate()
      // println(level, ((ExprNodeGenericFuncDesc)desc).getExprString());
      // println(level, ((ExprNodeGenericFuncDesc)desc).getCols());
    }

    List<Operator> lst = sinkOp.getParentOperators();
    if (lst != null) {
      for (Operator l : lst) {
        analyzeHelper(l, level + 1);
      }
    }
  }

예제 #9

0

파일 보기

파일: RowResolver.java 프로젝트: zshao/hive-1.1.0-cdh5.4.1

 public void put(String tab_alias, String col_alias, ColumnInfo colInfo) {
   if (!addMappingOnly(tab_alias, col_alias, colInfo)) {
     rowSchema.getSignature().add(colInfo);
   }
 }

예제 #10

0

파일 보기

파일: RowResolver.java 프로젝트: zshao/hive-1.1.0-cdh5.4.1

 public ArrayList<ColumnInfo> getColumnInfos() {
   return rowSchema.getSignature();
 }

예제 #11

0

파일 보기

파일: SortedDynPartitionOptimizer.java 프로젝트: hadoop-zuiwanyuan/hive

    @Override
    public Object process(
        Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs)
        throws SemanticException {

      // introduce RS and EX before FS. If the operator tree already contains
      // RS then ReduceSinkDeDuplication optimization should merge them
      FileSinkOperator fsOp = (FileSinkOperator) nd;

      LOG.info("Sorted dynamic partitioning optimization kicked in..");

      // if not dynamic partitioning then bail out
      if (fsOp.getConf().getDynPartCtx() == null) {
        LOG.debug(
            "Bailing out of sort dynamic partition optimization as dynamic partitioning context is null");
        return null;
      }

      // if list bucketing then bail out
      ListBucketingCtx lbCtx = fsOp.getConf().getLbCtx();
      if (lbCtx != null
          && !lbCtx.getSkewedColNames().isEmpty()
          && !lbCtx.getSkewedColValues().isEmpty()) {
        LOG.debug(
            "Bailing out of sort dynamic partition optimization as list bucketing is enabled");
        return null;
      }

      Table destTable = fsOp.getConf().getTable();
      if (destTable == null) {
        LOG.debug(
            "Bailing out of sort dynamic partition optimization as destination table is null");
        return null;
      }

      // unlink connection between FS and its parent
      Operator<? extends OperatorDesc> fsParent = fsOp.getParentOperators().get(0);
      // if all dp columns got constant folded then disable this optimization
      if (allStaticPartitions(fsParent, fsOp.getConf().getDynPartCtx())) {
        LOG.debug(
            "Bailing out of sorted dynamic partition optimizer as all dynamic partition"
                + " columns got constant folded (static partitioning)");
        return null;
      }

      // if RS is inserted by enforce bucketing or sorting, we need to remove it
      // since ReduceSinkDeDuplication will not merge them to single RS.
      // RS inserted by enforce bucketing/sorting will have bucketing column in
      // reduce sink key whereas RS inserted by this optimization will have
      // partition columns followed by bucket number followed by sort columns in
      // the reduce sink key. Since both key columns are not prefix subset
      // ReduceSinkDeDuplication will not merge them together resulting in 2 MR jobs.
      // To avoid that we will remove the RS (and EX) inserted by enforce bucketing/sorting.
      if (!removeRSInsertedByEnforceBucketing(fsOp)) {
        LOG.debug(
            "Bailing out of sort dynamic partition optimization as some partition columns "
                + "got constant folded.");
        return null;
      }

      // unlink connection between FS and its parent
      fsParent = fsOp.getParentOperators().get(0);
      fsParent.getChildOperators().clear();

      DynamicPartitionCtx dpCtx = fsOp.getConf().getDynPartCtx();
      int numBuckets = destTable.getNumBuckets();

      // if enforce bucketing/sorting is disabled numBuckets will not be set.
      // set the number of buckets here to ensure creation of empty buckets
      dpCtx.setNumBuckets(numBuckets);

      // Get the positions for partition, bucket and sort columns
      List<Integer> bucketPositions =
          getBucketPositions(destTable.getBucketCols(), destTable.getCols());
      List<Integer> sortPositions = null;
      List<Integer> sortOrder = null;
      ArrayList<ExprNodeDesc> bucketColumns;
      if (fsOp.getConf().getWriteType() == AcidUtils.Operation.UPDATE
          || fsOp.getConf().getWriteType() == AcidUtils.Operation.DELETE) {
        // When doing updates and deletes we always want to sort on the rowid because the ACID
        // reader will expect this sort order when doing reads.  So
        // ignore whatever comes from the table and enforce this sort order instead.
        sortPositions = Arrays.asList(0);
        sortOrder = Arrays.asList(1); // 1 means asc, could really use enum here in the thrift if
        bucketColumns =
            new ArrayList<>(); // Bucketing column is already present in ROW__ID, which is specially
                               // handled in ReduceSink
      } else {
        if (!destTable.getSortCols().isEmpty()) {
          // Sort columns specified by table
          sortPositions = getSortPositions(destTable.getSortCols(), destTable.getCols());
          sortOrder = getSortOrders(destTable.getSortCols(), destTable.getCols());
        } else {
          // Infer sort columns from operator tree
          sortPositions = Lists.newArrayList();
          sortOrder = Lists.newArrayList();
          inferSortPositions(fsParent, sortPositions, sortOrder);
        }
        List<ColumnInfo> colInfos = fsParent.getSchema().getSignature();
        bucketColumns = getPositionsToExprNodes(bucketPositions, colInfos);
      }
      List<Integer> sortNullOrder = new ArrayList<Integer>();
      for (int order : sortOrder) {
        sortNullOrder.add(order == 1 ? 0 : 1); // for asc, nulls first; for desc, nulls last
      }
      LOG.debug("Got sort order");
      for (int i : sortPositions) LOG.debug("sort position " + i);
      for (int i : sortOrder) LOG.debug("sort order " + i);
      for (int i : sortNullOrder) LOG.debug("sort null order " + i);
      List<Integer> partitionPositions = getPartitionPositions(dpCtx, fsParent.getSchema());

      // update file sink descriptor
      fsOp.getConf().setMultiFileSpray(false);
      fsOp.getConf().setNumFiles(1);
      fsOp.getConf().setTotalFiles(1);

      ArrayList<ColumnInfo> parentCols = Lists.newArrayList(fsParent.getSchema().getSignature());
      ArrayList<ExprNodeDesc> allRSCols = Lists.newArrayList();
      for (ColumnInfo ci : parentCols) {
        allRSCols.add(new ExprNodeColumnDesc(ci));
      }

      // Create ReduceSink operator
      ReduceSinkOperator rsOp =
          getReduceSinkOp(
              partitionPositions,
              sortPositions,
              sortOrder,
              sortNullOrder,
              allRSCols,
              bucketColumns,
              numBuckets,
              fsParent,
              fsOp.getConf().getWriteType());

      List<ExprNodeDesc> descs = new ArrayList<ExprNodeDesc>(allRSCols.size());
      List<String> colNames = new ArrayList<String>();
      String colName;
      for (int i = 0; i < allRSCols.size(); i++) {
        ExprNodeDesc col = allRSCols.get(i);
        colName = col.getExprString();
        colNames.add(colName);
        if (partitionPositions.contains(i) || sortPositions.contains(i)) {
          descs.add(
              new ExprNodeColumnDesc(
                  col.getTypeInfo(), ReduceField.KEY.toString() + "." + colName, null, false));
        } else {
          descs.add(
              new ExprNodeColumnDesc(
                  col.getTypeInfo(), ReduceField.VALUE.toString() + "." + colName, null, false));
        }
      }
      RowSchema selRS = new RowSchema(fsParent.getSchema());
      if (!bucketColumns.isEmpty()
          || fsOp.getConf().getWriteType() == Operation.DELETE
          || fsOp.getConf().getWriteType() == Operation.UPDATE) {
        descs.add(
            new ExprNodeColumnDesc(
                TypeInfoFactory.stringTypeInfo,
                ReduceField.KEY.toString() + ".'" + BUCKET_NUMBER_COL_NAME + "'",
                null,
                false));
        colNames.add("'" + BUCKET_NUMBER_COL_NAME + "'");
        ColumnInfo ci =
            new ColumnInfo(
                BUCKET_NUMBER_COL_NAME,
                TypeInfoFactory.stringTypeInfo,
                selRS.getSignature().get(0).getTabAlias(),
                true,
                true);
        selRS.getSignature().add(ci);
        fsParent.getSchema().getSignature().add(ci);
      }
      // Create SelectDesc
      SelectDesc selConf = new SelectDesc(descs, colNames);

      // Create Select Operator
      SelectOperator selOp = (SelectOperator) OperatorFactory.getAndMakeChild(selConf, selRS, rsOp);

      // link SEL to FS
      fsOp.getParentOperators().clear();
      fsOp.getParentOperators().add(selOp);
      selOp.getChildOperators().add(fsOp);

      // Set if partition sorted or partition bucket sorted
      fsOp.getConf().setDpSortState(FileSinkDesc.DPSortState.PARTITION_SORTED);
      if (bucketColumns.size() > 0
          || fsOp.getConf().getWriteType() == Operation.DELETE
          || fsOp.getConf().getWriteType() == Operation.UPDATE) {
        fsOp.getConf().setDpSortState(FileSinkDesc.DPSortState.PARTITION_BUCKET_SORTED);
      }

      // update partition column info in FS descriptor
      fsOp.getConf().setPartitionCols(rsOp.getConf().getPartitionCols());

      LOG.info(
          "Inserted "
              + rsOp.getOperatorId()
              + " and "
              + selOp.getOperatorId()
              + " as parent of "
              + fsOp.getOperatorId()
              + " and child of "
              + fsParent.getOperatorId());

      parseCtx.setReduceSinkAddedBySortedDynPartition(true);
      return null;
    }