/**
   * Construct a conditional task given the current leaf task, the MoveWork and the MapredWork.
   *
   * @param conf HiveConf
   * @param currTask current leaf task
   * @param mvWork MoveWork for the move task
   * @param mergeWork MapredWork for the merge task.
   * @param inputPath the input directory of the merge/move task
   * @return The conditional task
   */
  private ConditionalTask createCondTask(
      HiveConf conf,
      Task<? extends Serializable> currTask,
      MoveWork mvWork,
      MapredWork mergeWork,
      String inputPath) {

    Task<? extends Serializable> mergeTask = TaskFactory.get(mergeWork, conf);
    Task<? extends Serializable> moveTask = TaskFactory.get(mvWork, conf);
    List<Serializable> listWorks = new ArrayList<Serializable>();
    listWorks.add(mvWork);
    listWorks.add(mergeWork);

    ConditionalWork cndWork = new ConditionalWork(listWorks);

    List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
    listTasks.add(moveTask);
    listTasks.add(mergeTask);

    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, conf);
    cndTsk.setListTasks(listTasks);

    // create resolver
    cndTsk.setResolver(new ConditionalResolverMergeFiles());
    ConditionalResolverMergeFilesCtx mrCtx =
        new ConditionalResolverMergeFilesCtx(listTasks, inputPath);
    cndTsk.setResolverCtx(mrCtx);

    // make the conditional task as the child of the current leaf task
    currTask.addDependentTask(cndTsk);

    return cndTsk;
  }
Example #2
0
  /**
   * Construct a conditional task given the current leaf task, the MoveWork and the MapredWork.
   *
   * @param conf HiveConf
   * @param currTask current leaf task
   * @param mvWork MoveWork for the move task
   * @param mergeWork MapredWork for the merge task.
   * @param inputPath the input directory of the merge/move task
   * @return The conditional task
   */
  private ConditionalTask createCondTask(
      HiveConf conf,
      Task<? extends Serializable> currTask,
      MoveWork mvWork,
      MapredWork mergeWork,
      String inputPath) {

    // There are 3 options for this ConditionalTask:
    // 1) Merge the partitions
    // 2) Move the partitions (i.e. don't merge the partitions)
    // 3) Merge some partitions and move other partitions (i.e. merge some partitions and don't
    // merge others) in this case the merge is done first followed by the move to prevent
    // conflicts.
    Task<? extends Serializable> mergeOnlyMergeTask = TaskFactory.get(mergeWork, conf);
    Task<? extends Serializable> moveOnlyMoveTask = TaskFactory.get(mvWork, conf);
    Task<? extends Serializable> mergeAndMoveMergeTask = TaskFactory.get(mergeWork, conf);
    Task<? extends Serializable> mergeAndMoveMoveTask = TaskFactory.get(mvWork, conf);

    // NOTE! It is necessary merge task is the parent of the move task, and not
    // the other way around, for the proper execution of the execute method of
    // ConditionalTask
    mergeAndMoveMergeTask.addDependentTask(mergeAndMoveMoveTask);

    List<Serializable> listWorks = new ArrayList<Serializable>();
    listWorks.add(mvWork);
    listWorks.add(mergeWork);

    ConditionalWork cndWork = new ConditionalWork(listWorks);

    List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
    listTasks.add(moveOnlyMoveTask);
    listTasks.add(mergeOnlyMergeTask);
    listTasks.add(mergeAndMoveMergeTask);

    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, conf);
    cndTsk.setListTasks(listTasks);

    // create resolver
    cndTsk.setResolver(new ConditionalResolverMergeFiles());
    ConditionalResolverMergeFilesCtx mrCtx =
        new ConditionalResolverMergeFilesCtx(listTasks, inputPath);
    cndTsk.setResolverCtx(mrCtx);

    // make the conditional task as the child of the current leaf task
    currTask.addDependentTask(cndTsk);

    return cndTsk;
  }
Example #3
0
  /**
   * Make the move task in the GenMRProcContext following the FileSinkOperator a dependent of all
   * possible subtrees branching from the ConditionalTask.
   *
   * @param ctx
   * @param newOutput
   * @param cndTsk
   */
  private void linkMoveTask(
      GenMRProcContext ctx, FileSinkOperator newOutput, ConditionalTask cndTsk) {

    List<Task<MoveWork>> mvTasks = ctx.getMvTask();
    Task<MoveWork> mvTask = findMoveTask(mvTasks, newOutput);

    for (Task<? extends Serializable> tsk : cndTsk.getListTasks()) {
      linkMoveTask(ctx, mvTask, tsk);
    }
  }
  private void LinkMoveTask(
      GenMRProcContext ctx, FileSinkOperator newOutput, ConditionalTask cndTsk) {

    List<Task<? extends Serializable>> mvTasks = ctx.getMvTask();
    Task<? extends Serializable> mvTask = findMoveTask(mvTasks, newOutput);

    if (mvTask != null) {
      for (Task<? extends Serializable> tsk : cndTsk.getListTasks()) {
        tsk.addDependentTask(mvTask);
      }
    }
  }
  /**
   * create a Map-only merge job with the following operators:
   *
   * @param fsInput
   * @param ctx
   * @param finalName MR job J0: ... | v FileSinkOperator_1 (fsInput) | v Merge job J1: | v
   *     TableScan (using CombineHiveInputFormat) (tsMerge) | v FileSinkOperator (fsMerge)
   *     <p>Here the pathToPartitionInfo & pathToAlias will remain the same, which means the paths
   *     do not contain the dynamic partitions (their parent). So after the dynamic partitions are
   *     created (after the first job finished before the moveTask or ConditionalTask start), we
   *     need to change the pathToPartitionInfo & pathToAlias to include the dynamic partition
   *     directories.
   */
  private void createMap4Merge(FileSinkOperator fsInput, GenMRProcContext ctx, String finalName) {

    //
    // 1. create the operator tree
    //
    ParseContext parseCtx = ctx.getParseCtx();
    FileSinkDesc fsInputDesc = fsInput.getConf();

    // Create a TableScan operator
    RowSchema inputRS = fsInput.getSchema();
    Operator<? extends Serializable> tsMerge = OperatorFactory.get(TableScanDesc.class, inputRS);

    // Create a FileSink operator
    TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone();
    FileSinkDesc fsOutputDesc =
        new FileSinkDesc(
            finalName, ts, parseCtx.getConf().getBoolVar(HiveConf.ConfVars.COMPRESSRESULT));
    FileSinkOperator fsOutput =
        (FileSinkOperator) OperatorFactory.getAndMakeChild(fsOutputDesc, inputRS, tsMerge);

    // If the input FileSinkOperator is a dynamic partition enabled, the tsMerge input schema
    // needs to include the partition column, and the fsOutput should have
    // a DynamicPartitionCtx to indicate that it needs to dynamically partitioned.
    DynamicPartitionCtx dpCtx = fsInputDesc.getDynPartCtx();
    if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
      // adding DP ColumnInfo to the RowSchema signature
      ArrayList<ColumnInfo> signature = inputRS.getSignature();
      String tblAlias = fsInputDesc.getTableInfo().getTableName();
      LinkedHashMap<String, String> colMap = new LinkedHashMap<String, String>();
      StringBuilder partCols = new StringBuilder();
      for (String dpCol : dpCtx.getDPColNames()) {
        ColumnInfo colInfo =
            new ColumnInfo(
                dpCol,
                TypeInfoFactory.stringTypeInfo, // all partition column type should be string
                tblAlias,
                true); // partition column is virtual column
        signature.add(colInfo);
        colMap.put(dpCol, dpCol); // input and output have the same column name
        partCols.append(dpCol).append('/');
      }
      partCols.setLength(partCols.length() - 1); // remove the last '/'
      inputRS.setSignature(signature);

      // create another DynamicPartitionCtx, which has a different input-to-DP column mapping
      DynamicPartitionCtx dpCtx2 = new DynamicPartitionCtx(dpCtx);
      dpCtx2.setInputToDPCols(colMap);
      fsOutputDesc.setDynPartCtx(dpCtx2);

      // update the FileSinkOperator to include partition columns
      fsInputDesc
          .getTableInfo()
          .getProperties()
          .setProperty(
              org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_PARTITION_COLUMNS,
              partCols.toString()); // list of dynamic partition column names
    } else {
      // non-partitioned table
      fsInputDesc
          .getTableInfo()
          .getProperties()
          .remove(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_PARTITION_COLUMNS);
    }

    //
    // 2. Constructing a conditional task consisting of a move task and a map reduce task
    //
    MapRedTask currTask = (MapRedTask) ctx.getCurrTask();
    MoveWork dummyMv =
        new MoveWork(
            null,
            null,
            null,
            new LoadFileDesc(fsInputDesc.getDirName(), finalName, true, null, null),
            false);
    MapredWork cplan = createMergeTask(ctx.getConf(), tsMerge, fsInputDesc);
    // use CombineHiveInputFormat for map-only merging
    cplan.setInputformat("org.apache.hadoop.hive.ql.io.CombineHiveInputFormat");
    // NOTE: we should gather stats in MR1 rather than MR2 at merge job since we don't
    // know if merge MR2 will be triggered at execution time
    ConditionalTask cndTsk =
        createCondTask(ctx.getConf(), ctx.getCurrTask(), dummyMv, cplan, fsInputDesc.getDirName());

    // keep the dynamic partition context in conditional task resolver context
    ConditionalResolverMergeFilesCtx mrCtx =
        (ConditionalResolverMergeFilesCtx) cndTsk.getResolverCtx();
    mrCtx.setDPCtx(fsInputDesc.getDynPartCtx());

    //
    // 3. add the moveTask as the children of the conditional task
    //
    LinkMoveTask(ctx, fsOutput, cndTsk);
  }
Example #6
0
  /**
   * @param fsInput The FileSink operator.
   * @param ctx The MR processing context.
   * @param finalName the final destination path the merge job should output.
   * @throws SemanticException
   *     <p>create a Map-only merge job using CombineHiveInputFormat for all partitions with
   *     following operators: MR job J0: ... | v FileSinkOperator_1 (fsInput) | v Merge job J1: | v
   *     TableScan (using CombineHiveInputFormat) (tsMerge) | v FileSinkOperator (fsMerge)
   *     <p>Here the pathToPartitionInfo & pathToAlias will remain the same, which means the paths
   *     do not contain the dynamic partitions (their parent). So after the dynamic partitions are
   *     created (after the first job finished before the moveTask or ConditionalTask start), we
   *     need to change the pathToPartitionInfo & pathToAlias to include the dynamic partition
   *     directories.
   */
  private void createMRWorkForMergingFiles(
      FileSinkOperator fsInput, GenMRProcContext ctx, String finalName) throws SemanticException {

    //
    // 1. create the operator tree
    //
    HiveConf conf = ctx.getParseCtx().getConf();
    FileSinkDesc fsInputDesc = fsInput.getConf();

    // Create a TableScan operator
    RowSchema inputRS = fsInput.getSchema();
    Operator<? extends OperatorDesc> tsMerge = OperatorFactory.get(TableScanDesc.class, inputRS);

    // Create a FileSink operator
    TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone();
    FileSinkDesc fsOutputDesc =
        new FileSinkDesc(finalName, ts, conf.getBoolVar(ConfVars.COMPRESSRESULT));
    FileSinkOperator fsOutput =
        (FileSinkOperator) OperatorFactory.getAndMakeChild(fsOutputDesc, inputRS, tsMerge);

    // If the input FileSinkOperator is a dynamic partition enabled, the tsMerge input schema
    // needs to include the partition column, and the fsOutput should have
    // a DynamicPartitionCtx to indicate that it needs to dynamically partitioned.
    DynamicPartitionCtx dpCtx = fsInputDesc.getDynPartCtx();
    if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
      // adding DP ColumnInfo to the RowSchema signature
      ArrayList<ColumnInfo> signature = inputRS.getSignature();
      String tblAlias = fsInputDesc.getTableInfo().getTableName();
      LinkedHashMap<String, String> colMap = new LinkedHashMap<String, String>();
      StringBuilder partCols = new StringBuilder();
      for (String dpCol : dpCtx.getDPColNames()) {
        ColumnInfo colInfo =
            new ColumnInfo(
                dpCol,
                TypeInfoFactory.stringTypeInfo, // all partition column type should be string
                tblAlias,
                true); // partition column is virtual column
        signature.add(colInfo);
        colMap.put(dpCol, dpCol); // input and output have the same column name
        partCols.append(dpCol).append('/');
      }
      partCols.setLength(partCols.length() - 1); // remove the last '/'
      inputRS.setSignature(signature);

      // create another DynamicPartitionCtx, which has a different input-to-DP column mapping
      DynamicPartitionCtx dpCtx2 = new DynamicPartitionCtx(dpCtx);
      dpCtx2.setInputToDPCols(colMap);
      fsOutputDesc.setDynPartCtx(dpCtx2);

      // update the FileSinkOperator to include partition columns
      fsInputDesc
          .getTableInfo()
          .getProperties()
          .setProperty(
              org.apache
                  .hadoop
                  .hive
                  .metastore
                  .api
                  .hive_metastoreConstants
                  .META_TABLE_PARTITION_COLUMNS,
              partCols.toString()); // list of dynamic partition column names
    } else {
      // non-partitioned table
      fsInputDesc
          .getTableInfo()
          .getProperties()
          .remove(
              org.apache
                  .hadoop
                  .hive
                  .metastore
                  .api
                  .hive_metastoreConstants
                  .META_TABLE_PARTITION_COLUMNS);
    }

    //
    // 2. Constructing a conditional task consisting of a move task and a map reduce task
    //
    MoveWork dummyMv =
        new MoveWork(
            null,
            null,
            null,
            new LoadFileDesc(fsInputDesc.getFinalDirName(), finalName, true, null, null),
            false);
    MapredWork cplan;

    if (conf.getBoolVar(ConfVars.HIVEMERGERCFILEBLOCKLEVEL)
        && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(RCFileInputFormat.class)) {

      // Check if InputFormatClass is valid
      String inputFormatClass = conf.getVar(ConfVars.HIVEMERGERCFILEINPUTFORMATBLOCKLEVEL);
      try {
        Class c = (Class<? extends InputFormat>) Class.forName(inputFormatClass);

        LOG.info("RCFile format- Using block level merge");
        cplan =
            createBlockMergeTask(
                fsInputDesc,
                finalName,
                dpCtx != null && dpCtx.getNumDPCols() > 0,
                RCFileMergeMapper.class,
                RCFileInputFormat.class,
                RCFileBlockMergeInputFormat.class);
      } catch (ClassNotFoundException e) {
        String msg = "Illegal input format class: " + inputFormatClass;
        throw new SemanticException(msg);
      }

    } else if (conf.getBoolVar(ConfVars.HIVEMERGEORCBLOCKLEVEL)
        && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(OrcInputFormat.class)) {

      // Check if InputFormatClass is valid
      String inputFormatClass = conf.getVar(ConfVars.HIVEMERGEORCINPUTFORMATBLOCKLEVEL);
      try {
        Class c = (Class<? extends InputFormat>) Class.forName(inputFormatClass);

        LOG.info("ORCFile format- Using block level merge");
        cplan =
            createBlockMergeTask(
                fsInputDesc,
                finalName,
                dpCtx != null && dpCtx.getNumDPCols() > 0,
                OrcMergeMapper.class,
                OrcInputFormat.class,
                OrcBlockMergeInputFormat.class);
      } catch (ClassNotFoundException e) {
        String msg = "Illegal input format class: " + inputFormatClass;
        throw new SemanticException(msg);
      }

    } else {
      cplan = createMRWorkForMergingFiles(conf, tsMerge, fsInputDesc);
      // use CombineHiveInputFormat for map-only merging
    }
    cplan.setInputformat("org.apache.hadoop.hive.ql.io.CombineHiveInputFormat");
    // NOTE: we should gather stats in MR1 rather than MR2 at merge job since we don't
    // know if merge MR2 will be triggered at execution time
    ConditionalTask cndTsk =
        createCondTask(conf, ctx.getCurrTask(), dummyMv, cplan, fsInputDesc.getFinalDirName());

    // keep the dynamic partition context in conditional task resolver context
    ConditionalResolverMergeFilesCtx mrCtx =
        (ConditionalResolverMergeFilesCtx) cndTsk.getResolverCtx();
    mrCtx.setDPCtx(fsInputDesc.getDynPartCtx());
    mrCtx.setLbCtx(fsInputDesc.getLbCtx());

    //
    // 3. add the moveTask as the children of the conditional task
    //
    linkMoveTask(ctx, fsOutput, cndTsk);
  }