Java Task.getWork Examples, org.apache.hadoop.hive.ql.exec.Task.getWork Java Examples

Example #1

0

Show file

File: SkewJoinResolver.java Project: uclaabs/absHive

    @Override
    public Object dispatch(Node nd, Stack<Node> stack, Object... nodeOutputs)
        throws SemanticException {
      Task<? extends Serializable> task = (Task<? extends Serializable>) nd;

      if (!task.isMapRedTask()
          || task instanceof ConditionalTask
          || ((MapredWork) task.getWork()).getReducer() == null) {
        return null;
      }

      SkewJoinProcCtx skewJoinProcContext =
          new SkewJoinProcCtx(task, physicalContext.getParseContext());

      Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
      opRules.put(
          new RuleRegExp("R1", CommonJoinOperator.getOperatorName() + "%"),
          SkewJoinProcFactory.getJoinProc());

      // The dispatcher fires the processor corresponding to the closest
      // matching rule and passes the context along
      Dispatcher disp =
          new DefaultRuleDispatcher(
              SkewJoinProcFactory.getDefaultProc(), opRules, skewJoinProcContext);
      GraphWalker ogw = new DefaultGraphWalker(disp);

      // iterator the reducer operator tree
      ArrayList<Node> topNodes = new ArrayList<Node>();
      topNodes.add(((MapredWork) task.getWork()).getReducer());
      ogw.startWalking(topNodes, null);
      return null;
    }

Example #2

0

Show file

File: GenMRFileSink1.java Project: uclaabs/absHive

  /*
   * Multiple file sink descriptors are linked.
   * Use the task created by the first linked file descriptor
   */
  private void processLinkedFileDesc(GenMRProcContext ctx, Task<? extends Serializable> childTask)
      throws SemanticException {
    Operator<? extends OperatorDesc> currTopOp = ctx.getCurrTopOp();
    String currAliasId = ctx.getCurrAliasId();
    List<Operator<? extends OperatorDesc>> seenOps = ctx.getSeenOps();
    List<Task<? extends Serializable>> rootTasks = ctx.getRootTasks();
    Task<? extends Serializable> currTask = ctx.getCurrTask();

    if (currTopOp != null) {
      if (!seenOps.contains(currTopOp)) {
        seenOps.add(currTopOp);
        GenMapRedUtils.setTaskPlan(
            currAliasId, currTopOp, (MapredWork) currTask.getWork(), false, ctx);
      }

      if (!rootTasks.contains(currTask)
          && (currTask.getParentTasks() == null || currTask.getParentTasks().isEmpty())) {
        rootTasks.add(currTask);
      }
    }

    if (childTask != null) {
      currTask.addDependentTask(childTask);
    }
  }

Example #3

0

Show file

File: GenMapRedUtils.java Project: Carlie20083/hive-0.7.0

  /*
   * It is a idempotent function to add various intermediate files as the source
   * for the union. The plan has already been created.
   */
  public static void initUnionPlan(
      GenMRProcContext opProcCtx, Task<? extends Serializable> currTask, boolean local) {
    MapredWork plan = (MapredWork) currTask.getWork();
    UnionOperator currUnionOp = opProcCtx.getCurrUnionOp();
    assert currUnionOp != null;
    GenMRUnionCtx uCtx = opProcCtx.getUnionTask(currUnionOp);
    assert uCtx != null;

    List<String> taskTmpDirLst = uCtx.getTaskTmpDir();
    List<TableDesc> tt_descLst = uCtx.getTTDesc();
    assert !taskTmpDirLst.isEmpty() && !tt_descLst.isEmpty();
    assert taskTmpDirLst.size() == tt_descLst.size();
    int size = taskTmpDirLst.size();
    assert local == false;

    for (int pos = 0; pos < size; pos++) {
      String taskTmpDir = taskTmpDirLst.get(pos);
      TableDesc tt_desc = tt_descLst.get(pos);
      if (plan.getPathToAliases().get(taskTmpDir) == null) {
        plan.getPathToAliases().put(taskTmpDir, new ArrayList<String>());
        plan.getPathToAliases().get(taskTmpDir).add(taskTmpDir);
        plan.getPathToPartitionInfo().put(taskTmpDir, new PartitionDesc(tt_desc, null));
        plan.getAliasToWork().put(taskTmpDir, currUnionOp);
      }
    }
  }

Example #4

0

Show file

File: GenMRFileSink1.java Project: uclaabs/absHive

  /**
   * Add the StatsTask as a dependent task of the MoveTask because StatsTask will change the
   * Table/Partition metadata. For atomicity, we should not change it before the data is actually
   * there done by MoveTask.
   *
   * @param nd the FileSinkOperator whose results are taken care of by the MoveTask.
   * @param mvTask The MoveTask that moves the FileSinkOperator's results.
   * @param currTask The MapRedTask that the FileSinkOperator belongs to.
   * @param hconf HiveConf
   */
  private void addStatsTask(
      FileSinkOperator nd, MoveTask mvTask, Task<? extends Serializable> currTask, HiveConf hconf) {

    MoveWork mvWork = ((MoveTask) mvTask).getWork();
    StatsWork statsWork = null;
    if (mvWork.getLoadTableWork() != null) {
      statsWork = new StatsWork(mvWork.getLoadTableWork());
    } else if (mvWork.getLoadFileWork() != null) {
      statsWork = new StatsWork(mvWork.getLoadFileWork());
    }
    assert statsWork != null : "Error when genereting StatsTask";
    statsWork.setStatsReliable(hconf.getBoolVar(ConfVars.HIVE_STATS_RELIABLE));
    MapredWork mrWork = (MapredWork) currTask.getWork();

    // AggKey in StatsWork is used for stats aggregation while StatsAggPrefix
    // in FileSinkDesc is used for stats publishing. They should be consistent.
    statsWork.setAggKey(((FileSinkOperator) nd).getConf().getStatsAggPrefix());
    Task<? extends Serializable> statsTask = TaskFactory.get(statsWork, hconf);

    // mark the MapredWork and FileSinkOperator for gathering stats
    nd.getConf().setGatherStats(true);
    mrWork.setGatheringStats(true);
    nd.getConf().setStatsReliable(hconf.getBoolVar(ConfVars.HIVE_STATS_RELIABLE));
    nd.getConf()
        .setMaxStatsKeyPrefixLength(hconf.getIntVar(ConfVars.HIVE_STATS_KEY_PREFIX_MAX_LENGTH));
    // mrWork.addDestinationTable(nd.getConf().getTableInfo().getTableName());

    // subscribe feeds from the MoveTask so that MoveTask can forward the list
    // of dynamic partition list to the StatsTask
    mvTask.addDependentTask(statsTask);
    statsTask.subscribeFeed(mvTask);
  }

Example #5

0

Show file

File: GenMRFileSink1.java Project: Carlie20083/hive-0.7.0

  /**
   * File Sink Operator encountered.
   *
   * @param nd the file sink operator encountered
   * @param opProcCtx context
   */
  public Object process(
      Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs)
      throws SemanticException {
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    ParseContext parseCtx = ctx.getParseCtx();
    boolean chDir = false;
    Task<? extends Serializable> currTask = ctx.getCurrTask();
    FileSinkOperator fsOp = (FileSinkOperator) nd;
    boolean isInsertTable = // is INSERT OVERWRITE TABLE
        fsOp.getConf().getTableInfo().getTableName() != null
            && parseCtx.getQB().getParseInfo().isInsertToTable();
    HiveConf hconf = parseCtx.getConf();

    // Has the user enabled merging of files for map-only jobs or for all jobs
    if ((ctx.getMvTask() != null) && (!ctx.getMvTask().isEmpty())) {
      List<Task<? extends Serializable>> mvTasks = ctx.getMvTask();

      // In case of unions or map-joins, it is possible that the file has
      // already been seen.
      // So, no need to attempt to merge the files again.
      if ((ctx.getSeenFileSinkOps() == null) || (!ctx.getSeenFileSinkOps().contains(nd))) {

        // no need of merging if the move is to a local file system
        MoveTask mvTask = (MoveTask) findMoveTask(mvTasks, fsOp);

        if (isInsertTable && hconf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
          addStatsTask(fsOp, mvTask, currTask, parseCtx.getConf());
        }

        if ((mvTask != null) && !mvTask.isLocal()) {
          // There are separate configuration parameters to control whether to
          // merge for a map-only job
          // or for a map-reduce job
          MapredWork currWork = (MapredWork) currTask.getWork();
          boolean mergeMapOnly =
              hconf.getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPFILES)
                  && currWork.getReducer() == null;
          boolean mergeMapRed =
              hconf.getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPREDFILES)
                  && currWork.getReducer() != null;
          if (mergeMapOnly || mergeMapRed) {
            chDir = true;
          }
        }
      }
    }

    String finalName = processFS(nd, stack, opProcCtx, chDir);

    // need to merge the files in the destination table/partitions
    if (chDir && (finalName != null)) {
      createMergeJob((FileSinkOperator) nd, ctx, finalName);
    }

    return null;
  }

Example #6

0

Show file

File: GenMRFileSink1.java Project: uclaabs/absHive

  private Task<MoveWork> findMoveTask(List<Task<MoveWork>> mvTasks, FileSinkOperator fsOp) {
    // find the move task
    for (Task<MoveWork> mvTsk : mvTasks) {
      MoveWork mvWork = mvTsk.getWork();
      String srcDir = null;
      if (mvWork.getLoadFileWork() != null) {
        srcDir = mvWork.getLoadFileWork().getSourceDir();
      } else if (mvWork.getLoadTableWork() != null) {
        srcDir = mvWork.getLoadTableWork().getSourceDir();
      }

      String fsOpDirName = fsOp.getConf().getFinalDirName();
      if ((srcDir != null) && (srcDir.equalsIgnoreCase(fsOpDirName))) {
        return mvTsk;
      }
    }
    return null;
  }

Example #7

0

Show file

File: GenMRFileSink1.java Project: Carlie20083/hive-0.7.0

  private Task<? extends Serializable> findMoveTask(
      List<Task<? extends Serializable>> mvTasks, FileSinkOperator fsOp) {
    // find the move task
    for (Task<? extends Serializable> mvTsk : mvTasks) {
      MoveWork mvWork = (MoveWork) mvTsk.getWork();
      String srcDir = null;
      if (mvWork.getLoadFileWork() != null) {
        srcDir = mvWork.getLoadFileWork().getSourceDir();
      } else if (mvWork.getLoadTableWork() != null) {
        srcDir = mvWork.getLoadTableWork().getSourceDir();
      }

      if ((srcDir != null) && (srcDir.equalsIgnoreCase(fsOp.getConf().getDirName()))) {
        return mvTsk;
      }
    }
    return null;
  }

Example #8

0

Show file

File: GenMapRedUtils.java Project: Carlie20083/hive-0.7.0

  /**
   * Initialize the current plan by adding it to root tasks.
   *
   * @param op the reduce sink operator encountered
   * @param opProcCtx processing context
   */
  public static void initPlan(ReduceSinkOperator op, GenMRProcContext opProcCtx)
      throws SemanticException {
    Operator<? extends Serializable> reducer = op.getChildOperators().get(0);
    Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0));
    Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
    MapredWork plan = (MapredWork) currTask.getWork();
    HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap =
        opProcCtx.getOpTaskMap();
    Operator<? extends Serializable> currTopOp = opProcCtx.getCurrTopOp();

    opTaskMap.put(reducer, currTask);
    plan.setReducer(reducer);
    ReduceSinkDesc desc = op.getConf();

    plan.setNumReduceTasks(desc.getNumReducers());

    List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();

    if (!rootTasks.contains(currTask)) {
      rootTasks.add(currTask);
    }
    if (reducer.getClass() == JoinOperator.class) {
      plan.setNeedsTagging(true);
    }

    assert currTopOp != null;
    List<Operator<? extends Serializable>> seenOps = opProcCtx.getSeenOps();
    String currAliasId = opProcCtx.getCurrAliasId();

    if (!seenOps.contains(currTopOp)) {
      seenOps.add(currTopOp);
      setTaskPlan(currAliasId, currTopOp, plan, false, opProcCtx);
    }

    currTopOp = null;
    currAliasId = null;

    opProcCtx.setCurrTask(currTask);
    opProcCtx.setCurrTopOp(currTopOp);
    opProcCtx.setCurrAliasId(currAliasId);
  }

Example #9

0

Show file

File: GenMRFileSink1.java Project: uclaabs/absHive

  /**
   * Adds the dependencyTaskForMultiInsert in ctx as a dependent of parentTask. If mvTask is a load
   * table, and HIVE_MULTI_INSERT_ATOMIC_OUTPUTS is set, adds mvTask as a dependent of
   * dependencyTaskForMultiInsert in ctx, otherwise adds mvTask as a dependent of parentTask as
   * well.
   *
   * @param ctx
   * @param mvTask
   * @param parentTask
   */
  private void addDependentMoveTasks(
      GenMRProcContext ctx, Task<MoveWork> mvTask, Task<? extends Serializable> parentTask) {

    if (mvTask != null) {
      if (ctx.getConf().getBoolVar(ConfVars.HIVE_MULTI_INSERT_MOVE_TASKS_SHARE_DEPENDENCIES)) {
        DependencyCollectionTask dependencyTask = ctx.getDependencyTaskForMultiInsert();
        parentTask.addDependentTask(dependencyTask);
        if (mvTask.getWork().getLoadTableWork() != null) {
          // Moving tables/partitions depend on the dependencyTask
          dependencyTask.addDependentTask(mvTask);
        } else {
          // Moving files depends on the parentTask (we still want the dependencyTask to depend
          // on the parentTask)
          parentTask.addDependentTask(mvTask);
        }
      } else {
        parentTask.addDependentTask(mvTask);
      }
    }
  }

Example #10

0

Show file

File: GenMapRedUtils.java Project: Carlie20083/hive-0.7.0

  /**
   * Initialize the current union plan.
   *
   * @param op the reduce sink operator encountered
   * @param opProcCtx processing context
   */
  public static void initUnionPlan(ReduceSinkOperator op, GenMRProcContext opProcCtx)
      throws SemanticException {
    Operator<? extends Serializable> reducer = op.getChildOperators().get(0);
    Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0));
    Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
    MapredWork plan = (MapredWork) currTask.getWork();
    HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap =
        opProcCtx.getOpTaskMap();

    opTaskMap.put(reducer, currTask);
    plan.setReducer(reducer);
    ReduceSinkDesc desc = op.getConf();

    plan.setNumReduceTasks(desc.getNumReducers());

    if (reducer.getClass() == JoinOperator.class) {
      plan.setNeedsTagging(true);
    }

    initUnionPlan(opProcCtx, currTask, false);
  }

Example #11

0

Show file

File: GenMRFileSink1.java Project: Carlie20083/hive-0.7.0

  /**
   * Add the StatsTask as a dependent task of the MoveTask because StatsTask will change the
   * Table/Partition metadata. For atomicity, we should not change it before the data is actually
   * there done by MoveTask.
   *
   * @param nd the FileSinkOperator whose results are taken care of by the MoveTask.
   * @param mvTask The MoveTask that moves the FileSinkOperator's results.
   * @param currTask The MapRedTask that the FileSinkOperator belongs to.
   * @param hconf HiveConf
   */
  private void addStatsTask(
      FileSinkOperator nd, MoveTask mvTask, Task<? extends Serializable> currTask, HiveConf hconf) {

    MoveWork mvWork = ((MoveTask) mvTask).getWork();
    StatsWork statsWork = new StatsWork(mvWork.getLoadTableWork());
    MapredWork mrWork = (MapredWork) currTask.getWork();

    // AggKey in StatsWork is used for stats aggregation while StatsAggPrefix
    // in FileSinkDesc is used for stats publishing. They should be consistent.
    statsWork.setAggKey(((FileSinkOperator) nd).getConf().getStatsAggPrefix());
    Task<? extends Serializable> statsTask = TaskFactory.get(statsWork, hconf);

    // mark the MapredWork and FileSinkOperator for gathering stats
    nd.getConf().setGatherStats(true);
    mrWork.setGatheringStats(true);
    // mrWork.addDestinationTable(nd.getConf().getTableInfo().getTableName());

    // subscribe feeds from the MoveTask so that MoveTask can forward the list
    // of dynamic partition list to the StatsTask
    mvTask.addDependentTask(statsTask);
    statsTask.subscribeFeed(mvTask);
  }

Example #12

0

Show file

File: GenMRFileSink1.java Project: Carlie20083/hive-0.7.0

  /**
   * Process the FileSink operator to generate a MoveTask if necessary.
   *
   * @param nd current FileSink operator
   * @param stack parent operators
   * @param opProcCtx
   * @param chDir whether the operator should be first output to a tmp dir and then merged to the
   *     final dir later
   * @return the final file name to which the FileSinkOperator should store.
   * @throws SemanticException
   */
  private String processFS(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, boolean chDir)
      throws SemanticException {

    // Is it the dummy file sink after the mapjoin
    FileSinkOperator fsOp = (FileSinkOperator) nd;
    if ((fsOp.getParentOperators().size() == 1)
        && (fsOp.getParentOperators().get(0) instanceof MapJoinOperator)) {
      return null;
    }

    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    List<FileSinkOperator> seenFSOps = ctx.getSeenFileSinkOps();
    if (seenFSOps == null) {
      seenFSOps = new ArrayList<FileSinkOperator>();
    }
    if (!seenFSOps.contains(fsOp)) {
      seenFSOps.add(fsOp);
    }
    ctx.setSeenFileSinkOps(seenFSOps);

    Task<? extends Serializable> currTask = ctx.getCurrTask();

    // If the directory needs to be changed, send the new directory
    String dest = null;

    if (chDir) {
      dest = fsOp.getConf().getDirName();

      // generate the temporary file
      // it must be on the same file system as the current destination
      ParseContext parseCtx = ctx.getParseCtx();
      Context baseCtx = parseCtx.getContext();
      String tmpDir = baseCtx.getExternalTmpFileURI((new Path(dest)).toUri());

      fsOp.getConf().setDirName(tmpDir);
    }

    Task<? extends Serializable> mvTask = null;

    if (!chDir) {
      mvTask = findMoveTask(ctx.getMvTask(), fsOp);
    }

    Operator<? extends Serializable> currTopOp = ctx.getCurrTopOp();
    String currAliasId = ctx.getCurrAliasId();
    HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap =
        ctx.getOpTaskMap();
    List<Operator<? extends Serializable>> seenOps = ctx.getSeenOps();
    List<Task<? extends Serializable>> rootTasks = ctx.getRootTasks();

    // Set the move task to be dependent on the current task
    if (mvTask != null) {
      currTask.addDependentTask(mvTask);
    }

    // In case of multi-table insert, the path to alias mapping is needed for
    // all the sources. Since there is no
    // reducer, treat it as a plan with null reducer
    // If it is a map-only job, the task needs to be processed
    if (currTopOp != null) {
      Task<? extends Serializable> mapTask = opTaskMap.get(null);
      if (mapTask == null) {
        assert (!seenOps.contains(currTopOp));
        seenOps.add(currTopOp);
        GenMapRedUtils.setTaskPlan(
            currAliasId, currTopOp, (MapredWork) currTask.getWork(), false, ctx);
        opTaskMap.put(null, currTask);
        rootTasks.add(currTask);
      } else {
        if (!seenOps.contains(currTopOp)) {
          seenOps.add(currTopOp);
          GenMapRedUtils.setTaskPlan(
              currAliasId, currTopOp, (MapredWork) mapTask.getWork(), false, ctx);
        }
        // mapTask and currTask should be merged by and join/union operator
        // (e.g., GenMRUnion1j) which has multiple topOps.
        assert mapTask == currTask
            : "mapTask.id = " + mapTask.getId() + "; currTask.id = " + currTask.getId();
      }

      return dest;
    }

    UnionOperator currUnionOp = ctx.getCurrUnionOp();

    if (currUnionOp != null) {
      opTaskMap.put(null, currTask);
      GenMapRedUtils.initUnionPlan(ctx, currTask, false);
      return dest;
    }

    AbstractMapJoinOperator<? extends MapJoinDesc> currMapJoinOp = ctx.getCurrMapJoinOp();

    if (currMapJoinOp != null) {
      opTaskMap.put(null, currTask);
      GenMRMapJoinCtx mjCtx = ctx.getMapJoinCtx(currMapJoinOp);
      MapredWork plan = (MapredWork) currTask.getWork();

      String taskTmpDir = mjCtx.getTaskTmpDir();
      TableDesc tt_desc = mjCtx.getTTDesc();
      assert plan.getPathToAliases().get(taskTmpDir) == null;
      plan.getPathToAliases().put(taskTmpDir, new ArrayList<String>());
      plan.getPathToAliases().get(taskTmpDir).add(taskTmpDir);
      plan.getPathToPartitionInfo().put(taskTmpDir, new PartitionDesc(tt_desc, null));
      plan.getAliasToWork().put(taskTmpDir, mjCtx.getRootMapJoinOp());
      return dest;
    }

    return dest;
  }

Example #13

0

Show file

File: GenMapRedUtils.java Project: Carlie20083/hive-0.7.0

  public static void mergeMapJoinUnion(UnionOperator union, GenMRProcContext ctx, int pos)
      throws SemanticException {
    ParseContext parseCtx = ctx.getParseCtx();
    UnionProcContext uCtx = parseCtx.getUCtx();

    UnionParseContext uPrsCtx = uCtx.getUnionParseContext(union);
    assert uPrsCtx != null;

    Task<? extends Serializable> currTask = ctx.getCurrTask();

    GenMRUnionCtx uCtxTask = ctx.getUnionTask(union);
    Task<? extends Serializable> uTask = null;

    union.getParentOperators().get(pos);
    MapredWork uPlan = null;

    // union is encountered for the first time
    if (uCtxTask == null) {
      uCtxTask = new GenMRUnionCtx();
      uPlan = GenMapRedUtils.getMapRedWork(parseCtx.getConf());
      uTask = TaskFactory.get(uPlan, parseCtx.getConf());
      uCtxTask.setUTask(uTask);
      ctx.setUnionTask(union, uCtxTask);
    } else {
      uTask = uCtxTask.getUTask();
      uPlan = (MapredWork) uTask.getWork();
    }

    // If there is a mapjoin at position 'pos'
    if (uPrsCtx.getMapJoinSubq(pos)) {
      GenMRMapJoinCtx mjCtx = ctx.getMapJoinCtx(ctx.getCurrMapJoinOp());
      String taskTmpDir = mjCtx.getTaskTmpDir();
      if (uPlan.getPathToAliases().get(taskTmpDir) == null) {
        uPlan.getPathToAliases().put(taskTmpDir, new ArrayList<String>());
        uPlan.getPathToAliases().get(taskTmpDir).add(taskTmpDir);
        uPlan.getPathToPartitionInfo().put(taskTmpDir, new PartitionDesc(mjCtx.getTTDesc(), null));
        uPlan.getAliasToWork().put(taskTmpDir, mjCtx.getRootMapJoinOp());
      }

      for (Task t : currTask.getParentTasks()) {
        t.addDependentTask(uTask);
      }
      try {
        boolean notDone = true;
        while (notDone) {
          for (Task t : currTask.getParentTasks()) {
            t.removeDependentTask(currTask);
          }
          notDone = false;
        }
      } catch (ConcurrentModificationException e) {
      }
    } else {
      setTaskPlan(ctx.getCurrAliasId(), ctx.getCurrTopOp(), uPlan, false, ctx);
    }

    ctx.setCurrTask(uTask);
    ctx.setCurrAliasId(null);
    ctx.setCurrTopOp(null);
    ctx.setCurrMapJoinOp(null);

    ctx.getMapCurrCtx().put(union, new GenMapRedCtx(ctx.getCurrTask(), null, null));
  }

Example #14

0

Show file

File: GenMapRedUtils.java Project: Carlie20083/hive-0.7.0

  @SuppressWarnings("nls")
  /**
   * Merge the tasks - by creating a temporary file between them.
   *
   * @param op reduce sink operator being processed
   * @param oldTask the parent task
   * @param task the child task
   * @param opProcCtx context
   * @param setReducer does the reducer needs to be set
   * @param pos position of the parent
   */
  public static void splitTasks(
      Operator<? extends Serializable> op,
      Task<? extends Serializable> parentTask,
      Task<? extends Serializable> childTask,
      GenMRProcContext opProcCtx,
      boolean setReducer,
      boolean local,
      int posn)
      throws SemanticException {
    childTask.getWork();
    Operator<? extends Serializable> currTopOp = opProcCtx.getCurrTopOp();

    ParseContext parseCtx = opProcCtx.getParseCtx();
    parentTask.addDependentTask(childTask);

    // Root Task cannot depend on any other task, therefore childTask cannot be
    // a root Task
    List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
    if (rootTasks.contains(childTask)) {
      rootTasks.remove(childTask);
    }

    // generate the temporary file
    Context baseCtx = parseCtx.getContext();
    String taskTmpDir = baseCtx.getMRTmpFileURI();

    Operator<? extends Serializable> parent = op.getParentOperators().get(posn);
    TableDesc tt_desc =
        PlanUtils.getIntermediateFileTableDesc(
            PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));

    // Create a file sink operator for this file name
    boolean compressIntermediate =
        parseCtx.getConf().getBoolVar(HiveConf.ConfVars.COMPRESSINTERMEDIATE);
    FileSinkDesc desc = new FileSinkDesc(taskTmpDir, tt_desc, compressIntermediate);
    if (compressIntermediate) {
      desc.setCompressCodec(parseCtx.getConf().getVar(HiveConf.ConfVars.COMPRESSINTERMEDIATECODEC));
      desc.setCompressType(parseCtx.getConf().getVar(HiveConf.ConfVars.COMPRESSINTERMEDIATETYPE));
    }
    Operator<? extends Serializable> fs_op =
        putOpInsertMap(OperatorFactory.get(desc, parent.getSchema()), null, parseCtx);

    // replace the reduce child with this operator
    List<Operator<? extends Serializable>> childOpList = parent.getChildOperators();
    for (int pos = 0; pos < childOpList.size(); pos++) {
      if (childOpList.get(pos) == op) {
        childOpList.set(pos, fs_op);
        break;
      }
    }

    List<Operator<? extends Serializable>> parentOpList =
        new ArrayList<Operator<? extends Serializable>>();
    parentOpList.add(parent);
    fs_op.setParentOperators(parentOpList);

    // create a dummy tableScan operator on top of op
    // TableScanOperator is implicitly created here for each MapOperator
    RowResolver rowResolver = opProcCtx.getParseCtx().getOpParseCtx().get(parent).getRowResolver();
    Operator<? extends Serializable> ts_op =
        putOpInsertMap(
            OperatorFactory.get(TableScanDesc.class, parent.getSchema()), rowResolver, parseCtx);

    childOpList = new ArrayList<Operator<? extends Serializable>>();
    childOpList.add(op);
    ts_op.setChildOperators(childOpList);
    op.getParentOperators().set(posn, ts_op);

    Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
    mapCurrCtx.put(ts_op, new GenMapRedCtx(childTask, null, null));

    String streamDesc = taskTmpDir;
    MapredWork cplan = (MapredWork) childTask.getWork();

    if (setReducer) {
      Operator<? extends Serializable> reducer = op.getChildOperators().get(0);

      if (reducer.getClass() == JoinOperator.class) {
        String origStreamDesc;
        streamDesc = "$INTNAME";
        origStreamDesc = streamDesc;
        int pos = 0;
        while (cplan.getAliasToWork().get(streamDesc) != null) {
          streamDesc = origStreamDesc.concat(String.valueOf(++pos));
        }
      }

      // TODO: Allocate work to remove the temporary files and make that
      // dependent on the redTask
      if (reducer.getClass() == JoinOperator.class) {
        cplan.setNeedsTagging(true);
      }
    }

    // Add the path to alias mapping
    setTaskPlan(taskTmpDir, streamDesc, ts_op, cplan, local, tt_desc);

    // This can be cleaned up as a function table in future
    if (op instanceof AbstractMapJoinOperator<?>) {
      AbstractMapJoinOperator<? extends MapJoinDesc> mjOp =
          (AbstractMapJoinOperator<? extends MapJoinDesc>) op;
      opProcCtx.setCurrMapJoinOp(mjOp);
      GenMRMapJoinCtx mjCtx = opProcCtx.getMapJoinCtx(mjOp);
      if (mjCtx == null) {
        mjCtx = new GenMRMapJoinCtx(taskTmpDir, tt_desc, ts_op, null);
      } else {
        mjCtx.setTaskTmpDir(taskTmpDir);
        mjCtx.setTTDesc(tt_desc);
        mjCtx.setRootMapJoinOp(ts_op);
      }
      opProcCtx.setMapJoinCtx(mjOp, mjCtx);
      opProcCtx.getMapCurrCtx().put(parent, new GenMapRedCtx(childTask, null, null));
      setupBucketMapJoinInfo(cplan, mjOp, false);
    }

    currTopOp = null;
    String currAliasId = null;

    opProcCtx.setCurrTopOp(currTopOp);
    opProcCtx.setCurrAliasId(currAliasId);
    opProcCtx.setCurrTask(childTask);
  }

Example #15

0

Show file

File: GenMapRedUtils.java Project: Carlie20083/hive-0.7.0

  /**
   * Merge the current task with the task for the current reducer.
   *
   * @param op operator being processed
   * @param oldTask the old task for the current reducer
   * @param task the current task for the current reducer
   * @param opProcCtx processing context
   * @param pos position of the parent in the stack
   */
  public static void joinPlan(
      Operator<? extends Serializable> op,
      Task<? extends Serializable> oldTask,
      Task<? extends Serializable> task,
      GenMRProcContext opProcCtx,
      int pos,
      boolean split,
      boolean readMapJoinData,
      boolean readUnionData,
      boolean createLocalWork)
      throws SemanticException {
    Task<? extends Serializable> currTask = task;
    MapredWork plan = (MapredWork) currTask.getWork();
    Operator<? extends Serializable> currTopOp = opProcCtx.getCurrTopOp();
    List<Task<? extends Serializable>> parTasks = null;

    // terminate the old task and make current task dependent on it
    if (split) {
      assert oldTask != null;
      splitTasks(op, oldTask, currTask, opProcCtx, true, false, 0);
    } else {
      if ((oldTask != null)
          && (oldTask.getParentTasks() != null)
          && !oldTask.getParentTasks().isEmpty()) {
        parTasks = new ArrayList<Task<? extends Serializable>>();
        parTasks.addAll(oldTask.getParentTasks());

        Object[] parTaskArr = parTasks.toArray();
        for (Object element : parTaskArr) {
          ((Task<? extends Serializable>) element).removeDependentTask(oldTask);
        }
      }
    }

    if (currTopOp != null) {
      List<Operator<? extends Serializable>> seenOps = opProcCtx.getSeenOps();
      String currAliasId = opProcCtx.getCurrAliasId();

      if (!seenOps.contains(currTopOp)) {
        seenOps.add(currTopOp);
        boolean local = false;
        if (pos != -1) {
          local = (pos == ((MapJoinDesc) op.getConf()).getPosBigTable()) ? false : true;
        }
        setTaskPlan(currAliasId, currTopOp, plan, local, opProcCtx);
        if (op instanceof AbstractMapJoinOperator) {
          setupBucketMapJoinInfo(
              plan, (AbstractMapJoinOperator<? extends MapJoinDesc>) op, createLocalWork);
        }
      }
      currTopOp = null;
      opProcCtx.setCurrTopOp(currTopOp);
    } else if (opProcCtx.getCurrMapJoinOp() != null) {
      AbstractMapJoinOperator<? extends MapJoinDesc> mjOp = opProcCtx.getCurrMapJoinOp();
      if (readUnionData) {
        initUnionPlan(opProcCtx, currTask, false);
      } else {
        GenMRMapJoinCtx mjCtx = opProcCtx.getMapJoinCtx(mjOp);

        // In case of map-join followed by map-join, the file needs to be
        // obtained from the old map join
        AbstractMapJoinOperator<? extends MapJoinDesc> oldMapJoin = mjCtx.getOldMapJoin();
        String taskTmpDir = null;
        TableDesc tt_desc = null;
        Operator<? extends Serializable> rootOp = null;

        boolean local = ((pos == -1) || (pos == (mjOp.getConf()).getPosBigTable())) ? false : true;
        if (oldMapJoin == null) {
          if (opProcCtx.getParseCtx().getListMapJoinOpsNoReducer().contains(mjOp)
              || local
              || (oldTask != null) && (parTasks != null)) {
            taskTmpDir = mjCtx.getTaskTmpDir();
            tt_desc = mjCtx.getTTDesc();
            rootOp = mjCtx.getRootMapJoinOp();
          }
        } else {
          GenMRMapJoinCtx oldMjCtx = opProcCtx.getMapJoinCtx(oldMapJoin);
          assert oldMjCtx != null;
          taskTmpDir = oldMjCtx.getTaskTmpDir();
          tt_desc = oldMjCtx.getTTDesc();
          rootOp = oldMjCtx.getRootMapJoinOp();
        }

        setTaskPlan(taskTmpDir, taskTmpDir, rootOp, plan, local, tt_desc);
        setupBucketMapJoinInfo(plan, oldMapJoin, createLocalWork);
      }
      opProcCtx.setCurrMapJoinOp(null);

      if ((oldTask != null) && (parTasks != null)) {
        for (Task<? extends Serializable> parTask : parTasks) {
          parTask.addDependentTask(currTask);
          if (opProcCtx.getRootTasks().contains(currTask)) {
            opProcCtx.getRootTasks().remove(currTask);
          }
        }
      }
    }

    opProcCtx.setCurrTask(currTask);
  }

Example #16

0

Show file

File: ExplainTaskHelper.java Project: victor2100/hive

  private void outputPlan(Task<? extends Serializable> task) {
    if (task == null) return;

    out.printf("Stage: \n", task.getId());

    // real output
    Serializable work = task.getWork();
    if (work == null) return;

    if (work instanceof FetchWork) {
      out.println("Fetch");
      output(((FetchWork) work).getSource());
    } else if (work instanceof MapredLocalWork) {
      out.println("MapredLocalWork");
      // fetch
      try {
        out.println("Fetch Part");
        Collection<FetchWork> fetchWorkCollect =
            ((MapredLocalWork) work).getAliasToFetchWork().values();
        for (FetchWork f : fetchWorkCollect) {
          output(f.getSource());
        }
      } catch (Exception e) {
        out.println("Exception 1");
      }

      // others
      try {
        out.println("Other Parts");
        Collection<Operator<? extends OperatorDesc>> collect =
            ((MapredLocalWork) work).getAliasToWork().values();

        for (Operator<? extends OperatorDesc> c : collect) {
          output(c);
        }
      } catch (Exception e) {
        out.println("Exception 2");
      }
    } else if (work instanceof MapredWork) {
      out.println("MapredWork");
      try {
        Collection<Operator<? extends OperatorDesc>> collect =
            ((MapredWork) work).getAllOperators();

        for (Operator<? extends OperatorDesc> c : collect) {
          // out.println(1);
          output(c);
          break; // first operator will give out all info s
        }
      } catch (Exception e) {
        out.println("Exception 3");
      }
    } else {
      output(work);
    }

    // -------other cases--------------------
    if (task instanceof ConditionalTask && ((ConditionalTask) task).getListTasks() != null) {
      for (Task<? extends Serializable> con : ((ConditionalTask) task).getListTasks()) {
        outputPlan(con);
      }
    }

    if (task.getChildTasks() != null) {
      for (Task<? extends Serializable> child : task.getChildTasks()) {
        outputPlan(child);
      }
    }
  }

Example #17

0

Show file

File: HiveAuthzBindingHook.java Project: sravya8/incubator-sentry

  /** Post analyze hook that invokes hive auth bindings */
  @Override
  public void postAnalyze(
      HiveSemanticAnalyzerHookContext context, List<Task<? extends Serializable>> rootTasks)
      throws SemanticException {
    HiveOperation stmtOperation = getCurrentHiveStmtOp();
    HiveAuthzPrivileges stmtAuthObject;

    stmtAuthObject = HiveAuthzPrivilegesMap.getHiveAuthzPrivileges(stmtOperation);

    // must occur above the null check on stmtAuthObject
    // since GRANT/REVOKE/etc are not authorized by binding layer at present
    Subject subject = getCurrentSubject(context);
    Set<String> subjectGroups = hiveAuthzBinding.getGroups(subject);
    for (Task<? extends Serializable> task : rootTasks) {
      if (task instanceof SentryGrantRevokeTask) {
        SentryGrantRevokeTask sentryTask = (SentryGrantRevokeTask) task;
        sentryTask.setHiveAuthzBinding(hiveAuthzBinding);
        sentryTask.setAuthzConf(authzConf);
        sentryTask.setSubject(subject);
        sentryTask.setSubjectGroups(subjectGroups);
        sentryTask.setIpAddress(context.getIpAddress());
        sentryTask.setOperation(stmtOperation);
      }
    }

    try {
      if (stmtAuthObject == null) {
        // We don't handle authorizing this statement
        return;
      }

      /**
       * Replace DDLTask using the SentryFilterDDLTask for protection, such as "show column" only
       * allow show some column that user can access to. SENTRY-847
       */
      for (int i = 0; i < rootTasks.size(); i++) {
        Task<? extends Serializable> task = rootTasks.get(i);
        if (task instanceof DDLTask) {
          SentryFilterDDLTask filterTask =
              new SentryFilterDDLTask(hiveAuthzBinding, subject, stmtOperation);
          filterTask.setWork((DDLWork) task.getWork());
          rootTasks.set(i, filterTask);
        }
      }

      authorizeWithHiveBindings(context, stmtAuthObject, stmtOperation);
    } catch (AuthorizationException e) {
      executeOnFailureHooks(context, stmtOperation, e);
      String permsRequired = "";
      for (String perm : hiveAuthzBinding.getLastQueryPrivilegeErrors()) {
        permsRequired += perm + ";";
      }
      SessionState.get().getConf().set(HiveAuthzConf.HIVE_SENTRY_AUTH_ERRORS, permsRequired);
      String msgForLog =
          HiveAuthzConf.HIVE_SENTRY_PRIVILEGE_ERROR_MESSAGE
              + "\n Required privileges for this query: "
              + permsRequired;
      String msgForConsole =
          HiveAuthzConf.HIVE_SENTRY_PRIVILEGE_ERROR_MESSAGE
              + "\n "
              + e.getMessage()
              + "\n The required privileges: "
              + permsRequired;
      // AuthorizationException is not a real exception, use the info level to record this.
      LOG.info(msgForLog);
      throw new SemanticException(msgForConsole, e);
    } finally {
      hiveAuthzBinding.close();
    }

    if ("true"
        .equalsIgnoreCase(context.getConf().get(HiveAuthzConf.HIVE_SENTRY_MOCK_COMPILATION))) {
      throw new SemanticException(
          HiveAuthzConf.HIVE_SENTRY_MOCK_ERROR
              + " Mock query compilation aborted. Set "
              + HiveAuthzConf.HIVE_SENTRY_MOCK_COMPILATION
              + " to 'false' for normal query processing");
    }
  }

Example #18

0

Show file

File: GenMapRedUtils.java Project: Carlie20083/hive-0.7.0

  /**
   * Initialize the current plan by adding it to root tasks.
   *
   * @param op the map join operator encountered
   * @param opProcCtx processing context
   * @param pos position of the parent
   */
  public static void initMapJoinPlan(
      Operator<? extends Serializable> op,
      GenMRProcContext opProcCtx,
      boolean readInputMapJoin,
      boolean readInputUnion,
      boolean setReducer,
      int pos,
      boolean createLocalPlan)
      throws SemanticException {
    Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
    assert (((pos == -1) && (readInputMapJoin)) || (pos != -1));
    int parentPos = (pos == -1) ? 0 : pos;
    GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(parentPos));
    Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
    MapredWork plan = (MapredWork) currTask.getWork();
    HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap =
        opProcCtx.getOpTaskMap();
    Operator<? extends Serializable> currTopOp = opProcCtx.getCurrTopOp();

    // The mapjoin has already been encountered. Some context must be stored
    // about that
    if (readInputMapJoin) {
      AbstractMapJoinOperator<? extends MapJoinDesc> currMapJoinOp = opProcCtx.getCurrMapJoinOp();
      assert currMapJoinOp != null;
      boolean local =
          ((pos == -1) || (pos == (currMapJoinOp.getConf()).getPosBigTable())) ? false : true;

      if (setReducer) {
        Operator<? extends Serializable> reducer = op.getChildOperators().get(0);
        plan.setReducer(reducer);
        opTaskMap.put(reducer, currTask);
        if (reducer.getClass() == JoinOperator.class) {
          plan.setNeedsTagging(true);
        }
        ReduceSinkDesc desc = (ReduceSinkDesc) op.getConf();
        plan.setNumReduceTasks(desc.getNumReducers());
      } else {
        opTaskMap.put(op, currTask);
      }

      if (!readInputUnion) {
        GenMRMapJoinCtx mjCtx = opProcCtx.getMapJoinCtx(currMapJoinOp);
        String taskTmpDir;
        TableDesc tt_desc;
        Operator<? extends Serializable> rootOp;

        if (mjCtx.getOldMapJoin() == null || setReducer) {
          taskTmpDir = mjCtx.getTaskTmpDir();
          tt_desc = mjCtx.getTTDesc();
          rootOp = mjCtx.getRootMapJoinOp();
        } else {
          GenMRMapJoinCtx oldMjCtx = opProcCtx.getMapJoinCtx(mjCtx.getOldMapJoin());
          taskTmpDir = oldMjCtx.getTaskTmpDir();
          tt_desc = oldMjCtx.getTTDesc();
          rootOp = oldMjCtx.getRootMapJoinOp();
        }

        setTaskPlan(taskTmpDir, taskTmpDir, rootOp, plan, local, tt_desc);
        setupBucketMapJoinInfo(plan, currMapJoinOp, createLocalPlan);
      } else {
        initUnionPlan(opProcCtx, currTask, false);
      }

      opProcCtx.setCurrMapJoinOp(null);
    } else {
      MapJoinDesc desc = (MapJoinDesc) op.getConf();

      // The map is overloaded to keep track of mapjoins also
      opTaskMap.put(op, currTask);

      List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
      rootTasks.add(currTask);

      assert currTopOp != null;
      List<Operator<? extends Serializable>> seenOps = opProcCtx.getSeenOps();
      String currAliasId = opProcCtx.getCurrAliasId();

      seenOps.add(currTopOp);
      boolean local = (pos == desc.getPosBigTable()) ? false : true;
      setTaskPlan(currAliasId, currTopOp, plan, local, opProcCtx);
      setupBucketMapJoinInfo(
          plan, (AbstractMapJoinOperator<? extends MapJoinDesc>) op, createLocalPlan);
    }

    opProcCtx.setCurrTask(currTask);
    opProcCtx.setCurrTopOp(null);
    opProcCtx.setCurrAliasId(null);
  }

Example #19

0

Show file

File: GenMRFileSink1.java Project: uclaabs/absHive

  /**
   * File Sink Operator encountered.
   *
   * @param nd the file sink operator encountered
   * @param opProcCtx context
   */
  public Object process(
      Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs)
      throws SemanticException {
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    ParseContext parseCtx = ctx.getParseCtx();
    boolean chDir = false;
    Task<? extends Serializable> currTask = ctx.getCurrTask();
    FileSinkOperator fsOp = (FileSinkOperator) nd;
    boolean isInsertTable = // is INSERT OVERWRITE TABLE
        fsOp.getConf().getTableInfo().getTableName() != null
            && parseCtx.getQB().getParseInfo().isInsertToTable();
    HiveConf hconf = parseCtx.getConf();

    // Mark this task as a final map reduce task (ignoring the optional merge task)
    ((MapredWork) currTask.getWork()).setFinalMapRed(true);

    // If this file sink desc has been processed due to a linked file sink desc,
    // use that task
    Map<FileSinkDesc, Task<? extends Serializable>> fileSinkDescs = ctx.getLinkedFileDescTasks();
    if (fileSinkDescs != null) {
      Task<? extends Serializable> childTask = fileSinkDescs.get(fsOp.getConf());
      processLinkedFileDesc(ctx, childTask);
      return null;
    }

    // Has the user enabled merging of files for map-only jobs or for all jobs
    if ((ctx.getMvTask() != null) && (!ctx.getMvTask().isEmpty())) {
      List<Task<MoveWork>> mvTasks = ctx.getMvTask();

      // In case of unions or map-joins, it is possible that the file has
      // already been seen.
      // So, no need to attempt to merge the files again.
      if ((ctx.getSeenFileSinkOps() == null) || (!ctx.getSeenFileSinkOps().contains(nd))) {

        // no need of merging if the move is to a local file system
        MoveTask mvTask = (MoveTask) findMoveTask(mvTasks, fsOp);

        if (isInsertTable && hconf.getBoolVar(ConfVars.HIVESTATSAUTOGATHER)) {
          addStatsTask(fsOp, mvTask, currTask, parseCtx.getConf());
        }

        if ((mvTask != null) && !mvTask.isLocal() && fsOp.getConf().canBeMerged()) {
          if (fsOp.getConf().isLinkedFileSink()) {
            // If the user has HIVEMERGEMAPREDFILES set to false, the idea was the
            // number of reducers are few, so the number of files anyway are small.
            // However, with this optimization, we are increasing the number of files
            // possibly by a big margin. So, merge aggresively.
            if (hconf.getBoolVar(ConfVars.HIVEMERGEMAPFILES)
                || hconf.getBoolVar(ConfVars.HIVEMERGEMAPREDFILES)) {
              chDir = true;
            }
          } else {
            // There are separate configuration parameters to control whether to
            // merge for a map-only job
            // or for a map-reduce job
            MapredWork currWork = (MapredWork) currTask.getWork();
            boolean mergeMapOnly =
                hconf.getBoolVar(ConfVars.HIVEMERGEMAPFILES) && currWork.getReducer() == null;
            boolean mergeMapRed =
                hconf.getBoolVar(ConfVars.HIVEMERGEMAPREDFILES) && currWork.getReducer() != null;
            if (mergeMapOnly || mergeMapRed) {
              chDir = true;
            }
          }
        }
      }
    }

    String finalName = processFS(fsOp, stack, opProcCtx, chDir);

    if (chDir) {
      // Merge the files in the destination table/partitions by creating Map-only merge job
      // If underlying data is RCFile or OrcFile a BlockMerge task would be created.
      LOG.info("using CombineHiveInputformat for the merge job");
      createMRWorkForMergingFiles(fsOp, ctx, finalName);
    }

    FileSinkDesc fileSinkDesc = fsOp.getConf();
    if (fileSinkDesc.isLinkedFileSink()) {
      Map<FileSinkDesc, Task<? extends Serializable>> linkedFileDescTasks =
          ctx.getLinkedFileDescTasks();
      if (linkedFileDescTasks == null) {
        linkedFileDescTasks = new HashMap<FileSinkDesc, Task<? extends Serializable>>();
        ctx.setLinkedFileDescTasks(linkedFileDescTasks);
      }

      // The child tasks may be null in case of a select
      if ((currTask.getChildTasks() != null) && (currTask.getChildTasks().size() == 1)) {
        for (FileSinkDesc fileDesc : fileSinkDesc.getLinkedFileSinkDesc()) {
          linkedFileDescTasks.put(fileDesc, currTask.getChildTasks().get(0));
        }
      }
    }

    return null;
  }

Example #20

0

Show file

File: GenMRFileSink1.java Project: uclaabs/absHive

  /**
   * Process the FileSink operator to generate a MoveTask if necessary.
   *
   * @param fsOp current FileSink operator
   * @param stack parent operators
   * @param opProcCtx
   * @param chDir whether the operator should be first output to a tmp dir and then merged to the
   *     final dir later
   * @return the final file name to which the FileSinkOperator should store.
   * @throws SemanticException
   */
  private String processFS(
      FileSinkOperator fsOp, Stack<Node> stack, NodeProcessorCtx opProcCtx, boolean chDir)
      throws SemanticException {

    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    List<FileSinkOperator> seenFSOps = ctx.getSeenFileSinkOps();
    if (seenFSOps == null) {
      seenFSOps = new ArrayList<FileSinkOperator>();
    }
    if (!seenFSOps.contains(fsOp)) {
      seenFSOps.add(fsOp);
    }
    ctx.setSeenFileSinkOps(seenFSOps);

    Task<? extends Serializable> currTask = ctx.getCurrTask();

    // If the directory needs to be changed, send the new directory
    String dest = null;

    if (chDir) {
      dest = fsOp.getConf().getFinalDirName();

      // generate the temporary file
      // it must be on the same file system as the current destination
      ParseContext parseCtx = ctx.getParseCtx();
      Context baseCtx = parseCtx.getContext();
      String tmpDir = baseCtx.getExternalTmpFileURI((new Path(dest)).toUri());

      FileSinkDesc fileSinkDesc = fsOp.getConf();
      // Change all the linked file sink descriptors
      if (fileSinkDesc.isLinkedFileSink()) {
        for (FileSinkDesc fsConf : fileSinkDesc.getLinkedFileSinkDesc()) {
          String fileName = Utilities.getFileNameFromDirName(fsConf.getDirName());
          fsConf.setParentDir(tmpDir);
          fsConf.setDirName(tmpDir + Path.SEPARATOR + fileName);
        }
      } else {
        fileSinkDesc.setDirName(tmpDir);
      }
    }

    Task<MoveWork> mvTask = null;

    if (!chDir) {
      mvTask = findMoveTask(ctx.getMvTask(), fsOp);
    }

    Operator<? extends OperatorDesc> currTopOp = ctx.getCurrTopOp();
    String currAliasId = ctx.getCurrAliasId();
    HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap =
        ctx.getOpTaskMap();
    List<Operator<? extends OperatorDesc>> seenOps = ctx.getSeenOps();
    List<Task<? extends Serializable>> rootTasks = ctx.getRootTasks();

    // Set the move task to be dependent on the current task
    if (mvTask != null) {
      addDependentMoveTasks(ctx, mvTask, currTask);
    }

    // In case of multi-table insert, the path to alias mapping is needed for
    // all the sources. Since there is no
    // reducer, treat it as a plan with null reducer
    // If it is a map-only job, the task needs to be processed
    if (currTopOp != null) {
      Task<? extends Serializable> mapTask = opTaskMap.get(null);
      if (mapTask == null) {
        if (!seenOps.contains(currTopOp)) {
          seenOps.add(currTopOp);
          GenMapRedUtils.setTaskPlan(
              currAliasId, currTopOp, (MapredWork) currTask.getWork(), false, ctx);
        }
        opTaskMap.put(null, currTask);
        if (!rootTasks.contains(currTask)
            && (currTask.getParentTasks() == null || currTask.getParentTasks().isEmpty())) {
          rootTasks.add(currTask);
        }
      } else {
        if (!seenOps.contains(currTopOp)) {
          seenOps.add(currTopOp);
          GenMapRedUtils.setTaskPlan(
              currAliasId, currTopOp, (MapredWork) mapTask.getWork(), false, ctx);
        } else {
          UnionOperator currUnionOp = ctx.getCurrUnionOp();
          if (currUnionOp != null) {
            opTaskMap.put(null, currTask);
            ctx.setCurrTopOp(null);
            GenMapRedUtils.initUnionPlan(ctx, currUnionOp, currTask, false);
            return dest;
          }
        }
        // mapTask and currTask should be merged by and join/union operator
        // (e.g., GenMRUnion1) which has multiple topOps.
        // assert mapTask == currTask : "mapTask.id = " + mapTask.getId()
        // + "; currTask.id = " + currTask.getId();
      }

      return dest;
    }

    UnionOperator currUnionOp = ctx.getCurrUnionOp();

    if (currUnionOp != null) {
      opTaskMap.put(null, currTask);
      GenMapRedUtils.initUnionPlan(ctx, currUnionOp, currTask, false);
      return dest;
    }

    return dest;
  }