Esempio n. 1
0
  /**
   * Construct a conditional task given the current leaf task, the MoveWork and the MapredWork.
   *
   * @param conf HiveConf
   * @param currTask current leaf task
   * @param mvWork MoveWork for the move task
   * @param mergeWork MapredWork for the merge task.
   * @param inputPath the input directory of the merge/move task
   * @return The conditional task
   */
  private ConditionalTask createCondTask(
      HiveConf conf,
      Task<? extends Serializable> currTask,
      MoveWork mvWork,
      MapredWork mergeWork,
      String inputPath) {

    // There are 3 options for this ConditionalTask:
    // 1) Merge the partitions
    // 2) Move the partitions (i.e. don't merge the partitions)
    // 3) Merge some partitions and move other partitions (i.e. merge some partitions and don't
    // merge others) in this case the merge is done first followed by the move to prevent
    // conflicts.
    Task<? extends Serializable> mergeOnlyMergeTask = TaskFactory.get(mergeWork, conf);
    Task<? extends Serializable> moveOnlyMoveTask = TaskFactory.get(mvWork, conf);
    Task<? extends Serializable> mergeAndMoveMergeTask = TaskFactory.get(mergeWork, conf);
    Task<? extends Serializable> mergeAndMoveMoveTask = TaskFactory.get(mvWork, conf);

    // NOTE! It is necessary merge task is the parent of the move task, and not
    // the other way around, for the proper execution of the execute method of
    // ConditionalTask
    mergeAndMoveMergeTask.addDependentTask(mergeAndMoveMoveTask);

    List<Serializable> listWorks = new ArrayList<Serializable>();
    listWorks.add(mvWork);
    listWorks.add(mergeWork);

    ConditionalWork cndWork = new ConditionalWork(listWorks);

    List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
    listTasks.add(moveOnlyMoveTask);
    listTasks.add(mergeOnlyMergeTask);
    listTasks.add(mergeAndMoveMergeTask);

    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, conf);
    cndTsk.setListTasks(listTasks);

    // create resolver
    cndTsk.setResolver(new ConditionalResolverMergeFiles());
    ConditionalResolverMergeFilesCtx mrCtx =
        new ConditionalResolverMergeFilesCtx(listTasks, inputPath);
    cndTsk.setResolverCtx(mrCtx);

    // make the conditional task as the child of the current leaf task
    currTask.addDependentTask(cndTsk);

    return cndTsk;
  }
Esempio n. 2
0
  /**
   * Construct a conditional task given the current leaf task, the MoveWork and the MapredWork.
   *
   * @param conf HiveConf
   * @param currTask current leaf task
   * @param mvWork MoveWork for the move task
   * @param mergeWork MapredWork for the merge task.
   * @param inputPath the input directory of the merge/move task
   * @return The conditional task
   */
  private ConditionalTask createCondTask(
      HiveConf conf,
      Task<? extends Serializable> currTask,
      MoveWork mvWork,
      MapredWork mergeWork,
      String inputPath) {

    Task<? extends Serializable> mergeTask = TaskFactory.get(mergeWork, conf);
    Task<? extends Serializable> moveTask = TaskFactory.get(mvWork, conf);
    List<Serializable> listWorks = new ArrayList<Serializable>();
    listWorks.add(mvWork);
    listWorks.add(mergeWork);

    ConditionalWork cndWork = new ConditionalWork(listWorks);

    List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
    listTasks.add(moveTask);
    listTasks.add(mergeTask);

    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, conf);
    cndTsk.setListTasks(listTasks);

    // create resolver
    cndTsk.setResolver(new ConditionalResolverMergeFiles());
    ConditionalResolverMergeFilesCtx mrCtx =
        new ConditionalResolverMergeFilesCtx(listTasks, inputPath);
    cndTsk.setResolverCtx(mrCtx);

    // make the conditional task as the child of the current leaf task
    currTask.addDependentTask(cndTsk);

    return cndTsk;
  }
Esempio n. 3
0
  /*
   * Multiple file sink descriptors are linked.
   * Use the task created by the first linked file descriptor
   */
  private void processLinkedFileDesc(GenMRProcContext ctx, Task<? extends Serializable> childTask)
      throws SemanticException {
    Operator<? extends OperatorDesc> currTopOp = ctx.getCurrTopOp();
    String currAliasId = ctx.getCurrAliasId();
    List<Operator<? extends OperatorDesc>> seenOps = ctx.getSeenOps();
    List<Task<? extends Serializable>> rootTasks = ctx.getRootTasks();
    Task<? extends Serializable> currTask = ctx.getCurrTask();

    if (currTopOp != null) {
      if (!seenOps.contains(currTopOp)) {
        seenOps.add(currTopOp);
        GenMapRedUtils.setTaskPlan(
            currAliasId, currTopOp, (MapredWork) currTask.getWork(), false, ctx);
      }

      if (!rootTasks.contains(currTask)
          && (currTask.getParentTasks() == null || currTask.getParentTasks().isEmpty())) {
        rootTasks.add(currTask);
      }
    }

    if (childTask != null) {
      currTask.addDependentTask(childTask);
    }
  }
Esempio n. 4
0
  /**
   * handle partial scan command.
   *
   * <p>It is composed of PartialScanTask followed by StatsTask.
   */
  private void handlePartialScanCommand(
      TableScanOperator tableScan,
      ParseContext parseContext,
      StatsWork statsWork,
      GenTezProcContext context,
      Task<StatsWork> statsTask)
      throws SemanticException {

    String aggregationKey = tableScan.getConf().getStatsAggPrefix();
    StringBuilder aggregationKeyBuffer = new StringBuilder(aggregationKey);
    List<Path> inputPaths =
        GenMapRedUtils.getInputPathsForPartialScan(tableScan, aggregationKeyBuffer);
    aggregationKey = aggregationKeyBuffer.toString();

    // scan work
    PartialScanWork scanWork = new PartialScanWork(inputPaths);
    scanWork.setMapperCannotSpanPartns(true);
    scanWork.setAggKey(aggregationKey);
    scanWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir(), parseContext.getConf());

    // stats work
    statsWork.setPartialScanAnalyzeCommand(true);

    // partial scan task
    DriverContext driverCxt = new DriverContext();
    Task<PartialScanWork> partialScanTask = TaskFactory.get(scanWork, parseContext.getConf());
    partialScanTask.initialize(parseContext.getConf(), null, driverCxt);
    partialScanTask.setWork(scanWork);
    statsWork.setSourceTask(partialScanTask);

    // task dependency
    context.rootTasks.remove(context.currentTask);
    context.rootTasks.add(partialScanTask);
    partialScanTask.addDependentTask(statsTask);
  }
  /**
   * handle partial scan command. It is composed of PartialScanTask followed by StatsTask .
   *
   * @param op
   * @param ctx
   * @param parseCtx
   * @param currTask
   * @param parseInfo
   * @param statsWork
   * @param statsTask
   * @throws SemanticException
   */
  private void handlePartialScanCommand(
      TableScanOperator op,
      GenMRProcContext ctx,
      ParseContext parseCtx,
      Task<? extends Serializable> currTask,
      StatsWork statsWork,
      Task<StatsWork> statsTask)
      throws SemanticException {
    String aggregationKey = op.getConf().getStatsAggPrefix();
    StringBuffer aggregationKeyBuffer = new StringBuffer(aggregationKey);
    List<Path> inputPaths = GenMapRedUtils.getInputPathsForPartialScan(op, aggregationKeyBuffer);
    aggregationKey = aggregationKeyBuffer.toString();

    // scan work
    PartialScanWork scanWork = new PartialScanWork(inputPaths);
    scanWork.setMapperCannotSpanPartns(true);
    scanWork.setAggKey(aggregationKey);

    // stats work
    statsWork.setPartialScanAnalyzeCommand(true);

    // partial scan task
    DriverContext driverCxt = new DriverContext();
    Task<PartialScanWork> psTask = TaskFactory.get(scanWork, parseCtx.getConf());
    psTask.initialize(parseCtx.getConf(), null, driverCxt);
    psTask.setWork(scanWork);

    // task dependency
    ctx.getRootTasks().remove(currTask);
    ctx.getRootTasks().add(psTask);
    psTask.addDependentTask(statsTask);
    List<Task<? extends Serializable>> parentTasks = new ArrayList<Task<? extends Serializable>>();
    parentTasks.add(psTask);
    statsTask.setParentTasks(parentTasks);
  }
Esempio n. 6
0
  /**
   * Adds the dependencyTaskForMultiInsert in ctx as a dependent of parentTask. If mvTask is a load
   * table, and HIVE_MULTI_INSERT_ATOMIC_OUTPUTS is set, adds mvTask as a dependent of
   * dependencyTaskForMultiInsert in ctx, otherwise adds mvTask as a dependent of parentTask as
   * well.
   *
   * @param ctx
   * @param mvTask
   * @param parentTask
   */
  private void addDependentMoveTasks(
      GenMRProcContext ctx, Task<MoveWork> mvTask, Task<? extends Serializable> parentTask) {

    if (mvTask != null) {
      if (ctx.getConf().getBoolVar(ConfVars.HIVE_MULTI_INSERT_MOVE_TASKS_SHARE_DEPENDENCIES)) {
        DependencyCollectionTask dependencyTask = ctx.getDependencyTaskForMultiInsert();
        parentTask.addDependentTask(dependencyTask);
        if (mvTask.getWork().getLoadTableWork() != null) {
          // Moving tables/partitions depend on the dependencyTask
          dependencyTask.addDependentTask(mvTask);
        } else {
          // Moving files depends on the parentTask (we still want the dependencyTask to depend
          // on the parentTask)
          parentTask.addDependentTask(mvTask);
        }
      } else {
        parentTask.addDependentTask(mvTask);
      }
    }
  }
Esempio n. 7
0
  private void LinkMoveTask(
      GenMRProcContext ctx, FileSinkOperator newOutput, ConditionalTask cndTsk) {

    List<Task<? extends Serializable>> mvTasks = ctx.getMvTask();
    Task<? extends Serializable> mvTask = findMoveTask(mvTasks, newOutput);

    if (mvTask != null) {
      for (Task<? extends Serializable> tsk : cndTsk.getListTasks()) {
        tsk.addDependentTask(mvTask);
      }
    }
  }
Esempio n. 8
0
  /**
   * Process the FileSink operator to generate a MoveTask if necessary.
   *
   * @param nd current FileSink operator
   * @param stack parent operators
   * @param opProcCtx
   * @param chDir whether the operator should be first output to a tmp dir and then merged to the
   *     final dir later
   * @return the final file name to which the FileSinkOperator should store.
   * @throws SemanticException
   */
  private String processFS(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, boolean chDir)
      throws SemanticException {

    // Is it the dummy file sink after the mapjoin
    FileSinkOperator fsOp = (FileSinkOperator) nd;
    if ((fsOp.getParentOperators().size() == 1)
        && (fsOp.getParentOperators().get(0) instanceof MapJoinOperator)) {
      return null;
    }

    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    List<FileSinkOperator> seenFSOps = ctx.getSeenFileSinkOps();
    if (seenFSOps == null) {
      seenFSOps = new ArrayList<FileSinkOperator>();
    }
    if (!seenFSOps.contains(fsOp)) {
      seenFSOps.add(fsOp);
    }
    ctx.setSeenFileSinkOps(seenFSOps);

    Task<? extends Serializable> currTask = ctx.getCurrTask();

    // If the directory needs to be changed, send the new directory
    String dest = null;

    if (chDir) {
      dest = fsOp.getConf().getDirName();

      // generate the temporary file
      // it must be on the same file system as the current destination
      ParseContext parseCtx = ctx.getParseCtx();
      Context baseCtx = parseCtx.getContext();
      String tmpDir = baseCtx.getExternalTmpFileURI((new Path(dest)).toUri());

      fsOp.getConf().setDirName(tmpDir);
    }

    Task<? extends Serializable> mvTask = null;

    if (!chDir) {
      mvTask = findMoveTask(ctx.getMvTask(), fsOp);
    }

    Operator<? extends Serializable> currTopOp = ctx.getCurrTopOp();
    String currAliasId = ctx.getCurrAliasId();
    HashMap<Operator<? extends Serializable>, Task<? extends Serializable>> opTaskMap =
        ctx.getOpTaskMap();
    List<Operator<? extends Serializable>> seenOps = ctx.getSeenOps();
    List<Task<? extends Serializable>> rootTasks = ctx.getRootTasks();

    // Set the move task to be dependent on the current task
    if (mvTask != null) {
      currTask.addDependentTask(mvTask);
    }

    // In case of multi-table insert, the path to alias mapping is needed for
    // all the sources. Since there is no
    // reducer, treat it as a plan with null reducer
    // If it is a map-only job, the task needs to be processed
    if (currTopOp != null) {
      Task<? extends Serializable> mapTask = opTaskMap.get(null);
      if (mapTask == null) {
        assert (!seenOps.contains(currTopOp));
        seenOps.add(currTopOp);
        GenMapRedUtils.setTaskPlan(
            currAliasId, currTopOp, (MapredWork) currTask.getWork(), false, ctx);
        opTaskMap.put(null, currTask);
        rootTasks.add(currTask);
      } else {
        if (!seenOps.contains(currTopOp)) {
          seenOps.add(currTopOp);
          GenMapRedUtils.setTaskPlan(
              currAliasId, currTopOp, (MapredWork) mapTask.getWork(), false, ctx);
        }
        // mapTask and currTask should be merged by and join/union operator
        // (e.g., GenMRUnion1j) which has multiple topOps.
        assert mapTask == currTask
            : "mapTask.id = " + mapTask.getId() + "; currTask.id = " + currTask.getId();
      }

      return dest;
    }

    UnionOperator currUnionOp = ctx.getCurrUnionOp();

    if (currUnionOp != null) {
      opTaskMap.put(null, currTask);
      GenMapRedUtils.initUnionPlan(ctx, currTask, false);
      return dest;
    }

    AbstractMapJoinOperator<? extends MapJoinDesc> currMapJoinOp = ctx.getCurrMapJoinOp();

    if (currMapJoinOp != null) {
      opTaskMap.put(null, currTask);
      GenMRMapJoinCtx mjCtx = ctx.getMapJoinCtx(currMapJoinOp);
      MapredWork plan = (MapredWork) currTask.getWork();

      String taskTmpDir = mjCtx.getTaskTmpDir();
      TableDesc tt_desc = mjCtx.getTTDesc();
      assert plan.getPathToAliases().get(taskTmpDir) == null;
      plan.getPathToAliases().put(taskTmpDir, new ArrayList<String>());
      plan.getPathToAliases().get(taskTmpDir).add(taskTmpDir);
      plan.getPathToPartitionInfo().put(taskTmpDir, new PartitionDesc(tt_desc, null));
      plan.getAliasToWork().put(taskTmpDir, mjCtx.getRootMapJoinOp());
      return dest;
    }

    return dest;
  }
Esempio n. 9
0
  public static void mergeMapJoinUnion(UnionOperator union, GenMRProcContext ctx, int pos)
      throws SemanticException {
    ParseContext parseCtx = ctx.getParseCtx();
    UnionProcContext uCtx = parseCtx.getUCtx();

    UnionParseContext uPrsCtx = uCtx.getUnionParseContext(union);
    assert uPrsCtx != null;

    Task<? extends Serializable> currTask = ctx.getCurrTask();

    GenMRUnionCtx uCtxTask = ctx.getUnionTask(union);
    Task<? extends Serializable> uTask = null;

    union.getParentOperators().get(pos);
    MapredWork uPlan = null;

    // union is encountered for the first time
    if (uCtxTask == null) {
      uCtxTask = new GenMRUnionCtx();
      uPlan = GenMapRedUtils.getMapRedWork(parseCtx.getConf());
      uTask = TaskFactory.get(uPlan, parseCtx.getConf());
      uCtxTask.setUTask(uTask);
      ctx.setUnionTask(union, uCtxTask);
    } else {
      uTask = uCtxTask.getUTask();
      uPlan = (MapredWork) uTask.getWork();
    }

    // If there is a mapjoin at position 'pos'
    if (uPrsCtx.getMapJoinSubq(pos)) {
      GenMRMapJoinCtx mjCtx = ctx.getMapJoinCtx(ctx.getCurrMapJoinOp());
      String taskTmpDir = mjCtx.getTaskTmpDir();
      if (uPlan.getPathToAliases().get(taskTmpDir) == null) {
        uPlan.getPathToAliases().put(taskTmpDir, new ArrayList<String>());
        uPlan.getPathToAliases().get(taskTmpDir).add(taskTmpDir);
        uPlan.getPathToPartitionInfo().put(taskTmpDir, new PartitionDesc(mjCtx.getTTDesc(), null));
        uPlan.getAliasToWork().put(taskTmpDir, mjCtx.getRootMapJoinOp());
      }

      for (Task t : currTask.getParentTasks()) {
        t.addDependentTask(uTask);
      }
      try {
        boolean notDone = true;
        while (notDone) {
          for (Task t : currTask.getParentTasks()) {
            t.removeDependentTask(currTask);
          }
          notDone = false;
        }
      } catch (ConcurrentModificationException e) {
      }
    } else {
      setTaskPlan(ctx.getCurrAliasId(), ctx.getCurrTopOp(), uPlan, false, ctx);
    }

    ctx.setCurrTask(uTask);
    ctx.setCurrAliasId(null);
    ctx.setCurrTopOp(null);
    ctx.setCurrMapJoinOp(null);

    ctx.getMapCurrCtx().put(union, new GenMapRedCtx(ctx.getCurrTask(), null, null));
  }
Esempio n. 10
0
  @SuppressWarnings("nls")
  /**
   * Merge the tasks - by creating a temporary file between them.
   *
   * @param op reduce sink operator being processed
   * @param oldTask the parent task
   * @param task the child task
   * @param opProcCtx context
   * @param setReducer does the reducer needs to be set
   * @param pos position of the parent
   */
  public static void splitTasks(
      Operator<? extends Serializable> op,
      Task<? extends Serializable> parentTask,
      Task<? extends Serializable> childTask,
      GenMRProcContext opProcCtx,
      boolean setReducer,
      boolean local,
      int posn)
      throws SemanticException {
    childTask.getWork();
    Operator<? extends Serializable> currTopOp = opProcCtx.getCurrTopOp();

    ParseContext parseCtx = opProcCtx.getParseCtx();
    parentTask.addDependentTask(childTask);

    // Root Task cannot depend on any other task, therefore childTask cannot be
    // a root Task
    List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
    if (rootTasks.contains(childTask)) {
      rootTasks.remove(childTask);
    }

    // generate the temporary file
    Context baseCtx = parseCtx.getContext();
    String taskTmpDir = baseCtx.getMRTmpFileURI();

    Operator<? extends Serializable> parent = op.getParentOperators().get(posn);
    TableDesc tt_desc =
        PlanUtils.getIntermediateFileTableDesc(
            PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));

    // Create a file sink operator for this file name
    boolean compressIntermediate =
        parseCtx.getConf().getBoolVar(HiveConf.ConfVars.COMPRESSINTERMEDIATE);
    FileSinkDesc desc = new FileSinkDesc(taskTmpDir, tt_desc, compressIntermediate);
    if (compressIntermediate) {
      desc.setCompressCodec(parseCtx.getConf().getVar(HiveConf.ConfVars.COMPRESSINTERMEDIATECODEC));
      desc.setCompressType(parseCtx.getConf().getVar(HiveConf.ConfVars.COMPRESSINTERMEDIATETYPE));
    }
    Operator<? extends Serializable> fs_op =
        putOpInsertMap(OperatorFactory.get(desc, parent.getSchema()), null, parseCtx);

    // replace the reduce child with this operator
    List<Operator<? extends Serializable>> childOpList = parent.getChildOperators();
    for (int pos = 0; pos < childOpList.size(); pos++) {
      if (childOpList.get(pos) == op) {
        childOpList.set(pos, fs_op);
        break;
      }
    }

    List<Operator<? extends Serializable>> parentOpList =
        new ArrayList<Operator<? extends Serializable>>();
    parentOpList.add(parent);
    fs_op.setParentOperators(parentOpList);

    // create a dummy tableScan operator on top of op
    // TableScanOperator is implicitly created here for each MapOperator
    RowResolver rowResolver = opProcCtx.getParseCtx().getOpParseCtx().get(parent).getRowResolver();
    Operator<? extends Serializable> ts_op =
        putOpInsertMap(
            OperatorFactory.get(TableScanDesc.class, parent.getSchema()), rowResolver, parseCtx);

    childOpList = new ArrayList<Operator<? extends Serializable>>();
    childOpList.add(op);
    ts_op.setChildOperators(childOpList);
    op.getParentOperators().set(posn, ts_op);

    Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
    mapCurrCtx.put(ts_op, new GenMapRedCtx(childTask, null, null));

    String streamDesc = taskTmpDir;
    MapredWork cplan = (MapredWork) childTask.getWork();

    if (setReducer) {
      Operator<? extends Serializable> reducer = op.getChildOperators().get(0);

      if (reducer.getClass() == JoinOperator.class) {
        String origStreamDesc;
        streamDesc = "$INTNAME";
        origStreamDesc = streamDesc;
        int pos = 0;
        while (cplan.getAliasToWork().get(streamDesc) != null) {
          streamDesc = origStreamDesc.concat(String.valueOf(++pos));
        }
      }

      // TODO: Allocate work to remove the temporary files and make that
      // dependent on the redTask
      if (reducer.getClass() == JoinOperator.class) {
        cplan.setNeedsTagging(true);
      }
    }

    // Add the path to alias mapping
    setTaskPlan(taskTmpDir, streamDesc, ts_op, cplan, local, tt_desc);

    // This can be cleaned up as a function table in future
    if (op instanceof AbstractMapJoinOperator<?>) {
      AbstractMapJoinOperator<? extends MapJoinDesc> mjOp =
          (AbstractMapJoinOperator<? extends MapJoinDesc>) op;
      opProcCtx.setCurrMapJoinOp(mjOp);
      GenMRMapJoinCtx mjCtx = opProcCtx.getMapJoinCtx(mjOp);
      if (mjCtx == null) {
        mjCtx = new GenMRMapJoinCtx(taskTmpDir, tt_desc, ts_op, null);
      } else {
        mjCtx.setTaskTmpDir(taskTmpDir);
        mjCtx.setTTDesc(tt_desc);
        mjCtx.setRootMapJoinOp(ts_op);
      }
      opProcCtx.setMapJoinCtx(mjOp, mjCtx);
      opProcCtx.getMapCurrCtx().put(parent, new GenMapRedCtx(childTask, null, null));
      setupBucketMapJoinInfo(cplan, mjOp, false);
    }

    currTopOp = null;
    String currAliasId = null;

    opProcCtx.setCurrTopOp(currTopOp);
    opProcCtx.setCurrAliasId(currAliasId);
    opProcCtx.setCurrTask(childTask);
  }
Esempio n. 11
0
  /**
   * Merge the current task with the task for the current reducer.
   *
   * @param op operator being processed
   * @param oldTask the old task for the current reducer
   * @param task the current task for the current reducer
   * @param opProcCtx processing context
   * @param pos position of the parent in the stack
   */
  public static void joinPlan(
      Operator<? extends Serializable> op,
      Task<? extends Serializable> oldTask,
      Task<? extends Serializable> task,
      GenMRProcContext opProcCtx,
      int pos,
      boolean split,
      boolean readMapJoinData,
      boolean readUnionData,
      boolean createLocalWork)
      throws SemanticException {
    Task<? extends Serializable> currTask = task;
    MapredWork plan = (MapredWork) currTask.getWork();
    Operator<? extends Serializable> currTopOp = opProcCtx.getCurrTopOp();
    List<Task<? extends Serializable>> parTasks = null;

    // terminate the old task and make current task dependent on it
    if (split) {
      assert oldTask != null;
      splitTasks(op, oldTask, currTask, opProcCtx, true, false, 0);
    } else {
      if ((oldTask != null)
          && (oldTask.getParentTasks() != null)
          && !oldTask.getParentTasks().isEmpty()) {
        parTasks = new ArrayList<Task<? extends Serializable>>();
        parTasks.addAll(oldTask.getParentTasks());

        Object[] parTaskArr = parTasks.toArray();
        for (Object element : parTaskArr) {
          ((Task<? extends Serializable>) element).removeDependentTask(oldTask);
        }
      }
    }

    if (currTopOp != null) {
      List<Operator<? extends Serializable>> seenOps = opProcCtx.getSeenOps();
      String currAliasId = opProcCtx.getCurrAliasId();

      if (!seenOps.contains(currTopOp)) {
        seenOps.add(currTopOp);
        boolean local = false;
        if (pos != -1) {
          local = (pos == ((MapJoinDesc) op.getConf()).getPosBigTable()) ? false : true;
        }
        setTaskPlan(currAliasId, currTopOp, plan, local, opProcCtx);
        if (op instanceof AbstractMapJoinOperator) {
          setupBucketMapJoinInfo(
              plan, (AbstractMapJoinOperator<? extends MapJoinDesc>) op, createLocalWork);
        }
      }
      currTopOp = null;
      opProcCtx.setCurrTopOp(currTopOp);
    } else if (opProcCtx.getCurrMapJoinOp() != null) {
      AbstractMapJoinOperator<? extends MapJoinDesc> mjOp = opProcCtx.getCurrMapJoinOp();
      if (readUnionData) {
        initUnionPlan(opProcCtx, currTask, false);
      } else {
        GenMRMapJoinCtx mjCtx = opProcCtx.getMapJoinCtx(mjOp);

        // In case of map-join followed by map-join, the file needs to be
        // obtained from the old map join
        AbstractMapJoinOperator<? extends MapJoinDesc> oldMapJoin = mjCtx.getOldMapJoin();
        String taskTmpDir = null;
        TableDesc tt_desc = null;
        Operator<? extends Serializable> rootOp = null;

        boolean local = ((pos == -1) || (pos == (mjOp.getConf()).getPosBigTable())) ? false : true;
        if (oldMapJoin == null) {
          if (opProcCtx.getParseCtx().getListMapJoinOpsNoReducer().contains(mjOp)
              || local
              || (oldTask != null) && (parTasks != null)) {
            taskTmpDir = mjCtx.getTaskTmpDir();
            tt_desc = mjCtx.getTTDesc();
            rootOp = mjCtx.getRootMapJoinOp();
          }
        } else {
          GenMRMapJoinCtx oldMjCtx = opProcCtx.getMapJoinCtx(oldMapJoin);
          assert oldMjCtx != null;
          taskTmpDir = oldMjCtx.getTaskTmpDir();
          tt_desc = oldMjCtx.getTTDesc();
          rootOp = oldMjCtx.getRootMapJoinOp();
        }

        setTaskPlan(taskTmpDir, taskTmpDir, rootOp, plan, local, tt_desc);
        setupBucketMapJoinInfo(plan, oldMapJoin, createLocalWork);
      }
      opProcCtx.setCurrMapJoinOp(null);

      if ((oldTask != null) && (parTasks != null)) {
        for (Task<? extends Serializable> parTask : parTasks) {
          parTask.addDependentTask(currTask);
          if (opProcCtx.getRootTasks().contains(currTask)) {
            opProcCtx.getRootTasks().remove(currTask);
          }
        }
      }
    }

    opProcCtx.setCurrTask(currTask);
  }
Esempio n. 12
0
  @Override
  public void analyzeInternal(ASTNode ast) throws SemanticException {
    isLocal = false;
    isOverWrite = false;
    Tree fromTree = ast.getChild(0);
    Tree tableTree = ast.getChild(1);

    if (ast.getChildCount() == 4) {
      isLocal = true;
      isOverWrite = true;
    }

    if (ast.getChildCount() == 3) {
      if (ast.getChild(2).getText().toLowerCase().equals("local")) {
        isLocal = true;
      } else {
        isOverWrite = true;
      }
    }

    // initialize load path
    URI fromURI;
    try {
      String fromPath = stripQuotes(fromTree.getText());
      fromURI = initializeFromURI(fromPath);
    } catch (IOException e) {
      throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e.getMessage()), e);
    } catch (URISyntaxException e) {
      throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e.getMessage()), e);
    }

    // initialize destination table/partition
    tableSpec ts = new tableSpec(db, conf, (ASTNode) tableTree);

    if (ts.tableHandle.isOffline()) {
      throw new SemanticException(
          ErrorMsg.OFFLINE_TABLE_OR_PARTITION.getMsg(":Table " + ts.tableName));
    }

    if (ts.tableHandle.isView()) {
      throw new SemanticException(ErrorMsg.DML_AGAINST_VIEW.getMsg());
    }
    if (ts.tableHandle.isNonNative()) {
      throw new SemanticException(ErrorMsg.LOAD_INTO_NON_NATIVE.getMsg());
    }

    if (ts.tableHandle.isStoredAsSubDirectories()) {
      throw new SemanticException(ErrorMsg.LOAD_INTO_STORED_AS_DIR.getMsg());
    }

    URI toURI =
        (ts.partHandle != null)
            ? ts.partHandle.getDataLocation()
            : ts.tableHandle.getDataLocation();

    List<FieldSchema> parts = ts.tableHandle.getPartitionKeys();
    if ((parts != null && parts.size() > 0) && (ts.partSpec == null || ts.partSpec.size() == 0)) {
      throw new SemanticException(ErrorMsg.NEED_PARTITION_ERROR.getMsg());
    }

    // make sure the arguments make sense
    applyConstraints(fromURI, toURI, fromTree, isLocal);

    Task<? extends Serializable> rTask = null;

    // create copy work
    if (isLocal) {
      // if the local keyword is specified - we will always make a copy. this
      // might seem redundant in the case
      // that the hive warehouse is also located in the local file system - but
      // that's just a test case.
      String copyURIStr = ctx.getExternalTmpFileURI(toURI);
      URI copyURI = URI.create(copyURIStr);
      rTask = TaskFactory.get(new CopyWork(fromURI.toString(), copyURIStr), conf);
      fromURI = copyURI;
    }

    // create final load/move work

    String loadTmpPath = ctx.getExternalTmpFileURI(toURI);
    Map<String, String> partSpec = ts.getPartSpec();
    if (partSpec == null) {
      partSpec = new LinkedHashMap<String, String>();
      outputs.add(new WriteEntity(ts.tableHandle));
    } else {
      try {
        Partition part = Hive.get().getPartition(ts.tableHandle, partSpec, false);
        if (part != null) {
          if (part.isOffline()) {
            throw new SemanticException(
                ErrorMsg.OFFLINE_TABLE_OR_PARTITION.getMsg(ts.tableName + ":" + part.getName()));
          }
          outputs.add(new WriteEntity(part));
        } else {
          outputs.add(new WriteEntity(ts.tableHandle));
        }
      } catch (HiveException e) {
        throw new SemanticException(e);
      }
    }

    LoadTableDesc loadTableWork =
        new LoadTableDesc(
            fromURI.toString(),
            loadTmpPath,
            Utilities.getTableDesc(ts.tableHandle),
            partSpec,
            isOverWrite);

    Task<? extends Serializable> childTask =
        TaskFactory.get(new MoveWork(getInputs(), getOutputs(), loadTableWork, null, true), conf);
    if (rTask != null) {
      rTask.addDependentTask(childTask);
    } else {
      rTask = childTask;
    }

    rootTasks.add(rTask);

    // The user asked for stats to be collected.
    // Some stats like number of rows require a scan of the data
    // However, some other stats, like number of files, do not require a complete scan
    // Update the stats which do not require a complete scan.
    Task<? extends Serializable> statTask = null;
    if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
      StatsWork statDesc = new StatsWork(loadTableWork);
      statDesc.setNoStatsAggregator(true);
      statDesc.setClearAggregatorStats(true);
      statDesc.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
      statTask = TaskFactory.get(statDesc, conf);
    }

    // HIVE-3334 has been filed for load file with index auto update
    if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEINDEXAUTOUPDATE)) {
      IndexUpdater indexUpdater = new IndexUpdater(loadTableWork, getInputs(), conf);
      try {
        List<Task<? extends Serializable>> indexUpdateTasks = indexUpdater.generateUpdateTasks();

        for (Task<? extends Serializable> updateTask : indexUpdateTasks) {
          // LOAD DATA will either have a copy & move or just a move,
          // we always want the update to be dependent on the move
          childTask.addDependentTask(updateTask);
          if (statTask != null) {
            updateTask.addDependentTask(statTask);
          }
        }
      } catch (HiveException e) {
        console.printInfo(
            "WARNING: could not auto-update stale indexes, indexes are not out of sync");
      }
    } else if (statTask != null) {
      childTask.addDependentTask(statTask);
    }
  }