/**
  * Method to fetch table data
  *
  * @param table table name
  * @param database database
  * @return list of columns in comma seperated way
  * @throws Exception if any error occurs
  */
 private List<String> getTableData(String table, String database) throws Exception {
   HiveConf conf = new HiveConf();
   conf.addResource("hive-site.xml");
   ArrayList<String> results = new ArrayList<String>();
   ArrayList<String> temp = new ArrayList<String>();
   Hive hive = Hive.get(conf);
   org.apache.hadoop.hive.ql.metadata.Table tbl = hive.getTable(database, table);
   FetchWork work;
   if (!tbl.getPartCols().isEmpty()) {
     List<Partition> partitions = hive.getPartitions(tbl);
     List<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
     List<String> partLocs = new ArrayList<String>();
     for (Partition part : partitions) {
       partLocs.add(part.getLocation());
       partDesc.add(Utilities.getPartitionDesc(part));
     }
     work = new FetchWork(partLocs, partDesc, Utilities.getTableDesc(tbl));
     work.setLimit(100);
   } else {
     work = new FetchWork(tbl.getDataLocation().toString(), Utilities.getTableDesc(tbl));
   }
   FetchTask task = new FetchTask();
   task.setWork(work);
   task.initialize(conf, null, null);
   task.fetch(temp);
   for (String str : temp) {
     results.add(str.replace("\t", ","));
   }
   return results;
 }
Пример #2
0
  /**
   * set the current task in the mapredWork.
   *
   * @param alias_id current alias
   * @param topOp the top operator of the stack
   * @param plan current plan
   * @param local whether you need to add to map-reduce or local work
   * @param opProcCtx processing context
   * @param pList pruned partition list. If it is null it will be computed on-the-fly.
   */
  public static void setTaskPlan(
      String alias_id,
      Operator<? extends Serializable> topOp,
      MapredWork plan,
      boolean local,
      GenMRProcContext opProcCtx,
      PrunedPartitionList pList)
      throws SemanticException {
    ParseContext parseCtx = opProcCtx.getParseCtx();
    Set<ReadEntity> inputs = opProcCtx.getInputs();

    ArrayList<Path> partDir = new ArrayList<Path>();
    ArrayList<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();

    Path tblDir = null;
    TableDesc tblDesc = null;

    PrunedPartitionList partsList = pList;

    if (partsList == null) {
      try {
        partsList = parseCtx.getOpToPartList().get((TableScanOperator) topOp);
        if (partsList == null) {
          partsList =
              PartitionPruner.prune(
                  parseCtx.getTopToTable().get(topOp),
                  parseCtx.getOpToPartPruner().get(topOp),
                  opProcCtx.getConf(),
                  alias_id,
                  parseCtx.getPrunedPartitions());
          parseCtx.getOpToPartList().put((TableScanOperator) topOp, partsList);
        }
      } catch (SemanticException e) {
        throw e;
      } catch (HiveException e) {
        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
        throw new SemanticException(e.getMessage(), e);
      }
    }

    // Generate the map work for this alias_id
    Set<Partition> parts = null;
    // pass both confirmed and unknown partitions through the map-reduce
    // framework

    parts = partsList.getConfirmedPartns();
    parts.addAll(partsList.getUnknownPartns());
    PartitionDesc aliasPartnDesc = null;
    try {
      if (!parts.isEmpty()) {
        aliasPartnDesc = Utilities.getPartitionDesc(parts.iterator().next());
      }
    } catch (HiveException e) {
      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
      throw new SemanticException(e.getMessage(), e);
    }

    // The table does not have any partitions
    if (aliasPartnDesc == null) {
      aliasPartnDesc =
          new PartitionDesc(Utilities.getTableDesc(parseCtx.getTopToTable().get(topOp)), null);
    }

    plan.getAliasToPartnInfo().put(alias_id, aliasPartnDesc);

    for (Partition part : parts) {
      if (part.getTable().isPartitioned()) {
        inputs.add(new ReadEntity(part));
      } else {
        inputs.add(new ReadEntity(part.getTable()));
      }

      // Later the properties have to come from the partition as opposed
      // to from the table in order to support versioning.
      Path[] paths;
      sampleDesc sampleDescr = parseCtx.getOpToSamplePruner().get(topOp);

      if (sampleDescr != null) {
        paths = SamplePruner.prune(part, sampleDescr);
      } else {
        paths = part.getPath();
      }

      // is it a partitioned table ?
      if (!part.getTable().isPartitioned()) {
        assert ((tblDir == null) && (tblDesc == null));

        tblDir = paths[0];
        tblDesc = Utilities.getTableDesc(part.getTable());
      }

      for (Path p : paths) {
        if (p == null) {
          continue;
        }
        String path = p.toString();
        if (LOG.isDebugEnabled()) {
          LOG.debug("Adding " + path + " of table" + alias_id);
        }

        partDir.add(p);
        try {
          partDesc.add(Utilities.getPartitionDesc(part));
        } catch (HiveException e) {
          LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
          throw new SemanticException(e.getMessage(), e);
        }
      }
    }

    Iterator<Path> iterPath = partDir.iterator();
    Iterator<PartitionDesc> iterPartnDesc = partDesc.iterator();

    if (!local) {
      while (iterPath.hasNext()) {
        assert iterPartnDesc.hasNext();
        String path = iterPath.next().toString();

        PartitionDesc prtDesc = iterPartnDesc.next();

        // Add the path to alias mapping
        if (plan.getPathToAliases().get(path) == null) {
          plan.getPathToAliases().put(path, new ArrayList<String>());
        }
        plan.getPathToAliases().get(path).add(alias_id);
        plan.getPathToPartitionInfo().put(path, prtDesc);
        if (LOG.isDebugEnabled()) {
          LOG.debug("Information added for path " + path);
        }
      }

      assert plan.getAliasToWork().get(alias_id) == null;
      plan.getAliasToWork().put(alias_id, topOp);
    } else {
      // populate local work if needed
      MapredLocalWork localPlan = plan.getMapLocalWork();
      if (localPlan == null) {
        localPlan =
            new MapredLocalWork(
                new LinkedHashMap<String, Operator<? extends Serializable>>(),
                new LinkedHashMap<String, FetchWork>());
      }

      assert localPlan.getAliasToWork().get(alias_id) == null;
      assert localPlan.getAliasToFetchWork().get(alias_id) == null;
      localPlan.getAliasToWork().put(alias_id, topOp);
      if (tblDir == null) {
        localPlan
            .getAliasToFetchWork()
            .put(alias_id, new FetchWork(FetchWork.convertPathToStringArray(partDir), partDesc));
      } else {
        localPlan.getAliasToFetchWork().put(alias_id, new FetchWork(tblDir.toString(), tblDesc));
      }
      plan.setMapLocalWork(localPlan);
    }
  }
Пример #3
0
  @Override
  public void analyzeInternal(ASTNode ast) throws SemanticException {
    isLocal = false;
    isOverWrite = false;
    Tree fromTree = ast.getChild(0);
    Tree tableTree = ast.getChild(1);

    if (ast.getChildCount() == 4) {
      isLocal = true;
      isOverWrite = true;
    }

    if (ast.getChildCount() == 3) {
      if (ast.getChild(2).getText().toLowerCase().equals("local")) {
        isLocal = true;
      } else {
        isOverWrite = true;
      }
    }

    // initialize load path
    URI fromURI;
    try {
      String fromPath = stripQuotes(fromTree.getText());
      fromURI = initializeFromURI(fromPath);
    } catch (IOException e) {
      throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e.getMessage()), e);
    } catch (URISyntaxException e) {
      throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e.getMessage()), e);
    }

    // initialize destination table/partition
    tableSpec ts = new tableSpec(db, conf, (ASTNode) tableTree);

    if (ts.tableHandle.isOffline()) {
      throw new SemanticException(
          ErrorMsg.OFFLINE_TABLE_OR_PARTITION.getMsg(":Table " + ts.tableName));
    }

    if (ts.tableHandle.isView()) {
      throw new SemanticException(ErrorMsg.DML_AGAINST_VIEW.getMsg());
    }
    if (ts.tableHandle.isNonNative()) {
      throw new SemanticException(ErrorMsg.LOAD_INTO_NON_NATIVE.getMsg());
    }

    if (ts.tableHandle.isStoredAsSubDirectories()) {
      throw new SemanticException(ErrorMsg.LOAD_INTO_STORED_AS_DIR.getMsg());
    }

    URI toURI =
        (ts.partHandle != null)
            ? ts.partHandle.getDataLocation()
            : ts.tableHandle.getDataLocation();

    List<FieldSchema> parts = ts.tableHandle.getPartitionKeys();
    if ((parts != null && parts.size() > 0) && (ts.partSpec == null || ts.partSpec.size() == 0)) {
      throw new SemanticException(ErrorMsg.NEED_PARTITION_ERROR.getMsg());
    }

    // make sure the arguments make sense
    applyConstraints(fromURI, toURI, fromTree, isLocal);

    Task<? extends Serializable> rTask = null;

    // create copy work
    if (isLocal) {
      // if the local keyword is specified - we will always make a copy. this
      // might seem redundant in the case
      // that the hive warehouse is also located in the local file system - but
      // that's just a test case.
      String copyURIStr = ctx.getExternalTmpFileURI(toURI);
      URI copyURI = URI.create(copyURIStr);
      rTask = TaskFactory.get(new CopyWork(fromURI.toString(), copyURIStr), conf);
      fromURI = copyURI;
    }

    // create final load/move work

    String loadTmpPath = ctx.getExternalTmpFileURI(toURI);
    Map<String, String> partSpec = ts.getPartSpec();
    if (partSpec == null) {
      partSpec = new LinkedHashMap<String, String>();
      outputs.add(new WriteEntity(ts.tableHandle));
    } else {
      try {
        Partition part = Hive.get().getPartition(ts.tableHandle, partSpec, false);
        if (part != null) {
          if (part.isOffline()) {
            throw new SemanticException(
                ErrorMsg.OFFLINE_TABLE_OR_PARTITION.getMsg(ts.tableName + ":" + part.getName()));
          }
          outputs.add(new WriteEntity(part));
        } else {
          outputs.add(new WriteEntity(ts.tableHandle));
        }
      } catch (HiveException e) {
        throw new SemanticException(e);
      }
    }

    LoadTableDesc loadTableWork =
        new LoadTableDesc(
            fromURI.toString(),
            loadTmpPath,
            Utilities.getTableDesc(ts.tableHandle),
            partSpec,
            isOverWrite);

    Task<? extends Serializable> childTask =
        TaskFactory.get(new MoveWork(getInputs(), getOutputs(), loadTableWork, null, true), conf);
    if (rTask != null) {
      rTask.addDependentTask(childTask);
    } else {
      rTask = childTask;
    }

    rootTasks.add(rTask);

    // The user asked for stats to be collected.
    // Some stats like number of rows require a scan of the data
    // However, some other stats, like number of files, do not require a complete scan
    // Update the stats which do not require a complete scan.
    Task<? extends Serializable> statTask = null;
    if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
      StatsWork statDesc = new StatsWork(loadTableWork);
      statDesc.setNoStatsAggregator(true);
      statDesc.setClearAggregatorStats(true);
      statDesc.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
      statTask = TaskFactory.get(statDesc, conf);
    }

    // HIVE-3334 has been filed for load file with index auto update
    if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEINDEXAUTOUPDATE)) {
      IndexUpdater indexUpdater = new IndexUpdater(loadTableWork, getInputs(), conf);
      try {
        List<Task<? extends Serializable>> indexUpdateTasks = indexUpdater.generateUpdateTasks();

        for (Task<? extends Serializable> updateTask : indexUpdateTasks) {
          // LOAD DATA will either have a copy & move or just a move,
          // we always want the update to be dependent on the move
          childTask.addDependentTask(updateTask);
          if (statTask != null) {
            updateTask.addDependentTask(statTask);
          }
        }
      } catch (HiveException e) {
        console.printInfo(
            "WARNING: could not auto-update stale indexes, indexes are not out of sync");
      }
    } else if (statTask != null) {
      childTask.addDependentTask(statTask);
    }
  }