Пример #1
0
  private void startForward(boolean inputFileChangeSenstive, String bigTableBucket)
      throws Exception {
    for (Operator<?> source : work.getAliasToWork().values()) {
      source.reset();
    }
    if (inputFileChangeSenstive) {
      execContext.setCurrentBigBucketFile(bigTableBucket);
    }
    for (Map.Entry<String, FetchOperator> entry : fetchOperators.entrySet()) {
      String alias = entry.getKey();
      FetchOperator fetchOp = entry.getValue();

      if (inputFileChangeSenstive) {
        fetchOp.clearFetchContext();
        setUpFetchOpContext(fetchOp, alias, bigTableBucket);
      }

      // get the root operator
      Operator<? extends OperatorDesc> forwardOp = work.getAliasToWork().get(alias);
      // walk through the operator tree
      while (!forwardOp.getDone()) {
        InspectableObject row = fetchOp.getNextRow();
        if (row == null) {
          break;
        }
        forwardOp.process(row.o, 0);
      }
      forwardOp.flush();
    }
    for (Operator<?> source : work.getAliasToWork().values()) {
      source.close(false);
    }
  }
Пример #2
0
  private void handleSampling(DriverContext context, MapWork mWork, JobConf job, HiveConf conf)
      throws Exception {
    assert mWork.getAliasToWork().keySet().size() == 1;

    String alias = mWork.getAliases().get(0);
    Operator<?> topOp = mWork.getAliasToWork().get(alias);
    PartitionDesc partDesc = mWork.getAliasToPartnInfo().get(alias);

    ArrayList<String> paths = mWork.getPaths();
    ArrayList<PartitionDesc> parts = mWork.getPartitionDescs();

    List<Path> inputPaths = new ArrayList<Path>(paths.size());
    for (String path : paths) {
      inputPaths.add(new Path(path));
    }

    Path tmpPath = context.getCtx().getExternalTmpPath(inputPaths.get(0));
    Path partitionFile = new Path(tmpPath, ".partitions");
    ShimLoader.getHadoopShims().setTotalOrderPartitionFile(job, partitionFile);
    PartitionKeySampler sampler = new PartitionKeySampler();

    if (mWork.getSamplingType() == MapWork.SAMPLING_ON_PREV_MR) {
      console.printInfo("Use sampling data created in previous MR");
      // merges sampling data from previous MR and make partition keys for total sort
      for (Path path : inputPaths) {
        FileSystem fs = path.getFileSystem(job);
        for (FileStatus status : fs.globStatus(new Path(path, ".sampling*"))) {
          sampler.addSampleFile(status.getPath(), job);
        }
      }
    } else if (mWork.getSamplingType() == MapWork.SAMPLING_ON_START) {
      console.printInfo("Creating sampling data..");
      assert topOp instanceof TableScanOperator;
      TableScanOperator ts = (TableScanOperator) topOp;

      FetchWork fetchWork;
      if (!partDesc.isPartitioned()) {
        assert paths.size() == 1;
        fetchWork = new FetchWork(inputPaths.get(0), partDesc.getTableDesc());
      } else {
        fetchWork = new FetchWork(inputPaths, parts, partDesc.getTableDesc());
      }
      fetchWork.setSource(ts);

      // random sampling
      FetchOperator fetcher = PartitionKeySampler.createSampler(fetchWork, conf, job, ts);
      try {
        ts.initialize(conf, new ObjectInspector[] {fetcher.getOutputObjectInspector()});
        OperatorUtils.setChildrenCollector(ts.getChildOperators(), sampler);
        while (fetcher.pushRow()) {}
      } finally {
        fetcher.clearFetchContext();
      }
    } else {
      throw new IllegalArgumentException("Invalid sampling type " + mWork.getSamplingType());
    }
    sampler.writePartitionKeys(partitionFile, conf, job);
  }
Пример #3
0
  private void initializeOperators(Map<FetchOperator, JobConf> fetchOpJobConfMap)
      throws HiveException {
    for (Map.Entry<String, Operator<? extends OperatorDesc>> entry :
        work.getAliasToWork().entrySet()) {
      LOG.debug(
          "initializeOperators: "
              + entry.getKey()
              + ", children = "
              + entry.getValue().getChildOperators());
    }
    // this mapper operator is used to initialize all the operators
    for (Map.Entry<String, FetchWork> entry : work.getAliasToFetchWork().entrySet()) {
      if (entry.getValue() == null) {
        continue;
      }
      JobConf jobClone = new JobConf(job);

      TableScanOperator ts = (TableScanOperator) work.getAliasToWork().get(entry.getKey());
      // push down projections
      ColumnProjectionUtils.appendReadColumns(
          jobClone, ts.getNeededColumnIDs(), ts.getNeededColumns());
      // push down filters
      HiveInputFormat.pushFilters(jobClone, ts);

      // create a fetch operator
      FetchOperator fetchOp = new FetchOperator(entry.getValue(), jobClone);
      fetchOpJobConfMap.put(fetchOp, jobClone);
      fetchOperators.put(entry.getKey(), fetchOp);
      l4j.info("fetchoperator for " + entry.getKey() + " created");
    }
    // initialize all forward operator
    for (Map.Entry<String, FetchOperator> entry : fetchOperators.entrySet()) {
      // get the forward op
      String alias = entry.getKey();
      Operator<? extends OperatorDesc> forwardOp = work.getAliasToWork().get(alias);

      // put the exe context into all the operators
      forwardOp.passExecContext(execContext);
      // All the operators need to be initialized before process
      FetchOperator fetchOp = entry.getValue();
      JobConf jobConf = fetchOpJobConfMap.get(fetchOp);

      if (jobConf == null) {
        jobConf = job;
      }
      // initialize the forward operator
      ObjectInspector objectInspector = fetchOp.getOutputObjectInspector();
      forwardOp.initialize(jobConf, new ObjectInspector[] {objectInspector});
      l4j.info("fetchoperator for " + entry.getKey() + " initialized");
    }
  }
Пример #4
0
  private void setUpFetchOpContext(FetchOperator fetchOp, String alias, String currentInputFile)
      throws Exception {

    BucketMapJoinContext bucketMatcherCxt = this.work.getBucketMapjoinContext();

    Class<? extends BucketMatcher> bucketMatcherCls = bucketMatcherCxt.getBucketMatcherClass();
    BucketMatcher bucketMatcher = ReflectionUtils.newInstance(bucketMatcherCls, null);
    bucketMatcher.setAliasBucketFileNameMapping(bucketMatcherCxt.getAliasBucketFileNameMapping());

    List<Path> aliasFiles =
        bucketMatcher.getAliasBucketFiles(
            currentInputFile, bucketMatcherCxt.getMapJoinBigTableAlias(), alias);
    fetchOp.setupContext(aliasFiles);
  }