// Load the hash table
  @Override
  public void cleanUpInputFileChangedOp() throws HiveException {
    try {
      if (firstRow) {
        // generate the map metadata
        generateMapMetaData();
        firstRow = false;
      }

      loadHashTable();
    } catch (SerDeException e) {
      e.printStackTrace();
      throw new HiveException(e);
    }
  }
  @Override
  public void processOp(Object row, int tag) throws HiveException {

    try {
      if (firstRow) {
        // generate the map metadata
        generateMapMetaData();
        firstRow = false;
      }

      // get alias
      alias = order[tag];
      // alias = (byte)tag;

      if ((lastAlias == null) || (!lastAlias.equals(alias))) {
        nextSz = joinEmitInterval;
      }

      // compute keys and values as StandardObjects
      AbstractMapJoinKey key =
          JoinUtil.computeMapJoinKeys(
              row, joinKeys.get(alias), joinKeysObjectInspectors.get(alias));
      ArrayList<Object> value =
          JoinUtil.computeValues(
              row,
              joinValues.get(alias),
              joinValuesObjectInspectors.get(alias),
              joinFilters.get(alias),
              joinFilterObjectInspectors.get(alias),
              noOuterJoin);

      // Add the value to the ArrayList
      storage.get((byte) tag).add(value);

      for (Byte pos : order) {
        if (pos.intValue() != tag) {

          MapJoinObjectValue o = mapJoinTables.get(pos).get(key);
          MapJoinRowContainer<ArrayList<Object>> rowContainer = rowContainerMap.get(pos);

          // there is no join-value or join-key has all null elements
          if (o == null || key.hasAnyNulls()) {
            if (noOuterJoin) {
              storage.put(pos, emptyList);
            } else {
              storage.put(pos, dummyObjVectors[pos.intValue()]);
            }
          } else {
            rowContainer.reset(o.getObj());
            storage.put(pos, rowContainer);
          }
        }
      }

      // generate the output records
      checkAndGenObject();

      // done with the row
      storage.get((byte) tag).clear();

      for (Byte pos : order) {
        if (pos.intValue() != tag) {
          storage.put(pos, null);
        }
      }

    } catch (SerDeException e) {
      e.printStackTrace();
      throw new HiveException(e);
    }
  }
Beispiel #3
0
  @Override
  protected Collection<Future<?>> initializeOp(Configuration hconf) throws HiveException {
    this.hconf = hconf;
    unwrapContainer = new UnwrapRowContainer[conf.getTagLength()];

    Collection<Future<?>> result = super.initializeOp(hconf);
    if (result == null) {
      result = new HashSet<Future<?>>();
    }

    int tagLen = conf.getTagLength();

    // On Tez only: The hash map might already be cached in the container we run
    // the task in. On MR: The cache is a no-op.
    cacheKey =
        HiveConf.getVar(hconf, HiveConf.ConfVars.HIVEQUERYID)
            + "__HASH_MAP_"
            + this.getOperatorId()
            + "_container";

    cache = ObjectCacheFactory.getCache(hconf);
    loader = getHashTableLoader(hconf);

    hashMapRowGetters = null;

    mapJoinTables = new MapJoinTableContainer[tagLen];
    mapJoinTableSerdes = new MapJoinTableContainerSerDe[tagLen];
    hashTblInitedOnce = false;

    generateMapMetaData();

    final ExecMapperContext mapContext = getExecContext();
    final MapredContext mrContext = MapredContext.get();

    if (!conf.isBucketMapJoin() && !conf.isDynamicPartitionHashJoin()) {
      /*
       * The issue with caching in case of bucket map join is that different tasks
       * process different buckets and if the container is reused to join a different bucket,
       * join results can be incorrect. The cache is keyed on operator id and for bucket map join
       * the operator does not change but data needed is different. For a proper fix, this
       * requires changes in the Tez API with regard to finding bucket id and
       * also ability to schedule tasks to re-use containers that have cached the specific bucket.
       */
      if (isLogInfoEnabled) {
        LOG.info("This is not bucket map join, so cache");
      }

      Future<Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>> future =
          cache.retrieveAsync(
              cacheKey,
              new Callable<Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>>() {
                @Override
                public Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]> call()
                    throws HiveException {
                  return loadHashTable(mapContext, mrContext);
                }
              });
      result.add(future);
    } else if (!isInputFileChangeSensitive(mapContext)) {
      loadHashTable(mapContext, mrContext);
      hashTblInitedOnce = true;
    }
    return result;
  }