@SuppressWarnings("unchecked") @Override protected final void completeInitializationOp(Object[] os) throws HiveException { if (os.length != 0) { Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]> pair = (Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>) os[0]; boolean spilled = false; for (MapJoinTableContainer container : pair.getLeft()) { if (container != null) { spilled = spilled || container.hasSpill(); } } if (!loadCalled && spilled) { // we can't use the cached table because it has spilled. loadHashTable(getExecContext(), MapredContext.get()); } else { // let's use the table from the cache. mapJoinTables = pair.getLeft(); mapJoinTableSerdes = pair.getRight(); } hashTblInitedOnce = true; } if (this.getExecContext() != null) { // reset exec context so that initialization of the map operator happens // properly this.getExecContext().setLastInputPath(null); this.getExecContext().setCurrentInputPath(null); } }
// Load the hash table @Override public void cleanUpInputFileChangedOp() throws HiveException { loadHashTable(getExecContext(), MapredContext.get()); }
@Override protected Collection<Future<?>> initializeOp(Configuration hconf) throws HiveException { this.hconf = hconf; unwrapContainer = new UnwrapRowContainer[conf.getTagLength()]; Collection<Future<?>> result = super.initializeOp(hconf); if (result == null) { result = new HashSet<Future<?>>(); } int tagLen = conf.getTagLength(); // On Tez only: The hash map might already be cached in the container we run // the task in. On MR: The cache is a no-op. cacheKey = HiveConf.getVar(hconf, HiveConf.ConfVars.HIVEQUERYID) + "__HASH_MAP_" + this.getOperatorId() + "_container"; cache = ObjectCacheFactory.getCache(hconf); loader = getHashTableLoader(hconf); hashMapRowGetters = null; mapJoinTables = new MapJoinTableContainer[tagLen]; mapJoinTableSerdes = new MapJoinTableContainerSerDe[tagLen]; hashTblInitedOnce = false; generateMapMetaData(); final ExecMapperContext mapContext = getExecContext(); final MapredContext mrContext = MapredContext.get(); if (!conf.isBucketMapJoin() && !conf.isDynamicPartitionHashJoin()) { /* * The issue with caching in case of bucket map join is that different tasks * process different buckets and if the container is reused to join a different bucket, * join results can be incorrect. The cache is keyed on operator id and for bucket map join * the operator does not change but data needed is different. For a proper fix, this * requires changes in the Tez API with regard to finding bucket id and * also ability to schedule tasks to re-use containers that have cached the specific bucket. */ if (isLogInfoEnabled) { LOG.info("This is not bucket map join, so cache"); } Future<Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>> future = cache.retrieveAsync( cacheKey, new Callable<Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>>() { @Override public Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]> call() throws HiveException { return loadHashTable(mapContext, mrContext); } }); result.add(future); } else if (!isInputFileChangeSensitive(mapContext)) { loadHashTable(mapContext, mrContext); hashTblInitedOnce = true; } return result; }
@Nullable public static MapredContext get() { return MapredContext.get(); }