// Load the hash table @Override public void cleanUpInputFileChangedOp() throws HiveException { try { if (firstRow) { // generate the map metadata generateMapMetaData(); firstRow = false; } loadHashTable(); } catch (SerDeException e) { e.printStackTrace(); throw new HiveException(e); } }
@Override public void processOp(Object row, int tag) throws HiveException { try { if (firstRow) { // generate the map metadata generateMapMetaData(); firstRow = false; } // get alias alias = order[tag]; // alias = (byte)tag; if ((lastAlias == null) || (!lastAlias.equals(alias))) { nextSz = joinEmitInterval; } // compute keys and values as StandardObjects AbstractMapJoinKey key = JoinUtil.computeMapJoinKeys( row, joinKeys.get(alias), joinKeysObjectInspectors.get(alias)); ArrayList<Object> value = JoinUtil.computeValues( row, joinValues.get(alias), joinValuesObjectInspectors.get(alias), joinFilters.get(alias), joinFilterObjectInspectors.get(alias), noOuterJoin); // Add the value to the ArrayList storage.get((byte) tag).add(value); for (Byte pos : order) { if (pos.intValue() != tag) { MapJoinObjectValue o = mapJoinTables.get(pos).get(key); MapJoinRowContainer<ArrayList<Object>> rowContainer = rowContainerMap.get(pos); // there is no join-value or join-key has all null elements if (o == null || key.hasAnyNulls()) { if (noOuterJoin) { storage.put(pos, emptyList); } else { storage.put(pos, dummyObjVectors[pos.intValue()]); } } else { rowContainer.reset(o.getObj()); storage.put(pos, rowContainer); } } } // generate the output records checkAndGenObject(); // done with the row storage.get((byte) tag).clear(); for (Byte pos : order) { if (pos.intValue() != tag) { storage.put(pos, null); } } } catch (SerDeException e) { e.printStackTrace(); throw new HiveException(e); } }
@Override protected Collection<Future<?>> initializeOp(Configuration hconf) throws HiveException { this.hconf = hconf; unwrapContainer = new UnwrapRowContainer[conf.getTagLength()]; Collection<Future<?>> result = super.initializeOp(hconf); if (result == null) { result = new HashSet<Future<?>>(); } int tagLen = conf.getTagLength(); // On Tez only: The hash map might already be cached in the container we run // the task in. On MR: The cache is a no-op. cacheKey = HiveConf.getVar(hconf, HiveConf.ConfVars.HIVEQUERYID) + "__HASH_MAP_" + this.getOperatorId() + "_container"; cache = ObjectCacheFactory.getCache(hconf); loader = getHashTableLoader(hconf); hashMapRowGetters = null; mapJoinTables = new MapJoinTableContainer[tagLen]; mapJoinTableSerdes = new MapJoinTableContainerSerDe[tagLen]; hashTblInitedOnce = false; generateMapMetaData(); final ExecMapperContext mapContext = getExecContext(); final MapredContext mrContext = MapredContext.get(); if (!conf.isBucketMapJoin() && !conf.isDynamicPartitionHashJoin()) { /* * The issue with caching in case of bucket map join is that different tasks * process different buckets and if the container is reused to join a different bucket, * join results can be incorrect. The cache is keyed on operator id and for bucket map join * the operator does not change but data needed is different. For a proper fix, this * requires changes in the Tez API with regard to finding bucket id and * also ability to schedule tasks to re-use containers that have cached the specific bucket. */ if (isLogInfoEnabled) { LOG.info("This is not bucket map join, so cache"); } Future<Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>> future = cache.retrieveAsync( cacheKey, new Callable<Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>>() { @Override public Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]> call() throws HiveException { return loadHashTable(mapContext, mrContext); } }); result.add(future); } else if (!isInputFileChangeSensitive(mapContext)) { loadHashTable(mapContext, mrContext); hashTblInitedOnce = true; } return result; }