@Override protected void closeOp(boolean abort) throws HiveException { if (!abort) { reducerHash.flush(); } super.closeOp(abort); out = null; if (isLogInfoEnabled) { LOG.info(toString() + ": records written - " + numRows); } recordCounter.set(numRows); }
@Override @SuppressWarnings("unchecked") public void process(Object row, int tag) throws HiveException { try { ObjectInspector rowInspector = inputObjInspectors[tag]; if (firstRow) { firstRow = false; // TODO: this is fishy - we init object inspectors based on first tag. We // should either init for each tag, or if rowInspector doesn't really // matter, then we can create this in ctor and get rid of firstRow. if (conf.getWriteType() == AcidUtils.Operation.UPDATE || conf.getWriteType() == AcidUtils.Operation.DELETE) { assert rowInspector instanceof StructObjectInspector : "Exptected rowInspector to be instance of StructObjectInspector but it is a " + rowInspector.getClass().getName(); acidRowInspector = (StructObjectInspector) rowInspector; // The record identifier is always in the first column recIdField = acidRowInspector.getAllStructFieldRefs().get(0); recIdInspector = (StructObjectInspector) recIdField.getFieldObjectInspector(); // The bucket field is in the second position bucketField = recIdInspector.getAllStructFieldRefs().get(1); bucketInspector = (IntObjectInspector) bucketField.getFieldObjectInspector(); } if (isLogInfoEnabled) { LOG.info( "keys are " + conf.getOutputKeyColumnNames() + " num distributions: " + conf.getNumDistributionKeys()); } keyObjectInspector = initEvaluatorsAndReturnStruct( keyEval, distinctColIndices, conf.getOutputKeyColumnNames(), numDistributionKeys, rowInspector); valueObjectInspector = initEvaluatorsAndReturnStruct( valueEval, conf.getOutputValueColumnNames(), rowInspector); partitionObjectInspectors = initEvaluators(partitionEval, rowInspector); if (bucketEval != null) { bucketObjectInspectors = initEvaluators(bucketEval, rowInspector); } int numKeys = numDistinctExprs > 0 ? numDistinctExprs : 1; int keyLen = numDistinctExprs > 0 ? numDistributionKeys + 1 : numDistributionKeys; cachedKeys = new Object[numKeys][keyLen]; cachedValues = new Object[valueEval.length]; } // Determine distKeyLength (w/o distincts), and then add the first if present. populateCachedDistributionKeys(row, 0); // replace bucketing columns with hashcode % numBuckets int bucketNumber = -1; if (bucketEval != null) { bucketNumber = computeBucketNumber(row, conf.getNumBuckets()); cachedKeys[0][buckColIdxInKey] = new Text(String.valueOf(bucketNumber)); } else if (conf.getWriteType() == AcidUtils.Operation.UPDATE || conf.getWriteType() == AcidUtils.Operation.DELETE) { // In the non-partitioned case we still want to compute the bucket number for updates and // deletes. bucketNumber = computeBucketNumber(row, conf.getNumBuckets()); } HiveKey firstKey = toHiveKey(cachedKeys[0], tag, null); int distKeyLength = firstKey.getDistKeyLength(); if (numDistinctExprs > 0) { populateCachedDistinctKeys(row, 0); firstKey = toHiveKey(cachedKeys[0], tag, distKeyLength); } final int hashCode; // distKeyLength doesn't include tag, but includes buckNum in cachedKeys[0] if (useUniformHash && partitionEval.length > 0) { hashCode = computeMurmurHash(firstKey); } else { hashCode = computeHashCode(row, bucketNumber); } firstKey.setHashCode(hashCode); /* * in case of TopN for windowing, we need to distinguish between rows with * null partition keys and rows with value 0 for partition keys. */ boolean partKeyNull = conf.isPTFReduceSink() && partitionKeysAreNull(row); // Try to store the first key. If it's not excluded, we will proceed. int firstIndex = reducerHash.tryStoreKey(firstKey, partKeyNull); if (firstIndex == TopNHash.EXCLUDE) return; // Nothing to do. // Compute value and hashcode - we'd either store or forward them. BytesWritable value = makeValueWritable(row); if (firstIndex == TopNHash.FORWARD) { collect(firstKey, value); } else { assert firstIndex >= 0; reducerHash.storeValue(firstIndex, firstKey.hashCode(), value, false); } // All other distinct keys will just be forwarded. This could be optimized... for (int i = 1; i < numDistinctExprs; i++) { System.arraycopy(cachedKeys[0], 0, cachedKeys[i], 0, numDistributionKeys); populateCachedDistinctKeys(row, i); HiveKey hiveKey = toHiveKey(cachedKeys[i], tag, distKeyLength); hiveKey.setHashCode(hashCode); collect(hiveKey, value); } } catch (HiveException e) { throw e; } catch (Exception e) { throw new HiveException(e); } }
@Override protected void initializeOp(Configuration hconf) throws HiveException { super.initializeOp(hconf); try { numRows = 0; cntr = 1; logEveryNRows = HiveConf.getLongVar(hconf, HiveConf.ConfVars.HIVE_LOG_N_RECORDS); statsMap.put(getCounterName(Counter.RECORDS_OUT_INTERMEDIATE, hconf), recordCounter); List<ExprNodeDesc> keys = conf.getKeyCols(); if (isLogDebugEnabled) { LOG.debug("keys size is " + keys.size()); for (ExprNodeDesc k : keys) { LOG.debug("Key exprNodeDesc " + k.getExprString()); } } keyEval = new ExprNodeEvaluator[keys.size()]; int i = 0; for (ExprNodeDesc e : keys) { keyEval[i++] = ExprNodeEvaluatorFactory.get(e); } numDistributionKeys = conf.getNumDistributionKeys(); distinctColIndices = conf.getDistinctColumnIndices(); numDistinctExprs = distinctColIndices.size(); valueEval = new ExprNodeEvaluator[conf.getValueCols().size()]; i = 0; for (ExprNodeDesc e : conf.getValueCols()) { valueEval[i++] = ExprNodeEvaluatorFactory.get(e); } partitionEval = new ExprNodeEvaluator[conf.getPartitionCols().size()]; i = 0; for (ExprNodeDesc e : conf.getPartitionCols()) { int index = ExprNodeDescUtils.indexOf(e, keys); partitionEval[i++] = index < 0 ? ExprNodeEvaluatorFactory.get(e) : keyEval[index]; } if (conf.getBucketCols() != null && !conf.getBucketCols().isEmpty()) { bucketEval = new ExprNodeEvaluator[conf.getBucketCols().size()]; i = 0; for (ExprNodeDesc e : conf.getBucketCols()) { int index = ExprNodeDescUtils.indexOf(e, keys); bucketEval[i++] = index < 0 ? ExprNodeEvaluatorFactory.get(e) : keyEval[index]; } buckColIdxInKey = conf.getPartitionCols().size(); } tag = conf.getTag(); tagByte[0] = (byte) tag; skipTag = conf.getSkipTag(); if (isLogInfoEnabled) { LOG.info("Using tag = " + tag); } TableDesc keyTableDesc = conf.getKeySerializeInfo(); keySerializer = (Serializer) keyTableDesc.getDeserializerClass().newInstance(); keySerializer.initialize(null, keyTableDesc.getProperties()); keyIsText = keySerializer.getSerializedClass().equals(Text.class); TableDesc valueTableDesc = conf.getValueSerializeInfo(); valueSerializer = (Serializer) valueTableDesc.getDeserializerClass().newInstance(); valueSerializer.initialize(null, valueTableDesc.getProperties()); int limit = conf.getTopN(); float memUsage = conf.getTopNMemoryUsage(); if (limit >= 0 && memUsage > 0) { reducerHash = conf.isPTFReduceSink() ? new PTFTopNHash() : reducerHash; reducerHash.initialize(limit, memUsage, conf.isMapGroupBy(), this); } useUniformHash = conf.getReducerTraits().contains(UNIFORM); firstRow = true; } catch (Exception e) { String msg = "Error initializing ReduceSinkOperator: " + e.getMessage(); LOG.error(msg, e); throw new RuntimeException(e); } }