public void process(Writable value) throws HiveException { // A mapper can span multiple files/partitions. // The serializers need to be reset if the input file changed ExecMapperContext context = getExecContext(); if (context != null && context.inputFileChanged()) { // The child operators cleanup if input file has changed cleanUpInputFileChanged(); } int childrenDone = 0; for (MapOpCtx current : currentCtxs) { Object row = null; try { row = current.readRow(value, context); if (!current.forward(row)) { childrenDone++; } } catch (Exception e) { // TODO: policy on deserialization errors String message = toErrorMessage(value, row, current.rowObjectInspector); if (row == null) { deserialize_error_count.set(deserialize_error_count.get() + 1); throw new HiveException("Hive Runtime Error while processing writable " + message, e); } throw new HiveException("Hive Runtime Error while processing row " + message, e); } } rowsForwarded(childrenDone, 1); }
protected boolean isInputFileChangeSensitive(ExecMapperContext mapContext) { return !(mapContext == null || mapContext.getLocalWork() == null || mapContext.getLocalWork().getInputFileChangeSensitive() == false); }
@Override @SuppressWarnings("unchecked") public void init(JobConf job, OutputCollector output, Reporter reporter) throws Exception { perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS); super.init(job, output, reporter); rowObjectInspector = new ObjectInspector[Byte.MAX_VALUE]; ObjectInspector[] valueObjectInspector = new ObjectInspector[Byte.MAX_VALUE]; ObjectInspector keyObjectInspector; ReduceWork gWork = Utilities.getReduceWork(job); reducer = gWork.getReducer(); vectorized = gWork.getVectorMode(); reducer.setParentOperators(null); // clear out any parents as reducer is the // root isTagged = gWork.getNeedsTagging(); try { keyTableDesc = gWork.getKeyDesc(); inputKeyDeserializer = ReflectionUtils.newInstance(keyTableDesc.getDeserializerClass(), null); SerDeUtils.initializeSerDe(inputKeyDeserializer, null, keyTableDesc.getProperties(), null); keyObjectInspector = inputKeyDeserializer.getObjectInspector(); valueTableDesc = new TableDesc[gWork.getTagToValueDesc().size()]; if (vectorized) { final int maxTags = gWork.getTagToValueDesc().size(); keyStructInspector = (StructObjectInspector) keyObjectInspector; batches = new VectorizedRowBatch[maxTags]; valueStructInspectors = new StructObjectInspector[maxTags]; valueStringWriters = new List[maxTags]; keysColumnOffset = keyStructInspector.getAllStructFieldRefs().size(); buffer = new DataOutputBuffer(); } for (int tag = 0; tag < gWork.getTagToValueDesc().size(); tag++) { // We should initialize the SerDe with the TypeInfo when available. valueTableDesc[tag] = gWork.getTagToValueDesc().get(tag); inputValueDeserializer[tag] = ReflectionUtils.newInstance(valueTableDesc[tag].getDeserializerClass(), null); SerDeUtils.initializeSerDe( inputValueDeserializer[tag], null, valueTableDesc[tag].getProperties(), null); valueObjectInspector[tag] = inputValueDeserializer[tag].getObjectInspector(); ArrayList<ObjectInspector> ois = new ArrayList<ObjectInspector>(); if (vectorized) { /* vectorization only works with struct object inspectors */ valueStructInspectors[tag] = (StructObjectInspector) valueObjectInspector[tag]; ObjectPair<VectorizedRowBatch, StandardStructObjectInspector> pair = VectorizedBatchUtil.constructVectorizedRowBatch( keyStructInspector, valueStructInspectors[tag], gWork.getVectorScratchColumnTypeMap()); batches[tag] = pair.getFirst(); final int totalColumns = keysColumnOffset + valueStructInspectors[tag].getAllStructFieldRefs().size(); valueStringWriters[tag] = new ArrayList<VectorExpressionWriter>(totalColumns); valueStringWriters[tag].addAll( Arrays.asList( VectorExpressionWriterFactory.genVectorStructExpressionWritables( keyStructInspector))); valueStringWriters[tag].addAll( Arrays.asList( VectorExpressionWriterFactory.genVectorStructExpressionWritables( valueStructInspectors[tag]))); rowObjectInspector[tag] = pair.getSecond(); } else { ois.add(keyObjectInspector); ois.add(valueObjectInspector[tag]); // reducer.setGroupKeyObjectInspector(keyObjectInspector); rowObjectInspector[tag] = ObjectInspectorFactory.getStandardStructObjectInspector( Utilities.reduceFieldNameList, ois); } } } catch (Exception e) { throw new RuntimeException(e); } ExecMapperContext execContext = new ExecMapperContext(job); localWork = gWork.getMapRedLocalWork(); execContext.setJc(jc); execContext.setLocalWork(localWork); reducer.passExecContext(execContext); reducer.setReporter(rp); OperatorUtils.setChildrenCollector( Arrays.<Operator<? extends OperatorDesc>>asList(reducer), output); // initialize reduce operator tree try { LOG.info(reducer.dump(0)); reducer.initialize(jc, rowObjectInspector); if (localWork != null) { for (Operator<? extends OperatorDesc> dummyOp : localWork.getDummyParentOp()) { dummyOp.setExecContext(execContext); dummyOp.initialize(jc, null); } } } catch (Throwable e) { abort = true; if (e instanceof OutOfMemoryError) { // Don't create a new object if we are already out of memory throw (OutOfMemoryError) e; } else { throw new RuntimeException("Reduce operator initialization failed", e); } } perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS); }
public static Object[] populateVirtualColumnValues( ExecMapperContext ctx, List<VirtualColumn> vcs, Object[] vcValues, Deserializer deserializer) { if (vcs == null) { return vcValues; } if (vcValues == null) { vcValues = new Object[vcs.size()]; } for (int i = 0; i < vcs.size(); i++) { VirtualColumn vc = vcs.get(i); if (vc.equals(VirtualColumn.FILENAME)) { if (ctx.inputFileChanged()) { vcValues[i] = new Text(ctx.getCurrentInputPath().toString()); } } else if (vc.equals(VirtualColumn.BLOCKOFFSET)) { long current = ctx.getIoCxt().getCurrentBlockStart(); LongWritable old = (LongWritable) vcValues[i]; if (old == null) { old = new LongWritable(current); vcValues[i] = old; continue; } if (current != old.get()) { old.set(current); } } else if (vc.equals(VirtualColumn.ROWOFFSET)) { long current = ctx.getIoCxt().getCurrentRow(); LongWritable old = (LongWritable) vcValues[i]; if (old == null) { old = new LongWritable(current); vcValues[i] = old; continue; } if (current != old.get()) { old.set(current); } } else if (vc.equals(VirtualColumn.RAWDATASIZE)) { long current = 0L; SerDeStats stats = deserializer.getSerDeStats(); if (stats != null) { current = stats.getRawDataSize(); } LongWritable old = (LongWritable) vcValues[i]; if (old == null) { old = new LongWritable(current); vcValues[i] = old; continue; } if (current != old.get()) { old.set(current); } } else if (vc.equals(VirtualColumn.ROWID)) { if (ctx.getIoCxt().getRecordIdentifier() == null) { vcValues[i] = null; } else { if (vcValues[i] == null) { vcValues[i] = new Object[RecordIdentifier.Field.values().length]; } RecordIdentifier.StructInfo.toArray( ctx.getIoCxt().getRecordIdentifier(), (Object[]) vcValues[i]); ctx.getIoCxt() .setRecordIdentifier(null); // so we don't accidentally cache the value; shouldn't // happen since IO layer either knows how to produce ROW__ID or not - but to be safe } } } return vcValues; }