static void initializeDeserializer( Deserializer deserializer, Configuration conf, HCatTableInfo info, HCatSchema schema) throws SerDeException { Properties props = getSerdeProperties(info, schema); LOG.info("Initializing " + deserializer.getClass().getName() + " with properties " + props); deserializer.initialize(conf, props); }
@SuppressWarnings("deprecation") private static void initializeDeserializer(Deserializer deserializer, Properties schema) { try { deserializer.initialize(new Configuration(false), schema); } catch (SerDeException e) { throw new RuntimeException( "error initializing deserializer: " + deserializer.getClass().getName()); } }
/* * This is the same as the setChildren method below but for empty tables. * It takes care of the following: * 1. Create the right object inspector. * 2. Set up the childrenOpToOI with the object inspector. * So as to ensure that the initialization happens correctly. */ public void initEmptyInputChildren(List<Operator<?>> children, Configuration hconf) throws SerDeException, Exception { setChildOperators(children); for (Operator<?> child : children) { TableScanOperator tsOp = (TableScanOperator) child; StructObjectInspector soi = null; PartitionDesc partDesc = conf.getAliasToPartnInfo().get(tsOp.getConf().getAlias()); Deserializer serde = partDesc.getTableDesc().getDeserializer(); partDesc.setProperties(partDesc.getProperties()); MapOpCtx opCtx = new MapOpCtx(tsOp.getConf().getAlias(), child, partDesc); StructObjectInspector tableRowOI = (StructObjectInspector) serde.getObjectInspector(); initObjectInspector(hconf, opCtx, tableRowOI); soi = opCtx.rowObjectInspector; child.getParentOperators().add(this); childrenOpToOI.put(child, soi); } }
public static StructObjectInspector getTableObjectInspector( @SuppressWarnings("deprecation") Deserializer deserializer) { try { ObjectInspector inspector = deserializer.getObjectInspector(); checkArgument( inspector.getCategory() == Category.STRUCT, "expected STRUCT: %s", inspector.getCategory()); return (StructObjectInspector) inspector; } catch (SerDeException e) { throw Throwables.propagate(e); } }
private Object readRow(Writable value, ExecMapperContext context) throws SerDeException { Object deserialized = deserializer.deserialize(value); Object row = partTblObjectInspectorConverter.convert(deserialized); if (hasVC()) { rowWithPartAndVC[0] = row; if (context != null) { populateVirtualColumnValues(context, vcs, vcValues, deserializer); } int vcPos = isPartitioned() ? 2 : 1; rowWithPartAndVC[vcPos] = vcValues; return rowWithPartAndVC; } else if (isPartitioned()) { rowWithPart[0] = row; return rowWithPart; } return row; }
@Override public boolean advanceNextPosition() { try { if (closed || !recordReader.next(key, value)) { close(); return false; } // reset loaded flags // partition keys are already loaded, but everything else is not System.arraycopy(isPartitionColumn, 0, loaded, 0, isPartitionColumn.length); // decode value rowData = deserializer.deserialize(value); return true; } catch (IOException | SerDeException | RuntimeException e) { closeWithSuppression(e); throw new PrestoException(HIVE_CURSOR_ERROR.toErrorCode(), e); } }
/** * Traverse all the partitions for a table, and get the OI for the table. Note that a conversion * is required if any of the partition OI is different from the table OI. For eg. if the query * references table T (partitions P1, P2), and P1's schema is same as T, whereas P2's scheme is * different from T, conversion might be needed for both P1 and P2, since SettableOI might be * needed for T */ private Map<TableDesc, StructObjectInspector> getConvertedOI(Configuration hconf) throws HiveException { Map<TableDesc, StructObjectInspector> tableDescOI = new HashMap<TableDesc, StructObjectInspector>(); Set<TableDesc> identityConverterTableDesc = new HashSet<TableDesc>(); try { Map<ObjectInspector, Boolean> oiSettableProperties = new HashMap<ObjectInspector, Boolean>(); for (String onefile : conf.getPathToAliases().keySet()) { PartitionDesc pd = conf.getPathToPartitionInfo().get(onefile); TableDesc tableDesc = pd.getTableDesc(); Deserializer partDeserializer = pd.getDeserializer(hconf); StructObjectInspector partRawRowObjectInspector; if (Utilities.isInputFileFormatSelfDescribing(pd)) { Deserializer tblDeserializer = tableDesc.getDeserializer(hconf); partRawRowObjectInspector = (StructObjectInspector) tblDeserializer.getObjectInspector(); } else { partRawRowObjectInspector = (StructObjectInspector) partDeserializer.getObjectInspector(); } StructObjectInspector tblRawRowObjectInspector = tableDescOI.get(tableDesc); if ((tblRawRowObjectInspector == null) || (identityConverterTableDesc.contains(tableDesc))) { Deserializer tblDeserializer = tableDesc.getDeserializer(hconf); tblRawRowObjectInspector = (StructObjectInspector) ObjectInspectorConverters.getConvertedOI( partRawRowObjectInspector, tblDeserializer.getObjectInspector(), oiSettableProperties); if (identityConverterTableDesc.contains(tableDesc)) { if (!partRawRowObjectInspector.equals(tblRawRowObjectInspector)) { identityConverterTableDesc.remove(tableDesc); } } else if (partRawRowObjectInspector.equals(tblRawRowObjectInspector)) { identityConverterTableDesc.add(tableDesc); } tableDescOI.put(tableDesc, tblRawRowObjectInspector); } } } catch (Exception e) { throw new HiveException(e); } return tableDescOI; }
@Override public <E> void processRow(Object key, Iterator<E> values) throws IOException { if (reducer.getDone()) { return; } try { BytesWritable keyWritable = (BytesWritable) key; byte tag = 0; if (isTagged) { // remove the tag from key coming out of reducer // and store it in separate variable. int size = keyWritable.getSize() - 1; tag = keyWritable.get()[size]; keyWritable.setSize(size); } if (!keyWritable.equals(groupKey)) { // If a operator wants to do some work at the beginning of a group if (groupKey == null) { // the first group groupKey = new BytesWritable(); } else { // If a operator wants to do some work at the end of a group LOG.trace("End Group"); reducer.endGroup(); } try { keyObject = inputKeyDeserializer.deserialize(keyWritable); } catch (Exception e) { throw new HiveException( "Hive Runtime Error: Unable to deserialize reduce input key from " + Utilities.formatBinaryString(keyWritable.get(), 0, keyWritable.getSize()) + " with properties " + keyTableDesc.getProperties(), e); } groupKey.set(keyWritable.get(), 0, keyWritable.getSize()); LOG.trace("Start Group"); reducer.setGroupKeyObject(keyObject); reducer.startGroup(); } /* this.keyObject passed via reference */ if (vectorized) { processVectors(values, tag); } else { processKeyValues(values, tag); } } catch (Throwable e) { abort = true; Utilities.setReduceWork(jc, null); if (e instanceof OutOfMemoryError) { // Don't create a new object if we are already out of memory throw (OutOfMemoryError) e; } else { String msg = "Fatal error: " + e; LOG.fatal(msg, e); throw new RuntimeException(e); } } }
@Override @SuppressWarnings("unchecked") public void init(JobConf job, OutputCollector output, Reporter reporter) throws Exception { perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS); super.init(job, output, reporter); rowObjectInspector = new ObjectInspector[Byte.MAX_VALUE]; ObjectInspector[] valueObjectInspector = new ObjectInspector[Byte.MAX_VALUE]; ObjectInspector keyObjectInspector; ReduceWork gWork = Utilities.getReduceWork(job); reducer = gWork.getReducer(); vectorized = gWork.getVectorMode(); reducer.setParentOperators(null); // clear out any parents as reducer is the // root isTagged = gWork.getNeedsTagging(); try { keyTableDesc = gWork.getKeyDesc(); inputKeyDeserializer = ReflectionUtils.newInstance(keyTableDesc.getDeserializerClass(), null); SerDeUtils.initializeSerDe(inputKeyDeserializer, null, keyTableDesc.getProperties(), null); keyObjectInspector = inputKeyDeserializer.getObjectInspector(); valueTableDesc = new TableDesc[gWork.getTagToValueDesc().size()]; if (vectorized) { final int maxTags = gWork.getTagToValueDesc().size(); keyStructInspector = (StructObjectInspector) keyObjectInspector; batches = new VectorizedRowBatch[maxTags]; valueStructInspectors = new StructObjectInspector[maxTags]; valueStringWriters = new List[maxTags]; keysColumnOffset = keyStructInspector.getAllStructFieldRefs().size(); buffer = new DataOutputBuffer(); } for (int tag = 0; tag < gWork.getTagToValueDesc().size(); tag++) { // We should initialize the SerDe with the TypeInfo when available. valueTableDesc[tag] = gWork.getTagToValueDesc().get(tag); inputValueDeserializer[tag] = ReflectionUtils.newInstance(valueTableDesc[tag].getDeserializerClass(), null); SerDeUtils.initializeSerDe( inputValueDeserializer[tag], null, valueTableDesc[tag].getProperties(), null); valueObjectInspector[tag] = inputValueDeserializer[tag].getObjectInspector(); ArrayList<ObjectInspector> ois = new ArrayList<ObjectInspector>(); if (vectorized) { /* vectorization only works with struct object inspectors */ valueStructInspectors[tag] = (StructObjectInspector) valueObjectInspector[tag]; ObjectPair<VectorizedRowBatch, StandardStructObjectInspector> pair = VectorizedBatchUtil.constructVectorizedRowBatch( keyStructInspector, valueStructInspectors[tag], gWork.getVectorScratchColumnTypeMap()); batches[tag] = pair.getFirst(); final int totalColumns = keysColumnOffset + valueStructInspectors[tag].getAllStructFieldRefs().size(); valueStringWriters[tag] = new ArrayList<VectorExpressionWriter>(totalColumns); valueStringWriters[tag].addAll( Arrays.asList( VectorExpressionWriterFactory.genVectorStructExpressionWritables( keyStructInspector))); valueStringWriters[tag].addAll( Arrays.asList( VectorExpressionWriterFactory.genVectorStructExpressionWritables( valueStructInspectors[tag]))); rowObjectInspector[tag] = pair.getSecond(); } else { ois.add(keyObjectInspector); ois.add(valueObjectInspector[tag]); // reducer.setGroupKeyObjectInspector(keyObjectInspector); rowObjectInspector[tag] = ObjectInspectorFactory.getStandardStructObjectInspector( Utilities.reduceFieldNameList, ois); } } } catch (Exception e) { throw new RuntimeException(e); } ExecMapperContext execContext = new ExecMapperContext(job); localWork = gWork.getMapRedLocalWork(); execContext.setJc(jc); execContext.setLocalWork(localWork); reducer.passExecContext(execContext); reducer.setReporter(rp); OperatorUtils.setChildrenCollector( Arrays.<Operator<? extends OperatorDesc>>asList(reducer), output); // initialize reduce operator tree try { LOG.info(reducer.dump(0)); reducer.initialize(jc, rowObjectInspector); if (localWork != null) { for (Operator<? extends OperatorDesc> dummyOp : localWork.getDummyParentOp()) { dummyOp.setExecContext(execContext); dummyOp.initialize(jc, null); } } } catch (Throwable e) { abort = true; if (e instanceof OutOfMemoryError) { // Don't create a new object if we are already out of memory throw (OutOfMemoryError) e; } else { throw new RuntimeException("Reduce operator initialization failed", e); } } perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS); }
public static Object[] populateVirtualColumnValues( ExecMapperContext ctx, List<VirtualColumn> vcs, Object[] vcValues, Deserializer deserializer) { if (vcs == null) { return vcValues; } if (vcValues == null) { vcValues = new Object[vcs.size()]; } for (int i = 0; i < vcs.size(); i++) { VirtualColumn vc = vcs.get(i); if (vc.equals(VirtualColumn.FILENAME)) { if (ctx.inputFileChanged()) { vcValues[i] = new Text(ctx.getCurrentInputPath().toString()); } } else if (vc.equals(VirtualColumn.BLOCKOFFSET)) { long current = ctx.getIoCxt().getCurrentBlockStart(); LongWritable old = (LongWritable) vcValues[i]; if (old == null) { old = new LongWritable(current); vcValues[i] = old; continue; } if (current != old.get()) { old.set(current); } } else if (vc.equals(VirtualColumn.ROWOFFSET)) { long current = ctx.getIoCxt().getCurrentRow(); LongWritable old = (LongWritable) vcValues[i]; if (old == null) { old = new LongWritable(current); vcValues[i] = old; continue; } if (current != old.get()) { old.set(current); } } else if (vc.equals(VirtualColumn.RAWDATASIZE)) { long current = 0L; SerDeStats stats = deserializer.getSerDeStats(); if (stats != null) { current = stats.getRawDataSize(); } LongWritable old = (LongWritable) vcValues[i]; if (old == null) { old = new LongWritable(current); vcValues[i] = old; continue; } if (current != old.get()) { old.set(current); } } else if (vc.equals(VirtualColumn.ROWID)) { if (ctx.getIoCxt().getRecordIdentifier() == null) { vcValues[i] = null; } else { if (vcValues[i] == null) { vcValues[i] = new Object[RecordIdentifier.Field.values().length]; } RecordIdentifier.StructInfo.toArray( ctx.getIoCxt().getRecordIdentifier(), (Object[]) vcValues[i]); ctx.getIoCxt() .setRecordIdentifier(null); // so we don't accidentally cache the value; shouldn't // happen since IO layer either knows how to produce ROW__ID or not - but to be safe } } } return vcValues; }