public void testRW() throws Exception { Configuration conf = new Configuration(); for (Pair<Properties, HCatRecord> e : getData()) { Properties tblProps = e.first; HCatRecord r = e.second; HCatRecordSerDe hrsd = new HCatRecordSerDe(); SerDeUtils.initializeSerDe(hrsd, conf, tblProps, null); JsonSerDe jsde = new JsonSerDe(); SerDeUtils.initializeSerDe(jsde, conf, tblProps, null); LOG.info("ORIG:{}", r); Writable s = hrsd.serialize(r, hrsd.getObjectInspector()); LOG.info("ONE:{}", s); Object o1 = hrsd.deserialize(s); StringBuilder msg = new StringBuilder(); boolean isEqual = HCatDataCheckUtil.recordsEqual(r, (HCatRecord) o1); assertTrue(msg.toString(), isEqual); Writable s2 = jsde.serialize(o1, hrsd.getObjectInspector()); LOG.info("TWO:{}", s2); Object o2 = jsde.deserialize(s2); LOG.info("deserialized TWO : {} ", o2); msg.setLength(0); isEqual = HCatDataCheckUtil.recordsEqual(r, (HCatRecord) o2, msg); assertTrue(msg.toString(), isEqual); } }
public void generateMapMetaData() throws HiveException { // generate the meta data for key // index for key is -1 try { TableDesc keyTableDesc = conf.getKeyTblDesc(); SerDe keySerializer = (SerDe) ReflectionUtil.newInstance(keyTableDesc.getDeserializerClass(), null); SerDeUtils.initializeSerDe(keySerializer, null, keyTableDesc.getProperties(), null); MapJoinObjectSerDeContext keyContext = new MapJoinObjectSerDeContext(keySerializer, false); for (int pos = 0; pos < order.length; pos++) { if (pos == posBigTable) { continue; } TableDesc valueTableDesc; if (conf.getNoOuterJoin()) { valueTableDesc = conf.getValueTblDescs().get(pos); } else { valueTableDesc = conf.getValueFilteredTblDescs().get(pos); } SerDe valueSerDe = (SerDe) ReflectionUtil.newInstance(valueTableDesc.getDeserializerClass(), null); SerDeUtils.initializeSerDe(valueSerDe, null, valueTableDesc.getProperties(), null); MapJoinObjectSerDeContext valueContext = new MapJoinObjectSerDeContext(valueSerDe, hasFilter(pos)); mapJoinTableSerdes[pos] = new MapJoinTableContainerSerDe(keyContext, valueContext); } } catch (SerDeException e) { throw new HiveException(e); } }
static { StackTraceElement[] sTrace = new Exception().getStackTrace(); String className = sTrace[0].getClassName(); try { SerDeUtils.registerSerDe(shortName(), Class.forName(className)); // For backward compatibility: this class replaces the following class. SerDeUtils.registerSerDe("org.apache.hadoop.hive.serde.TestSerDe", Class.forName(className)); } catch (Exception e) { throw new RuntimeException(e); } }
public void testRW() throws Exception { Configuration conf = new Configuration(); for (Entry<Properties, HCatRecord> e : getData().entrySet()) { Properties tblProps = e.getKey(); HCatRecord r = e.getValue(); HCatRecordSerDe hrsd = new HCatRecordSerDe(); SerDeUtils.initializeSerDe(hrsd, conf, tblProps, null); LOG.info("ORIG: {}", r); Writable s = hrsd.serialize(r, hrsd.getObjectInspector()); LOG.info("ONE: {}", s); HCatRecord r2 = (HCatRecord) hrsd.deserialize(s); Assert.assertTrue(HCatDataCheckUtil.recordsEqual(r, r2)); // If it went through correctly, then s is also a HCatRecord, // and also equal to the above, and a deepcopy, and this holds // through for multiple levels more of serialization as well. Writable s2 = hrsd.serialize(s, hrsd.getObjectInspector()); LOG.info("TWO: {}", s2); Assert.assertTrue(HCatDataCheckUtil.recordsEqual(r, (HCatRecord) s)); Assert.assertTrue(HCatDataCheckUtil.recordsEqual(r, (HCatRecord) s2)); // serialize using another serde, and read out that object repr. LazySimpleSerDe testSD = new LazySimpleSerDe(); SerDeUtils.initializeSerDe(testSD, conf, tblProps, null); Writable s3 = testSD.serialize(s, hrsd.getObjectInspector()); LOG.info("THREE: {}", s3); Object o3 = testSD.deserialize(s3); Assert.assertFalse(r.getClass().equals(o3.getClass())); // then serialize again using hrsd, and compare results HCatRecord s4 = (HCatRecord) hrsd.serialize(o3, testSD.getObjectInspector()); LOG.info("FOUR: {}", s4); // Test LazyHCatRecord init and read LazyHCatRecord s5 = new LazyHCatRecord(o3, testSD.getObjectInspector()); LOG.info("FIVE: {}", s5); LazyHCatRecord s6 = new LazyHCatRecord(s4, hrsd.getObjectInspector()); LOG.info("SIX: {}", s6); } }
public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException { if (objInspector.getCategory() != Category.STRUCT) { throw new SerDeException( getClass().toString() + " can only serialize struct types, but we got: " + objInspector.getTypeName()); } StructObjectInspector soi = (StructObjectInspector) objInspector; List<? extends StructField> fields = soi.getAllStructFieldRefs(); StringBuilder sb = new StringBuilder(); for (int i = 0; i < fields.size(); i++) { if (i > 0) sb.append(separator); Object column = soi.getStructFieldData(obj, fields.get(i)); if (fields.get(i).getFieldObjectInspector().getCategory() == Category.PRIMITIVE) { // For primitive object, serialize to plain string sb.append(column == null ? nullString : column.toString()); } else { // For complex object, serialize to JSON format sb.append(SerDeUtils.getJSONString(column, fields.get(i).getFieldObjectInspector())); } } serializeCache.set(sb.toString()); return serializeCache; }
public void testMapValues() throws Exception { Configuration conf = new Configuration(); Properties props = new Properties(); props.put(serdeConstants.LIST_COLUMNS, "a,b"); props.put(serdeConstants.LIST_COLUMN_TYPES, "array<string>,map<string,int>"); JsonSerDe rjsd = new JsonSerDe(); SerDeUtils.initializeSerDe(rjsd, conf, props, null); Text text1 = new Text("{ \"a\":[\"aaa\"],\"b\":{\"bbb\":1}} "); Text text2 = new Text("{\"a\":[\"yyy\"],\"b\":{\"zzz\":123}}"); Text text3 = new Text("{\"a\":[\"a\"],\"b\":{\"x\":11, \"y\": 22, \"z\": null}}"); HCatRecord expected1 = new DefaultHCatRecord( Arrays.<Object>asList( Arrays.<String>asList("aaa"), createHashMapStringInteger("bbb", 1))); HCatRecord expected2 = new DefaultHCatRecord( Arrays.<Object>asList( Arrays.<String>asList("yyy"), createHashMapStringInteger("zzz", 123))); HCatRecord expected3 = new DefaultHCatRecord( Arrays.<Object>asList( Arrays.<String>asList("a"), createHashMapStringInteger("x", 11, "y", 22, "z", null))); assertTrue(HCatDataCheckUtil.recordsEqual((HCatRecord) rjsd.deserialize(text1), expected1)); assertTrue(HCatDataCheckUtil.recordsEqual((HCatRecord) rjsd.deserialize(text2), expected2)); }
/** * This test tests that our json deserialization is not too strict, as per HIVE-6166 * * <p>i.e, if our schema is "s:struct<a:int,b:string>,k:int", and we pass in data that looks like * : { "x" : "abc" , "t" : { "a" : "1", "b" : "2", "c" : [ { "x" : 2 , "y" : 3 } , { "x" : 3 , "y" * : 2 } ] } , "s" : { "a" : 2 , "b" : "blah", "c": "woo" } } * * <p>Then it should still work, and ignore the "x" and "t" field and "c" subfield of "s", and it * should read k as null. */ public void testLooseJsonReadability() throws Exception { Configuration conf = new Configuration(); Properties props = new Properties(); props.put(serdeConstants.LIST_COLUMNS, "s,k"); props.put(serdeConstants.LIST_COLUMN_TYPES, "struct<a:int,b:string>,int"); JsonSerDe rjsd = new JsonSerDe(); SerDeUtils.initializeSerDe(rjsd, conf, props, null); Text jsonText = new Text( "{ \"x\" : \"abc\" , " + " \"t\" : { \"a\":\"1\", \"b\":\"2\", \"c\":[ { \"x\":2 , \"y\":3 } , { \"x\":3 , \"y\":2 }] } ," + "\"s\" : { \"a\" : 2 , \"b\" : \"blah\", \"c\": \"woo\" } }"); List<Object> expected = new ArrayList<Object>(); List<Object> inner = new ArrayList<Object>(); inner.add(2); inner.add("blah"); expected.add(inner); expected.add(null); HCatRecord expectedRecord = new DefaultHCatRecord(expected); HCatRecord r = (HCatRecord) rjsd.deserialize(jsonText); System.err.println("record : " + r.toString()); assertTrue(HCatDataCheckUtil.recordsEqual(r, expectedRecord)); }
private RowSet decodeFromString(List<Object> rows, RowSet rowSet) throws SQLException, SerDeException { getSerDe(); StructObjectInspector soi = (StructObjectInspector) serde.getObjectInspector(); List<? extends StructField> fieldRefs = soi.getAllStructFieldRefs(); Object[] deserializedFields = new Object[fieldRefs.size()]; Object rowObj; ObjectInspector fieldOI; int protocol = getProtocolVersion().getValue(); for (Object rowString : rows) { try { rowObj = serde.deserialize(new BytesWritable(((String) rowString).getBytes("UTF-8"))); } catch (UnsupportedEncodingException e) { throw new SerDeException(e); } for (int i = 0; i < fieldRefs.size(); i++) { StructField fieldRef = fieldRefs.get(i); fieldOI = fieldRef.getFieldObjectInspector(); Object fieldData = soi.getStructFieldData(rowObj, fieldRef); deserializedFields[i] = SerDeUtils.toThriftPayload(fieldData, fieldOI, protocol); } rowSet.addRow(deserializedFields); } return rowSet; }
public void testRobustRead() throws Exception { /** * This test has been added to account for HCATALOG-436 We write out columns with "internal * column names" such as "_col0", but try to read with regular column names. */ Configuration conf = new Configuration(); for (Pair<Properties, HCatRecord> e : getData()) { Properties tblProps = e.first; HCatRecord r = e.second; Properties internalTblProps = new Properties(); for (Map.Entry pe : tblProps.entrySet()) { if (!pe.getKey().equals(serdeConstants.LIST_COLUMNS)) { internalTblProps.put(pe.getKey(), pe.getValue()); } else { internalTblProps.put(pe.getKey(), getInternalNames((String) pe.getValue())); } } LOG.info("orig tbl props:{}", tblProps); LOG.info("modif tbl props:{}", internalTblProps); JsonSerDe wjsd = new JsonSerDe(); SerDeUtils.initializeSerDe(wjsd, conf, internalTblProps, null); JsonSerDe rjsd = new JsonSerDe(); SerDeUtils.initializeSerDe(rjsd, conf, tblProps, null); LOG.info("ORIG:{}", r); Writable s = wjsd.serialize(r, wjsd.getObjectInspector()); LOG.info("ONE:{}", s); Object o1 = wjsd.deserialize(s); LOG.info("deserialized ONE : {} ", o1); Object o2 = rjsd.deserialize(s); LOG.info("deserialized TWO : {} ", o2); StringBuilder msg = new StringBuilder(); boolean isEqual = HCatDataCheckUtil.recordsEqual(r, (HCatRecord) o2, msg); assertTrue(msg.toString(), isEqual); } }
private String toErrorMessage(Writable value, Object row, ObjectInspector inspector) { try { if (row != null) { return SerDeUtils.getJSONString(row, inspector); } return String.valueOf(value); } catch (Exception e) { return "[Error getting row data with exception " + StringUtils.stringifyException(e) + " ]"; } }
private ParquetHiveRecord getParquetWritable( String columnNames, String columnTypes, ArrayWritable record) throws SerDeException { Properties recordProperties = new Properties(); recordProperties.setProperty("columns", columnNames); recordProperties.setProperty("columns.types", columnTypes); ParquetHiveSerDe serDe = new ParquetHiveSerDe(); SerDeUtils.initializeSerDe(serDe, new Configuration(), recordProperties, null); return new ParquetHiveRecord( serDe.deserialize(record), getObjectInspector(columnNames, columnTypes)); }
/** * Convert a LazyObject to a standard Java object in compliance with JDBC 3.0 (see JDBC 3.0 * Specification, Table B-3: Mapping from JDBC Types to Java Object Types). * * <p>This method is kept consistent with {@link HiveResultSetMetaData#hiveTypeToSqlType}. */ private static Object convertLazyToJava(Object o, ObjectInspector oi) { Object obj = ObjectInspectorUtils.copyToStandardObject(o, oi, ObjectInspectorCopyOption.JAVA); if (obj == null) { return null; } if (oi.getTypeName().equals(serdeConstants.BINARY_TYPE_NAME)) { return new String((byte[]) obj); } // for now, expose non-primitive as a string // TODO: expose non-primitive as a structured object while maintaining JDBC compliance if (oi.getCategory() != ObjectInspector.Category.PRIMITIVE) { return SerDeUtils.getJSONString(o, oi); } return obj; }
protected void initialize(ShapeDetails shp, StructObjectInspector OI) throws HiveException { String serdeClassName = shp.getSerdeClassName(); Properties serDeProps = new Properties(); Map<String, String> serdePropsMap = new LinkedHashMap<String, String>(); addOIPropertiestoSerDePropsMap(OI, serdePropsMap); for (String serdeName : serdePropsMap.keySet()) { serDeProps.setProperty(serdeName, serdePropsMap.get(serdeName)); } try { SerDe serDe = (SerDe) SerDeUtils.lookupDeserializer(serdeClassName); serDe.initialize(hConf, serDeProps); shp.setSerde(serDe); shp.setOI((StructObjectInspector) serDe.getObjectInspector()); } catch (SerDeException se) { throw new HiveException(se); } }
public void testUpperCaseKey() throws Exception { Configuration conf = new Configuration(); Properties props = new Properties(); props.put(serdeConstants.LIST_COLUMNS, "empid,name"); props.put(serdeConstants.LIST_COLUMN_TYPES, "int,string"); JsonSerDe rjsd = new JsonSerDe(); SerDeUtils.initializeSerDe(rjsd, conf, props, null); Text text1 = new Text("{ \"empId\" : 123, \"name\" : \"John\" } "); Text text2 = new Text("{ \"empId\" : 456, \"name\" : \"Jane\" } "); HCatRecord expected1 = new DefaultHCatRecord(Arrays.<Object>asList(123, "John")); HCatRecord expected2 = new DefaultHCatRecord(Arrays.<Object>asList(456, "Jane")); assertTrue(HCatDataCheckUtil.recordsEqual((HCatRecord) rjsd.deserialize(text1), expected1)); assertTrue(HCatDataCheckUtil.recordsEqual((HCatRecord) rjsd.deserialize(text2), expected2)); }
/** * Convert a Object to a standard Java object in compliance with JDBC 3.0 (see JDBC 3.0 * Specification, Table B-3: Mapping from JDBC Types to Java Object Types). * * <p>This method is kept consistent with {@link HiveResultSetMetaData#hiveTypeToSqlType}. */ public static Object toThriftPayload(Object val, ObjectInspector valOI, int version) { if (valOI.getCategory() == ObjectInspector.Category.PRIMITIVE) { if (val == null) { return null; } Object obj = ObjectInspectorUtils.copyToStandardObject( val, valOI, ObjectInspectorUtils.ObjectInspectorCopyOption.JAVA); // uses string type for binary before HIVE_CLI_SERVICE_PROTOCOL_V6 if (version < 5 && ((PrimitiveObjectInspector) valOI).getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.BINARY) { // todo HIVE-5269 return new String((byte[]) obj); } return obj; } // for now, expose non-primitive as a string // TODO: expose non-primitive as a structured object while maintaining JDBC compliance return SerDeUtils.getJSONString(val, valOI); }
private SerDe getSerDe() throws SQLException { if (serde != null) { return serde; } try { List<FieldSchema> fieldSchemas = mResultSchema.getFieldSchemas(); StringBuilder namesSb = new StringBuilder(); StringBuilder typesSb = new StringBuilder(); if (fieldSchemas != null && !fieldSchemas.isEmpty()) { for (int pos = 0; pos < fieldSchemas.size(); pos++) { if (pos != 0) { namesSb.append(","); typesSb.append(","); } namesSb.append(fieldSchemas.get(pos).getName()); typesSb.append(fieldSchemas.get(pos).getType()); } } String names = namesSb.toString(); String types = typesSb.toString(); serde = new LazySimpleSerDe(); Properties props = new Properties(); if (names.length() > 0) { LOG.debug("Column names: " + names); props.setProperty(serdeConstants.LIST_COLUMNS, names); } if (types.length() > 0) { LOG.debug("Column types: " + types); props.setProperty(serdeConstants.LIST_COLUMN_TYPES, types); } SerDeUtils.initializeSerDe(serde, new HiveConf(), props, null); } catch (Exception ex) { ex.printStackTrace(); throw new SQLException("Could not create ResultSet: " + ex.getMessage(), ex); } return serde; }
/** * @param values * @return true if it is not done and can take more inputs */ private <E> boolean processKeyValues(Iterator<E> values, byte tag) throws HiveException { while (values.hasNext()) { BytesWritable valueWritable = (BytesWritable) values.next(); try { valueObject[tag] = inputValueDeserializer[tag].deserialize(valueWritable); } catch (SerDeException e) { throw new HiveException( "Hive Runtime Error: Unable to deserialize reduce input value (tag=" + tag + ") from " + Utilities.formatBinaryString(valueWritable.get(), 0, valueWritable.getSize()) + " with properties " + valueTableDesc[tag].getProperties(), e); } row.clear(); row.add(keyObject); row.add(valueObject[tag]); if (isLogInfoEnabled) { logMemoryInfo(); } try { reducer.process(row, tag); } catch (Exception e) { String rowString = null; try { rowString = SerDeUtils.getJSONString(row, rowObjectInspector[tag]); } catch (Exception e2) { rowString = "[Error getting row data with exception " + StringUtils.stringifyException(e2) + " ]"; } throw new HiveException("Error while processing row (tag=" + tag + ") " + rowString, e); } } return true; // give me more }
private MapOpCtx initObjectInspector( Configuration hconf, MapOpCtx opCtx, StructObjectInspector tableRowOI) throws Exception { PartitionDesc pd = opCtx.partDesc; TableDesc td = pd.getTableDesc(); // Use table properties in case of unpartitioned tables, // and the union of table properties and partition properties, with partition // taking precedence, in the case of partitioned tables Properties overlayedProps = SerDeUtils.createOverlayedProperties(td.getProperties(), pd.getProperties()); Map<String, String> partSpec = pd.getPartSpec(); opCtx.tableName = String.valueOf(overlayedProps.getProperty("name")); opCtx.partName = String.valueOf(partSpec); opCtx.deserializer = pd.getDeserializer(hconf); StructObjectInspector partRawRowObjectInspector; if (Utilities.isInputFileFormatSelfDescribing(pd)) { partRawRowObjectInspector = tableRowOI; } else { partRawRowObjectInspector = (StructObjectInspector) opCtx.deserializer.getObjectInspector(); } opCtx.partTblObjectInspectorConverter = ObjectInspectorConverters.getConverter(partRawRowObjectInspector, tableRowOI); // Next check if this table has partitions and if so // get the list of partition names as well as allocate // the serdes for the partition columns String pcols = overlayedProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS); if (pcols != null && pcols.length() > 0) { String[] partKeys = pcols.trim().split("/"); String pcolTypes = overlayedProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMN_TYPES); String[] partKeyTypes = pcolTypes.trim().split(":"); if (partKeys.length > partKeyTypes.length) { throw new HiveException( "Internal error : partKeys length, " + partKeys.length + " greater than partKeyTypes length, " + partKeyTypes.length); } List<String> partNames = new ArrayList<String>(partKeys.length); Object[] partValues = new Object[partKeys.length]; List<ObjectInspector> partObjectInspectors = new ArrayList<ObjectInspector>(partKeys.length); for (int i = 0; i < partKeys.length; i++) { String key = partKeys[i]; partNames.add(key); ObjectInspector oi = PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( TypeInfoFactory.getPrimitiveTypeInfo(partKeyTypes[i])); // Partitions do not exist for this table if (partSpec == null) { // for partitionless table, initialize partValue to null partValues[i] = null; } else { partValues[i] = ObjectInspectorConverters.getConverter( PrimitiveObjectInspectorFactory.javaStringObjectInspector, oi) .convert(partSpec.get(key)); } partObjectInspectors.add(oi); } opCtx.rowWithPart = new Object[] {null, partValues}; opCtx.partObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(partNames, partObjectInspectors); } // The op may not be a TableScan for mapjoins // Consider the query: select /*+MAPJOIN(a)*/ count(*) FROM T1 a JOIN T2 b ON a.key = b.key; // In that case, it will be a Select, but the rowOI need not be amended if (opCtx.op instanceof TableScanOperator) { TableScanOperator tsOp = (TableScanOperator) opCtx.op; TableScanDesc tsDesc = tsOp.getConf(); if (tsDesc != null && tsDesc.hasVirtualCols()) { opCtx.vcs = tsDesc.getVirtualCols(); opCtx.vcValues = new Object[opCtx.vcs.size()]; opCtx.vcsObjectInspector = VirtualColumn.getVCSObjectInspector(opCtx.vcs); if (opCtx.isPartitioned()) { opCtx.rowWithPartAndVC = Arrays.copyOfRange(opCtx.rowWithPart, 0, 3); } else { opCtx.rowWithPartAndVC = new Object[2]; } } } if (!opCtx.hasVC() && !opCtx.isPartitioned()) { opCtx.rowObjectInspector = tableRowOI; return opCtx; } List<StructObjectInspector> inspectors = new ArrayList<StructObjectInspector>(); inspectors.add(tableRowOI); if (opCtx.isPartitioned()) { inspectors.add(opCtx.partObjectInspector); } if (opCtx.hasVC()) { inspectors.add(opCtx.vcsObjectInspector); } opCtx.rowObjectInspector = ObjectInspectorFactory.getUnionStructObjectInspector(inspectors); return opCtx; }
public void validate() throws SemanticException { if ((this.getCols() == null) || (this.getCols().size() == 0)) { // for now make sure that serde exists if (StringUtils.isEmpty(this.getSerName()) || !SerDeUtils.shouldGetColsFromSerDe(this.getSerName())) { throw new SemanticException(ErrorMsg.INVALID_TBL_DDL_SERDE.getMsg()); } return; } if (this.getStorageHandler() == null) { try { Class<?> origin = Class.forName(this.getOutputFormat(), true, JavaUtils.getClassLoader()); Class<? extends HiveOutputFormat> replaced = HiveFileFormatUtils.getOutputFormatSubstitute(origin); if (replaced == null) { throw new SemanticException(ErrorMsg.INVALID_OUTPUT_FORMAT_TYPE.getMsg()); } } catch (ClassNotFoundException e) { throw new SemanticException(ErrorMsg.INVALID_OUTPUT_FORMAT_TYPE.getMsg()); } } List<String> colNames = ParseUtils.validateColumnNameUniqueness(this.getCols()); if (this.getBucketCols() != null) { // all columns in cluster and sort are valid columns Iterator<String> bucketCols = this.getBucketCols().iterator(); while (bucketCols.hasNext()) { String bucketCol = bucketCols.next(); boolean found = false; Iterator<String> colNamesIter = colNames.iterator(); while (colNamesIter.hasNext()) { String colName = colNamesIter.next(); if (bucketCol.equalsIgnoreCase(colName)) { found = true; break; } } if (!found) { throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg()); } } } if (this.getSortCols() != null) { // all columns in cluster and sort are valid columns Iterator<Order> sortCols = this.getSortCols().iterator(); while (sortCols.hasNext()) { String sortCol = sortCols.next().getCol(); boolean found = false; Iterator<String> colNamesIter = colNames.iterator(); while (colNamesIter.hasNext()) { String colName = colNamesIter.next(); if (sortCol.equalsIgnoreCase(colName)) { found = true; break; } } if (!found) { throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg()); } } } if (this.getPartCols() != null) { // there is no overlap between columns and partitioning columns Iterator<FieldSchema> partColsIter = this.getPartCols().iterator(); while (partColsIter.hasNext()) { FieldSchema fs = partColsIter.next(); String partCol = fs.getName(); PrimitiveObjectInspectorUtils.PrimitiveTypeEntry pte = PrimitiveObjectInspectorUtils.getTypeEntryFromTypeName(fs.getType()); if (null == pte) { throw new SemanticException( ErrorMsg.PARTITION_COLUMN_NON_PRIMITIVE.getMsg() + " Found " + partCol + " of type: " + fs.getType()); } Iterator<String> colNamesIter = colNames.iterator(); while (colNamesIter.hasNext()) { String colName = BaseSemanticAnalyzer.unescapeIdentifier(colNamesIter.next()); if (partCol.equalsIgnoreCase(colName)) { throw new SemanticException(ErrorMsg.COLUMN_REPEATED_IN_PARTITIONING_COLS.getMsg()); } } } } /* Validate skewed information. */ ValidationUtility.validateSkewedInformation( colNames, this.getSkewedColNames(), this.getSkewedColValues()); }
@Override public void processOp(Object row, int tag) throws HiveException { try { reportProgress(); // get alias alias = (byte) tag; if ((lastAlias == null) || (!lastAlias.equals(alias))) { nextSz = joinEmitInterval; } ArrayList<Object> nr = JoinUtil.computeValues( row, joinValues.get(alias), joinValuesObjectInspectors.get(alias), joinFilters.get(alias), joinFilterObjectInspectors.get(alias), noOuterJoin); if (handleSkewJoin) { skewJoinKeyContext.handleSkew(tag); } // number of rows for the key in the given table int sz = storage.get(alias).size(); StructObjectInspector soi = (StructObjectInspector) inputObjInspectors[tag]; StructField sf = soi.getStructFieldRef(Utilities.ReduceField.KEY.toString()); Object keyObject = soi.getStructFieldData(row, sf); // Are we consuming too much memory if (alias == numAliases - 1 && !(handleSkewJoin && skewJoinKeyContext.currBigKeyTag >= 0)) { if (sz == joinEmitInterval) { // The input is sorted by alias, so if we are already in the last join // operand, // we can emit some results now. // Note this has to be done before adding the current row to the // storage, // to preserve the correctness for outer joins. checkAndGenObject(); storage.get(alias).clear(); } } else { if (sz == nextSz) { // Output a warning if we reached at least 1000 rows for a join // operand // We won't output a warning for the last join operand since the size // will never goes to joinEmitInterval. LOG.warn("table " + alias + " has " + sz + " rows for join key " + keyObject); nextSz = getNextSize(nextSz); } } // Add the value to the vector storage.get(alias).add(nr); // if join-key is null, process each row in different group. if (SerDeUtils.hasAnyNullObject(keyObject, sf.getFieldObjectInspector())) { endGroup(); startGroup(); } } catch (Exception e) { e.printStackTrace(); throw new HiveException(e); } }
@Override @SuppressWarnings("unchecked") public void init(JobConf job, OutputCollector output, Reporter reporter) throws Exception { perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS); super.init(job, output, reporter); rowObjectInspector = new ObjectInspector[Byte.MAX_VALUE]; ObjectInspector[] valueObjectInspector = new ObjectInspector[Byte.MAX_VALUE]; ObjectInspector keyObjectInspector; ReduceWork gWork = Utilities.getReduceWork(job); reducer = gWork.getReducer(); vectorized = gWork.getVectorMode(); reducer.setParentOperators(null); // clear out any parents as reducer is the // root isTagged = gWork.getNeedsTagging(); try { keyTableDesc = gWork.getKeyDesc(); inputKeyDeserializer = ReflectionUtils.newInstance(keyTableDesc.getDeserializerClass(), null); SerDeUtils.initializeSerDe(inputKeyDeserializer, null, keyTableDesc.getProperties(), null); keyObjectInspector = inputKeyDeserializer.getObjectInspector(); valueTableDesc = new TableDesc[gWork.getTagToValueDesc().size()]; if (vectorized) { final int maxTags = gWork.getTagToValueDesc().size(); keyStructInspector = (StructObjectInspector) keyObjectInspector; batches = new VectorizedRowBatch[maxTags]; valueStructInspectors = new StructObjectInspector[maxTags]; valueStringWriters = new List[maxTags]; keysColumnOffset = keyStructInspector.getAllStructFieldRefs().size(); buffer = new DataOutputBuffer(); } for (int tag = 0; tag < gWork.getTagToValueDesc().size(); tag++) { // We should initialize the SerDe with the TypeInfo when available. valueTableDesc[tag] = gWork.getTagToValueDesc().get(tag); inputValueDeserializer[tag] = ReflectionUtils.newInstance(valueTableDesc[tag].getDeserializerClass(), null); SerDeUtils.initializeSerDe( inputValueDeserializer[tag], null, valueTableDesc[tag].getProperties(), null); valueObjectInspector[tag] = inputValueDeserializer[tag].getObjectInspector(); ArrayList<ObjectInspector> ois = new ArrayList<ObjectInspector>(); if (vectorized) { /* vectorization only works with struct object inspectors */ valueStructInspectors[tag] = (StructObjectInspector) valueObjectInspector[tag]; ObjectPair<VectorizedRowBatch, StandardStructObjectInspector> pair = VectorizedBatchUtil.constructVectorizedRowBatch( keyStructInspector, valueStructInspectors[tag], gWork.getVectorScratchColumnTypeMap()); batches[tag] = pair.getFirst(); final int totalColumns = keysColumnOffset + valueStructInspectors[tag].getAllStructFieldRefs().size(); valueStringWriters[tag] = new ArrayList<VectorExpressionWriter>(totalColumns); valueStringWriters[tag].addAll( Arrays.asList( VectorExpressionWriterFactory.genVectorStructExpressionWritables( keyStructInspector))); valueStringWriters[tag].addAll( Arrays.asList( VectorExpressionWriterFactory.genVectorStructExpressionWritables( valueStructInspectors[tag]))); rowObjectInspector[tag] = pair.getSecond(); } else { ois.add(keyObjectInspector); ois.add(valueObjectInspector[tag]); // reducer.setGroupKeyObjectInspector(keyObjectInspector); rowObjectInspector[tag] = ObjectInspectorFactory.getStandardStructObjectInspector( Utilities.reduceFieldNameList, ois); } } } catch (Exception e) { throw new RuntimeException(e); } ExecMapperContext execContext = new ExecMapperContext(job); localWork = gWork.getMapRedLocalWork(); execContext.setJc(jc); execContext.setLocalWork(localWork); reducer.passExecContext(execContext); reducer.setReporter(rp); OperatorUtils.setChildrenCollector( Arrays.<Operator<? extends OperatorDesc>>asList(reducer), output); // initialize reduce operator tree try { LOG.info(reducer.dump(0)); reducer.initialize(jc, rowObjectInspector); if (localWork != null) { for (Operator<? extends OperatorDesc> dummyOp : localWork.getDummyParentOp()) { dummyOp.setExecContext(execContext); dummyOp.initialize(jc, null); } } } catch (Throwable e) { abort = true; if (e instanceof OutOfMemoryError) { // Don't create a new object if we are already out of memory throw (OutOfMemoryError) e; } else { throw new RuntimeException("Reduce operator initialization failed", e); } } perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS); }