public void generateMapMetaData() throws HiveException { // generate the meta data for key // index for key is -1 try { TableDesc keyTableDesc = conf.getKeyTblDesc(); SerDe keySerializer = (SerDe) ReflectionUtil.newInstance(keyTableDesc.getDeserializerClass(), null); SerDeUtils.initializeSerDe(keySerializer, null, keyTableDesc.getProperties(), null); MapJoinObjectSerDeContext keyContext = new MapJoinObjectSerDeContext(keySerializer, false); for (int pos = 0; pos < order.length; pos++) { if (pos == posBigTable) { continue; } TableDesc valueTableDesc; if (conf.getNoOuterJoin()) { valueTableDesc = conf.getValueTblDescs().get(pos); } else { valueTableDesc = conf.getValueFilteredTblDescs().get(pos); } SerDe valueSerDe = (SerDe) ReflectionUtil.newInstance(valueTableDesc.getDeserializerClass(), null); SerDeUtils.initializeSerDe(valueSerDe, null, valueTableDesc.getProperties(), null); MapJoinObjectSerDeContext valueContext = new MapJoinObjectSerDeContext(valueSerDe, hasFilter(pos)); mapJoinTableSerdes[pos] = new MapJoinTableContainerSerDe(keyContext, valueContext); } } catch (SerDeException e) { throw new HiveException(e); } }
public void testRW() throws Exception { Configuration conf = new Configuration(); for (Pair<Properties, HCatRecord> e : getData()) { Properties tblProps = e.first; HCatRecord r = e.second; HCatRecordSerDe hrsd = new HCatRecordSerDe(); SerDeUtils.initializeSerDe(hrsd, conf, tblProps, null); JsonSerDe jsde = new JsonSerDe(); SerDeUtils.initializeSerDe(jsde, conf, tblProps, null); LOG.info("ORIG:{}", r); Writable s = hrsd.serialize(r, hrsd.getObjectInspector()); LOG.info("ONE:{}", s); Object o1 = hrsd.deserialize(s); StringBuilder msg = new StringBuilder(); boolean isEqual = HCatDataCheckUtil.recordsEqual(r, (HCatRecord) o1); assertTrue(msg.toString(), isEqual); Writable s2 = jsde.serialize(o1, hrsd.getObjectInspector()); LOG.info("TWO:{}", s2); Object o2 = jsde.deserialize(s2); LOG.info("deserialized TWO : {} ", o2); msg.setLength(0); isEqual = HCatDataCheckUtil.recordsEqual(r, (HCatRecord) o2, msg); assertTrue(msg.toString(), isEqual); } }
public void testRW() throws Exception { Configuration conf = new Configuration(); for (Entry<Properties, HCatRecord> e : getData().entrySet()) { Properties tblProps = e.getKey(); HCatRecord r = e.getValue(); HCatRecordSerDe hrsd = new HCatRecordSerDe(); SerDeUtils.initializeSerDe(hrsd, conf, tblProps, null); LOG.info("ORIG: {}", r); Writable s = hrsd.serialize(r, hrsd.getObjectInspector()); LOG.info("ONE: {}", s); HCatRecord r2 = (HCatRecord) hrsd.deserialize(s); Assert.assertTrue(HCatDataCheckUtil.recordsEqual(r, r2)); // If it went through correctly, then s is also a HCatRecord, // and also equal to the above, and a deepcopy, and this holds // through for multiple levels more of serialization as well. Writable s2 = hrsd.serialize(s, hrsd.getObjectInspector()); LOG.info("TWO: {}", s2); Assert.assertTrue(HCatDataCheckUtil.recordsEqual(r, (HCatRecord) s)); Assert.assertTrue(HCatDataCheckUtil.recordsEqual(r, (HCatRecord) s2)); // serialize using another serde, and read out that object repr. LazySimpleSerDe testSD = new LazySimpleSerDe(); SerDeUtils.initializeSerDe(testSD, conf, tblProps, null); Writable s3 = testSD.serialize(s, hrsd.getObjectInspector()); LOG.info("THREE: {}", s3); Object o3 = testSD.deserialize(s3); Assert.assertFalse(r.getClass().equals(o3.getClass())); // then serialize again using hrsd, and compare results HCatRecord s4 = (HCatRecord) hrsd.serialize(o3, testSD.getObjectInspector()); LOG.info("FOUR: {}", s4); // Test LazyHCatRecord init and read LazyHCatRecord s5 = new LazyHCatRecord(o3, testSD.getObjectInspector()); LOG.info("FIVE: {}", s5); LazyHCatRecord s6 = new LazyHCatRecord(s4, hrsd.getObjectInspector()); LOG.info("SIX: {}", s6); } }
public void testMapValues() throws Exception { Configuration conf = new Configuration(); Properties props = new Properties(); props.put(serdeConstants.LIST_COLUMNS, "a,b"); props.put(serdeConstants.LIST_COLUMN_TYPES, "array<string>,map<string,int>"); JsonSerDe rjsd = new JsonSerDe(); SerDeUtils.initializeSerDe(rjsd, conf, props, null); Text text1 = new Text("{ \"a\":[\"aaa\"],\"b\":{\"bbb\":1}} "); Text text2 = new Text("{\"a\":[\"yyy\"],\"b\":{\"zzz\":123}}"); Text text3 = new Text("{\"a\":[\"a\"],\"b\":{\"x\":11, \"y\": 22, \"z\": null}}"); HCatRecord expected1 = new DefaultHCatRecord( Arrays.<Object>asList( Arrays.<String>asList("aaa"), createHashMapStringInteger("bbb", 1))); HCatRecord expected2 = new DefaultHCatRecord( Arrays.<Object>asList( Arrays.<String>asList("yyy"), createHashMapStringInteger("zzz", 123))); HCatRecord expected3 = new DefaultHCatRecord( Arrays.<Object>asList( Arrays.<String>asList("a"), createHashMapStringInteger("x", 11, "y", 22, "z", null))); assertTrue(HCatDataCheckUtil.recordsEqual((HCatRecord) rjsd.deserialize(text1), expected1)); assertTrue(HCatDataCheckUtil.recordsEqual((HCatRecord) rjsd.deserialize(text2), expected2)); }
/** * This test tests that our json deserialization is not too strict, as per HIVE-6166 * * <p>i.e, if our schema is "s:struct<a:int,b:string>,k:int", and we pass in data that looks like * : { "x" : "abc" , "t" : { "a" : "1", "b" : "2", "c" : [ { "x" : 2 , "y" : 3 } , { "x" : 3 , "y" * : 2 } ] } , "s" : { "a" : 2 , "b" : "blah", "c": "woo" } } * * <p>Then it should still work, and ignore the "x" and "t" field and "c" subfield of "s", and it * should read k as null. */ public void testLooseJsonReadability() throws Exception { Configuration conf = new Configuration(); Properties props = new Properties(); props.put(serdeConstants.LIST_COLUMNS, "s,k"); props.put(serdeConstants.LIST_COLUMN_TYPES, "struct<a:int,b:string>,int"); JsonSerDe rjsd = new JsonSerDe(); SerDeUtils.initializeSerDe(rjsd, conf, props, null); Text jsonText = new Text( "{ \"x\" : \"abc\" , " + " \"t\" : { \"a\":\"1\", \"b\":\"2\", \"c\":[ { \"x\":2 , \"y\":3 } , { \"x\":3 , \"y\":2 }] } ," + "\"s\" : { \"a\" : 2 , \"b\" : \"blah\", \"c\": \"woo\" } }"); List<Object> expected = new ArrayList<Object>(); List<Object> inner = new ArrayList<Object>(); inner.add(2); inner.add("blah"); expected.add(inner); expected.add(null); HCatRecord expectedRecord = new DefaultHCatRecord(expected); HCatRecord r = (HCatRecord) rjsd.deserialize(jsonText); System.err.println("record : " + r.toString()); assertTrue(HCatDataCheckUtil.recordsEqual(r, expectedRecord)); }
public void testRobustRead() throws Exception { /** * This test has been added to account for HCATALOG-436 We write out columns with "internal * column names" such as "_col0", but try to read with regular column names. */ Configuration conf = new Configuration(); for (Pair<Properties, HCatRecord> e : getData()) { Properties tblProps = e.first; HCatRecord r = e.second; Properties internalTblProps = new Properties(); for (Map.Entry pe : tblProps.entrySet()) { if (!pe.getKey().equals(serdeConstants.LIST_COLUMNS)) { internalTblProps.put(pe.getKey(), pe.getValue()); } else { internalTblProps.put(pe.getKey(), getInternalNames((String) pe.getValue())); } } LOG.info("orig tbl props:{}", tblProps); LOG.info("modif tbl props:{}", internalTblProps); JsonSerDe wjsd = new JsonSerDe(); SerDeUtils.initializeSerDe(wjsd, conf, internalTblProps, null); JsonSerDe rjsd = new JsonSerDe(); SerDeUtils.initializeSerDe(rjsd, conf, tblProps, null); LOG.info("ORIG:{}", r); Writable s = wjsd.serialize(r, wjsd.getObjectInspector()); LOG.info("ONE:{}", s); Object o1 = wjsd.deserialize(s); LOG.info("deserialized ONE : {} ", o1); Object o2 = rjsd.deserialize(s); LOG.info("deserialized TWO : {} ", o2); StringBuilder msg = new StringBuilder(); boolean isEqual = HCatDataCheckUtil.recordsEqual(r, (HCatRecord) o2, msg); assertTrue(msg.toString(), isEqual); } }
private ParquetHiveRecord getParquetWritable( String columnNames, String columnTypes, ArrayWritable record) throws SerDeException { Properties recordProperties = new Properties(); recordProperties.setProperty("columns", columnNames); recordProperties.setProperty("columns.types", columnTypes); ParquetHiveSerDe serDe = new ParquetHiveSerDe(); SerDeUtils.initializeSerDe(serDe, new Configuration(), recordProperties, null); return new ParquetHiveRecord( serDe.deserialize(record), getObjectInspector(columnNames, columnTypes)); }
public void testUpperCaseKey() throws Exception { Configuration conf = new Configuration(); Properties props = new Properties(); props.put(serdeConstants.LIST_COLUMNS, "empid,name"); props.put(serdeConstants.LIST_COLUMN_TYPES, "int,string"); JsonSerDe rjsd = new JsonSerDe(); SerDeUtils.initializeSerDe(rjsd, conf, props, null); Text text1 = new Text("{ \"empId\" : 123, \"name\" : \"John\" } "); Text text2 = new Text("{ \"empId\" : 456, \"name\" : \"Jane\" } "); HCatRecord expected1 = new DefaultHCatRecord(Arrays.<Object>asList(123, "John")); HCatRecord expected2 = new DefaultHCatRecord(Arrays.<Object>asList(456, "Jane")); assertTrue(HCatDataCheckUtil.recordsEqual((HCatRecord) rjsd.deserialize(text1), expected1)); assertTrue(HCatDataCheckUtil.recordsEqual((HCatRecord) rjsd.deserialize(text2), expected2)); }
private SerDe getSerDe() throws SQLException { if (serde != null) { return serde; } try { List<FieldSchema> fieldSchemas = mResultSchema.getFieldSchemas(); StringBuilder namesSb = new StringBuilder(); StringBuilder typesSb = new StringBuilder(); if (fieldSchemas != null && !fieldSchemas.isEmpty()) { for (int pos = 0; pos < fieldSchemas.size(); pos++) { if (pos != 0) { namesSb.append(","); typesSb.append(","); } namesSb.append(fieldSchemas.get(pos).getName()); typesSb.append(fieldSchemas.get(pos).getType()); } } String names = namesSb.toString(); String types = typesSb.toString(); serde = new LazySimpleSerDe(); Properties props = new Properties(); if (names.length() > 0) { LOG.debug("Column names: " + names); props.setProperty(serdeConstants.LIST_COLUMNS, names); } if (types.length() > 0) { LOG.debug("Column types: " + types); props.setProperty(serdeConstants.LIST_COLUMN_TYPES, types); } SerDeUtils.initializeSerDe(serde, new HiveConf(), props, null); } catch (Exception ex) { ex.printStackTrace(); throw new SQLException("Could not create ResultSet: " + ex.getMessage(), ex); } return serde; }
@Override @SuppressWarnings("unchecked") public void init(JobConf job, OutputCollector output, Reporter reporter) throws Exception { perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS); super.init(job, output, reporter); rowObjectInspector = new ObjectInspector[Byte.MAX_VALUE]; ObjectInspector[] valueObjectInspector = new ObjectInspector[Byte.MAX_VALUE]; ObjectInspector keyObjectInspector; ReduceWork gWork = Utilities.getReduceWork(job); reducer = gWork.getReducer(); vectorized = gWork.getVectorMode(); reducer.setParentOperators(null); // clear out any parents as reducer is the // root isTagged = gWork.getNeedsTagging(); try { keyTableDesc = gWork.getKeyDesc(); inputKeyDeserializer = ReflectionUtils.newInstance(keyTableDesc.getDeserializerClass(), null); SerDeUtils.initializeSerDe(inputKeyDeserializer, null, keyTableDesc.getProperties(), null); keyObjectInspector = inputKeyDeserializer.getObjectInspector(); valueTableDesc = new TableDesc[gWork.getTagToValueDesc().size()]; if (vectorized) { final int maxTags = gWork.getTagToValueDesc().size(); keyStructInspector = (StructObjectInspector) keyObjectInspector; batches = new VectorizedRowBatch[maxTags]; valueStructInspectors = new StructObjectInspector[maxTags]; valueStringWriters = new List[maxTags]; keysColumnOffset = keyStructInspector.getAllStructFieldRefs().size(); buffer = new DataOutputBuffer(); } for (int tag = 0; tag < gWork.getTagToValueDesc().size(); tag++) { // We should initialize the SerDe with the TypeInfo when available. valueTableDesc[tag] = gWork.getTagToValueDesc().get(tag); inputValueDeserializer[tag] = ReflectionUtils.newInstance(valueTableDesc[tag].getDeserializerClass(), null); SerDeUtils.initializeSerDe( inputValueDeserializer[tag], null, valueTableDesc[tag].getProperties(), null); valueObjectInspector[tag] = inputValueDeserializer[tag].getObjectInspector(); ArrayList<ObjectInspector> ois = new ArrayList<ObjectInspector>(); if (vectorized) { /* vectorization only works with struct object inspectors */ valueStructInspectors[tag] = (StructObjectInspector) valueObjectInspector[tag]; ObjectPair<VectorizedRowBatch, StandardStructObjectInspector> pair = VectorizedBatchUtil.constructVectorizedRowBatch( keyStructInspector, valueStructInspectors[tag], gWork.getVectorScratchColumnTypeMap()); batches[tag] = pair.getFirst(); final int totalColumns = keysColumnOffset + valueStructInspectors[tag].getAllStructFieldRefs().size(); valueStringWriters[tag] = new ArrayList<VectorExpressionWriter>(totalColumns); valueStringWriters[tag].addAll( Arrays.asList( VectorExpressionWriterFactory.genVectorStructExpressionWritables( keyStructInspector))); valueStringWriters[tag].addAll( Arrays.asList( VectorExpressionWriterFactory.genVectorStructExpressionWritables( valueStructInspectors[tag]))); rowObjectInspector[tag] = pair.getSecond(); } else { ois.add(keyObjectInspector); ois.add(valueObjectInspector[tag]); // reducer.setGroupKeyObjectInspector(keyObjectInspector); rowObjectInspector[tag] = ObjectInspectorFactory.getStandardStructObjectInspector( Utilities.reduceFieldNameList, ois); } } } catch (Exception e) { throw new RuntimeException(e); } ExecMapperContext execContext = new ExecMapperContext(job); localWork = gWork.getMapRedLocalWork(); execContext.setJc(jc); execContext.setLocalWork(localWork); reducer.passExecContext(execContext); reducer.setReporter(rp); OperatorUtils.setChildrenCollector( Arrays.<Operator<? extends OperatorDesc>>asList(reducer), output); // initialize reduce operator tree try { LOG.info(reducer.dump(0)); reducer.initialize(jc, rowObjectInspector); if (localWork != null) { for (Operator<? extends OperatorDesc> dummyOp : localWork.getDummyParentOp()) { dummyOp.setExecContext(execContext); dummyOp.initialize(jc, null); } } } catch (Throwable e) { abort = true; if (e instanceof OutOfMemoryError) { // Don't create a new object if we are already out of memory throw (OutOfMemoryError) e; } else { throw new RuntimeException("Reduce operator initialization failed", e); } } perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS); }