Example #1
0
  public void generateMapMetaData() throws HiveException {
    // generate the meta data for key
    // index for key is -1

    try {
      TableDesc keyTableDesc = conf.getKeyTblDesc();
      SerDe keySerializer =
          (SerDe) ReflectionUtil.newInstance(keyTableDesc.getDeserializerClass(), null);
      SerDeUtils.initializeSerDe(keySerializer, null, keyTableDesc.getProperties(), null);
      MapJoinObjectSerDeContext keyContext = new MapJoinObjectSerDeContext(keySerializer, false);
      for (int pos = 0; pos < order.length; pos++) {
        if (pos == posBigTable) {
          continue;
        }
        TableDesc valueTableDesc;
        if (conf.getNoOuterJoin()) {
          valueTableDesc = conf.getValueTblDescs().get(pos);
        } else {
          valueTableDesc = conf.getValueFilteredTblDescs().get(pos);
        }
        SerDe valueSerDe =
            (SerDe) ReflectionUtil.newInstance(valueTableDesc.getDeserializerClass(), null);
        SerDeUtils.initializeSerDe(valueSerDe, null, valueTableDesc.getProperties(), null);
        MapJoinObjectSerDeContext valueContext =
            new MapJoinObjectSerDeContext(valueSerDe, hasFilter(pos));
        mapJoinTableSerdes[pos] = new MapJoinTableContainerSerDe(keyContext, valueContext);
      }
    } catch (SerDeException e) {
      throw new HiveException(e);
    }
  }
Example #2
0
  public void testRW() throws Exception {

    Configuration conf = new Configuration();

    for (Pair<Properties, HCatRecord> e : getData()) {
      Properties tblProps = e.first;
      HCatRecord r = e.second;

      HCatRecordSerDe hrsd = new HCatRecordSerDe();
      SerDeUtils.initializeSerDe(hrsd, conf, tblProps, null);

      JsonSerDe jsde = new JsonSerDe();
      SerDeUtils.initializeSerDe(jsde, conf, tblProps, null);

      LOG.info("ORIG:{}", r);

      Writable s = hrsd.serialize(r, hrsd.getObjectInspector());
      LOG.info("ONE:{}", s);

      Object o1 = hrsd.deserialize(s);
      StringBuilder msg = new StringBuilder();
      boolean isEqual = HCatDataCheckUtil.recordsEqual(r, (HCatRecord) o1);
      assertTrue(msg.toString(), isEqual);

      Writable s2 = jsde.serialize(o1, hrsd.getObjectInspector());
      LOG.info("TWO:{}", s2);
      Object o2 = jsde.deserialize(s2);
      LOG.info("deserialized TWO : {} ", o2);
      msg.setLength(0);
      isEqual = HCatDataCheckUtil.recordsEqual(r, (HCatRecord) o2, msg);
      assertTrue(msg.toString(), isEqual);
    }
  }
Example #3
0
  public void testRW() throws Exception {

    Configuration conf = new Configuration();

    for (Entry<Properties, HCatRecord> e : getData().entrySet()) {
      Properties tblProps = e.getKey();
      HCatRecord r = e.getValue();

      HCatRecordSerDe hrsd = new HCatRecordSerDe();
      SerDeUtils.initializeSerDe(hrsd, conf, tblProps, null);

      LOG.info("ORIG: {}", r);

      Writable s = hrsd.serialize(r, hrsd.getObjectInspector());
      LOG.info("ONE: {}", s);

      HCatRecord r2 = (HCatRecord) hrsd.deserialize(s);
      Assert.assertTrue(HCatDataCheckUtil.recordsEqual(r, r2));

      // If it went through correctly, then s is also a HCatRecord,
      // and also equal to the above, and a deepcopy, and this holds
      // through for multiple levels more of serialization as well.

      Writable s2 = hrsd.serialize(s, hrsd.getObjectInspector());
      LOG.info("TWO: {}", s2);
      Assert.assertTrue(HCatDataCheckUtil.recordsEqual(r, (HCatRecord) s));
      Assert.assertTrue(HCatDataCheckUtil.recordsEqual(r, (HCatRecord) s2));

      // serialize using another serde, and read out that object repr.
      LazySimpleSerDe testSD = new LazySimpleSerDe();
      SerDeUtils.initializeSerDe(testSD, conf, tblProps, null);

      Writable s3 = testSD.serialize(s, hrsd.getObjectInspector());
      LOG.info("THREE: {}", s3);
      Object o3 = testSD.deserialize(s3);
      Assert.assertFalse(r.getClass().equals(o3.getClass()));

      // then serialize again using hrsd, and compare results
      HCatRecord s4 = (HCatRecord) hrsd.serialize(o3, testSD.getObjectInspector());
      LOG.info("FOUR: {}", s4);

      // Test LazyHCatRecord init and read
      LazyHCatRecord s5 = new LazyHCatRecord(o3, testSD.getObjectInspector());
      LOG.info("FIVE: {}", s5);

      LazyHCatRecord s6 = new LazyHCatRecord(s4, hrsd.getObjectInspector());
      LOG.info("SIX: {}", s6);
    }
  }
Example #4
0
  public void testMapValues() throws Exception {
    Configuration conf = new Configuration();
    Properties props = new Properties();

    props.put(serdeConstants.LIST_COLUMNS, "a,b");
    props.put(serdeConstants.LIST_COLUMN_TYPES, "array<string>,map<string,int>");
    JsonSerDe rjsd = new JsonSerDe();
    SerDeUtils.initializeSerDe(rjsd, conf, props, null);

    Text text1 = new Text("{ \"a\":[\"aaa\"],\"b\":{\"bbb\":1}} ");
    Text text2 = new Text("{\"a\":[\"yyy\"],\"b\":{\"zzz\":123}}");
    Text text3 = new Text("{\"a\":[\"a\"],\"b\":{\"x\":11, \"y\": 22, \"z\": null}}");

    HCatRecord expected1 =
        new DefaultHCatRecord(
            Arrays.<Object>asList(
                Arrays.<String>asList("aaa"), createHashMapStringInteger("bbb", 1)));
    HCatRecord expected2 =
        new DefaultHCatRecord(
            Arrays.<Object>asList(
                Arrays.<String>asList("yyy"), createHashMapStringInteger("zzz", 123)));
    HCatRecord expected3 =
        new DefaultHCatRecord(
            Arrays.<Object>asList(
                Arrays.<String>asList("a"),
                createHashMapStringInteger("x", 11, "y", 22, "z", null)));

    assertTrue(HCatDataCheckUtil.recordsEqual((HCatRecord) rjsd.deserialize(text1), expected1));
    assertTrue(HCatDataCheckUtil.recordsEqual((HCatRecord) rjsd.deserialize(text2), expected2));
  }
Example #5
0
  /**
   * This test tests that our json deserialization is not too strict, as per HIVE-6166
   *
   * <p>i.e, if our schema is "s:struct<a:int,b:string>,k:int", and we pass in data that looks like
   * : { "x" : "abc" , "t" : { "a" : "1", "b" : "2", "c" : [ { "x" : 2 , "y" : 3 } , { "x" : 3 , "y"
   * : 2 } ] } , "s" : { "a" : 2 , "b" : "blah", "c": "woo" } }
   *
   * <p>Then it should still work, and ignore the "x" and "t" field and "c" subfield of "s", and it
   * should read k as null.
   */
  public void testLooseJsonReadability() throws Exception {
    Configuration conf = new Configuration();
    Properties props = new Properties();

    props.put(serdeConstants.LIST_COLUMNS, "s,k");
    props.put(serdeConstants.LIST_COLUMN_TYPES, "struct<a:int,b:string>,int");
    JsonSerDe rjsd = new JsonSerDe();
    SerDeUtils.initializeSerDe(rjsd, conf, props, null);

    Text jsonText =
        new Text(
            "{ \"x\" : \"abc\" , "
                + " \"t\" : { \"a\":\"1\", \"b\":\"2\", \"c\":[ { \"x\":2 , \"y\":3 } , { \"x\":3 , \"y\":2 }] } ,"
                + "\"s\" : { \"a\" : 2 , \"b\" : \"blah\", \"c\": \"woo\" } }");
    List<Object> expected = new ArrayList<Object>();
    List<Object> inner = new ArrayList<Object>();
    inner.add(2);
    inner.add("blah");
    expected.add(inner);
    expected.add(null);
    HCatRecord expectedRecord = new DefaultHCatRecord(expected);

    HCatRecord r = (HCatRecord) rjsd.deserialize(jsonText);
    System.err.println("record : " + r.toString());

    assertTrue(HCatDataCheckUtil.recordsEqual(r, expectedRecord));
  }
Example #6
0
  public void testRobustRead() throws Exception {
    /**
     * This test has been added to account for HCATALOG-436 We write out columns with "internal
     * column names" such as "_col0", but try to read with regular column names.
     */
    Configuration conf = new Configuration();

    for (Pair<Properties, HCatRecord> e : getData()) {
      Properties tblProps = e.first;
      HCatRecord r = e.second;

      Properties internalTblProps = new Properties();
      for (Map.Entry pe : tblProps.entrySet()) {
        if (!pe.getKey().equals(serdeConstants.LIST_COLUMNS)) {
          internalTblProps.put(pe.getKey(), pe.getValue());
        } else {
          internalTblProps.put(pe.getKey(), getInternalNames((String) pe.getValue()));
        }
      }

      LOG.info("orig tbl props:{}", tblProps);
      LOG.info("modif tbl props:{}", internalTblProps);

      JsonSerDe wjsd = new JsonSerDe();
      SerDeUtils.initializeSerDe(wjsd, conf, internalTblProps, null);

      JsonSerDe rjsd = new JsonSerDe();
      SerDeUtils.initializeSerDe(rjsd, conf, tblProps, null);

      LOG.info("ORIG:{}", r);

      Writable s = wjsd.serialize(r, wjsd.getObjectInspector());
      LOG.info("ONE:{}", s);

      Object o1 = wjsd.deserialize(s);
      LOG.info("deserialized ONE : {} ", o1);

      Object o2 = rjsd.deserialize(s);
      LOG.info("deserialized TWO : {} ", o2);
      StringBuilder msg = new StringBuilder();
      boolean isEqual = HCatDataCheckUtil.recordsEqual(r, (HCatRecord) o2, msg);
      assertTrue(msg.toString(), isEqual);
    }
  }
  private ParquetHiveRecord getParquetWritable(
      String columnNames, String columnTypes, ArrayWritable record) throws SerDeException {
    Properties recordProperties = new Properties();
    recordProperties.setProperty("columns", columnNames);
    recordProperties.setProperty("columns.types", columnTypes);

    ParquetHiveSerDe serDe = new ParquetHiveSerDe();
    SerDeUtils.initializeSerDe(serDe, new Configuration(), recordProperties, null);

    return new ParquetHiveRecord(
        serDe.deserialize(record), getObjectInspector(columnNames, columnTypes));
  }
Example #8
0
  public void testUpperCaseKey() throws Exception {
    Configuration conf = new Configuration();
    Properties props = new Properties();

    props.put(serdeConstants.LIST_COLUMNS, "empid,name");
    props.put(serdeConstants.LIST_COLUMN_TYPES, "int,string");
    JsonSerDe rjsd = new JsonSerDe();
    SerDeUtils.initializeSerDe(rjsd, conf, props, null);

    Text text1 = new Text("{ \"empId\" : 123, \"name\" : \"John\" } ");
    Text text2 = new Text("{ \"empId\" : 456, \"name\" : \"Jane\" } ");

    HCatRecord expected1 = new DefaultHCatRecord(Arrays.<Object>asList(123, "John"));
    HCatRecord expected2 = new DefaultHCatRecord(Arrays.<Object>asList(456, "Jane"));

    assertTrue(HCatDataCheckUtil.recordsEqual((HCatRecord) rjsd.deserialize(text1), expected1));
    assertTrue(HCatDataCheckUtil.recordsEqual((HCatRecord) rjsd.deserialize(text2), expected2));
  }
Example #9
0
  private SerDe getSerDe() throws SQLException {
    if (serde != null) {
      return serde;
    }
    try {
      List<FieldSchema> fieldSchemas = mResultSchema.getFieldSchemas();
      StringBuilder namesSb = new StringBuilder();
      StringBuilder typesSb = new StringBuilder();

      if (fieldSchemas != null && !fieldSchemas.isEmpty()) {
        for (int pos = 0; pos < fieldSchemas.size(); pos++) {
          if (pos != 0) {
            namesSb.append(",");
            typesSb.append(",");
          }
          namesSb.append(fieldSchemas.get(pos).getName());
          typesSb.append(fieldSchemas.get(pos).getType());
        }
      }
      String names = namesSb.toString();
      String types = typesSb.toString();

      serde = new LazySimpleSerDe();
      Properties props = new Properties();
      if (names.length() > 0) {
        LOG.debug("Column names: " + names);
        props.setProperty(serdeConstants.LIST_COLUMNS, names);
      }
      if (types.length() > 0) {
        LOG.debug("Column types: " + types);
        props.setProperty(serdeConstants.LIST_COLUMN_TYPES, types);
      }
      SerDeUtils.initializeSerDe(serde, new HiveConf(), props, null);

    } catch (Exception ex) {
      ex.printStackTrace();
      throw new SQLException("Could not create ResultSet: " + ex.getMessage(), ex);
    }
    return serde;
  }
  @Override
  @SuppressWarnings("unchecked")
  public void init(JobConf job, OutputCollector output, Reporter reporter) throws Exception {
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS);
    super.init(job, output, reporter);

    rowObjectInspector = new ObjectInspector[Byte.MAX_VALUE];
    ObjectInspector[] valueObjectInspector = new ObjectInspector[Byte.MAX_VALUE];
    ObjectInspector keyObjectInspector;

    ReduceWork gWork = Utilities.getReduceWork(job);

    reducer = gWork.getReducer();
    vectorized = gWork.getVectorMode();
    reducer.setParentOperators(null); // clear out any parents as reducer is the
    // root
    isTagged = gWork.getNeedsTagging();
    try {
      keyTableDesc = gWork.getKeyDesc();
      inputKeyDeserializer = ReflectionUtils.newInstance(keyTableDesc.getDeserializerClass(), null);
      SerDeUtils.initializeSerDe(inputKeyDeserializer, null, keyTableDesc.getProperties(), null);
      keyObjectInspector = inputKeyDeserializer.getObjectInspector();
      valueTableDesc = new TableDesc[gWork.getTagToValueDesc().size()];

      if (vectorized) {
        final int maxTags = gWork.getTagToValueDesc().size();
        keyStructInspector = (StructObjectInspector) keyObjectInspector;
        batches = new VectorizedRowBatch[maxTags];
        valueStructInspectors = new StructObjectInspector[maxTags];
        valueStringWriters = new List[maxTags];
        keysColumnOffset = keyStructInspector.getAllStructFieldRefs().size();
        buffer = new DataOutputBuffer();
      }

      for (int tag = 0; tag < gWork.getTagToValueDesc().size(); tag++) {
        // We should initialize the SerDe with the TypeInfo when available.
        valueTableDesc[tag] = gWork.getTagToValueDesc().get(tag);
        inputValueDeserializer[tag] =
            ReflectionUtils.newInstance(valueTableDesc[tag].getDeserializerClass(), null);
        SerDeUtils.initializeSerDe(
            inputValueDeserializer[tag], null, valueTableDesc[tag].getProperties(), null);
        valueObjectInspector[tag] = inputValueDeserializer[tag].getObjectInspector();

        ArrayList<ObjectInspector> ois = new ArrayList<ObjectInspector>();

        if (vectorized) {
          /* vectorization only works with struct object inspectors */
          valueStructInspectors[tag] = (StructObjectInspector) valueObjectInspector[tag];

          ObjectPair<VectorizedRowBatch, StandardStructObjectInspector> pair =
              VectorizedBatchUtil.constructVectorizedRowBatch(
                  keyStructInspector,
                  valueStructInspectors[tag],
                  gWork.getVectorScratchColumnTypeMap());
          batches[tag] = pair.getFirst();
          final int totalColumns =
              keysColumnOffset + valueStructInspectors[tag].getAllStructFieldRefs().size();
          valueStringWriters[tag] = new ArrayList<VectorExpressionWriter>(totalColumns);
          valueStringWriters[tag].addAll(
              Arrays.asList(
                  VectorExpressionWriterFactory.genVectorStructExpressionWritables(
                      keyStructInspector)));
          valueStringWriters[tag].addAll(
              Arrays.asList(
                  VectorExpressionWriterFactory.genVectorStructExpressionWritables(
                      valueStructInspectors[tag])));

          rowObjectInspector[tag] = pair.getSecond();
        } else {
          ois.add(keyObjectInspector);
          ois.add(valueObjectInspector[tag]);
          // reducer.setGroupKeyObjectInspector(keyObjectInspector);
          rowObjectInspector[tag] =
              ObjectInspectorFactory.getStandardStructObjectInspector(
                  Utilities.reduceFieldNameList, ois);
        }
      }
    } catch (Exception e) {
      throw new RuntimeException(e);
    }

    ExecMapperContext execContext = new ExecMapperContext(job);
    localWork = gWork.getMapRedLocalWork();
    execContext.setJc(jc);
    execContext.setLocalWork(localWork);
    reducer.passExecContext(execContext);

    reducer.setReporter(rp);
    OperatorUtils.setChildrenCollector(
        Arrays.<Operator<? extends OperatorDesc>>asList(reducer), output);

    // initialize reduce operator tree
    try {
      LOG.info(reducer.dump(0));
      reducer.initialize(jc, rowObjectInspector);

      if (localWork != null) {
        for (Operator<? extends OperatorDesc> dummyOp : localWork.getDummyParentOp()) {
          dummyOp.setExecContext(execContext);
          dummyOp.initialize(jc, null);
        }
      }

    } catch (Throwable e) {
      abort = true;
      if (e instanceof OutOfMemoryError) {
        // Don't create a new object if we are already out of memory
        throw (OutOfMemoryError) e;
      } else {
        throw new RuntimeException("Reduce operator initialization failed", e);
      }
    }
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS);
  }