@Override protected String extractField(Object target) { if (target instanceof HiveType) { HiveType type = (HiveType) target; ObjectInspector inspector = type.getObjectInspector(); if (inspector instanceof StructObjectInspector) { StructObjectInspector soi = (StructObjectInspector) inspector; StructField field = soi.getStructFieldRef(fieldName); ObjectInspector foi = field.getFieldObjectInspector(); Assert.isTrue( foi.getCategory() == ObjectInspector.Category.PRIMITIVE, String.format( "Field [%s] needs to be a primitive; found [%s]", fieldName, foi.getTypeName())); // expecting a writeable - simply do a toString Object data = soi.getStructFieldData(type.getObject(), field); if (data == null || data instanceof NullWritable) { return StringUtils.EMPTY; } return data.toString(); } } return null; }
/** * Builds the assigners from an object inspector and from a list of columns. * * @param outputBatch The batch to which the assigners are bound * @param outputOI The row object inspector * @param columnMap Vector column map * @param outputColumnNames Column names, used both to find the vector columns and the * @return * @throws HiveException */ public static VectorColumnAssign[] buildAssigners( VectorizedRowBatch outputBatch, ObjectInspector outputOI, Map<String, Integer> columnMap, List<String> outputColumnNames) throws HiveException { StructObjectInspector soi = (StructObjectInspector) outputOI; VectorColumnAssign[] vcas = new VectorColumnAssign[outputColumnNames.size()]; for (int i = 0; i < outputColumnNames.size(); ++i) { String columnName = outputColumnNames.get(i); Integer columnIndex = columnMap.get(columnName); StructField columnRef = soi.getStructFieldRef(columnName); ObjectInspector valueOI = columnRef.getFieldObjectInspector(); vcas[i] = buildObjectAssign(outputBatch, columnIndex, valueOI); } return vcas; }
public GenericHiveRecordCursor( RecordReader<K, V> recordReader, long totalBytes, Properties splitSchema, List<HivePartitionKey> partitionKeys, List<HiveColumnHandle> columns, DateTimeZone hiveStorageTimeZone, DateTimeZone sessionTimeZone) { checkNotNull(recordReader, "recordReader is null"); checkArgument(totalBytes >= 0, "totalBytes is negative"); checkNotNull(splitSchema, "splitSchema is null"); checkNotNull(partitionKeys, "partitionKeys is null"); checkNotNull(columns, "columns is null"); checkArgument(!columns.isEmpty(), "columns is empty"); checkNotNull(hiveStorageTimeZone, "hiveStorageTimeZone is null"); checkNotNull(sessionTimeZone, "sessionTimeZone is null"); this.recordReader = recordReader; this.totalBytes = totalBytes; this.key = recordReader.createKey(); this.value = recordReader.createValue(); this.hiveStorageTimeZone = hiveStorageTimeZone; this.sessionTimeZone = sessionTimeZone; this.deserializer = getDeserializer(splitSchema); this.rowInspector = getTableObjectInspector(deserializer); int size = columns.size(); String[] names = new String[size]; this.types = new Type[size]; this.hiveTypes = new HiveType[size]; this.structFields = new StructField[size]; this.fieldInspectors = new ObjectInspector[size]; this.isPartitionColumn = new boolean[size]; this.loaded = new boolean[size]; this.booleans = new boolean[size]; this.longs = new long[size]; this.doubles = new double[size]; this.slices = new Slice[size]; this.nulls = new boolean[size]; // initialize data columns for (int i = 0; i < columns.size(); i++) { HiveColumnHandle column = columns.get(i); names[i] = column.getName(); types[i] = column.getType(); hiveTypes[i] = column.getHiveType(); if (!column.isPartitionKey()) { StructField field = rowInspector.getStructFieldRef(column.getName()); structFields[i] = field; fieldInspectors[i] = field.getFieldObjectInspector(); } isPartitionColumn[i] = column.isPartitionKey(); } // parse requested partition columns Map<String, HivePartitionKey> partitionKeysByName = uniqueIndex(partitionKeys, HivePartitionKey.nameGetter()); for (int columnIndex = 0; columnIndex < columns.size(); columnIndex++) { HiveColumnHandle column = columns.get(columnIndex); if (column.isPartitionKey()) { HivePartitionKey partitionKey = partitionKeysByName.get(column.getName()); checkArgument(partitionKey != null, "Unknown partition key %s", column.getName()); byte[] bytes = partitionKey.getValue().getBytes(Charsets.UTF_8); Type type = types[columnIndex]; if (BOOLEAN.equals(type)) { if (isTrue(bytes, 0, bytes.length)) { booleans[columnIndex] = true; } else if (isFalse(bytes, 0, bytes.length)) { booleans[columnIndex] = false; } else { String valueString = new String(bytes, Charsets.UTF_8); throw new IllegalArgumentException( String.format( "Invalid partition value '%s' for BOOLEAN partition key %s", valueString, names[columnIndex])); } } else if (BIGINT.equals(type)) { if (bytes.length == 0) { throw new IllegalArgumentException( String.format( "Invalid partition value '' for BIGINT partition key %s", names[columnIndex])); } longs[columnIndex] = parseLong(bytes, 0, bytes.length); } else if (DOUBLE.equals(type)) { if (bytes.length == 0) { throw new IllegalArgumentException( String.format( "Invalid partition value '' for DOUBLE partition key %s", names[columnIndex])); } doubles[columnIndex] = parseDouble(bytes, 0, bytes.length); } else if (VARCHAR.equals(type)) { slices[columnIndex] = Slices.wrappedBuffer(Arrays.copyOf(bytes, bytes.length)); } else { throw new UnsupportedOperationException("Unsupported column type: " + type); } } } }
public GenericHiveRecordCursor( RecordReader<K, V> recordReader, long totalBytes, Properties splitSchema, List<HivePartitionKey> partitionKeys, List<HiveColumnHandle> columns, DateTimeZone hiveStorageTimeZone, TypeManager typeManager) { requireNonNull(recordReader, "recordReader is null"); checkArgument(totalBytes >= 0, "totalBytes is negative"); requireNonNull(splitSchema, "splitSchema is null"); requireNonNull(partitionKeys, "partitionKeys is null"); requireNonNull(columns, "columns is null"); requireNonNull(hiveStorageTimeZone, "hiveStorageTimeZone is null"); this.recordReader = recordReader; this.totalBytes = totalBytes; this.key = recordReader.createKey(); this.value = recordReader.createValue(); this.hiveStorageTimeZone = hiveStorageTimeZone; this.deserializer = getDeserializer(splitSchema); this.rowInspector = getTableObjectInspector(deserializer); int size = columns.size(); String[] names = new String[size]; this.types = new Type[size]; this.hiveTypes = new HiveType[size]; this.structFields = new StructField[size]; this.fieldInspectors = new ObjectInspector[size]; this.isPartitionColumn = new boolean[size]; this.loaded = new boolean[size]; this.booleans = new boolean[size]; this.longs = new long[size]; this.doubles = new double[size]; this.slices = new Slice[size]; this.objects = new Object[size]; this.nulls = new boolean[size]; // initialize data columns for (int i = 0; i < columns.size(); i++) { HiveColumnHandle column = columns.get(i); names[i] = column.getName(); types[i] = typeManager.getType(column.getTypeSignature()); hiveTypes[i] = column.getHiveType(); if (!column.isPartitionKey()) { StructField field = rowInspector.getStructFieldRef(column.getName()); structFields[i] = field; fieldInspectors[i] = field.getFieldObjectInspector(); } isPartitionColumn[i] = column.isPartitionKey(); } // parse requested partition columns Map<String, HivePartitionKey> partitionKeysByName = uniqueIndex(partitionKeys, HivePartitionKey::getName); for (int columnIndex = 0; columnIndex < columns.size(); columnIndex++) { HiveColumnHandle column = columns.get(columnIndex); if (column.isPartitionKey()) { HivePartitionKey partitionKey = partitionKeysByName.get(column.getName()); checkArgument(partitionKey != null, "Unknown partition key %s", column.getName()); byte[] bytes = partitionKey.getValue().getBytes(UTF_8); String name = names[columnIndex]; Type type = types[columnIndex]; if (HiveUtil.isHiveNull(bytes)) { nulls[columnIndex] = true; } else if (BOOLEAN.equals(type)) { booleans[columnIndex] = booleanPartitionKey(partitionKey.getValue(), name); } else if (BIGINT.equals(type)) { longs[columnIndex] = bigintPartitionKey(partitionKey.getValue(), name); } else if (INTEGER.equals(type)) { longs[columnIndex] = integerPartitionKey(partitionKey.getValue(), name); } else if (SMALLINT.equals(type)) { longs[columnIndex] = smallintPartitionKey(partitionKey.getValue(), name); } else if (TINYINT.equals(type)) { longs[columnIndex] = tinyintPartitionKey(partitionKey.getValue(), name); } else if (DOUBLE.equals(type)) { doubles[columnIndex] = doublePartitionKey(partitionKey.getValue(), name); } else if (isVarcharType(type)) { slices[columnIndex] = varcharPartitionKey(partitionKey.getValue(), name, type); } else if (DATE.equals(type)) { longs[columnIndex] = datePartitionKey(partitionKey.getValue(), name); } else if (TIMESTAMP.equals(type)) { longs[columnIndex] = timestampPartitionKey(partitionKey.getValue(), hiveStorageTimeZone, name); } else if (isShortDecimal(type)) { longs[columnIndex] = shortDecimalPartitionKey(partitionKey.getValue(), (DecimalType) type, name); } else if (isLongDecimal(type)) { slices[columnIndex] = longDecimalPartitionKey(partitionKey.getValue(), (DecimalType) type, name); } else { throw new PrestoException( NOT_SUPPORTED, format( "Unsupported column type %s for partition key: %s", type.getDisplayName(), name)); } } } }
@Override public void processOp(Object row, int tag) throws HiveException { try { reportProgress(); // get alias alias = (byte) tag; if ((lastAlias == null) || (!lastAlias.equals(alias))) { nextSz = joinEmitInterval; } ArrayList<Object> nr = JoinUtil.computeValues( row, joinValues.get(alias), joinValuesObjectInspectors.get(alias), joinFilters.get(alias), joinFilterObjectInspectors.get(alias), noOuterJoin); if (handleSkewJoin) { skewJoinKeyContext.handleSkew(tag); } // number of rows for the key in the given table int sz = storage.get(alias).size(); StructObjectInspector soi = (StructObjectInspector) inputObjInspectors[tag]; StructField sf = soi.getStructFieldRef(Utilities.ReduceField.KEY.toString()); Object keyObject = soi.getStructFieldData(row, sf); // Are we consuming too much memory if (alias == numAliases - 1 && !(handleSkewJoin && skewJoinKeyContext.currBigKeyTag >= 0)) { if (sz == joinEmitInterval) { // The input is sorted by alias, so if we are already in the last join // operand, // we can emit some results now. // Note this has to be done before adding the current row to the // storage, // to preserve the correctness for outer joins. checkAndGenObject(); storage.get(alias).clear(); } } else { if (sz == nextSz) { // Output a warning if we reached at least 1000 rows for a join // operand // We won't output a warning for the last join operand since the size // will never goes to joinEmitInterval. LOG.warn("table " + alias + " has " + sz + " rows for join key " + keyObject); nextSz = getNextSize(nextSz); } } // Add the value to the vector storage.get(alias).add(nr); // if join-key is null, process each row in different group. if (SerDeUtils.hasAnyNullObject(keyObject, sf.getFieldObjectInspector())) { endGroup(); startGroup(); } } catch (Exception e) { e.printStackTrace(); throw new HiveException(e); } }