/** * It writes a list type and its array elements to the Parquet RecordConsumer. This is called when * the original type (LIST) is detected by writeValue() * * @param array The list of array values that contains the repeated array group type * @param type Type that contains information about the group schema */ private void writeArray(final ArrayWritable array, final GroupType type) { GroupType repeatedType = type.getType(0).asGroupType(); ArrayWritable repeatedValue = (ArrayWritable) array.get()[0]; recordConsumer.startGroup(); recordConsumer.startField(repeatedType.getName(), 0); Writable[] array_values = repeatedValue.get(); for (int record = 0; record < array_values.length; record++) { recordConsumer.startGroup(); // Null values must be wrapped into startGroup/endGroup Writable element = array_values[record]; if (element != null) { for (int i = 0; i < type.getFieldCount(); i++) { Type fieldType = repeatedType.getType(i); String fieldName = fieldType.getName(); recordConsumer.startField(fieldName, i); writeValue(element, fieldType); recordConsumer.endField(fieldName, i); } } recordConsumer.endGroup(); } recordConsumer.endField(repeatedType.getName(), 0); recordConsumer.endGroup(); }
public ParquetListConverter(Type prestoType, String columnName, GroupType listType) { checkArgument( listType.getFieldCount() == 1, "Expected LIST column '%s' to only have one field, but has %s fields", columnName, listType.getFieldCount()); checkArgument(ARRAY.equals(prestoType.getTypeSignature().getBase())); this.arrayType = prestoType; // The Parquet specification requires that the element value of a // LIST type be wrapped in an inner repeated group, like so: // // optional group listField (LIST) { // repeated group list { // optional int element // } // } // // However, some parquet libraries don't follow this spec. The // compatibility rules used here are specified in the Parquet // documentation at http://git.io/vOpNz. parquet.schema.Type elementType = listType.getType(0); if (isElementType(elementType, listType.getName())) { elementConverter = createConverter( prestoType.getTypeParameters().get(0), columnName + ".element", elementType); } else { elementConverter = new ParquetListEntryConverter( prestoType.getTypeParameters().get(0), columnName, elementType.asGroupType()); } }
/** * It writes a map type and its key-pair values to the Parquet RecordConsumer. This is called when * the original type (MAP) is detected by writeValue() * * @param value The list of map values that contains the repeated KEY_PAIR_VALUE group type * @param type Type that contains information about the group schema */ private void writeMap(final ArrayWritable value, final GroupType type) { GroupType repeatedType = type.getType(0).asGroupType(); ArrayWritable repeatedValue = (ArrayWritable) value.get()[0]; recordConsumer.startGroup(); recordConsumer.startField(repeatedType.getName(), 0); Writable[] map_values = repeatedValue.get(); for (int record = 0; record < map_values.length; record++) { Writable key_value_pair = map_values[record]; if (key_value_pair != null) { // Hive wraps a map key-pair into an ArrayWritable if (key_value_pair instanceof ArrayWritable) { writeGroup((ArrayWritable) key_value_pair, repeatedType); } else { throw new RuntimeException( "Map key-value pair is not an ArrayWritable object on record " + record); } } else { throw new RuntimeException("Map key-value pair is null on record " + record); } } recordConsumer.endField(repeatedType.getName(), 0); recordConsumer.endGroup(); }
/** * Creates a new TajoRecordConverter. * * @param parquetSchema The Parquet schema of the projection. * @param tajoReadSchema The Tajo schema of the table. * @param projectionMap An array mapping the projection column to the column index in the table. */ public TajoRecordConverter(GroupType parquetSchema, Schema tajoReadSchema, int[] projectionMap) { this.parquetSchema = parquetSchema; this.tajoReadSchema = tajoReadSchema; this.projectionMap = projectionMap; this.tupleSize = tajoReadSchema.size(); // The projectionMap.length does not match parquetSchema.getFieldCount() // when the projection contains NULL_TYPE columns. We will skip over the // NULL_TYPE columns when we construct the converters and populate the // NULL_TYPE columns with NullDatums in start(). int index = 0; this.converters = new Converter[parquetSchema.getFieldCount()]; for (int i = 0; i < projectionMap.length; ++i) { final int projectionIndex = projectionMap[i]; Column column = tajoReadSchema.getColumn(projectionIndex); if (column.getDataType().getType() == TajoDataTypes.Type.NULL_TYPE) { continue; } Type type = parquetSchema.getType(index); final int writeIndex = i; converters[index] = newConverter( column, type, new ParentValueContainer() { @Override void add(Object value) { TajoRecordConverter.this.set(writeIndex, value); } }); ++index; } }
public ParquetMapEntryConverter(Type prestoType, String columnName, GroupType entryType) { checkArgument(StandardTypes.MAP.equals(prestoType.getTypeSignature().getBase())); // original version of parquet used null for entry due to a bug if (entryType.getOriginalType() != null) { checkArgument( entryType.getOriginalType() == MAP_KEY_VALUE, "Expected MAP column '%s' field to be type %s, but is %s", columnName, MAP_KEY_VALUE, entryType); } GroupType entryGroupType = entryType.asGroupType(); checkArgument( entryGroupType.getFieldCount() == 2, "Expected MAP column '%s' entry to have two fields, but has %s fields", columnName, entryGroupType.getFieldCount()); checkArgument( entryGroupType.getFieldName(0).equals("key"), "Expected MAP column '%s' entry field 0 to be named 'key', but is named %s", columnName, entryGroupType.getFieldName(0)); checkArgument( entryGroupType.getFieldName(1).equals("value"), "Expected MAP column '%s' entry field 1 to be named 'value', but is named %s", columnName, entryGroupType.getFieldName(1)); checkArgument( entryGroupType.getType(0).isPrimitive(), "Expected MAP column '%s' entry field 0 to be primitive, but is named %s", columnName, entryGroupType.getType(0)); keyConverter = createConverter( prestoType.getTypeParameters().get(0), columnName + ".key", entryGroupType.getFields().get(0)); valueConverter = createConverter( prestoType.getTypeParameters().get(1), columnName + ".value", entryGroupType.getFields().get(1)); }
private boolean contains(GroupType group, String[] path, int index) { if (index == path.length) { return false; } if (group.containsField(path[index])) { Type type = group.getType(path[index]); if (type.isPrimitive()) { return index + 1 == path.length; } else { return contains(type.asGroupType(), path, index + 1); } } return false; }
public ParquetListEntryConverter(Type prestoType, String columnName, GroupType elementType) { checkArgument( elementType.getOriginalType() == null, "Expected LIST column '%s' field to be type STRUCT, but is %s", columnName, elementType); checkArgument( elementType.getFieldCount() == 1, "Expected LIST column '%s' element to have one field, but has %s fields", columnName, elementType.getFieldCount()); elementConverter = createConverter(prestoType, columnName + ".element", elementType.getType(0)); }
MapConverter(List<TProtocol> parentEvents, GroupType parquetSchema, ThriftField field) { this.parentEvents = parentEvents; if (parquetSchema.getFieldCount() != 1) { throw new IllegalArgumentException( "maps have only one field. " + parquetSchema + " size = " + parquetSchema.getFieldCount()); } Type nestedType = parquetSchema.getType(0); final ThriftField key = ((MapType) field.getType()).getKey(); keyType = key.getType().getType().getThriftType(); final ThriftField value = ((MapType) field.getType()).getValue(); valueType = value.getType().getType().getThriftType(); child = new GroupCounter(new MapKeyValueConverter(mapEvents, nestedType, key, value)); }
/** * It writes all the fields contained inside a group to the RecordConsumer. * * @param value The list of values contained in the group. * @param type Type that contains information about the group schema. */ public void writeGroupFields(final ArrayWritable value, final GroupType type) { if (value != null) { for (int i = 0; i < type.getFieldCount(); i++) { Type fieldType = type.getType(i); String fieldName = fieldType.getName(); Writable fieldValue = value.get()[i]; // Parquet does not write null elements if (fieldValue != null) { recordConsumer.startField(fieldName, i); writeValue(fieldValue, fieldType); recordConsumer.endField(fieldName, i); } } } }
private StructConverter(List<TProtocol> events, GroupType parquetSchema, ThriftField field) { this.events = events; this.name = field.getName(); this.tStruct = new TStruct(name); this.thriftType = (StructType) field.getType(); this.schemaSize = parquetSchema.getFieldCount(); this.converters = new Converter[this.schemaSize]; List<ThriftField> thriftChildren = thriftType.getChildren(); for (int i = 0; i < schemaSize; i++) { Type schemaType = parquetSchema.getType(i); String fieldName = schemaType.getName(); ThriftField matchingThrift = null; for (ThriftField childField : thriftChildren) { String thriftChildName = childField.getName(); if (thriftChildName != null && thriftChildName.equalsIgnoreCase(fieldName)) { matchingThrift = childField; break; } } if (matchingThrift == null) { // this means the file did not contain that field // it will never be populated in this instance // other files might populate it continue; } if (schemaType.isPrimitive()) { converters[i] = new PrimitiveFieldHandler( newConverter(events, schemaType, matchingThrift).asPrimitiveConverter(), matchingThrift, events); } else { converters[i] = new GroupFieldhandler( newConverter(events, schemaType, matchingThrift).asGroupConverter(), matchingThrift, events); } } }
CollectionConverter(List<TProtocol> parentEvents, GroupType parquetSchema, ThriftField values) { this.parentEvents = parentEvents; if (parquetSchema.getFieldCount() != 1) { throw new IllegalArgumentException( "lists have only one field. " + parquetSchema + " size = " + parquetSchema.getFieldCount()); } nestedType = parquetSchema.getType(0); valuesType = values.getType().getType(); if (nestedType.isPrimitive()) { PrimitiveCounter counter = new PrimitiveCounter( newConverter(listEvents, nestedType, values).asPrimitiveConverter()); child = counter; childCounter = counter; } else { GroupCounter counter = new GroupCounter(newConverter(listEvents, nestedType, values).asGroupConverter()); child = counter; childCounter = counter; } }