Ejemplo n.º 1
0
    public ParquetListConverter(Type prestoType, String columnName, GroupType listType) {
      checkArgument(
          listType.getFieldCount() == 1,
          "Expected LIST column '%s' to only have one field, but has %s fields",
          columnName,
          listType.getFieldCount());
      checkArgument(ARRAY.equals(prestoType.getTypeSignature().getBase()));

      this.arrayType = prestoType;

      // The Parquet specification requires that the element value of a
      // LIST type be wrapped in an inner repeated group, like so:
      //
      // optional group listField (LIST) {
      //   repeated group list {
      //     optional int element
      //   }
      // }
      //
      // However, some parquet libraries don't follow this spec. The
      // compatibility rules used here are specified in the Parquet
      // documentation at http://git.io/vOpNz.
      parquet.schema.Type elementType = listType.getType(0);
      if (isElementType(elementType, listType.getName())) {
        elementConverter =
            createConverter(
                prestoType.getTypeParameters().get(0), columnName + ".element", elementType);
      } else {
        elementConverter =
            new ParquetListEntryConverter(
                prestoType.getTypeParameters().get(0), columnName, elementType.asGroupType());
      }
    }
Ejemplo n.º 2
0
  /**
   * It writes a map type and its key-pair values to the Parquet RecordConsumer. This is called when
   * the original type (MAP) is detected by writeValue()
   *
   * @param value The list of map values that contains the repeated KEY_PAIR_VALUE group type
   * @param type Type that contains information about the group schema
   */
  private void writeMap(final ArrayWritable value, final GroupType type) {
    GroupType repeatedType = type.getType(0).asGroupType();
    ArrayWritable repeatedValue = (ArrayWritable) value.get()[0];

    recordConsumer.startGroup();
    recordConsumer.startField(repeatedType.getName(), 0);

    Writable[] map_values = repeatedValue.get();
    for (int record = 0; record < map_values.length; record++) {
      Writable key_value_pair = map_values[record];
      if (key_value_pair != null) {
        // Hive wraps a map key-pair into an ArrayWritable
        if (key_value_pair instanceof ArrayWritable) {
          writeGroup((ArrayWritable) key_value_pair, repeatedType);
        } else {
          throw new RuntimeException(
              "Map key-value pair is not an ArrayWritable object on record " + record);
        }
      } else {
        throw new RuntimeException("Map key-value pair is null on record " + record);
      }
    }

    recordConsumer.endField(repeatedType.getName(), 0);
    recordConsumer.endGroup();
  }
Ejemplo n.º 3
0
  /**
   * Creates a new TajoRecordConverter.
   *
   * @param parquetSchema The Parquet schema of the projection.
   * @param tajoReadSchema The Tajo schema of the table.
   * @param projectionMap An array mapping the projection column to the column index in the table.
   */
  public TajoRecordConverter(GroupType parquetSchema, Schema tajoReadSchema, int[] projectionMap) {
    this.parquetSchema = parquetSchema;
    this.tajoReadSchema = tajoReadSchema;
    this.projectionMap = projectionMap;
    this.tupleSize = tajoReadSchema.size();

    // The projectionMap.length does not match parquetSchema.getFieldCount()
    // when the projection contains NULL_TYPE columns. We will skip over the
    // NULL_TYPE columns when we construct the converters and populate the
    // NULL_TYPE columns with NullDatums in start().
    int index = 0;
    this.converters = new Converter[parquetSchema.getFieldCount()];
    for (int i = 0; i < projectionMap.length; ++i) {
      final int projectionIndex = projectionMap[i];
      Column column = tajoReadSchema.getColumn(projectionIndex);
      if (column.getDataType().getType() == TajoDataTypes.Type.NULL_TYPE) {
        continue;
      }
      Type type = parquetSchema.getType(index);
      final int writeIndex = i;
      converters[index] =
          newConverter(
              column,
              type,
              new ParentValueContainer() {
                @Override
                void add(Object value) {
                  TajoRecordConverter.this.set(writeIndex, value);
                }
              });
      ++index;
    }
  }
Ejemplo n.º 4
0
 private boolean contains(GroupType group, String[] path, int index) {
   if (index == path.length) {
     return false;
   }
   if (group.containsField(path[index])) {
     Type type = group.getType(path[index]);
     if (type.isPrimitive()) {
       return index + 1 == path.length;
     } else {
       return contains(type.asGroupType(), path, index + 1);
     }
   }
   return false;
 }
Ejemplo n.º 5
0
    public ParquetMapConverter(Type type, String columnName, GroupType mapType) {
      checkArgument(
          mapType.getFieldCount() == 1,
          "Expected MAP column '%s' to only have one field, but has %s fields",
          mapType.getName(),
          mapType.getFieldCount());

      this.mapType = type;

      parquet.schema.Type entryType = mapType.getFields().get(0);

      entryConverter =
          new ParquetMapEntryConverter(type, columnName + ".entry", entryType.asGroupType());
    }
Ejemplo n.º 6
0
    public ParquetListEntryConverter(Type prestoType, String columnName, GroupType elementType) {
      checkArgument(
          elementType.getOriginalType() == null,
          "Expected LIST column '%s' field to be type STRUCT, but is %s",
          columnName,
          elementType);

      checkArgument(
          elementType.getFieldCount() == 1,
          "Expected LIST column '%s' element to have one field, but has %s fields",
          columnName,
          elementType.getFieldCount());

      elementConverter =
          createConverter(prestoType, columnName + ".element", elementType.getType(0));
    }
Ejemplo n.º 7
0
 MapConverter(List<TProtocol> parentEvents, GroupType parquetSchema, ThriftField field) {
   this.parentEvents = parentEvents;
   if (parquetSchema.getFieldCount() != 1) {
     throw new IllegalArgumentException(
         "maps have only one field. "
             + parquetSchema
             + " size = "
             + parquetSchema.getFieldCount());
   }
   Type nestedType = parquetSchema.getType(0);
   final ThriftField key = ((MapType) field.getType()).getKey();
   keyType = key.getType().getType().getThriftType();
   final ThriftField value = ((MapType) field.getType()).getValue();
   valueType = value.getType().getType().getThriftType();
   child = new GroupCounter(new MapKeyValueConverter(mapEvents, nestedType, key, value));
 }
Ejemplo n.º 8
0
  /**
   * It writes all the fields contained inside a group to the RecordConsumer.
   *
   * @param value The list of values contained in the group.
   * @param type Type that contains information about the group schema.
   */
  public void writeGroupFields(final ArrayWritable value, final GroupType type) {
    if (value != null) {
      for (int i = 0; i < type.getFieldCount(); i++) {
        Type fieldType = type.getType(i);
        String fieldName = fieldType.getName();
        Writable fieldValue = value.get()[i];

        // Parquet does not write null elements
        if (fieldValue != null) {
          recordConsumer.startField(fieldName, i);
          writeValue(fieldValue, fieldType);
          recordConsumer.endField(fieldName, i);
        }
      }
    }
  }
Ejemplo n.º 9
0
 private StructConverter(List<TProtocol> events, GroupType parquetSchema, ThriftField field) {
   this.events = events;
   this.name = field.getName();
   this.tStruct = new TStruct(name);
   this.thriftType = (StructType) field.getType();
   this.schemaSize = parquetSchema.getFieldCount();
   this.converters = new Converter[this.schemaSize];
   List<ThriftField> thriftChildren = thriftType.getChildren();
   for (int i = 0; i < schemaSize; i++) {
     Type schemaType = parquetSchema.getType(i);
     String fieldName = schemaType.getName();
     ThriftField matchingThrift = null;
     for (ThriftField childField : thriftChildren) {
       String thriftChildName = childField.getName();
       if (thriftChildName != null && thriftChildName.equalsIgnoreCase(fieldName)) {
         matchingThrift = childField;
         break;
       }
     }
     if (matchingThrift == null) {
       // this means the file did not contain that field
       // it will never be populated in this instance
       // other files might populate it
       continue;
     }
     if (schemaType.isPrimitive()) {
       converters[i] =
           new PrimitiveFieldHandler(
               newConverter(events, schemaType, matchingThrift).asPrimitiveConverter(),
               matchingThrift,
               events);
     } else {
       converters[i] =
           new GroupFieldhandler(
               newConverter(events, schemaType, matchingThrift).asGroupConverter(),
               matchingThrift,
               events);
     }
   }
 }
Ejemplo n.º 10
0
  /**
   * It writes a list type and its array elements to the Parquet RecordConsumer. This is called when
   * the original type (LIST) is detected by writeValue()
   *
   * @param array The list of array values that contains the repeated array group type
   * @param type Type that contains information about the group schema
   */
  private void writeArray(final ArrayWritable array, final GroupType type) {
    GroupType repeatedType = type.getType(0).asGroupType();
    ArrayWritable repeatedValue = (ArrayWritable) array.get()[0];

    recordConsumer.startGroup();
    recordConsumer.startField(repeatedType.getName(), 0);

    Writable[] array_values = repeatedValue.get();
    for (int record = 0; record < array_values.length; record++) {
      recordConsumer.startGroup();

      // Null values must be wrapped into startGroup/endGroup
      Writable element = array_values[record];
      if (element != null) {
        for (int i = 0; i < type.getFieldCount(); i++) {
          Type fieldType = repeatedType.getType(i);
          String fieldName = fieldType.getName();

          recordConsumer.startField(fieldName, i);
          writeValue(element, fieldType);
          recordConsumer.endField(fieldName, i);
        }
      }

      recordConsumer.endGroup();
    }

    recordConsumer.endField(repeatedType.getName(), 0);
    recordConsumer.endGroup();
  }
Ejemplo n.º 11
0
 CollectionConverter(List<TProtocol> parentEvents, GroupType parquetSchema, ThriftField values) {
   this.parentEvents = parentEvents;
   if (parquetSchema.getFieldCount() != 1) {
     throw new IllegalArgumentException(
         "lists have only one field. "
             + parquetSchema
             + " size = "
             + parquetSchema.getFieldCount());
   }
   nestedType = parquetSchema.getType(0);
   valuesType = values.getType().getType();
   if (nestedType.isPrimitive()) {
     PrimitiveCounter counter =
         new PrimitiveCounter(
             newConverter(listEvents, nestedType, values).asPrimitiveConverter());
     child = counter;
     childCounter = counter;
   } else {
     GroupCounter counter =
         new GroupCounter(newConverter(listEvents, nestedType, values).asGroupConverter());
     child = counter;
     childCounter = counter;
   }
 }
Ejemplo n.º 12
0
    public ParquetStructConverter(Type prestoType, String columnName, GroupType entryType) {
      checkArgument(ROW.equals(prestoType.getTypeSignature().getBase()));
      List<Type> prestoTypeParameters = prestoType.getTypeParameters();
      List<parquet.schema.Type> fieldTypes = entryType.getFields();
      checkArgument(prestoTypeParameters.size() == fieldTypes.size());

      this.rowType = prestoType;

      ImmutableList.Builder<BlockConverter> converters = ImmutableList.builder();
      for (int i = 0; i < prestoTypeParameters.size(); i++) {
        parquet.schema.Type fieldType = fieldTypes.get(i);
        converters.add(
            createConverter(
                prestoTypeParameters.get(i), columnName + "." + fieldType.getName(), fieldType));
      }
      this.converters = converters.build();
    }
Ejemplo n.º 13
0
    public PrestoReadSupport(
        boolean useParquetColumnNames, List<HiveColumnHandle> columns, MessageType messageType) {
      this.columns = columns;
      this.useParquetColumnNames = useParquetColumnNames;

      ImmutableList.Builder<Converter> converters = ImmutableList.builder();
      for (int i = 0; i < columns.size(); i++) {
        HiveColumnHandle column = columns.get(i);
        if (!column.isPartitionKey()) {
          parquet.schema.Type parquetType = getParquetType(column, messageType);
          if (parquetType == null) {
            continue;
          }
          if (parquetType.isPrimitive()) {
            converters.add(new ParquetPrimitiveColumnConverter(i));
          } else {
            GroupType groupType = parquetType.asGroupType();
            switch (column.getTypeSignature().getBase()) {
              case ARRAY:
                ParquetColumnConverter listConverter =
                    new ParquetColumnConverter(
                        new ParquetListConverter(types[i], groupType.getName(), groupType), i);
                converters.add(listConverter);
                break;
              case StandardTypes.MAP:
                ParquetColumnConverter mapConverter =
                    new ParquetColumnConverter(
                        new ParquetMapConverter(types[i], groupType.getName(), groupType), i);
                converters.add(mapConverter);
                break;
              case ROW:
                ParquetColumnConverter rowConverter =
                    new ParquetColumnConverter(
                        new ParquetStructConverter(types[i], groupType.getName(), groupType), i);
                converters.add(rowConverter);
                break;
              default:
                throw new IllegalArgumentException(
                    "Group column "
                        + groupType.getName()
                        + " type "
                        + groupType.getOriginalType()
                        + " not supported");
            }
          }
        }
      }
      this.converters = converters.build();
    }
Ejemplo n.º 14
0
    public ParquetMapEntryConverter(Type prestoType, String columnName, GroupType entryType) {
      checkArgument(StandardTypes.MAP.equals(prestoType.getTypeSignature().getBase()));
      // original version of parquet used null for entry due to a bug
      if (entryType.getOriginalType() != null) {
        checkArgument(
            entryType.getOriginalType() == MAP_KEY_VALUE,
            "Expected MAP column '%s' field to be type %s, but is %s",
            columnName,
            MAP_KEY_VALUE,
            entryType);
      }

      GroupType entryGroupType = entryType.asGroupType();
      checkArgument(
          entryGroupType.getFieldCount() == 2,
          "Expected MAP column '%s' entry to have two fields, but has %s fields",
          columnName,
          entryGroupType.getFieldCount());
      checkArgument(
          entryGroupType.getFieldName(0).equals("key"),
          "Expected MAP column '%s' entry field 0 to be named 'key', but is named %s",
          columnName,
          entryGroupType.getFieldName(0));
      checkArgument(
          entryGroupType.getFieldName(1).equals("value"),
          "Expected MAP column '%s' entry field 1 to be named 'value', but is named %s",
          columnName,
          entryGroupType.getFieldName(1));
      checkArgument(
          entryGroupType.getType(0).isPrimitive(),
          "Expected MAP column '%s' entry field 0 to be primitive, but is named %s",
          columnName,
          entryGroupType.getType(0));

      keyConverter =
          createConverter(
              prestoType.getTypeParameters().get(0),
              columnName + ".key",
              entryGroupType.getFields().get(0));
      valueConverter =
          createConverter(
              prestoType.getTypeParameters().get(1),
              columnName + ".value",
              entryGroupType.getFields().get(1));
    }