コード例 #1
0
 @Override
 public void addBinary(Binary value) {
   final byte[] input = value.getBytes();
   holder.months = ParquetReaderUtility.getIntFromLEBytes(input, 0);
   holder.days = ParquetReaderUtility.getIntFromLEBytes(input, 4);
   holder.milliseconds = ParquetReaderUtility.getIntFromLEBytes(input, 8);
   writer.write(holder);
 }
コード例 #2
0
  @Override
  public void setup(OperatorContext operatorContext, OutputMutator output)
      throws ExecutionSetupException {
    this.operatorContext = operatorContext;
    if (!isStarQuery()) {
      columnsFound = new boolean[getColumns().size()];
      nullFilledVectors = new ArrayList<>();
    }
    columnStatuses = new ArrayList<>();
    //    totalRecords = footer.getBlocks().get(rowGroupIndex).getRowCount();
    List<ColumnDescriptor> columns = footer.getFileMetaData().getSchema().getColumns();
    allFieldsFixedLength = true;
    ColumnDescriptor column;
    ColumnChunkMetaData columnChunkMetaData;
    int columnsToScan = 0;
    mockRecordsRead = 0;

    MaterializedField field;
    //    ParquetMetadataConverter metaConverter = new ParquetMetadataConverter();
    FileMetaData fileMetaData;

    logger.debug(
        "Reading row group({}) with {} records in file {}.",
        rowGroupIndex,
        footer.getBlocks().get(rowGroupIndex).getRowCount(),
        hadoopPath.toUri().getPath());
    totalRecordsRead = 0;

    // TODO - figure out how to deal with this better once we add nested reading, note also look
    // where this map is used below
    // store a map from column name to converted types if they are non-null
    Map<String, SchemaElement> schemaElements =
        ParquetReaderUtility.getColNameToSchemaElementMapping(footer);

    // loop to add up the length of the fixed width columns and build the schema
    for (int i = 0; i < columns.size(); ++i) {
      column = columns.get(i);
      SchemaElement se = schemaElements.get(column.getPath()[0]);
      MajorType mt =
          ParquetToDrillTypeConverter.toMajorType(
              column.getType(),
              se.getType_length(),
              getDataMode(column),
              se,
              fragmentContext.getOptions());
      field = MaterializedField.create(toFieldName(column.getPath()), mt);
      if (!fieldSelected(field)) {
        continue;
      }
      columnsToScan++;
      int dataTypeLength = getDataTypeLength(column, se);
      if (dataTypeLength == -1) {
        allFieldsFixedLength = false;
      } else {
        bitWidthAllFixedFields += dataTypeLength;
      }
    }
    //    rowGroupOffset =
    // footer.getBlocks().get(rowGroupIndex).getColumns().get(0).getFirstDataPageOffset();

    if (columnsToScan != 0 && allFieldsFixedLength) {
      recordsPerBatch =
          (int)
              Math.min(
                  Math.min(
                      batchSize / bitWidthAllFixedFields,
                      footer.getBlocks().get(0).getColumns().get(0).getValueCount()),
                  65535);
    } else {
      recordsPerBatch = DEFAULT_RECORDS_TO_READ_IF_NOT_FIXED_WIDTH;
    }

    try {
      ValueVector vector;
      SchemaElement schemaElement;
      final ArrayList<VarLengthColumn<? extends ValueVector>> varLengthColumns = new ArrayList<>();
      // initialize all of the column read status objects
      boolean fieldFixedLength;
      // the column chunk meta-data is not guaranteed to be in the same order as the columns in the
      // schema
      // a map is constructed for fast access to the correct columnChunkMetadata to correspond
      // to an element in the schema
      Map<String, Integer> columnChunkMetadataPositionsInList = new HashMap<>();
      BlockMetaData rowGroupMetadata = footer.getBlocks().get(rowGroupIndex);

      int colChunkIndex = 0;
      for (ColumnChunkMetaData colChunk : rowGroupMetadata.getColumns()) {
        columnChunkMetadataPositionsInList.put(
            Arrays.toString(colChunk.getPath().toArray()), colChunkIndex);
        colChunkIndex++;
      }
      for (int i = 0; i < columns.size(); ++i) {
        column = columns.get(i);
        columnChunkMetaData =
            rowGroupMetadata
                .getColumns()
                .get(columnChunkMetadataPositionsInList.get(Arrays.toString(column.getPath())));
        schemaElement = schemaElements.get(column.getPath()[0]);
        MajorType type =
            ParquetToDrillTypeConverter.toMajorType(
                column.getType(),
                schemaElement.getType_length(),
                getDataMode(column),
                schemaElement,
                fragmentContext.getOptions());
        field = MaterializedField.create(toFieldName(column.getPath()), type);
        // the field was not requested to be read
        if (!fieldSelected(field)) {
          continue;
        }

        fieldFixedLength = column.getType() != PrimitiveType.PrimitiveTypeName.BINARY;
        vector =
            output.addField(
                field,
                (Class<? extends ValueVector>)
                    TypeHelper.getValueVectorClass(type.getMinorType(), type.getMode()));
        if (column.getType() != PrimitiveType.PrimitiveTypeName.BINARY) {
          if (column.getMaxRepetitionLevel() > 0) {
            final RepeatedValueVector repeatedVector = RepeatedValueVector.class.cast(vector);
            ColumnReader<?> dataReader =
                ColumnReaderFactory.createFixedColumnReader(
                    this,
                    fieldFixedLength,
                    column,
                    columnChunkMetaData,
                    recordsPerBatch,
                    repeatedVector.getDataVector(),
                    schemaElement);
            varLengthColumns.add(
                new FixedWidthRepeatedReader(
                    this,
                    dataReader,
                    getTypeLengthInBits(column.getType()),
                    -1,
                    column,
                    columnChunkMetaData,
                    false,
                    repeatedVector,
                    schemaElement));
          } else {
            columnStatuses.add(
                ColumnReaderFactory.createFixedColumnReader(
                    this,
                    fieldFixedLength,
                    column,
                    columnChunkMetaData,
                    recordsPerBatch,
                    vector,
                    schemaElement));
          }
        } else {
          // create a reader and add it to the appropriate list
          varLengthColumns.add(
              ColumnReaderFactory.getReader(
                  this, -1, column, columnChunkMetaData, false, vector, schemaElement));
        }
      }
      varLengthReader = new VarLenBinaryReader(this, varLengthColumns);

      if (!isStarQuery()) {
        List<SchemaPath> projectedColumns = Lists.newArrayList(getColumns());
        SchemaPath col;
        for (int i = 0; i < columnsFound.length; i++) {
          col = projectedColumns.get(i);
          assert col != null;
          if (!columnsFound[i] && !col.equals(STAR_COLUMN)) {
            nullFilledVectors.add(
                (NullableIntVector)
                    output.addField(
                        MaterializedField.create(
                            col.getAsUnescapedPath(), Types.optional(TypeProtos.MinorType.INT)),
                        (Class<? extends ValueVector>)
                            TypeHelper.getValueVectorClass(
                                TypeProtos.MinorType.INT, DataMode.OPTIONAL)));
          }
        }
      }
    } catch (Exception e) {
      handleAndRaise("Failure in setting up reader", e);
    }
  }
コード例 #3
0
  private PrimitiveConverter getConverterForType(String name, PrimitiveType type) {

    switch (type.getPrimitiveTypeName()) {
      case INT32:
        {
          if (type.getOriginalType() == null) {
            IntWriter writer =
                type.getRepetition() == Repetition.REPEATED
                    ? mapWriter.list(name).integer()
                    : mapWriter.integer(name);
            return new DrillIntConverter(writer);
          }
          switch (type.getOriginalType()) {
            case DECIMAL:
              {
                ParquetReaderUtility.checkDecimalTypeEnabled(options);
                Decimal9Writer writer =
                    type.getRepetition() == Repetition.REPEATED
                        ? mapWriter.list(name).decimal9()
                        : mapWriter.decimal9(name);
                return new DrillDecimal9Converter(
                    writer,
                    type.getDecimalMetadata().getPrecision(),
                    type.getDecimalMetadata().getScale());
              }
            case DATE:
              {
                DateWriter writer =
                    type.getRepetition() == Repetition.REPEATED
                        ? mapWriter.list(name).date()
                        : mapWriter.date(name);
                return new DrillDateConverter(writer);
              }
            case TIME_MILLIS:
              {
                TimeWriter writer =
                    type.getRepetition() == Repetition.REPEATED
                        ? mapWriter.list(name).time()
                        : mapWriter.time(name);
                return new DrillTimeConverter(writer);
              }
            default:
              {
                throw new UnsupportedOperationException(
                    "Unsupported type: " + type.getOriginalType());
              }
          }
        }
      case INT64:
        {
          if (type.getOriginalType() == null) {
            BigIntWriter writer =
                type.getRepetition() == Repetition.REPEATED
                    ? mapWriter.list(name).bigInt()
                    : mapWriter.bigInt(name);
            return new DrillBigIntConverter(writer);
          }
          switch (type.getOriginalType()) {
            case DECIMAL:
              {
                ParquetReaderUtility.checkDecimalTypeEnabled(options);
                Decimal18Writer writer =
                    type.getRepetition() == Repetition.REPEATED
                        ? mapWriter.list(name).decimal18()
                        : mapWriter.decimal18(name);
                return new DrillDecimal18Converter(
                    writer,
                    type.getDecimalMetadata().getPrecision(),
                    type.getDecimalMetadata().getScale());
              }
            case TIMESTAMP_MILLIS:
              {
                TimeStampWriter writer =
                    type.getRepetition() == Repetition.REPEATED
                        ? mapWriter.list(name).timeStamp()
                        : mapWriter.timeStamp(name);
                return new DrillTimeStampConverter(writer);
              }
            default:
              {
                throw new UnsupportedOperationException(
                    "Unsupported type " + type.getOriginalType());
              }
          }
        }
      case INT96:
        {
          if (type.getOriginalType() == null) {
            VarBinaryWriter writer =
                type.getRepetition() == Repetition.REPEATED
                    ? mapWriter.list(name).varBinary()
                    : mapWriter.varBinary(name);
            return new DrillFixedBinaryToVarbinaryConverter(
                writer,
                ParquetRecordReader.getTypeLengthInBits(type.getPrimitiveTypeName()) / 8,
                mutator.getManagedBuffer());
          }
        }
      case FLOAT:
        {
          Float4Writer writer =
              type.getRepetition() == Repetition.REPEATED
                  ? mapWriter.list(name).float4()
                  : mapWriter.float4(name);
          return new DrillFloat4Converter(writer);
        }
      case DOUBLE:
        {
          Float8Writer writer =
              type.getRepetition() == Repetition.REPEATED
                  ? mapWriter.list(name).float8()
                  : mapWriter.float8(name);
          return new DrillFloat8Converter(writer);
        }
      case BOOLEAN:
        {
          BitWriter writer =
              type.getRepetition() == Repetition.REPEATED
                  ? mapWriter.list(name).bit()
                  : mapWriter.bit(name);
          return new DrillBoolConverter(writer);
        }
      case BINARY:
        {
          if (type.getOriginalType() == null) {
            VarBinaryWriter writer =
                type.getRepetition() == Repetition.REPEATED
                    ? mapWriter.list(name).varBinary()
                    : mapWriter.varBinary(name);
            return new DrillVarBinaryConverter(writer, mutator.getManagedBuffer());
          }
          switch (type.getOriginalType()) {
            case UTF8:
              {
                VarCharWriter writer =
                    type.getRepetition() == Repetition.REPEATED
                        ? mapWriter.list(name).varChar()
                        : mapWriter.varChar(name);
                return new DrillVarCharConverter(writer, mutator.getManagedBuffer());
              }
              // TODO not sure if BINARY/DECIMAL is actually supported
            case DECIMAL:
              {
                ParquetReaderUtility.checkDecimalTypeEnabled(options);
                DecimalMetadata metadata = type.getDecimalMetadata();
                if (metadata.getPrecision() <= 28) {
                  Decimal28SparseWriter writer =
                      type.getRepetition() == Repetition.REPEATED
                          ? mapWriter.list(name).decimal28Sparse()
                          : mapWriter.decimal28Sparse(
                              name, metadata.getScale(), metadata.getPrecision());
                  return new DrillBinaryToDecimal28Converter(
                      writer,
                      metadata.getPrecision(),
                      metadata.getScale(),
                      mutator.getManagedBuffer());
                } else {
                  Decimal38SparseWriter writer =
                      type.getRepetition() == Repetition.REPEATED
                          ? mapWriter.list(name).decimal38Sparse()
                          : mapWriter.decimal38Sparse(
                              name, metadata.getScale(), metadata.getPrecision());
                  return new DrillBinaryToDecimal38Converter(
                      writer,
                      metadata.getPrecision(),
                      metadata.getScale(),
                      mutator.getManagedBuffer());
                }
              }
            default:
              {
                throw new UnsupportedOperationException(
                    "Unsupported type " + type.getOriginalType());
              }
          }
        }
      case FIXED_LEN_BYTE_ARRAY:
        if (type.getOriginalType() == OriginalType.DECIMAL) {
          ParquetReaderUtility.checkDecimalTypeEnabled(options);
          DecimalMetadata metadata = type.getDecimalMetadata();
          if (metadata.getPrecision() <= 28) {
            Decimal28SparseWriter writer =
                type.getRepetition() == Repetition.REPEATED
                    ? mapWriter.list(name).decimal28Sparse()
                    : mapWriter.decimal28Sparse(name, metadata.getScale(), metadata.getPrecision());
            return new DrillBinaryToDecimal28Converter(
                writer, metadata.getPrecision(), metadata.getScale(), mutator.getManagedBuffer());
          } else {
            Decimal38SparseWriter writer =
                type.getRepetition() == Repetition.REPEATED
                    ? mapWriter.list(name).decimal38Sparse()
                    : mapWriter.decimal38Sparse(name, metadata.getScale(), metadata.getPrecision());
            return new DrillBinaryToDecimal38Converter(
                writer, metadata.getPrecision(), metadata.getScale(), mutator.getManagedBuffer());
          }
        } else if (type.getOriginalType() == OriginalType.INTERVAL) {
          IntervalWriter writer =
              type.getRepetition() == Repetition.REPEATED
                  ? mapWriter.list(name).interval()
                  : mapWriter.interval(name);
          return new DrillFixedLengthByteArrayToInterval(writer);

        } else {
          VarBinaryWriter writer =
              type.getRepetition() == Repetition.REPEATED
                  ? mapWriter.list(name).varBinary()
                  : mapWriter.varBinary(name);
          return new DrillFixedBinaryToVarbinaryConverter(
              writer, type.getTypeLength(), mutator.getManagedBuffer());
        }
      default:
        throw new UnsupportedOperationException("Unsupported type: " + type.getPrimitiveTypeName());
    }
  }