@Override public void addBinary(Binary value) { final byte[] input = value.getBytes(); holder.months = ParquetReaderUtility.getIntFromLEBytes(input, 0); holder.days = ParquetReaderUtility.getIntFromLEBytes(input, 4); holder.milliseconds = ParquetReaderUtility.getIntFromLEBytes(input, 8); writer.write(holder); }
@Override public void setup(OperatorContext operatorContext, OutputMutator output) throws ExecutionSetupException { this.operatorContext = operatorContext; if (!isStarQuery()) { columnsFound = new boolean[getColumns().size()]; nullFilledVectors = new ArrayList<>(); } columnStatuses = new ArrayList<>(); // totalRecords = footer.getBlocks().get(rowGroupIndex).getRowCount(); List<ColumnDescriptor> columns = footer.getFileMetaData().getSchema().getColumns(); allFieldsFixedLength = true; ColumnDescriptor column; ColumnChunkMetaData columnChunkMetaData; int columnsToScan = 0; mockRecordsRead = 0; MaterializedField field; // ParquetMetadataConverter metaConverter = new ParquetMetadataConverter(); FileMetaData fileMetaData; logger.debug( "Reading row group({}) with {} records in file {}.", rowGroupIndex, footer.getBlocks().get(rowGroupIndex).getRowCount(), hadoopPath.toUri().getPath()); totalRecordsRead = 0; // TODO - figure out how to deal with this better once we add nested reading, note also look // where this map is used below // store a map from column name to converted types if they are non-null Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer); // loop to add up the length of the fixed width columns and build the schema for (int i = 0; i < columns.size(); ++i) { column = columns.get(i); SchemaElement se = schemaElements.get(column.getPath()[0]); MajorType mt = ParquetToDrillTypeConverter.toMajorType( column.getType(), se.getType_length(), getDataMode(column), se, fragmentContext.getOptions()); field = MaterializedField.create(toFieldName(column.getPath()), mt); if (!fieldSelected(field)) { continue; } columnsToScan++; int dataTypeLength = getDataTypeLength(column, se); if (dataTypeLength == -1) { allFieldsFixedLength = false; } else { bitWidthAllFixedFields += dataTypeLength; } } // rowGroupOffset = // footer.getBlocks().get(rowGroupIndex).getColumns().get(0).getFirstDataPageOffset(); if (columnsToScan != 0 && allFieldsFixedLength) { recordsPerBatch = (int) Math.min( Math.min( batchSize / bitWidthAllFixedFields, footer.getBlocks().get(0).getColumns().get(0).getValueCount()), 65535); } else { recordsPerBatch = DEFAULT_RECORDS_TO_READ_IF_NOT_FIXED_WIDTH; } try { ValueVector vector; SchemaElement schemaElement; final ArrayList<VarLengthColumn<? extends ValueVector>> varLengthColumns = new ArrayList<>(); // initialize all of the column read status objects boolean fieldFixedLength; // the column chunk meta-data is not guaranteed to be in the same order as the columns in the // schema // a map is constructed for fast access to the correct columnChunkMetadata to correspond // to an element in the schema Map<String, Integer> columnChunkMetadataPositionsInList = new HashMap<>(); BlockMetaData rowGroupMetadata = footer.getBlocks().get(rowGroupIndex); int colChunkIndex = 0; for (ColumnChunkMetaData colChunk : rowGroupMetadata.getColumns()) { columnChunkMetadataPositionsInList.put( Arrays.toString(colChunk.getPath().toArray()), colChunkIndex); colChunkIndex++; } for (int i = 0; i < columns.size(); ++i) { column = columns.get(i); columnChunkMetaData = rowGroupMetadata .getColumns() .get(columnChunkMetadataPositionsInList.get(Arrays.toString(column.getPath()))); schemaElement = schemaElements.get(column.getPath()[0]); MajorType type = ParquetToDrillTypeConverter.toMajorType( column.getType(), schemaElement.getType_length(), getDataMode(column), schemaElement, fragmentContext.getOptions()); field = MaterializedField.create(toFieldName(column.getPath()), type); // the field was not requested to be read if (!fieldSelected(field)) { continue; } fieldFixedLength = column.getType() != PrimitiveType.PrimitiveTypeName.BINARY; vector = output.addField( field, (Class<? extends ValueVector>) TypeHelper.getValueVectorClass(type.getMinorType(), type.getMode())); if (column.getType() != PrimitiveType.PrimitiveTypeName.BINARY) { if (column.getMaxRepetitionLevel() > 0) { final RepeatedValueVector repeatedVector = RepeatedValueVector.class.cast(vector); ColumnReader<?> dataReader = ColumnReaderFactory.createFixedColumnReader( this, fieldFixedLength, column, columnChunkMetaData, recordsPerBatch, repeatedVector.getDataVector(), schemaElement); varLengthColumns.add( new FixedWidthRepeatedReader( this, dataReader, getTypeLengthInBits(column.getType()), -1, column, columnChunkMetaData, false, repeatedVector, schemaElement)); } else { columnStatuses.add( ColumnReaderFactory.createFixedColumnReader( this, fieldFixedLength, column, columnChunkMetaData, recordsPerBatch, vector, schemaElement)); } } else { // create a reader and add it to the appropriate list varLengthColumns.add( ColumnReaderFactory.getReader( this, -1, column, columnChunkMetaData, false, vector, schemaElement)); } } varLengthReader = new VarLenBinaryReader(this, varLengthColumns); if (!isStarQuery()) { List<SchemaPath> projectedColumns = Lists.newArrayList(getColumns()); SchemaPath col; for (int i = 0; i < columnsFound.length; i++) { col = projectedColumns.get(i); assert col != null; if (!columnsFound[i] && !col.equals(STAR_COLUMN)) { nullFilledVectors.add( (NullableIntVector) output.addField( MaterializedField.create( col.getAsUnescapedPath(), Types.optional(TypeProtos.MinorType.INT)), (Class<? extends ValueVector>) TypeHelper.getValueVectorClass( TypeProtos.MinorType.INT, DataMode.OPTIONAL))); } } } } catch (Exception e) { handleAndRaise("Failure in setting up reader", e); } }
private PrimitiveConverter getConverterForType(String name, PrimitiveType type) { switch (type.getPrimitiveTypeName()) { case INT32: { if (type.getOriginalType() == null) { IntWriter writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).integer() : mapWriter.integer(name); return new DrillIntConverter(writer); } switch (type.getOriginalType()) { case DECIMAL: { ParquetReaderUtility.checkDecimalTypeEnabled(options); Decimal9Writer writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).decimal9() : mapWriter.decimal9(name); return new DrillDecimal9Converter( writer, type.getDecimalMetadata().getPrecision(), type.getDecimalMetadata().getScale()); } case DATE: { DateWriter writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).date() : mapWriter.date(name); return new DrillDateConverter(writer); } case TIME_MILLIS: { TimeWriter writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).time() : mapWriter.time(name); return new DrillTimeConverter(writer); } default: { throw new UnsupportedOperationException( "Unsupported type: " + type.getOriginalType()); } } } case INT64: { if (type.getOriginalType() == null) { BigIntWriter writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).bigInt() : mapWriter.bigInt(name); return new DrillBigIntConverter(writer); } switch (type.getOriginalType()) { case DECIMAL: { ParquetReaderUtility.checkDecimalTypeEnabled(options); Decimal18Writer writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).decimal18() : mapWriter.decimal18(name); return new DrillDecimal18Converter( writer, type.getDecimalMetadata().getPrecision(), type.getDecimalMetadata().getScale()); } case TIMESTAMP_MILLIS: { TimeStampWriter writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).timeStamp() : mapWriter.timeStamp(name); return new DrillTimeStampConverter(writer); } default: { throw new UnsupportedOperationException( "Unsupported type " + type.getOriginalType()); } } } case INT96: { if (type.getOriginalType() == null) { VarBinaryWriter writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).varBinary() : mapWriter.varBinary(name); return new DrillFixedBinaryToVarbinaryConverter( writer, ParquetRecordReader.getTypeLengthInBits(type.getPrimitiveTypeName()) / 8, mutator.getManagedBuffer()); } } case FLOAT: { Float4Writer writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).float4() : mapWriter.float4(name); return new DrillFloat4Converter(writer); } case DOUBLE: { Float8Writer writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).float8() : mapWriter.float8(name); return new DrillFloat8Converter(writer); } case BOOLEAN: { BitWriter writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).bit() : mapWriter.bit(name); return new DrillBoolConverter(writer); } case BINARY: { if (type.getOriginalType() == null) { VarBinaryWriter writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).varBinary() : mapWriter.varBinary(name); return new DrillVarBinaryConverter(writer, mutator.getManagedBuffer()); } switch (type.getOriginalType()) { case UTF8: { VarCharWriter writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).varChar() : mapWriter.varChar(name); return new DrillVarCharConverter(writer, mutator.getManagedBuffer()); } // TODO not sure if BINARY/DECIMAL is actually supported case DECIMAL: { ParquetReaderUtility.checkDecimalTypeEnabled(options); DecimalMetadata metadata = type.getDecimalMetadata(); if (metadata.getPrecision() <= 28) { Decimal28SparseWriter writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).decimal28Sparse() : mapWriter.decimal28Sparse( name, metadata.getScale(), metadata.getPrecision()); return new DrillBinaryToDecimal28Converter( writer, metadata.getPrecision(), metadata.getScale(), mutator.getManagedBuffer()); } else { Decimal38SparseWriter writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).decimal38Sparse() : mapWriter.decimal38Sparse( name, metadata.getScale(), metadata.getPrecision()); return new DrillBinaryToDecimal38Converter( writer, metadata.getPrecision(), metadata.getScale(), mutator.getManagedBuffer()); } } default: { throw new UnsupportedOperationException( "Unsupported type " + type.getOriginalType()); } } } case FIXED_LEN_BYTE_ARRAY: if (type.getOriginalType() == OriginalType.DECIMAL) { ParquetReaderUtility.checkDecimalTypeEnabled(options); DecimalMetadata metadata = type.getDecimalMetadata(); if (metadata.getPrecision() <= 28) { Decimal28SparseWriter writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).decimal28Sparse() : mapWriter.decimal28Sparse(name, metadata.getScale(), metadata.getPrecision()); return new DrillBinaryToDecimal28Converter( writer, metadata.getPrecision(), metadata.getScale(), mutator.getManagedBuffer()); } else { Decimal38SparseWriter writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).decimal38Sparse() : mapWriter.decimal38Sparse(name, metadata.getScale(), metadata.getPrecision()); return new DrillBinaryToDecimal38Converter( writer, metadata.getPrecision(), metadata.getScale(), mutator.getManagedBuffer()); } } else if (type.getOriginalType() == OriginalType.INTERVAL) { IntervalWriter writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).interval() : mapWriter.interval(name); return new DrillFixedLengthByteArrayToInterval(writer); } else { VarBinaryWriter writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).varBinary() : mapWriter.varBinary(name); return new DrillFixedBinaryToVarbinaryConverter( writer, type.getTypeLength(), mutator.getManagedBuffer()); } default: throw new UnsupportedOperationException("Unsupported type: " + type.getPrimitiveTypeName()); } }