@Test public void testVectorMetadataIsAccurate() throws Exception { final VectorVerifier noChild = new ChildVerifier(); final VectorVerifier offsetChild = new ChildVerifier(UInt4Holder.TYPE); final ImmutableMap.Builder<Class, VectorVerifier> builder = ImmutableMap.builder(); builder.put(UInt4Vector.class, noChild); builder.put(BitVector.class, noChild); builder.put(VarCharVector.class, offsetChild); builder.put( NullableVarCharVector.class, new ChildVerifier(UInt1Holder.TYPE, Types.optional(TypeProtos.MinorType.VARCHAR))); builder.put( RepeatedListVector.class, new ChildVerifier(UInt4Holder.TYPE, Types.LATE_BIND_TYPE)); builder.put(MapVector.class, noChild); builder.put(RepeatedMapVector.class, offsetChild); final ImmutableMap<Class, VectorVerifier> children = builder.build(); testVectors( new VectorVerifier() { @Override public void verify(ValueVector vector) throws Exception { final Class klazz = vector.getClass(); final VectorVerifier verifier = children.get(klazz); verifier.verify(vector); } }); }
/** * Returns the merger of schemas. The merged schema will include the union all columns. If there * is a type conflict between columns with the same schemapath but different types, the merged * schema will contain a Union type. * * @param schemas * @return */ public static BatchSchema mergeSchemas(BatchSchema... schemas) { Map<SchemaPath, Set<MinorType>> typeSetMap = Maps.newLinkedHashMap(); for (BatchSchema s : schemas) { for (MaterializedField field : s) { SchemaPath path = field.getPath(); Set<MinorType> currentTypes = typeSetMap.get(path); if (currentTypes == null) { currentTypes = Sets.newHashSet(); typeSetMap.put(path, currentTypes); } MinorType newType = field.getType().getMinorType(); if (newType == MinorType.MAP || newType == MinorType.LIST) { throw new RuntimeException( "Schema change not currently supported for schemas with complex types"); } if (newType == MinorType.UNION) { for (MinorType subType : field.getType().getSubTypeList()) { currentTypes.add(subType); } } else { currentTypes.add(newType); } } } List<MaterializedField> fields = Lists.newArrayList(); for (SchemaPath path : typeSetMap.keySet()) { Set<MinorType> types = typeSetMap.get(path); if (types.size() > 1) { MajorType.Builder builder = MajorType.newBuilder().setMinorType(MinorType.UNION).setMode(DataMode.OPTIONAL); for (MinorType t : types) { builder.addSubType(t); } MaterializedField field = MaterializedField.create(path, builder.build()); fields.add(field); } else { MaterializedField field = MaterializedField.create(path, Types.optional(types.iterator().next())); fields.add(field); } } SchemaBuilder schemaBuilder = new SchemaBuilder(); BatchSchema s = schemaBuilder .addFields(fields) .setSelectionVectorMode(schemas[0].getSelectionVectorMode()) .build(); return s; }
@Override public FunctionDefinition[] getFunctionDefintions() { return new FunctionDefinition[] { FunctionDefinition.simple( "bytesubstring", new BasicArgumentValidator( new Arg( Types.required(TypeProtos.MinorType.VARBINARY), Types.optional(TypeProtos.MinorType.VARBINARY)), new Arg(false, false, "offset", TypeProtos.MinorType.BIGINT), new Arg(false, false, "length", TypeProtos.MinorType.BIGINT)), new OutputTypeDeterminer.SameAsFirstInput(), "byte_substr") }; }
private void initCols(Schema schema) throws SchemaChangeException { ImmutableList.Builder<ProjectedColumnInfo> pciBuilder = ImmutableList.builder(); for (int i = 0; i < schema.getColumnCount(); i++) { ColumnSchema col = schema.getColumnByIndex(i); final String name = col.getName(); final Type kuduType = col.getType(); MinorType minorType = TYPES.get(kuduType); if (minorType == null) { logger.warn( "Ignoring column that is unsupported.", UserException.unsupportedError() .message( "A column you queried has a data type that is not currently supported by the Kudu storage plugin. " + "The column's name was %s and its Kudu data type was %s. ", name, kuduType.toString()) .addContext("column Name", name) .addContext("plugin", "kudu") .build(logger)); continue; } MajorType majorType; if (col.isNullable()) { majorType = Types.optional(minorType); } else { majorType = Types.required(minorType); } MaterializedField field = MaterializedField.create(name, majorType); final Class<? extends ValueVector> clazz = (Class<? extends ValueVector>) TypeHelper.getValueVectorClass(minorType, majorType.getMode()); ValueVector vector = output.addField(field, clazz); vector.allocateNew(); ProjectedColumnInfo pci = new ProjectedColumnInfo(); pci.vv = vector; pci.kuduColumn = col; pci.index = i; pciBuilder.add(pci); } projectedCols = pciBuilder.build(); }
protected FieldWriter getWriter(MinorType type) { if (state == State.UNION) { return writer; } if (state == State.UNTYPED) { if (type == null) { return null; } ValueVector v = listVector.addOrGetVector(new VectorDescriptor(Types.optional(type))).getVector(); v.allocateNew(); setWriter(v); writer.setPosition(position); } if (type != this.type) { return promoteToUnion(); } return writer; }
private FieldWriter promoteToUnion() { String name = vector.getField().getLastName(); TransferPair tp = vector.getTransferPair( vector.getField().getType().getMinorType().name().toLowerCase(), vector.getAllocator()); tp.transfer(); if (parentContainer != null) { unionVector = parentContainer.addOrGet(name, Types.optional(MinorType.UNION), UnionVector.class); } else if (listVector != null) { unionVector = listVector.promoteToUnion(); } unionVector.addVector(tp.getTo()); writer = new UnionWriter(unionVector); writer.setPosition(idx()); for (int i = 0; i < idx(); i++) { unionVector.getMutator().setType(i, vector.getField().getType().getMinorType()); } vector = null; state = State.UNION; return writer; }
@Override public TypeProtos.MajorType getVectorType(SchemaPath column, PlannerSettings plannerSettings) { return Types.optional(TypeProtos.MinorType.VARCHAR); }
@Override public void setup(OperatorContext operatorContext, OutputMutator output) throws ExecutionSetupException { this.operatorContext = operatorContext; if (!isStarQuery()) { columnsFound = new boolean[getColumns().size()]; nullFilledVectors = new ArrayList<>(); } columnStatuses = new ArrayList<>(); // totalRecords = footer.getBlocks().get(rowGroupIndex).getRowCount(); List<ColumnDescriptor> columns = footer.getFileMetaData().getSchema().getColumns(); allFieldsFixedLength = true; ColumnDescriptor column; ColumnChunkMetaData columnChunkMetaData; int columnsToScan = 0; mockRecordsRead = 0; MaterializedField field; // ParquetMetadataConverter metaConverter = new ParquetMetadataConverter(); FileMetaData fileMetaData; logger.debug( "Reading row group({}) with {} records in file {}.", rowGroupIndex, footer.getBlocks().get(rowGroupIndex).getRowCount(), hadoopPath.toUri().getPath()); totalRecordsRead = 0; // TODO - figure out how to deal with this better once we add nested reading, note also look // where this map is used below // store a map from column name to converted types if they are non-null Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer); // loop to add up the length of the fixed width columns and build the schema for (int i = 0; i < columns.size(); ++i) { column = columns.get(i); SchemaElement se = schemaElements.get(column.getPath()[0]); MajorType mt = ParquetToDrillTypeConverter.toMajorType( column.getType(), se.getType_length(), getDataMode(column), se, fragmentContext.getOptions()); field = MaterializedField.create(toFieldName(column.getPath()), mt); if (!fieldSelected(field)) { continue; } columnsToScan++; int dataTypeLength = getDataTypeLength(column, se); if (dataTypeLength == -1) { allFieldsFixedLength = false; } else { bitWidthAllFixedFields += dataTypeLength; } } // rowGroupOffset = // footer.getBlocks().get(rowGroupIndex).getColumns().get(0).getFirstDataPageOffset(); if (columnsToScan != 0 && allFieldsFixedLength) { recordsPerBatch = (int) Math.min( Math.min( batchSize / bitWidthAllFixedFields, footer.getBlocks().get(0).getColumns().get(0).getValueCount()), 65535); } else { recordsPerBatch = DEFAULT_RECORDS_TO_READ_IF_NOT_FIXED_WIDTH; } try { ValueVector vector; SchemaElement schemaElement; final ArrayList<VarLengthColumn<? extends ValueVector>> varLengthColumns = new ArrayList<>(); // initialize all of the column read status objects boolean fieldFixedLength; // the column chunk meta-data is not guaranteed to be in the same order as the columns in the // schema // a map is constructed for fast access to the correct columnChunkMetadata to correspond // to an element in the schema Map<String, Integer> columnChunkMetadataPositionsInList = new HashMap<>(); BlockMetaData rowGroupMetadata = footer.getBlocks().get(rowGroupIndex); int colChunkIndex = 0; for (ColumnChunkMetaData colChunk : rowGroupMetadata.getColumns()) { columnChunkMetadataPositionsInList.put( Arrays.toString(colChunk.getPath().toArray()), colChunkIndex); colChunkIndex++; } for (int i = 0; i < columns.size(); ++i) { column = columns.get(i); columnChunkMetaData = rowGroupMetadata .getColumns() .get(columnChunkMetadataPositionsInList.get(Arrays.toString(column.getPath()))); schemaElement = schemaElements.get(column.getPath()[0]); MajorType type = ParquetToDrillTypeConverter.toMajorType( column.getType(), schemaElement.getType_length(), getDataMode(column), schemaElement, fragmentContext.getOptions()); field = MaterializedField.create(toFieldName(column.getPath()), type); // the field was not requested to be read if (!fieldSelected(field)) { continue; } fieldFixedLength = column.getType() != PrimitiveType.PrimitiveTypeName.BINARY; vector = output.addField( field, (Class<? extends ValueVector>) TypeHelper.getValueVectorClass(type.getMinorType(), type.getMode())); if (column.getType() != PrimitiveType.PrimitiveTypeName.BINARY) { if (column.getMaxRepetitionLevel() > 0) { final RepeatedValueVector repeatedVector = RepeatedValueVector.class.cast(vector); ColumnReader<?> dataReader = ColumnReaderFactory.createFixedColumnReader( this, fieldFixedLength, column, columnChunkMetaData, recordsPerBatch, repeatedVector.getDataVector(), schemaElement); varLengthColumns.add( new FixedWidthRepeatedReader( this, dataReader, getTypeLengthInBits(column.getType()), -1, column, columnChunkMetaData, false, repeatedVector, schemaElement)); } else { columnStatuses.add( ColumnReaderFactory.createFixedColumnReader( this, fieldFixedLength, column, columnChunkMetaData, recordsPerBatch, vector, schemaElement)); } } else { // create a reader and add it to the appropriate list varLengthColumns.add( ColumnReaderFactory.getReader( this, -1, column, columnChunkMetaData, false, vector, schemaElement)); } } varLengthReader = new VarLenBinaryReader(this, varLengthColumns); if (!isStarQuery()) { List<SchemaPath> projectedColumns = Lists.newArrayList(getColumns()); SchemaPath col; for (int i = 0; i < columnsFound.length; i++) { col = projectedColumns.get(i); assert col != null; if (!columnsFound[i] && !col.equals(STAR_COLUMN)) { nullFilledVectors.add( (NullableIntVector) output.addField( MaterializedField.create( col.getAsUnescapedPath(), Types.optional(TypeProtos.MinorType.INT)), (Class<? extends ValueVector>) TypeHelper.getValueVectorClass( TypeProtos.MinorType.INT, DataMode.OPTIONAL))); } } } } catch (Exception e) { handleAndRaise("Failure in setting up reader", e); } }
@Test public void testHashFunctionResolution(@Injectable DrillConfig config) throws JClassAlreadyExistsException, IOException { FunctionImplementationRegistry registry = new FunctionImplementationRegistry(config); // test required vs nullable Int input resolveHash( config, new TypedNullConstant(Types.optional(TypeProtos.MinorType.INT)), Types.optional(TypeProtos.MinorType.INT), Types.required(TypeProtos.MinorType.INT), TypeProtos.DataMode.OPTIONAL, registry); resolveHash( config, new ValueExpressions.IntExpression(1, ExpressionPosition.UNKNOWN), Types.required(TypeProtos.MinorType.INT), Types.required(TypeProtos.MinorType.INT), TypeProtos.DataMode.REQUIRED, registry); // test required vs nullable float input resolveHash( config, new TypedNullConstant(Types.optional(TypeProtos.MinorType.FLOAT4)), Types.optional(TypeProtos.MinorType.FLOAT4), Types.required(TypeProtos.MinorType.FLOAT4), TypeProtos.DataMode.OPTIONAL, registry); resolveHash( config, new ValueExpressions.FloatExpression(5.0f, ExpressionPosition.UNKNOWN), Types.required(TypeProtos.MinorType.FLOAT4), Types.required(TypeProtos.MinorType.FLOAT4), TypeProtos.DataMode.REQUIRED, registry); // test required vs nullable long input resolveHash( config, new TypedNullConstant(Types.optional(TypeProtos.MinorType.BIGINT)), Types.optional(TypeProtos.MinorType.BIGINT), Types.required(TypeProtos.MinorType.BIGINT), TypeProtos.DataMode.OPTIONAL, registry); resolveHash( config, new ValueExpressions.LongExpression(100L, ExpressionPosition.UNKNOWN), Types.required(TypeProtos.MinorType.BIGINT), Types.required(TypeProtos.MinorType.BIGINT), TypeProtos.DataMode.REQUIRED, registry); // test required vs nullable double input resolveHash( config, new TypedNullConstant(Types.optional(TypeProtos.MinorType.FLOAT8)), Types.optional(TypeProtos.MinorType.FLOAT8), Types.required(TypeProtos.MinorType.FLOAT8), TypeProtos.DataMode.OPTIONAL, registry); resolveHash( config, new ValueExpressions.DoubleExpression(100.0, ExpressionPosition.UNKNOWN), Types.required(TypeProtos.MinorType.FLOAT8), Types.required(TypeProtos.MinorType.FLOAT8), TypeProtos.DataMode.REQUIRED, registry); }