Пример #1
0
  @Test
  public void testVectorMetadataIsAccurate() throws Exception {
    final VectorVerifier noChild = new ChildVerifier();
    final VectorVerifier offsetChild = new ChildVerifier(UInt4Holder.TYPE);

    final ImmutableMap.Builder<Class, VectorVerifier> builder = ImmutableMap.builder();
    builder.put(UInt4Vector.class, noChild);
    builder.put(BitVector.class, noChild);
    builder.put(VarCharVector.class, offsetChild);
    builder.put(
        NullableVarCharVector.class,
        new ChildVerifier(UInt1Holder.TYPE, Types.optional(TypeProtos.MinorType.VARCHAR)));
    builder.put(
        RepeatedListVector.class, new ChildVerifier(UInt4Holder.TYPE, Types.LATE_BIND_TYPE));
    builder.put(MapVector.class, noChild);
    builder.put(RepeatedMapVector.class, offsetChild);
    final ImmutableMap<Class, VectorVerifier> children = builder.build();

    testVectors(
        new VectorVerifier() {

          @Override
          public void verify(ValueVector vector) throws Exception {

            final Class klazz = vector.getClass();
            final VectorVerifier verifier = children.get(klazz);
            verifier.verify(vector);
          }
        });
  }
Пример #2
0
  /**
   * Returns the merger of schemas. The merged schema will include the union all columns. If there
   * is a type conflict between columns with the same schemapath but different types, the merged
   * schema will contain a Union type.
   *
   * @param schemas
   * @return
   */
  public static BatchSchema mergeSchemas(BatchSchema... schemas) {
    Map<SchemaPath, Set<MinorType>> typeSetMap = Maps.newLinkedHashMap();

    for (BatchSchema s : schemas) {
      for (MaterializedField field : s) {
        SchemaPath path = field.getPath();
        Set<MinorType> currentTypes = typeSetMap.get(path);
        if (currentTypes == null) {
          currentTypes = Sets.newHashSet();
          typeSetMap.put(path, currentTypes);
        }
        MinorType newType = field.getType().getMinorType();
        if (newType == MinorType.MAP || newType == MinorType.LIST) {
          throw new RuntimeException(
              "Schema change not currently supported for schemas with complex types");
        }
        if (newType == MinorType.UNION) {
          for (MinorType subType : field.getType().getSubTypeList()) {
            currentTypes.add(subType);
          }
        } else {
          currentTypes.add(newType);
        }
      }
    }

    List<MaterializedField> fields = Lists.newArrayList();

    for (SchemaPath path : typeSetMap.keySet()) {
      Set<MinorType> types = typeSetMap.get(path);
      if (types.size() > 1) {
        MajorType.Builder builder =
            MajorType.newBuilder().setMinorType(MinorType.UNION).setMode(DataMode.OPTIONAL);
        for (MinorType t : types) {
          builder.addSubType(t);
        }
        MaterializedField field = MaterializedField.create(path, builder.build());
        fields.add(field);
      } else {
        MaterializedField field =
            MaterializedField.create(path, Types.optional(types.iterator().next()));
        fields.add(field);
      }
    }

    SchemaBuilder schemaBuilder = new SchemaBuilder();
    BatchSchema s =
        schemaBuilder
            .addFields(fields)
            .setSelectionVectorMode(schemas[0].getSelectionVectorMode())
            .build();
    return s;
  }
 @Override
 public FunctionDefinition[] getFunctionDefintions() {
   return new FunctionDefinition[] {
     FunctionDefinition.simple(
         "bytesubstring",
         new BasicArgumentValidator(
             new Arg(
                 Types.required(TypeProtos.MinorType.VARBINARY),
                 Types.optional(TypeProtos.MinorType.VARBINARY)),
             new Arg(false, false, "offset", TypeProtos.MinorType.BIGINT),
             new Arg(false, false, "length", TypeProtos.MinorType.BIGINT)),
         new OutputTypeDeterminer.SameAsFirstInput(),
         "byte_substr")
   };
 }
Пример #4
0
  private void initCols(Schema schema) throws SchemaChangeException {
    ImmutableList.Builder<ProjectedColumnInfo> pciBuilder = ImmutableList.builder();

    for (int i = 0; i < schema.getColumnCount(); i++) {
      ColumnSchema col = schema.getColumnByIndex(i);

      final String name = col.getName();
      final Type kuduType = col.getType();
      MinorType minorType = TYPES.get(kuduType);
      if (minorType == null) {
        logger.warn(
            "Ignoring column that is unsupported.",
            UserException.unsupportedError()
                .message(
                    "A column you queried has a data type that is not currently supported by the Kudu storage plugin. "
                        + "The column's name was %s and its Kudu data type was %s. ",
                    name, kuduType.toString())
                .addContext("column Name", name)
                .addContext("plugin", "kudu")
                .build(logger));

        continue;
      }
      MajorType majorType;
      if (col.isNullable()) {
        majorType = Types.optional(minorType);
      } else {
        majorType = Types.required(minorType);
      }
      MaterializedField field = MaterializedField.create(name, majorType);
      final Class<? extends ValueVector> clazz =
          (Class<? extends ValueVector>)
              TypeHelper.getValueVectorClass(minorType, majorType.getMode());
      ValueVector vector = output.addField(field, clazz);
      vector.allocateNew();

      ProjectedColumnInfo pci = new ProjectedColumnInfo();
      pci.vv = vector;
      pci.kuduColumn = col;
      pci.index = i;
      pciBuilder.add(pci);
    }

    projectedCols = pciBuilder.build();
  }
Пример #5
0
 protected FieldWriter getWriter(MinorType type) {
   if (state == State.UNION) {
     return writer;
   }
   if (state == State.UNTYPED) {
     if (type == null) {
       return null;
     }
     ValueVector v =
         listVector.addOrGetVector(new VectorDescriptor(Types.optional(type))).getVector();
     v.allocateNew();
     setWriter(v);
     writer.setPosition(position);
   }
   if (type != this.type) {
     return promoteToUnion();
   }
   return writer;
 }
Пример #6
0
 private FieldWriter promoteToUnion() {
   String name = vector.getField().getLastName();
   TransferPair tp =
       vector.getTransferPair(
           vector.getField().getType().getMinorType().name().toLowerCase(), vector.getAllocator());
   tp.transfer();
   if (parentContainer != null) {
     unionVector =
         parentContainer.addOrGet(name, Types.optional(MinorType.UNION), UnionVector.class);
   } else if (listVector != null) {
     unionVector = listVector.promoteToUnion();
   }
   unionVector.addVector(tp.getTo());
   writer = new UnionWriter(unionVector);
   writer.setPosition(idx());
   for (int i = 0; i < idx(); i++) {
     unionVector.getMutator().setType(i, vector.getField().getType().getMinorType());
   }
   vector = null;
   state = State.UNION;
   return writer;
 }
 @Override
 public TypeProtos.MajorType getVectorType(SchemaPath column, PlannerSettings plannerSettings) {
   return Types.optional(TypeProtos.MinorType.VARCHAR);
 }
  @Override
  public void setup(OperatorContext operatorContext, OutputMutator output)
      throws ExecutionSetupException {
    this.operatorContext = operatorContext;
    if (!isStarQuery()) {
      columnsFound = new boolean[getColumns().size()];
      nullFilledVectors = new ArrayList<>();
    }
    columnStatuses = new ArrayList<>();
    //    totalRecords = footer.getBlocks().get(rowGroupIndex).getRowCount();
    List<ColumnDescriptor> columns = footer.getFileMetaData().getSchema().getColumns();
    allFieldsFixedLength = true;
    ColumnDescriptor column;
    ColumnChunkMetaData columnChunkMetaData;
    int columnsToScan = 0;
    mockRecordsRead = 0;

    MaterializedField field;
    //    ParquetMetadataConverter metaConverter = new ParquetMetadataConverter();
    FileMetaData fileMetaData;

    logger.debug(
        "Reading row group({}) with {} records in file {}.",
        rowGroupIndex,
        footer.getBlocks().get(rowGroupIndex).getRowCount(),
        hadoopPath.toUri().getPath());
    totalRecordsRead = 0;

    // TODO - figure out how to deal with this better once we add nested reading, note also look
    // where this map is used below
    // store a map from column name to converted types if they are non-null
    Map<String, SchemaElement> schemaElements =
        ParquetReaderUtility.getColNameToSchemaElementMapping(footer);

    // loop to add up the length of the fixed width columns and build the schema
    for (int i = 0; i < columns.size(); ++i) {
      column = columns.get(i);
      SchemaElement se = schemaElements.get(column.getPath()[0]);
      MajorType mt =
          ParquetToDrillTypeConverter.toMajorType(
              column.getType(),
              se.getType_length(),
              getDataMode(column),
              se,
              fragmentContext.getOptions());
      field = MaterializedField.create(toFieldName(column.getPath()), mt);
      if (!fieldSelected(field)) {
        continue;
      }
      columnsToScan++;
      int dataTypeLength = getDataTypeLength(column, se);
      if (dataTypeLength == -1) {
        allFieldsFixedLength = false;
      } else {
        bitWidthAllFixedFields += dataTypeLength;
      }
    }
    //    rowGroupOffset =
    // footer.getBlocks().get(rowGroupIndex).getColumns().get(0).getFirstDataPageOffset();

    if (columnsToScan != 0 && allFieldsFixedLength) {
      recordsPerBatch =
          (int)
              Math.min(
                  Math.min(
                      batchSize / bitWidthAllFixedFields,
                      footer.getBlocks().get(0).getColumns().get(0).getValueCount()),
                  65535);
    } else {
      recordsPerBatch = DEFAULT_RECORDS_TO_READ_IF_NOT_FIXED_WIDTH;
    }

    try {
      ValueVector vector;
      SchemaElement schemaElement;
      final ArrayList<VarLengthColumn<? extends ValueVector>> varLengthColumns = new ArrayList<>();
      // initialize all of the column read status objects
      boolean fieldFixedLength;
      // the column chunk meta-data is not guaranteed to be in the same order as the columns in the
      // schema
      // a map is constructed for fast access to the correct columnChunkMetadata to correspond
      // to an element in the schema
      Map<String, Integer> columnChunkMetadataPositionsInList = new HashMap<>();
      BlockMetaData rowGroupMetadata = footer.getBlocks().get(rowGroupIndex);

      int colChunkIndex = 0;
      for (ColumnChunkMetaData colChunk : rowGroupMetadata.getColumns()) {
        columnChunkMetadataPositionsInList.put(
            Arrays.toString(colChunk.getPath().toArray()), colChunkIndex);
        colChunkIndex++;
      }
      for (int i = 0; i < columns.size(); ++i) {
        column = columns.get(i);
        columnChunkMetaData =
            rowGroupMetadata
                .getColumns()
                .get(columnChunkMetadataPositionsInList.get(Arrays.toString(column.getPath())));
        schemaElement = schemaElements.get(column.getPath()[0]);
        MajorType type =
            ParquetToDrillTypeConverter.toMajorType(
                column.getType(),
                schemaElement.getType_length(),
                getDataMode(column),
                schemaElement,
                fragmentContext.getOptions());
        field = MaterializedField.create(toFieldName(column.getPath()), type);
        // the field was not requested to be read
        if (!fieldSelected(field)) {
          continue;
        }

        fieldFixedLength = column.getType() != PrimitiveType.PrimitiveTypeName.BINARY;
        vector =
            output.addField(
                field,
                (Class<? extends ValueVector>)
                    TypeHelper.getValueVectorClass(type.getMinorType(), type.getMode()));
        if (column.getType() != PrimitiveType.PrimitiveTypeName.BINARY) {
          if (column.getMaxRepetitionLevel() > 0) {
            final RepeatedValueVector repeatedVector = RepeatedValueVector.class.cast(vector);
            ColumnReader<?> dataReader =
                ColumnReaderFactory.createFixedColumnReader(
                    this,
                    fieldFixedLength,
                    column,
                    columnChunkMetaData,
                    recordsPerBatch,
                    repeatedVector.getDataVector(),
                    schemaElement);
            varLengthColumns.add(
                new FixedWidthRepeatedReader(
                    this,
                    dataReader,
                    getTypeLengthInBits(column.getType()),
                    -1,
                    column,
                    columnChunkMetaData,
                    false,
                    repeatedVector,
                    schemaElement));
          } else {
            columnStatuses.add(
                ColumnReaderFactory.createFixedColumnReader(
                    this,
                    fieldFixedLength,
                    column,
                    columnChunkMetaData,
                    recordsPerBatch,
                    vector,
                    schemaElement));
          }
        } else {
          // create a reader and add it to the appropriate list
          varLengthColumns.add(
              ColumnReaderFactory.getReader(
                  this, -1, column, columnChunkMetaData, false, vector, schemaElement));
        }
      }
      varLengthReader = new VarLenBinaryReader(this, varLengthColumns);

      if (!isStarQuery()) {
        List<SchemaPath> projectedColumns = Lists.newArrayList(getColumns());
        SchemaPath col;
        for (int i = 0; i < columnsFound.length; i++) {
          col = projectedColumns.get(i);
          assert col != null;
          if (!columnsFound[i] && !col.equals(STAR_COLUMN)) {
            nullFilledVectors.add(
                (NullableIntVector)
                    output.addField(
                        MaterializedField.create(
                            col.getAsUnescapedPath(), Types.optional(TypeProtos.MinorType.INT)),
                        (Class<? extends ValueVector>)
                            TypeHelper.getValueVectorClass(
                                TypeProtos.MinorType.INT, DataMode.OPTIONAL)));
          }
        }
      }
    } catch (Exception e) {
      handleAndRaise("Failure in setting up reader", e);
    }
  }
  @Test
  public void testHashFunctionResolution(@Injectable DrillConfig config)
      throws JClassAlreadyExistsException, IOException {
    FunctionImplementationRegistry registry = new FunctionImplementationRegistry(config);
    // test required vs nullable Int input
    resolveHash(
        config,
        new TypedNullConstant(Types.optional(TypeProtos.MinorType.INT)),
        Types.optional(TypeProtos.MinorType.INT),
        Types.required(TypeProtos.MinorType.INT),
        TypeProtos.DataMode.OPTIONAL,
        registry);

    resolveHash(
        config,
        new ValueExpressions.IntExpression(1, ExpressionPosition.UNKNOWN),
        Types.required(TypeProtos.MinorType.INT),
        Types.required(TypeProtos.MinorType.INT),
        TypeProtos.DataMode.REQUIRED,
        registry);

    // test required vs nullable float input
    resolveHash(
        config,
        new TypedNullConstant(Types.optional(TypeProtos.MinorType.FLOAT4)),
        Types.optional(TypeProtos.MinorType.FLOAT4),
        Types.required(TypeProtos.MinorType.FLOAT4),
        TypeProtos.DataMode.OPTIONAL,
        registry);

    resolveHash(
        config,
        new ValueExpressions.FloatExpression(5.0f, ExpressionPosition.UNKNOWN),
        Types.required(TypeProtos.MinorType.FLOAT4),
        Types.required(TypeProtos.MinorType.FLOAT4),
        TypeProtos.DataMode.REQUIRED,
        registry);

    // test required vs nullable long input
    resolveHash(
        config,
        new TypedNullConstant(Types.optional(TypeProtos.MinorType.BIGINT)),
        Types.optional(TypeProtos.MinorType.BIGINT),
        Types.required(TypeProtos.MinorType.BIGINT),
        TypeProtos.DataMode.OPTIONAL,
        registry);

    resolveHash(
        config,
        new ValueExpressions.LongExpression(100L, ExpressionPosition.UNKNOWN),
        Types.required(TypeProtos.MinorType.BIGINT),
        Types.required(TypeProtos.MinorType.BIGINT),
        TypeProtos.DataMode.REQUIRED,
        registry);

    // test required vs nullable double input
    resolveHash(
        config,
        new TypedNullConstant(Types.optional(TypeProtos.MinorType.FLOAT8)),
        Types.optional(TypeProtos.MinorType.FLOAT8),
        Types.required(TypeProtos.MinorType.FLOAT8),
        TypeProtos.DataMode.OPTIONAL,
        registry);

    resolveHash(
        config,
        new ValueExpressions.DoubleExpression(100.0, ExpressionPosition.UNKNOWN),
        Types.required(TypeProtos.MinorType.FLOAT8),
        Types.required(TypeProtos.MinorType.FLOAT8),
        TypeProtos.DataMode.REQUIRED,
        registry);
  }