Java ColumnDescriptor примеры, org.apache.parquet.column.ColumnDescriptor Java примеры использования

Пример #1

0

Показать файл

Файл: VectorizedColumnReader.java Проект: ChrisYohann/spark

  private void readPageV1(DataPageV1 page) throws IOException {
    this.pageValueCount = page.getValueCount();
    ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL);
    ValuesReader dlReader;

    // Initialize the decoders.
    if (page.getDlEncoding() != Encoding.RLE && descriptor.getMaxDefinitionLevel() != 0) {
      throw new UnsupportedOperationException("Unsupported encoding: " + page.getDlEncoding());
    }
    int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel());
    this.defColumn = new VectorizedRleValuesReader(bitWidth);
    dlReader = this.defColumn;
    this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader);
    this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader);
    try {
      byte[] bytes = page.getBytes().toByteArray();
      rlReader.initFromPage(pageValueCount, bytes, 0);
      int next = rlReader.getNextOffset();
      dlReader.initFromPage(pageValueCount, bytes, next);
      next = dlReader.getNextOffset();
      initDataReader(page.getValueEncoding(), bytes, next);
    } catch (IOException e) {
      throw new IOException("could not read page " + page + " in col " + descriptor, e);
    }
  }

Пример #2

0

Показать файл

Файл: ParquetRecordReader.java Проект: arina-ielchiieva/drill

 private TypeProtos.DataMode getDataMode(ColumnDescriptor column) {
   if (column.getMaxRepetitionLevel() > 0) {
     return DataMode.REPEATED;
   } else if (column.getMaxDefinitionLevel() == 0) {
     return TypeProtos.DataMode.REQUIRED;
   } else {
     return TypeProtos.DataMode.OPTIONAL;
   }
 }

Пример #3

0

Показать файл

Файл: ParquetRecordReader.java Проект: arina-ielchiieva/drill

 /**
  * Returns data type length for a given {@see ColumnDescriptor} and it's corresponding {@see
  * SchemaElement}. Neither is enough information alone as the max repetition level (indicating if
  * it is an array type) is in the ColumnDescriptor and the length of a fixed width field is stored
  * at the schema level.
  *
  * @return the length if fixed width, else -1
  */
 private int getDataTypeLength(ColumnDescriptor column, SchemaElement se) {
   if (column.getType() != PrimitiveType.PrimitiveTypeName.BINARY) {
     if (column.getMaxRepetitionLevel() > 0) {
       return -1;
     }
     if (column.getType() == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
       return se.getType_length() * 8;
     } else {
       return getTypeLengthInBits(column.getType());
     }
   } else {
     return -1;
   }
 }

Пример #4

0

Показать файл

Файл: VectorizedColumnReader.java Проект: ChrisYohann/spark

  private void readPageV2(DataPageV2 page) throws IOException {
    this.pageValueCount = page.getValueCount();
    this.repetitionLevelColumn =
        createRLEIterator(
            descriptor.getMaxRepetitionLevel(), page.getRepetitionLevels(), descriptor);

    int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel());
    this.defColumn = new VectorizedRleValuesReader(bitWidth);
    this.definitionLevelColumn = new ValuesReaderIntIterator(this.defColumn);
    this.defColumn.initFromBuffer(this.pageValueCount, page.getDefinitionLevels().toByteArray());
    try {
      initDataReader(page.getDataEncoding(), page.getData().toByteArray(), 0);
    } catch (IOException e) {
      throw new IOException("could not read page " + page + " in col " + descriptor, e);
    }
  }

Пример #5

0

Показать файл

Файл: ColumnReadStoreImpl.java Проект: chopeen/parquet-mr

 private PrimitiveConverter getPrimitiveConverter(ColumnDescriptor path) {
   Type currentType = schema;
   Converter currentConverter = recordConverter;
   for (String fieldName : path.getPath()) {
     final GroupType groupType = currentType.asGroupType();
     int fieldIndex = groupType.getFieldIndex(fieldName);
     currentType = groupType.getType(fieldName);
     currentConverter = currentConverter.asGroupConverter().getConverter(fieldIndex);
   }
   PrimitiveConverter converter = currentConverter.asPrimitiveConverter();
   return converter;
 }

Пример #6

0

Показать файл

Файл: VectorizedColumnReader.java Проект: ChrisYohann/spark

  public VectorizedColumnReader(ColumnDescriptor descriptor, PageReader pageReader)
      throws IOException {
    this.descriptor = descriptor;
    this.pageReader = pageReader;
    this.maxDefLevel = descriptor.getMaxDefinitionLevel();

    DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
    if (dictionaryPage != null) {
      try {
        this.dictionary = dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage);
        this.isCurrentPageDictionaryEncoded = true;
      } catch (IOException e) {
        throw new IOException("could not decode the dictionary for " + descriptor, e);
      }
    } else {
      this.dictionary = null;
      this.isCurrentPageDictionaryEncoded = false;
    }
    this.totalValueCount = pageReader.getTotalValueCount();
    if (totalValueCount == 0) {
      throw new IOException("totalValueCount == 0");
    }
  }

Пример #7

0

Показать файл

Файл: VectorizedColumnReader.java Проект: ChrisYohann/spark

  /** Reads `num` values into column, decoding the values from `dictionaryIds` and `dictionary`. */
  private void decodeDictionaryIds(
      int rowId, int num, ColumnVector column, ColumnVector dictionaryIds) {
    switch (descriptor.getType()) {
      case INT32:
        if (column.dataType() == DataTypes.IntegerType
            || DecimalType.is32BitDecimalType(column.dataType())) {
          for (int i = rowId; i < rowId + num; ++i) {
            column.putInt(i, dictionary.decodeToInt(dictionaryIds.getInt(i)));
          }
        } else if (column.dataType() == DataTypes.ByteType) {
          for (int i = rowId; i < rowId + num; ++i) {
            column.putByte(i, (byte) dictionary.decodeToInt(dictionaryIds.getInt(i)));
          }
        } else if (column.dataType() == DataTypes.ShortType) {
          for (int i = rowId; i < rowId + num; ++i) {
            column.putShort(i, (short) dictionary.decodeToInt(dictionaryIds.getInt(i)));
          }
        } else {
          throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
        }
        break;

      case INT64:
        if (column.dataType() == DataTypes.LongType
            || DecimalType.is64BitDecimalType(column.dataType())) {
          for (int i = rowId; i < rowId + num; ++i) {
            column.putLong(i, dictionary.decodeToLong(dictionaryIds.getInt(i)));
          }
        } else {
          throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
        }
        break;

      case FLOAT:
        for (int i = rowId; i < rowId + num; ++i) {
          column.putFloat(i, dictionary.decodeToFloat(dictionaryIds.getInt(i)));
        }
        break;

      case DOUBLE:
        for (int i = rowId; i < rowId + num; ++i) {
          column.putDouble(i, dictionary.decodeToDouble(dictionaryIds.getInt(i)));
        }
        break;
      case INT96:
        if (column.dataType() == DataTypes.TimestampType) {
          for (int i = rowId; i < rowId + num; ++i) {
            // TODO: Convert dictionary of Binaries to dictionary of Longs
            Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));
            column.putLong(i, ParquetRowConverter.binaryToSQLTimestamp(v));
          }
        } else {
          throw new UnsupportedOperationException();
        }
        break;
      case BINARY:
        // TODO: this is incredibly inefficient as it blows up the dictionary right here. We
        // need to do this better. We should probably add the dictionary data to the ColumnVector
        // and reuse it across batches. This should mean adding a ByteArray would just update
        // the length and offset.
        for (int i = rowId; i < rowId + num; ++i) {
          Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));
          column.putByteArray(i, v.getBytes());
        }
        break;
      case FIXED_LEN_BYTE_ARRAY:
        // DecimalType written in the legacy mode
        if (DecimalType.is32BitDecimalType(column.dataType())) {
          for (int i = rowId; i < rowId + num; ++i) {
            Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));
            column.putInt(i, (int) ParquetRowConverter.binaryToUnscaledLong(v));
          }
        } else if (DecimalType.is64BitDecimalType(column.dataType())) {
          for (int i = rowId; i < rowId + num; ++i) {
            Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));
            column.putLong(i, ParquetRowConverter.binaryToUnscaledLong(v));
          }
        } else if (DecimalType.isByteArrayDecimalType(column.dataType())) {
          for (int i = rowId; i < rowId + num; ++i) {
            Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));
            column.putByteArray(i, v.getBytes());
          }
        } else {
          throw new UnsupportedOperationException();
        }
        break;

      default:
        throw new UnsupportedOperationException("Unsupported type: " + descriptor.getType());
    }
  }

Пример #8

0

Показать файл

Файл: VectorizedColumnReader.java Проект: ChrisYohann/spark

  /** Reads `total` values from this columnReader into column. */
  void readBatch(int total, ColumnVector column) throws IOException {
    int rowId = 0;
    ColumnVector dictionaryIds = null;
    if (dictionary != null) {
      // SPARK-16334: We only maintain a single dictionary per row batch, so that it can be used to
      // decode all previous dictionary encoded pages if we ever encounter a non-dictionary encoded
      // page.
      dictionaryIds = column.reserveDictionaryIds(total);
    }
    while (total > 0) {
      // Compute the number of values we want to read in this page.
      int leftInPage = (int) (endOfPageValueCount - valuesRead);
      if (leftInPage == 0) {
        readPage();
        leftInPage = (int) (endOfPageValueCount - valuesRead);
      }
      int num = Math.min(total, leftInPage);
      if (isCurrentPageDictionaryEncoded) {
        // Read and decode dictionary ids.
        defColumn.readIntegers(
            num, dictionaryIds, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
        if (column.hasDictionary()
            || (rowId == 0
                && (descriptor.getType() == PrimitiveType.PrimitiveTypeName.INT32
                    || descriptor.getType() == PrimitiveType.PrimitiveTypeName.INT64
                    || descriptor.getType() == PrimitiveType.PrimitiveTypeName.FLOAT
                    || descriptor.getType() == PrimitiveType.PrimitiveTypeName.DOUBLE
                    || descriptor.getType() == PrimitiveType.PrimitiveTypeName.BINARY))) {
          // Column vector supports lazy decoding of dictionary values so just set the dictionary.
          // We can't do this if rowId != 0 AND the column doesn't have a dictionary (i.e. some
          // non-dictionary encoded values have already been added).
          column.setDictionary(dictionary);
        } else {
          decodeDictionaryIds(rowId, num, column, dictionaryIds);
        }
      } else {
        if (column.hasDictionary() && rowId != 0) {
          // This batch already has dictionary encoded values but this new page is not. The batch
          // does not support a mix of dictionary and not so we will decode the dictionary.
          decodeDictionaryIds(0, rowId, column, column.getDictionaryIds());
        }
        column.setDictionary(null);
        switch (descriptor.getType()) {
          case BOOLEAN:
            readBooleanBatch(rowId, num, column);
            break;
          case INT32:
            readIntBatch(rowId, num, column);
            break;
          case INT64:
            readLongBatch(rowId, num, column);
            break;
          case INT96:
            readBinaryBatch(rowId, num, column);
            break;
          case FLOAT:
            readFloatBatch(rowId, num, column);
            break;
          case DOUBLE:
            readDoubleBatch(rowId, num, column);
            break;
          case BINARY:
            readBinaryBatch(rowId, num, column);
            break;
          case FIXED_LEN_BYTE_ARRAY:
            readFixedLenByteArrayBatch(rowId, num, column, descriptor.getTypeLength());
            break;
          default:
            throw new IOException("Unsupported type: " + descriptor.getType());
        }
      }

      valuesRead += num;
      rowId += num;
      total -= num;
    }
  }

Пример #9

0

Показать файл

Файл: ParquetRecordReader.java Проект: arina-ielchiieva/drill

  @Override
  public void setup(OperatorContext operatorContext, OutputMutator output)
      throws ExecutionSetupException {
    this.operatorContext = operatorContext;
    if (!isStarQuery()) {
      columnsFound = new boolean[getColumns().size()];
      nullFilledVectors = new ArrayList<>();
    }
    columnStatuses = new ArrayList<>();
    //    totalRecords = footer.getBlocks().get(rowGroupIndex).getRowCount();
    List<ColumnDescriptor> columns = footer.getFileMetaData().getSchema().getColumns();
    allFieldsFixedLength = true;
    ColumnDescriptor column;
    ColumnChunkMetaData columnChunkMetaData;
    int columnsToScan = 0;
    mockRecordsRead = 0;

    MaterializedField field;
    //    ParquetMetadataConverter metaConverter = new ParquetMetadataConverter();
    FileMetaData fileMetaData;

    logger.debug(
        "Reading row group({}) with {} records in file {}.",
        rowGroupIndex,
        footer.getBlocks().get(rowGroupIndex).getRowCount(),
        hadoopPath.toUri().getPath());
    totalRecordsRead = 0;

    // TODO - figure out how to deal with this better once we add nested reading, note also look
    // where this map is used below
    // store a map from column name to converted types if they are non-null
    Map<String, SchemaElement> schemaElements =
        ParquetReaderUtility.getColNameToSchemaElementMapping(footer);

    // loop to add up the length of the fixed width columns and build the schema
    for (int i = 0; i < columns.size(); ++i) {
      column = columns.get(i);
      SchemaElement se = schemaElements.get(column.getPath()[0]);
      MajorType mt =
          ParquetToDrillTypeConverter.toMajorType(
              column.getType(),
              se.getType_length(),
              getDataMode(column),
              se,
              fragmentContext.getOptions());
      field = MaterializedField.create(toFieldName(column.getPath()), mt);
      if (!fieldSelected(field)) {
        continue;
      }
      columnsToScan++;
      int dataTypeLength = getDataTypeLength(column, se);
      if (dataTypeLength == -1) {
        allFieldsFixedLength = false;
      } else {
        bitWidthAllFixedFields += dataTypeLength;
      }
    }
    //    rowGroupOffset =
    // footer.getBlocks().get(rowGroupIndex).getColumns().get(0).getFirstDataPageOffset();

    if (columnsToScan != 0 && allFieldsFixedLength) {
      recordsPerBatch =
          (int)
              Math.min(
                  Math.min(
                      batchSize / bitWidthAllFixedFields,
                      footer.getBlocks().get(0).getColumns().get(0).getValueCount()),
                  65535);
    } else {
      recordsPerBatch = DEFAULT_RECORDS_TO_READ_IF_NOT_FIXED_WIDTH;
    }

    try {
      ValueVector vector;
      SchemaElement schemaElement;
      final ArrayList<VarLengthColumn<? extends ValueVector>> varLengthColumns = new ArrayList<>();
      // initialize all of the column read status objects
      boolean fieldFixedLength;
      // the column chunk meta-data is not guaranteed to be in the same order as the columns in the
      // schema
      // a map is constructed for fast access to the correct columnChunkMetadata to correspond
      // to an element in the schema
      Map<String, Integer> columnChunkMetadataPositionsInList = new HashMap<>();
      BlockMetaData rowGroupMetadata = footer.getBlocks().get(rowGroupIndex);

      int colChunkIndex = 0;
      for (ColumnChunkMetaData colChunk : rowGroupMetadata.getColumns()) {
        columnChunkMetadataPositionsInList.put(
            Arrays.toString(colChunk.getPath().toArray()), colChunkIndex);
        colChunkIndex++;
      }
      for (int i = 0; i < columns.size(); ++i) {
        column = columns.get(i);
        columnChunkMetaData =
            rowGroupMetadata
                .getColumns()
                .get(columnChunkMetadataPositionsInList.get(Arrays.toString(column.getPath())));
        schemaElement = schemaElements.get(column.getPath()[0]);
        MajorType type =
            ParquetToDrillTypeConverter.toMajorType(
                column.getType(),
                schemaElement.getType_length(),
                getDataMode(column),
                schemaElement,
                fragmentContext.getOptions());
        field = MaterializedField.create(toFieldName(column.getPath()), type);
        // the field was not requested to be read
        if (!fieldSelected(field)) {
          continue;
        }

        fieldFixedLength = column.getType() != PrimitiveType.PrimitiveTypeName.BINARY;
        vector =
            output.addField(
                field,
                (Class<? extends ValueVector>)
                    TypeHelper.getValueVectorClass(type.getMinorType(), type.getMode()));
        if (column.getType() != PrimitiveType.PrimitiveTypeName.BINARY) {
          if (column.getMaxRepetitionLevel() > 0) {
            final RepeatedValueVector repeatedVector = RepeatedValueVector.class.cast(vector);
            ColumnReader<?> dataReader =
                ColumnReaderFactory.createFixedColumnReader(
                    this,
                    fieldFixedLength,
                    column,
                    columnChunkMetaData,
                    recordsPerBatch,
                    repeatedVector.getDataVector(),
                    schemaElement);
            varLengthColumns.add(
                new FixedWidthRepeatedReader(
                    this,
                    dataReader,
                    getTypeLengthInBits(column.getType()),
                    -1,
                    column,
                    columnChunkMetaData,
                    false,
                    repeatedVector,
                    schemaElement));
          } else {
            columnStatuses.add(
                ColumnReaderFactory.createFixedColumnReader(
                    this,
                    fieldFixedLength,
                    column,
                    columnChunkMetaData,
                    recordsPerBatch,
                    vector,
                    schemaElement));
          }
        } else {
          // create a reader and add it to the appropriate list
          varLengthColumns.add(
              ColumnReaderFactory.getReader(
                  this, -1, column, columnChunkMetaData, false, vector, schemaElement));
        }
      }
      varLengthReader = new VarLenBinaryReader(this, varLengthColumns);

      if (!isStarQuery()) {
        List<SchemaPath> projectedColumns = Lists.newArrayList(getColumns());
        SchemaPath col;
        for (int i = 0; i < columnsFound.length; i++) {
          col = projectedColumns.get(i);
          assert col != null;
          if (!columnsFound[i] && !col.equals(STAR_COLUMN)) {
            nullFilledVectors.add(
                (NullableIntVector)
                    output.addField(
                        MaterializedField.create(
                            col.getAsUnescapedPath(), Types.optional(TypeProtos.MinorType.INT)),
                        (Class<? extends ValueVector>)
                            TypeHelper.getValueVectorClass(
                                TypeProtos.MinorType.INT, DataMode.OPTIONAL)));
          }
        }
      }
    } catch (Exception e) {
      handleAndRaise("Failure in setting up reader", e);
    }
  }

Java ColumnDescriptor примеры использования