private void readFixedLenByteArrayBatch(int rowId, int num, ColumnVector column, int arrayLen) throws IOException { VectorizedValuesReader data = (VectorizedValuesReader) dataColumn; // This is where we implement support for the valid type conversions. // TODO: implement remaining type conversions if (DecimalType.is32BitDecimalType(column.dataType())) { for (int i = 0; i < num; i++) { if (defColumn.readInteger() == maxDefLevel) { column.putInt( rowId + i, (int) ParquetRowConverter.binaryToUnscaledLong(data.readBinary(arrayLen))); } else { column.putNull(rowId + i); } } } else if (DecimalType.is64BitDecimalType(column.dataType())) { for (int i = 0; i < num; i++) { if (defColumn.readInteger() == maxDefLevel) { column.putLong( rowId + i, ParquetRowConverter.binaryToUnscaledLong(data.readBinary(arrayLen))); } else { column.putNull(rowId + i); } } } else if (DecimalType.isByteArrayDecimalType(column.dataType())) { for (int i = 0; i < num; i++) { if (defColumn.readInteger() == maxDefLevel) { column.putByteArray(rowId + i, data.readBinary(arrayLen).getBytes()); } else { column.putNull(rowId + i); } } } else { throw new UnsupportedOperationException("Unimplemented type: " + column.dataType()); } }
private void readFloatBatch(int rowId, int num, ColumnVector column) throws IOException { // This is where we implement support for the valid type conversions. // TODO: support implicit cast to double? if (column.dataType() == DataTypes.FloatType) { defColumn.readFloats(num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); } else { throw new UnsupportedOperationException("Unsupported conversion to: " + column.dataType()); } }
private void readDoubleBatch(int rowId, int num, ColumnVector column) throws IOException { // This is where we implement support for the valid type conversions. // TODO: implement remaining type conversions if (column.dataType() == DataTypes.DoubleType) { defColumn.readDoubles(num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); } else { throw new UnsupportedOperationException("Unimplemented type: " + column.dataType()); } }
private void readLongBatch(int rowId, int num, ColumnVector column) throws IOException { // This is where we implement support for the valid type conversions. if (column.dataType() == DataTypes.LongType || DecimalType.is64BitDecimalType(column.dataType())) { defColumn.readLongs(num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); } else { throw new UnsupportedOperationException("Unsupported conversion to: " + column.dataType()); } }
private void readBinaryBatch(int rowId, int num, ColumnVector column) throws IOException { // This is where we implement support for the valid type conversions. // TODO: implement remaining type conversions VectorizedValuesReader data = (VectorizedValuesReader) dataColumn; if (column.isArray()) { defColumn.readBinarys(num, column, rowId, maxDefLevel, data); } else if (column.dataType() == DataTypes.TimestampType) { for (int i = 0; i < num; i++) { if (defColumn.readInteger() == maxDefLevel) { column.putLong( rowId + i, // Read 12 bytes for INT96 ParquetRowConverter.binaryToSQLTimestamp(data.readBinary(12))); } else { column.putNull(rowId + i); } } } else { throw new UnsupportedOperationException("Unimplemented type: " + column.dataType()); } }
private void readIntBatch(int rowId, int num, ColumnVector column) throws IOException { // This is where we implement support for the valid type conversions. // TODO: implement remaining type conversions if (column.dataType() == DataTypes.IntegerType || column.dataType() == DataTypes.DateType || DecimalType.is32BitDecimalType(column.dataType())) { defColumn.readIntegers(num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); } else if (column.dataType() == DataTypes.ByteType) { defColumn.readBytes(num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); } else if (column.dataType() == DataTypes.ShortType) { defColumn.readShorts(num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); } else { throw new UnsupportedOperationException("Unimplemented type: " + column.dataType()); } }
/** * For all the read*Batch functions, reads `num` values from this columnReader into column. It is * guaranteed that num is smaller than the number of values left in the current page. */ private void readBooleanBatch(int rowId, int num, ColumnVector column) throws IOException { assert (column.dataType() == DataTypes.BooleanType); defColumn.readBooleans(num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); }
/** Reads `num` values into column, decoding the values from `dictionaryIds` and `dictionary`. */ private void decodeDictionaryIds( int rowId, int num, ColumnVector column, ColumnVector dictionaryIds) { switch (descriptor.getType()) { case INT32: if (column.dataType() == DataTypes.IntegerType || DecimalType.is32BitDecimalType(column.dataType())) { for (int i = rowId; i < rowId + num; ++i) { column.putInt(i, dictionary.decodeToInt(dictionaryIds.getInt(i))); } } else if (column.dataType() == DataTypes.ByteType) { for (int i = rowId; i < rowId + num; ++i) { column.putByte(i, (byte) dictionary.decodeToInt(dictionaryIds.getInt(i))); } } else if (column.dataType() == DataTypes.ShortType) { for (int i = rowId; i < rowId + num; ++i) { column.putShort(i, (short) dictionary.decodeToInt(dictionaryIds.getInt(i))); } } else { throw new UnsupportedOperationException("Unimplemented type: " + column.dataType()); } break; case INT64: if (column.dataType() == DataTypes.LongType || DecimalType.is64BitDecimalType(column.dataType())) { for (int i = rowId; i < rowId + num; ++i) { column.putLong(i, dictionary.decodeToLong(dictionaryIds.getInt(i))); } } else { throw new UnsupportedOperationException("Unimplemented type: " + column.dataType()); } break; case FLOAT: for (int i = rowId; i < rowId + num; ++i) { column.putFloat(i, dictionary.decodeToFloat(dictionaryIds.getInt(i))); } break; case DOUBLE: for (int i = rowId; i < rowId + num; ++i) { column.putDouble(i, dictionary.decodeToDouble(dictionaryIds.getInt(i))); } break; case INT96: if (column.dataType() == DataTypes.TimestampType) { for (int i = rowId; i < rowId + num; ++i) { // TODO: Convert dictionary of Binaries to dictionary of Longs Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i)); column.putLong(i, ParquetRowConverter.binaryToSQLTimestamp(v)); } } else { throw new UnsupportedOperationException(); } break; case BINARY: // TODO: this is incredibly inefficient as it blows up the dictionary right here. We // need to do this better. We should probably add the dictionary data to the ColumnVector // and reuse it across batches. This should mean adding a ByteArray would just update // the length and offset. for (int i = rowId; i < rowId + num; ++i) { Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i)); column.putByteArray(i, v.getBytes()); } break; case FIXED_LEN_BYTE_ARRAY: // DecimalType written in the legacy mode if (DecimalType.is32BitDecimalType(column.dataType())) { for (int i = rowId; i < rowId + num; ++i) { Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i)); column.putInt(i, (int) ParquetRowConverter.binaryToUnscaledLong(v)); } } else if (DecimalType.is64BitDecimalType(column.dataType())) { for (int i = rowId; i < rowId + num; ++i) { Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i)); column.putLong(i, ParquetRowConverter.binaryToUnscaledLong(v)); } } else if (DecimalType.isByteArrayDecimalType(column.dataType())) { for (int i = rowId; i < rowId + num; ++i) { Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i)); column.putByteArray(i, v.getBytes()); } } else { throw new UnsupportedOperationException(); } break; default: throw new UnsupportedOperationException("Unsupported type: " + descriptor.getType()); } }
/** Reads `total` values from this columnReader into column. */ void readBatch(int total, ColumnVector column) throws IOException { int rowId = 0; ColumnVector dictionaryIds = null; if (dictionary != null) { // SPARK-16334: We only maintain a single dictionary per row batch, so that it can be used to // decode all previous dictionary encoded pages if we ever encounter a non-dictionary encoded // page. dictionaryIds = column.reserveDictionaryIds(total); } while (total > 0) { // Compute the number of values we want to read in this page. int leftInPage = (int) (endOfPageValueCount - valuesRead); if (leftInPage == 0) { readPage(); leftInPage = (int) (endOfPageValueCount - valuesRead); } int num = Math.min(total, leftInPage); if (isCurrentPageDictionaryEncoded) { // Read and decode dictionary ids. defColumn.readIntegers( num, dictionaryIds, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); if (column.hasDictionary() || (rowId == 0 && (descriptor.getType() == PrimitiveType.PrimitiveTypeName.INT32 || descriptor.getType() == PrimitiveType.PrimitiveTypeName.INT64 || descriptor.getType() == PrimitiveType.PrimitiveTypeName.FLOAT || descriptor.getType() == PrimitiveType.PrimitiveTypeName.DOUBLE || descriptor.getType() == PrimitiveType.PrimitiveTypeName.BINARY))) { // Column vector supports lazy decoding of dictionary values so just set the dictionary. // We can't do this if rowId != 0 AND the column doesn't have a dictionary (i.e. some // non-dictionary encoded values have already been added). column.setDictionary(dictionary); } else { decodeDictionaryIds(rowId, num, column, dictionaryIds); } } else { if (column.hasDictionary() && rowId != 0) { // This batch already has dictionary encoded values but this new page is not. The batch // does not support a mix of dictionary and not so we will decode the dictionary. decodeDictionaryIds(0, rowId, column, column.getDictionaryIds()); } column.setDictionary(null); switch (descriptor.getType()) { case BOOLEAN: readBooleanBatch(rowId, num, column); break; case INT32: readIntBatch(rowId, num, column); break; case INT64: readLongBatch(rowId, num, column); break; case INT96: readBinaryBatch(rowId, num, column); break; case FLOAT: readFloatBatch(rowId, num, column); break; case DOUBLE: readDoubleBatch(rowId, num, column); break; case BINARY: readBinaryBatch(rowId, num, column); break; case FIXED_LEN_BYTE_ARRAY: readFixedLenByteArrayBatch(rowId, num, column, descriptor.getTypeLength()); break; default: throw new IOException("Unsupported type: " + descriptor.getType()); } } valuesRead += num; rowId += num; total -= num; } }