Java ColumnVector Examples, org.apache.spark.sql.execution.vectorized.ColumnVector Java Examples

Example #1

0

Show file

File: VectorizedColumnReader.java Project: ChrisYohann/spark

 private void readFixedLenByteArrayBatch(int rowId, int num, ColumnVector column, int arrayLen)
     throws IOException {
   VectorizedValuesReader data = (VectorizedValuesReader) dataColumn;
   // This is where we implement support for the valid type conversions.
   // TODO: implement remaining type conversions
   if (DecimalType.is32BitDecimalType(column.dataType())) {
     for (int i = 0; i < num; i++) {
       if (defColumn.readInteger() == maxDefLevel) {
         column.putInt(
             rowId + i, (int) ParquetRowConverter.binaryToUnscaledLong(data.readBinary(arrayLen)));
       } else {
         column.putNull(rowId + i);
       }
     }
   } else if (DecimalType.is64BitDecimalType(column.dataType())) {
     for (int i = 0; i < num; i++) {
       if (defColumn.readInteger() == maxDefLevel) {
         column.putLong(
             rowId + i, ParquetRowConverter.binaryToUnscaledLong(data.readBinary(arrayLen)));
       } else {
         column.putNull(rowId + i);
       }
     }
   } else if (DecimalType.isByteArrayDecimalType(column.dataType())) {
     for (int i = 0; i < num; i++) {
       if (defColumn.readInteger() == maxDefLevel) {
         column.putByteArray(rowId + i, data.readBinary(arrayLen).getBytes());
       } else {
         column.putNull(rowId + i);
       }
     }
   } else {
     throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
   }
 }

Example #2

0

Show file

File: VectorizedColumnReader.java Project: ChrisYohann/spark

 private void readFloatBatch(int rowId, int num, ColumnVector column) throws IOException {
   // This is where we implement support for the valid type conversions.
   // TODO: support implicit cast to double?
   if (column.dataType() == DataTypes.FloatType) {
     defColumn.readFloats(num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
   } else {
     throw new UnsupportedOperationException("Unsupported conversion to: " + column.dataType());
   }
 }

Example #3

0

Show file

File: VectorizedColumnReader.java Project: ChrisYohann/spark

 private void readDoubleBatch(int rowId, int num, ColumnVector column) throws IOException {
   // This is where we implement support for the valid type conversions.
   // TODO: implement remaining type conversions
   if (column.dataType() == DataTypes.DoubleType) {
     defColumn.readDoubles(num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
   } else {
     throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
   }
 }

Example #4

0

Show file

File: VectorizedColumnReader.java Project: ChrisYohann/spark

 private void readLongBatch(int rowId, int num, ColumnVector column) throws IOException {
   // This is where we implement support for the valid type conversions.
   if (column.dataType() == DataTypes.LongType
       || DecimalType.is64BitDecimalType(column.dataType())) {
     defColumn.readLongs(num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
   } else {
     throw new UnsupportedOperationException("Unsupported conversion to: " + column.dataType());
   }
 }

Example #5

0

Show file

File: VectorizedColumnReader.java Project: ChrisYohann/spark

 private void readBinaryBatch(int rowId, int num, ColumnVector column) throws IOException {
   // This is where we implement support for the valid type conversions.
   // TODO: implement remaining type conversions
   VectorizedValuesReader data = (VectorizedValuesReader) dataColumn;
   if (column.isArray()) {
     defColumn.readBinarys(num, column, rowId, maxDefLevel, data);
   } else if (column.dataType() == DataTypes.TimestampType) {
     for (int i = 0; i < num; i++) {
       if (defColumn.readInteger() == maxDefLevel) {
         column.putLong(
             rowId + i,
             // Read 12 bytes for INT96
             ParquetRowConverter.binaryToSQLTimestamp(data.readBinary(12)));
       } else {
         column.putNull(rowId + i);
       }
     }
   } else {
     throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
   }
 }

Example #6

0

Show file

File: VectorizedColumnReader.java Project: ChrisYohann/spark

 private void readIntBatch(int rowId, int num, ColumnVector column) throws IOException {
   // This is where we implement support for the valid type conversions.
   // TODO: implement remaining type conversions
   if (column.dataType() == DataTypes.IntegerType
       || column.dataType() == DataTypes.DateType
       || DecimalType.is32BitDecimalType(column.dataType())) {
     defColumn.readIntegers(num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
   } else if (column.dataType() == DataTypes.ByteType) {
     defColumn.readBytes(num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
   } else if (column.dataType() == DataTypes.ShortType) {
     defColumn.readShorts(num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
   } else {
     throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
   }
 }

Example #7

0

Show file

File: VectorizedColumnReader.java Project: ChrisYohann/spark

 /**
  * For all the read*Batch functions, reads `num` values from this columnReader into column. It is
  * guaranteed that num is smaller than the number of values left in the current page.
  */
 private void readBooleanBatch(int rowId, int num, ColumnVector column) throws IOException {
   assert (column.dataType() == DataTypes.BooleanType);
   defColumn.readBooleans(num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
 }

Example #8

0

Show file

File: VectorizedColumnReader.java Project: ChrisYohann/spark

  /** Reads `num` values into column, decoding the values from `dictionaryIds` and `dictionary`. */
  private void decodeDictionaryIds(
      int rowId, int num, ColumnVector column, ColumnVector dictionaryIds) {
    switch (descriptor.getType()) {
      case INT32:
        if (column.dataType() == DataTypes.IntegerType
            || DecimalType.is32BitDecimalType(column.dataType())) {
          for (int i = rowId; i < rowId + num; ++i) {
            column.putInt(i, dictionary.decodeToInt(dictionaryIds.getInt(i)));
          }
        } else if (column.dataType() == DataTypes.ByteType) {
          for (int i = rowId; i < rowId + num; ++i) {
            column.putByte(i, (byte) dictionary.decodeToInt(dictionaryIds.getInt(i)));
          }
        } else if (column.dataType() == DataTypes.ShortType) {
          for (int i = rowId; i < rowId + num; ++i) {
            column.putShort(i, (short) dictionary.decodeToInt(dictionaryIds.getInt(i)));
          }
        } else {
          throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
        }
        break;

      case INT64:
        if (column.dataType() == DataTypes.LongType
            || DecimalType.is64BitDecimalType(column.dataType())) {
          for (int i = rowId; i < rowId + num; ++i) {
            column.putLong(i, dictionary.decodeToLong(dictionaryIds.getInt(i)));
          }
        } else {
          throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
        }
        break;

      case FLOAT:
        for (int i = rowId; i < rowId + num; ++i) {
          column.putFloat(i, dictionary.decodeToFloat(dictionaryIds.getInt(i)));
        }
        break;

      case DOUBLE:
        for (int i = rowId; i < rowId + num; ++i) {
          column.putDouble(i, dictionary.decodeToDouble(dictionaryIds.getInt(i)));
        }
        break;
      case INT96:
        if (column.dataType() == DataTypes.TimestampType) {
          for (int i = rowId; i < rowId + num; ++i) {
            // TODO: Convert dictionary of Binaries to dictionary of Longs
            Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));
            column.putLong(i, ParquetRowConverter.binaryToSQLTimestamp(v));
          }
        } else {
          throw new UnsupportedOperationException();
        }
        break;
      case BINARY:
        // TODO: this is incredibly inefficient as it blows up the dictionary right here. We
        // need to do this better. We should probably add the dictionary data to the ColumnVector
        // and reuse it across batches. This should mean adding a ByteArray would just update
        // the length and offset.
        for (int i = rowId; i < rowId + num; ++i) {
          Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));
          column.putByteArray(i, v.getBytes());
        }
        break;
      case FIXED_LEN_BYTE_ARRAY:
        // DecimalType written in the legacy mode
        if (DecimalType.is32BitDecimalType(column.dataType())) {
          for (int i = rowId; i < rowId + num; ++i) {
            Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));
            column.putInt(i, (int) ParquetRowConverter.binaryToUnscaledLong(v));
          }
        } else if (DecimalType.is64BitDecimalType(column.dataType())) {
          for (int i = rowId; i < rowId + num; ++i) {
            Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));
            column.putLong(i, ParquetRowConverter.binaryToUnscaledLong(v));
          }
        } else if (DecimalType.isByteArrayDecimalType(column.dataType())) {
          for (int i = rowId; i < rowId + num; ++i) {
            Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));
            column.putByteArray(i, v.getBytes());
          }
        } else {
          throw new UnsupportedOperationException();
        }
        break;

      default:
        throw new UnsupportedOperationException("Unsupported type: " + descriptor.getType());
    }
  }

Example #9

0

Show file

File: VectorizedColumnReader.java Project: ChrisYohann/spark

  /** Reads `total` values from this columnReader into column. */
  void readBatch(int total, ColumnVector column) throws IOException {
    int rowId = 0;
    ColumnVector dictionaryIds = null;
    if (dictionary != null) {
      // SPARK-16334: We only maintain a single dictionary per row batch, so that it can be used to
      // decode all previous dictionary encoded pages if we ever encounter a non-dictionary encoded
      // page.
      dictionaryIds = column.reserveDictionaryIds(total);
    }
    while (total > 0) {
      // Compute the number of values we want to read in this page.
      int leftInPage = (int) (endOfPageValueCount - valuesRead);
      if (leftInPage == 0) {
        readPage();
        leftInPage = (int) (endOfPageValueCount - valuesRead);
      }
      int num = Math.min(total, leftInPage);
      if (isCurrentPageDictionaryEncoded) {
        // Read and decode dictionary ids.
        defColumn.readIntegers(
            num, dictionaryIds, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
        if (column.hasDictionary()
            || (rowId == 0
                && (descriptor.getType() == PrimitiveType.PrimitiveTypeName.INT32
                    || descriptor.getType() == PrimitiveType.PrimitiveTypeName.INT64
                    || descriptor.getType() == PrimitiveType.PrimitiveTypeName.FLOAT
                    || descriptor.getType() == PrimitiveType.PrimitiveTypeName.DOUBLE
                    || descriptor.getType() == PrimitiveType.PrimitiveTypeName.BINARY))) {
          // Column vector supports lazy decoding of dictionary values so just set the dictionary.
          // We can't do this if rowId != 0 AND the column doesn't have a dictionary (i.e. some
          // non-dictionary encoded values have already been added).
          column.setDictionary(dictionary);
        } else {
          decodeDictionaryIds(rowId, num, column, dictionaryIds);
        }
      } else {
        if (column.hasDictionary() && rowId != 0) {
          // This batch already has dictionary encoded values but this new page is not. The batch
          // does not support a mix of dictionary and not so we will decode the dictionary.
          decodeDictionaryIds(0, rowId, column, column.getDictionaryIds());
        }
        column.setDictionary(null);
        switch (descriptor.getType()) {
          case BOOLEAN:
            readBooleanBatch(rowId, num, column);
            break;
          case INT32:
            readIntBatch(rowId, num, column);
            break;
          case INT64:
            readLongBatch(rowId, num, column);
            break;
          case INT96:
            readBinaryBatch(rowId, num, column);
            break;
          case FLOAT:
            readFloatBatch(rowId, num, column);
            break;
          case DOUBLE:
            readDoubleBatch(rowId, num, column);
            break;
          case BINARY:
            readBinaryBatch(rowId, num, column);
            break;
          case FIXED_LEN_BYTE_ARRAY:
            readFixedLenByteArrayBatch(rowId, num, column, descriptor.getTypeLength());
            break;
          default:
            throw new IOException("Unsupported type: " + descriptor.getType());
        }
      }

      valuesRead += num;
      rowId += num;
      total -= num;
    }
  }