@Override public void addBinary(Binary value) { holder.buffer = buf = buf.reallocIfNeeded(value.length()); buf.setBytes(0, value.toByteBuffer()); holder.start = 0; holder.end = value.length(); writer.write(holder); }
@Override public void addBinary(Binary value) { BigDecimal bigDecimal = DecimalUtility.getBigDecimalFromByteArray( value.getBytes(), 0, value.length(), holder.scale); DecimalUtility.getSparseFromBigDecimal( bigDecimal, buf, 0, holder.scale, holder.precision, Decimal38SparseHolder.nDecimalDigits); holder.buffer = buf; writer.write(holder); }
@Override public void addBinary(Binary value) { final byte[] input = value.getBytes(); holder.months = ParquetReaderUtility.getIntFromLEBytes(input, 0); holder.days = ParquetReaderUtility.getIntFromLEBytes(input, 4); holder.milliseconds = ParquetReaderUtility.getIntFromLEBytes(input, 8); writer.write(holder); }
private void writeValue(Column column, Tuple tuple, int index) { switch (column.getDataType().getType()) { case BOOLEAN: recordConsumer.addBoolean(tuple.getBool(index)); break; case BIT: case INT2: case INT4: recordConsumer.addInteger(tuple.getInt4(index)); break; case INT8: recordConsumer.addLong(tuple.getInt8(index)); break; case FLOAT4: recordConsumer.addFloat(tuple.getFloat4(index)); break; case FLOAT8: recordConsumer.addDouble(tuple.getFloat8(index)); break; case CHAR: if (tuple.size(index) > column.getDataType().getLength()) { throw new ValueTooLongForTypeCharactersException(column.getDataType().getLength()); } recordConsumer.addBinary(Binary.fromByteArray(tuple.getTextBytes(index))); break; case TEXT: recordConsumer.addBinary(Binary.fromByteArray(tuple.getTextBytes(index))); break; case DATE: // Parquet DATE type is based on Unix Epoch(Jan 1, 1970). recordConsumer.addInteger(tuple.getInt4(index) - DateTimeConstants.UNIX_EPOCH_JDATE); break; case PROTOBUF: case BLOB: recordConsumer.addBinary(Binary.fromByteArray(tuple.getBytes(index))); break; default: break; } }
// this method is called by its superclass during a read loop @Override protected void readField(long recordsToReadInThisPass) { recordsReadInThisIteration = Math.min( pageReader.currentPageCount - pageReader.valuesRead, recordsToReadInThisPass - valuesReadInCurrentPass); readLengthInBits = recordsReadInThisIteration * dataTypeLengthInBits; readLength = (int) Math.ceil(readLengthInBits / 8.0); if (usingDictionary) { VarBinaryVector.Mutator mutator = valueVec.getMutator(); Binary currDictValToWrite = null; for (int i = 0; i < recordsReadInThisIteration; i++) { currDictValToWrite = pageReader.dictionaryValueReader.readBytes(); mutator.setSafe( valuesReadInCurrentPass + i, currDictValToWrite.toByteBuffer(), 0, currDictValToWrite.length()); } // Set the write Index. The next page that gets read might be a page that does not use // dictionary encoding // and we will go into the else condition below. The readField method of the parent class // requires the // writer index to be set correctly. int writerIndex = valueVec.getBuffer().writerIndex(); valueVec.getBuffer().setIndex(0, writerIndex + (int) readLength); } else { super.readField(recordsToReadInThisPass); } // TODO - replace this with fixed binary type in drill // now we need to write the lengths of each value int byteLength = dataTypeLengthInBits / 8; for (int i = 0; i < recordsToReadInThisPass; i++) { valueVec.getMutator().setValueLengthSafe(valuesReadInCurrentPass + i, byteLength); } }
/** Reads `num` values into column, decoding the values from `dictionaryIds` and `dictionary`. */ private void decodeDictionaryIds( int rowId, int num, ColumnVector column, ColumnVector dictionaryIds) { switch (descriptor.getType()) { case INT32: if (column.dataType() == DataTypes.IntegerType || DecimalType.is32BitDecimalType(column.dataType())) { for (int i = rowId; i < rowId + num; ++i) { column.putInt(i, dictionary.decodeToInt(dictionaryIds.getInt(i))); } } else if (column.dataType() == DataTypes.ByteType) { for (int i = rowId; i < rowId + num; ++i) { column.putByte(i, (byte) dictionary.decodeToInt(dictionaryIds.getInt(i))); } } else if (column.dataType() == DataTypes.ShortType) { for (int i = rowId; i < rowId + num; ++i) { column.putShort(i, (short) dictionary.decodeToInt(dictionaryIds.getInt(i))); } } else { throw new UnsupportedOperationException("Unimplemented type: " + column.dataType()); } break; case INT64: if (column.dataType() == DataTypes.LongType || DecimalType.is64BitDecimalType(column.dataType())) { for (int i = rowId; i < rowId + num; ++i) { column.putLong(i, dictionary.decodeToLong(dictionaryIds.getInt(i))); } } else { throw new UnsupportedOperationException("Unimplemented type: " + column.dataType()); } break; case FLOAT: for (int i = rowId; i < rowId + num; ++i) { column.putFloat(i, dictionary.decodeToFloat(dictionaryIds.getInt(i))); } break; case DOUBLE: for (int i = rowId; i < rowId + num; ++i) { column.putDouble(i, dictionary.decodeToDouble(dictionaryIds.getInt(i))); } break; case INT96: if (column.dataType() == DataTypes.TimestampType) { for (int i = rowId; i < rowId + num; ++i) { // TODO: Convert dictionary of Binaries to dictionary of Longs Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i)); column.putLong(i, ParquetRowConverter.binaryToSQLTimestamp(v)); } } else { throw new UnsupportedOperationException(); } break; case BINARY: // TODO: this is incredibly inefficient as it blows up the dictionary right here. We // need to do this better. We should probably add the dictionary data to the ColumnVector // and reuse it across batches. This should mean adding a ByteArray would just update // the length and offset. for (int i = rowId; i < rowId + num; ++i) { Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i)); column.putByteArray(i, v.getBytes()); } break; case FIXED_LEN_BYTE_ARRAY: // DecimalType written in the legacy mode if (DecimalType.is32BitDecimalType(column.dataType())) { for (int i = rowId; i < rowId + num; ++i) { Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i)); column.putInt(i, (int) ParquetRowConverter.binaryToUnscaledLong(v)); } } else if (DecimalType.is64BitDecimalType(column.dataType())) { for (int i = rowId; i < rowId + num; ++i) { Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i)); column.putLong(i, ParquetRowConverter.binaryToUnscaledLong(v)); } } else if (DecimalType.isByteArrayDecimalType(column.dataType())) { for (int i = rowId; i < rowId + num; ++i) { Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i)); column.putByteArray(i, v.getBytes()); } } else { throw new UnsupportedOperationException(); } break; default: throw new UnsupportedOperationException("Unsupported type: " + descriptor.getType()); } }
private void decodeArrayString( int index, Type field, Group pqGroup, String arrayString, char delim) throws IOException { // for parquet, we only have one-dimention array // anotation support: decimal, time, timestamp String[] splits = FormatHandlerUtil.getArraySplits(arrayString.toCharArray(), delim); for (String elementString : splits) { switch (field.asPrimitiveType().getPrimitiveTypeName()) { case BOOLEAN: pqGroup.add(index, Boolean.parseBoolean(elementString)); break; case INT32: if (columnSchemas.get(index).getType() == GPDBWritable.DATE) { pqGroup.add( index, (int) FormatHandlerUtil.getTimeDiff( elementString, "1970-01-01", "yyyy-mm-dd", 24 * 60 * 60 * 1000)); } else if (columnSchemas.get(index).getType() == GPDBWritable.TIME) { pqGroup.add( index, (int) FormatHandlerUtil.getTimeDiff(elementString, "00:00:00", "mm:hh:ss", 1)); } else { pqGroup.add(index, Integer.parseInt(elementString)); } break; case INT64: if (columnSchemas.get(index).getType() == GPDBWritable.TIMESTAMP) { pqGroup.add( index, FormatHandlerUtil.getTimeDiff( elementString, "1970-01-01 00:00:00", "yyyy-mm-dd mm:hh:ss", 1)); } else { pqGroup.add(index, Long.parseLong(elementString)); } break; case FLOAT: pqGroup.add(index, Float.parseFloat(elementString)); break; case DOUBLE: pqGroup.add(index, Double.parseDouble(elementString)); break; case INT96: case BINARY: case FIXED_LEN_BYTE_ARRAY: OriginalType type = field.getOriginalType(); if (type == OriginalType.UTF8 || type == OriginalType.JSON) { pqGroup.add(index, elementString); } else if (type == OriginalType.DECIMAL) { pqGroup.add(index, Binary.fromByteArray(elementString.getBytes())); } else if (type == OriginalType.INTERVAL) { pqGroup.add( index, Binary.fromByteArray(FormatHandlerUtil.getParquetInterval(elementString))); } else { pqGroup.add( index, Binary.fromByteArray(FormatHandlerUtil.octString2byteArray(elementString).array())); } break; default: throw new IOException( "internal error, you should not be here, pqtype:" + field.asPrimitiveType().getPrimitiveTypeName()); } } }
private void fillElement(int index, int colType, Group pqGroup, GPDBWritable gw, Type field) throws IOException { switch (colType) { case GPDBWritable.BPCHAR: case GPDBWritable.CHAR: case GPDBWritable.DATE: case GPDBWritable.NUMERIC: case GPDBWritable.TIME: case GPDBWritable.TIMESTAMP: case GPDBWritable.VARCHAR: case GPDBWritable.TEXT: // utf8 or array if (field.getRepetition() == Repetition.REPEATED) { decodeArrayString( index, field, pqGroup, gw.getString(index), columnSchemas.get(index).getDelim()); } else { int gpdbType = columnSchemas.get(index).getType(); PrimitiveTypeName priType = field.asPrimitiveType().getPrimitiveTypeName(); OriginalType originalType = field.getOriginalType(); if (gpdbType == GPDBWritable.NUMERIC && priType == PrimitiveTypeName.INT32) { pqGroup.add(index, Integer.parseInt(gw.getString(index))); } else if (gpdbType == GPDBWritable.NUMERIC && priType == PrimitiveTypeName.INT64) { pqGroup.add(index, Long.parseLong(gw.getString(index))); } else if (gpdbType == GPDBWritable.DATE && priType == PrimitiveTypeName.INT32) { pqGroup.add( index, (int) FormatHandlerUtil.getTimeDiff( gw.getString(index), "1970-01-01", "yyyy-mm-dd", 24 * 60 * 60 * 1000)); } else if (gpdbType == GPDBWritable.TIME && priType == PrimitiveTypeName.INT32) { pqGroup.add( index, (int) FormatHandlerUtil.getTimeDiff(gw.getString(index), "00:00:00", "mm:hh:ss", 1)); } else if (gpdbType == GPDBWritable.TIMESTAMP && priType == PrimitiveTypeName.INT64) { pqGroup.add( index, FormatHandlerUtil.getTimeDiff( gw.getString(index), "1970-01-01 00:00:00", "yyyy-mm-dd mm:hh:ss", 1)); } else if (gpdbType == GPDBWritable.INTERVAL && originalType == OriginalType.INTERVAL) { // interval is complex, we just use string, for now, we just support 'postgres' // style interval // 1 year 2 mons -3 days +04:05:06.00901 byte[] interval = FormatHandlerUtil.getParquetInterval(gw.getString(index)); pqGroup.add(index, Binary.fromByteArray(interval)); } else { pqGroup.add(index, gw.getString(index)); } } break; case GPDBWritable.BYTEA: pqGroup.add(index, Binary.fromByteArray(gw.getBytes(index))); break; case GPDBWritable.REAL: pqGroup.add(index, gw.getFloat(index)); break; case GPDBWritable.BIGINT: pqGroup.add(index, gw.getLong(index)); break; case GPDBWritable.BOOLEAN: pqGroup.add(index, gw.getBoolean(index)); break; case GPDBWritable.FLOAT8: pqGroup.add(index, gw.getDouble(index)); break; case GPDBWritable.INTEGER: pqGroup.add(index, gw.getInt(index)); break; case GPDBWritable.SMALLINT: pqGroup.add(index, gw.getShort(index)); break; default: throw new IOException("internal error, not support type, typeId:" + colType); } }
@Override public void addBinary(Binary value) { holder.buffer.setBytes(0, value.toByteBuffer()); writer.write(holder); }