Java Binary 예제들, org.apache.parquet.io.api.Binary Java 예제들

예제 #1

0

파일 보기

파일: DrillParquetGroupConverter.java 프로젝트: hnfgns/incubator-drill

 @Override
 public void addBinary(Binary value) {
   holder.buffer = buf = buf.reallocIfNeeded(value.length());
   buf.setBytes(0, value.toByteBuffer());
   holder.start = 0;
   holder.end = value.length();
   writer.write(holder);
 }

예제 #2

0

파일 보기

파일: DrillParquetGroupConverter.java 프로젝트: hnfgns/incubator-drill

 @Override
 public void addBinary(Binary value) {
   BigDecimal bigDecimal =
       DecimalUtility.getBigDecimalFromByteArray(
           value.getBytes(), 0, value.length(), holder.scale);
   DecimalUtility.getSparseFromBigDecimal(
       bigDecimal, buf, 0, holder.scale, holder.precision, Decimal38SparseHolder.nDecimalDigits);
   holder.buffer = buf;
   writer.write(holder);
 }

예제 #3

0

파일 보기

파일: DrillParquetGroupConverter.java 프로젝트: hnfgns/incubator-drill

 @Override
 public void addBinary(Binary value) {
   final byte[] input = value.getBytes();
   holder.months = ParquetReaderUtility.getIntFromLEBytes(input, 0);
   holder.days = ParquetReaderUtility.getIntFromLEBytes(input, 4);
   holder.milliseconds = ParquetReaderUtility.getIntFromLEBytes(input, 8);
   writer.write(holder);
 }

예제 #4

0

파일 보기

파일: TajoWriteSupport.java 프로젝트: combineads/tajo

  private void writeValue(Column column, Tuple tuple, int index) {
    switch (column.getDataType().getType()) {
      case BOOLEAN:
        recordConsumer.addBoolean(tuple.getBool(index));
        break;
      case BIT:
      case INT2:
      case INT4:
        recordConsumer.addInteger(tuple.getInt4(index));
        break;
      case INT8:
        recordConsumer.addLong(tuple.getInt8(index));
        break;
      case FLOAT4:
        recordConsumer.addFloat(tuple.getFloat4(index));
        break;
      case FLOAT8:
        recordConsumer.addDouble(tuple.getFloat8(index));
        break;
      case CHAR:
        if (tuple.size(index) > column.getDataType().getLength()) {
          throw new ValueTooLongForTypeCharactersException(column.getDataType().getLength());
        }

        recordConsumer.addBinary(Binary.fromByteArray(tuple.getTextBytes(index)));
        break;
      case TEXT:
        recordConsumer.addBinary(Binary.fromByteArray(tuple.getTextBytes(index)));
        break;
      case DATE:
        // Parquet DATE type is based on Unix Epoch(Jan 1, 1970).
        recordConsumer.addInteger(tuple.getInt4(index) - DateTimeConstants.UNIX_EPOCH_JDATE);
        break;
      case PROTOBUF:
      case BLOB:
        recordConsumer.addBinary(Binary.fromByteArray(tuple.getBytes(index)));
        break;
      default:
        break;
    }
  }

예제 #5

0

파일 보기

파일: ParquetFixedWidthDictionaryReaders.java 프로젝트: laurentgo/drill

    // this method is called by its superclass during a read loop
    @Override
    protected void readField(long recordsToReadInThisPass) {

      recordsReadInThisIteration =
          Math.min(
              pageReader.currentPageCount - pageReader.valuesRead,
              recordsToReadInThisPass - valuesReadInCurrentPass);
      readLengthInBits = recordsReadInThisIteration * dataTypeLengthInBits;
      readLength = (int) Math.ceil(readLengthInBits / 8.0);

      if (usingDictionary) {
        VarBinaryVector.Mutator mutator = valueVec.getMutator();
        Binary currDictValToWrite = null;
        for (int i = 0; i < recordsReadInThisIteration; i++) {
          currDictValToWrite = pageReader.dictionaryValueReader.readBytes();
          mutator.setSafe(
              valuesReadInCurrentPass + i,
              currDictValToWrite.toByteBuffer(),
              0,
              currDictValToWrite.length());
        }
        // Set the write Index. The next page that gets read might be a page that does not use
        // dictionary encoding
        // and we will go into the else condition below. The readField method of the parent class
        // requires the
        // writer index to be set correctly.
        int writerIndex = valueVec.getBuffer().writerIndex();
        valueVec.getBuffer().setIndex(0, writerIndex + (int) readLength);
      } else {
        super.readField(recordsToReadInThisPass);
      }

      // TODO - replace this with fixed binary type in drill
      // now we need to write the lengths of each value
      int byteLength = dataTypeLengthInBits / 8;
      for (int i = 0; i < recordsToReadInThisPass; i++) {
        valueVec.getMutator().setValueLengthSafe(valuesReadInCurrentPass + i, byteLength);
      }
    }

예제 #6

0

파일 보기

파일: VectorizedColumnReader.java 프로젝트: ChrisYohann/spark

  /** Reads `num` values into column, decoding the values from `dictionaryIds` and `dictionary`. */
  private void decodeDictionaryIds(
      int rowId, int num, ColumnVector column, ColumnVector dictionaryIds) {
    switch (descriptor.getType()) {
      case INT32:
        if (column.dataType() == DataTypes.IntegerType
            || DecimalType.is32BitDecimalType(column.dataType())) {
          for (int i = rowId; i < rowId + num; ++i) {
            column.putInt(i, dictionary.decodeToInt(dictionaryIds.getInt(i)));
          }
        } else if (column.dataType() == DataTypes.ByteType) {
          for (int i = rowId; i < rowId + num; ++i) {
            column.putByte(i, (byte) dictionary.decodeToInt(dictionaryIds.getInt(i)));
          }
        } else if (column.dataType() == DataTypes.ShortType) {
          for (int i = rowId; i < rowId + num; ++i) {
            column.putShort(i, (short) dictionary.decodeToInt(dictionaryIds.getInt(i)));
          }
        } else {
          throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
        }
        break;

      case INT64:
        if (column.dataType() == DataTypes.LongType
            || DecimalType.is64BitDecimalType(column.dataType())) {
          for (int i = rowId; i < rowId + num; ++i) {
            column.putLong(i, dictionary.decodeToLong(dictionaryIds.getInt(i)));
          }
        } else {
          throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
        }
        break;

      case FLOAT:
        for (int i = rowId; i < rowId + num; ++i) {
          column.putFloat(i, dictionary.decodeToFloat(dictionaryIds.getInt(i)));
        }
        break;

      case DOUBLE:
        for (int i = rowId; i < rowId + num; ++i) {
          column.putDouble(i, dictionary.decodeToDouble(dictionaryIds.getInt(i)));
        }
        break;
      case INT96:
        if (column.dataType() == DataTypes.TimestampType) {
          for (int i = rowId; i < rowId + num; ++i) {
            // TODO: Convert dictionary of Binaries to dictionary of Longs
            Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));
            column.putLong(i, ParquetRowConverter.binaryToSQLTimestamp(v));
          }
        } else {
          throw new UnsupportedOperationException();
        }
        break;
      case BINARY:
        // TODO: this is incredibly inefficient as it blows up the dictionary right here. We
        // need to do this better. We should probably add the dictionary data to the ColumnVector
        // and reuse it across batches. This should mean adding a ByteArray would just update
        // the length and offset.
        for (int i = rowId; i < rowId + num; ++i) {
          Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));
          column.putByteArray(i, v.getBytes());
        }
        break;
      case FIXED_LEN_BYTE_ARRAY:
        // DecimalType written in the legacy mode
        if (DecimalType.is32BitDecimalType(column.dataType())) {
          for (int i = rowId; i < rowId + num; ++i) {
            Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));
            column.putInt(i, (int) ParquetRowConverter.binaryToUnscaledLong(v));
          }
        } else if (DecimalType.is64BitDecimalType(column.dataType())) {
          for (int i = rowId; i < rowId + num; ++i) {
            Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));
            column.putLong(i, ParquetRowConverter.binaryToUnscaledLong(v));
          }
        } else if (DecimalType.isByteArrayDecimalType(column.dataType())) {
          for (int i = rowId; i < rowId + num; ++i) {
            Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));
            column.putByteArray(i, v.getBytes());
          }
        } else {
          throw new UnsupportedOperationException();
        }
        break;

      default:
        throw new UnsupportedOperationException("Unsupported type: " + descriptor.getType());
    }
  }

예제 #7

0

파일 보기

파일: GpdbParquetFileWriter.java 프로젝트: ginobiliwang/gpdb

  private void decodeArrayString(
      int index, Type field, Group pqGroup, String arrayString, char delim) throws IOException {
    //		for parquet, we only have one-dimention array
    //		anotation support: decimal, time, timestamp
    String[] splits = FormatHandlerUtil.getArraySplits(arrayString.toCharArray(), delim);

    for (String elementString : splits) {
      switch (field.asPrimitiveType().getPrimitiveTypeName()) {
        case BOOLEAN:
          pqGroup.add(index, Boolean.parseBoolean(elementString));
          break;

        case INT32:
          if (columnSchemas.get(index).getType() == GPDBWritable.DATE) {
            pqGroup.add(
                index,
                (int)
                    FormatHandlerUtil.getTimeDiff(
                        elementString, "1970-01-01", "yyyy-mm-dd", 24 * 60 * 60 * 1000));
          } else if (columnSchemas.get(index).getType() == GPDBWritable.TIME) {
            pqGroup.add(
                index,
                (int) FormatHandlerUtil.getTimeDiff(elementString, "00:00:00", "mm:hh:ss", 1));
          } else {
            pqGroup.add(index, Integer.parseInt(elementString));
          }
          break;

        case INT64:
          if (columnSchemas.get(index).getType() == GPDBWritable.TIMESTAMP) {
            pqGroup.add(
                index,
                FormatHandlerUtil.getTimeDiff(
                    elementString, "1970-01-01 00:00:00", "yyyy-mm-dd mm:hh:ss", 1));
          } else {
            pqGroup.add(index, Long.parseLong(elementString));
          }
          break;

        case FLOAT:
          pqGroup.add(index, Float.parseFloat(elementString));
          break;

        case DOUBLE:
          pqGroup.add(index, Double.parseDouble(elementString));
          break;

        case INT96:
        case BINARY:
        case FIXED_LEN_BYTE_ARRAY:
          OriginalType type = field.getOriginalType();
          if (type == OriginalType.UTF8 || type == OriginalType.JSON) {
            pqGroup.add(index, elementString);
          } else if (type == OriginalType.DECIMAL) {
            pqGroup.add(index, Binary.fromByteArray(elementString.getBytes()));
          } else if (type == OriginalType.INTERVAL) {
            pqGroup.add(
                index, Binary.fromByteArray(FormatHandlerUtil.getParquetInterval(elementString)));
          } else {
            pqGroup.add(
                index,
                Binary.fromByteArray(FormatHandlerUtil.octString2byteArray(elementString).array()));
          }
          break;

        default:
          throw new IOException(
              "internal error, you should not be here, pqtype:"
                  + field.asPrimitiveType().getPrimitiveTypeName());
      }
    }
  }

예제 #8

0

파일 보기

파일: GpdbParquetFileWriter.java 프로젝트: ginobiliwang/gpdb

  private void fillElement(int index, int colType, Group pqGroup, GPDBWritable gw, Type field)
      throws IOException {
    switch (colType) {
      case GPDBWritable.BPCHAR:
      case GPDBWritable.CHAR:
      case GPDBWritable.DATE:
      case GPDBWritable.NUMERIC:
      case GPDBWritable.TIME:
      case GPDBWritable.TIMESTAMP:
      case GPDBWritable.VARCHAR:
      case GPDBWritable.TEXT:
        //				utf8 or array
        if (field.getRepetition() == Repetition.REPEATED) {
          decodeArrayString(
              index, field, pqGroup, gw.getString(index), columnSchemas.get(index).getDelim());
        } else {
          int gpdbType = columnSchemas.get(index).getType();
          PrimitiveTypeName priType = field.asPrimitiveType().getPrimitiveTypeName();
          OriginalType originalType = field.getOriginalType();

          if (gpdbType == GPDBWritable.NUMERIC && priType == PrimitiveTypeName.INT32) {
            pqGroup.add(index, Integer.parseInt(gw.getString(index)));
          } else if (gpdbType == GPDBWritable.NUMERIC && priType == PrimitiveTypeName.INT64) {
            pqGroup.add(index, Long.parseLong(gw.getString(index)));
          } else if (gpdbType == GPDBWritable.DATE && priType == PrimitiveTypeName.INT32) {
            pqGroup.add(
                index,
                (int)
                    FormatHandlerUtil.getTimeDiff(
                        gw.getString(index), "1970-01-01", "yyyy-mm-dd", 24 * 60 * 60 * 1000));
          } else if (gpdbType == GPDBWritable.TIME && priType == PrimitiveTypeName.INT32) {
            pqGroup.add(
                index,
                (int)
                    FormatHandlerUtil.getTimeDiff(gw.getString(index), "00:00:00", "mm:hh:ss", 1));
          } else if (gpdbType == GPDBWritable.TIMESTAMP && priType == PrimitiveTypeName.INT64) {
            pqGroup.add(
                index,
                FormatHandlerUtil.getTimeDiff(
                    gw.getString(index), "1970-01-01 00:00:00", "yyyy-mm-dd mm:hh:ss", 1));
          } else if (gpdbType == GPDBWritable.INTERVAL && originalType == OriginalType.INTERVAL) {
            //						interval is complex, we just use string, for now, we just support 'postgres'
            // style interval
            //						1 year 2 mons -3 days +04:05:06.00901
            byte[] interval = FormatHandlerUtil.getParquetInterval(gw.getString(index));
            pqGroup.add(index, Binary.fromByteArray(interval));
          } else {
            pqGroup.add(index, gw.getString(index));
          }
        }
        break;

      case GPDBWritable.BYTEA:
        pqGroup.add(index, Binary.fromByteArray(gw.getBytes(index)));
        break;

      case GPDBWritable.REAL:
        pqGroup.add(index, gw.getFloat(index));
        break;

      case GPDBWritable.BIGINT:
        pqGroup.add(index, gw.getLong(index));
        break;

      case GPDBWritable.BOOLEAN:
        pqGroup.add(index, gw.getBoolean(index));
        break;

      case GPDBWritable.FLOAT8:
        pqGroup.add(index, gw.getDouble(index));
        break;

      case GPDBWritable.INTEGER:
        pqGroup.add(index, gw.getInt(index));
        break;

      case GPDBWritable.SMALLINT:
        pqGroup.add(index, gw.getShort(index));
        break;

      default:
        throw new IOException("internal error, not support type, typeId:" + colType);
    }
  }

예제 #9

0

파일 보기

파일: DrillParquetGroupConverter.java 프로젝트: hnfgns/incubator-drill

 @Override
 public void addBinary(Binary value) {
   holder.buffer.setBytes(0, value.toByteBuffer());
   writer.write(holder);
 }