예제 #1
0
  private PqElementType generateElementType(int colType, int notNull, int ndims) {
    PqElementType eType = new PqElementType();

    if (GPDBWritable.isArrayType(colType, ndims)) {
      eType.isArray = true;
      colType = GPDBWritable.getElementTypeFromArrayType(colType);
    }

    switch (colType) {
      case GPDBWritable.BOOLEAN:
        eType.primitiveType = PrimitiveTypeName.BOOLEAN;
        break;

      case GPDBWritable.BYTEA:
        eType.primitiveType = PrimitiveTypeName.BINARY;
        break;

      case GPDBWritable.BIGINT:
        eType.primitiveType = PrimitiveTypeName.INT64;
        break;

      case GPDBWritable.SMALLINT:
      case GPDBWritable.INTEGER:
        eType.primitiveType = PrimitiveTypeName.INT32;
        break;

      case GPDBWritable.REAL:
        eType.primitiveType = PrimitiveTypeName.FLOAT;
        break;

      case GPDBWritable.FLOAT8:
        eType.primitiveType = PrimitiveTypeName.DOUBLE;
        break;

      case GPDBWritable.CHAR:
      case GPDBWritable.VARCHAR:
      case GPDBWritable.BPCHAR:
      case GPDBWritable.NUMERIC:
      case GPDBWritable.DATE:
      case GPDBWritable.TIME:
      case GPDBWritable.TIMESTAMP:
      case GPDBWritable.TEXT:
      default:
        //			others we just treat them as text, may be udt
        eType.primitiveType = PrimitiveTypeName.BINARY;

        if (colType == GPDBWritable.NUMERIC) {
          eType.originalType = OriginalType.DECIMAL;
        } else {
          eType.originalType = OriginalType.UTF8;
        }
    }

    return eType;
  }
예제 #2
0
  /**
   * fill group using GPDBWritable
   *
   * @throws IOException
   */
  private void fillRecord(Group pqGroup, GPDBWritable gw, MessageType schema) throws IOException {
    int[] colType = gw.getColumnType();
    List<Type> fields = schema.getFields();

    for (int i = 0; i < colType.length; i++) {
      fillElement(i, colType[i], pqGroup, gw, fields.get(i));
    }
  }
예제 #3
0
  private void fillElement(int index, int colType, Group pqGroup, GPDBWritable gw, Type field)
      throws IOException {
    switch (colType) {
      case GPDBWritable.BPCHAR:
      case GPDBWritable.CHAR:
      case GPDBWritable.DATE:
      case GPDBWritable.NUMERIC:
      case GPDBWritable.TIME:
      case GPDBWritable.TIMESTAMP:
      case GPDBWritable.VARCHAR:
      case GPDBWritable.TEXT:
        //				utf8 or array
        if (field.getRepetition() == Repetition.REPEATED) {
          decodeArrayString(
              index, field, pqGroup, gw.getString(index), columnSchemas.get(index).getDelim());
        } else {
          int gpdbType = columnSchemas.get(index).getType();
          PrimitiveTypeName priType = field.asPrimitiveType().getPrimitiveTypeName();
          OriginalType originalType = field.getOriginalType();

          if (gpdbType == GPDBWritable.NUMERIC && priType == PrimitiveTypeName.INT32) {
            pqGroup.add(index, Integer.parseInt(gw.getString(index)));
          } else if (gpdbType == GPDBWritable.NUMERIC && priType == PrimitiveTypeName.INT64) {
            pqGroup.add(index, Long.parseLong(gw.getString(index)));
          } else if (gpdbType == GPDBWritable.DATE && priType == PrimitiveTypeName.INT32) {
            pqGroup.add(
                index,
                (int)
                    FormatHandlerUtil.getTimeDiff(
                        gw.getString(index), "1970-01-01", "yyyy-mm-dd", 24 * 60 * 60 * 1000));
          } else if (gpdbType == GPDBWritable.TIME && priType == PrimitiveTypeName.INT32) {
            pqGroup.add(
                index,
                (int)
                    FormatHandlerUtil.getTimeDiff(gw.getString(index), "00:00:00", "mm:hh:ss", 1));
          } else if (gpdbType == GPDBWritable.TIMESTAMP && priType == PrimitiveTypeName.INT64) {
            pqGroup.add(
                index,
                FormatHandlerUtil.getTimeDiff(
                    gw.getString(index), "1970-01-01 00:00:00", "yyyy-mm-dd mm:hh:ss", 1));
          } else if (gpdbType == GPDBWritable.INTERVAL && originalType == OriginalType.INTERVAL) {
            //						interval is complex, we just use string, for now, we just support 'postgres'
            // style interval
            //						1 year 2 mons -3 days +04:05:06.00901
            byte[] interval = FormatHandlerUtil.getParquetInterval(gw.getString(index));
            pqGroup.add(index, Binary.fromByteArray(interval));
          } else {
            pqGroup.add(index, gw.getString(index));
          }
        }
        break;

      case GPDBWritable.BYTEA:
        pqGroup.add(index, Binary.fromByteArray(gw.getBytes(index)));
        break;

      case GPDBWritable.REAL:
        pqGroup.add(index, gw.getFloat(index));
        break;

      case GPDBWritable.BIGINT:
        pqGroup.add(index, gw.getLong(index));
        break;

      case GPDBWritable.BOOLEAN:
        pqGroup.add(index, gw.getBoolean(index));
        break;

      case GPDBWritable.FLOAT8:
        pqGroup.add(index, gw.getDouble(index));
        break;

      case GPDBWritable.INTEGER:
        pqGroup.add(index, gw.getInt(index));
        break;

      case GPDBWritable.SMALLINT:
        pqGroup.add(index, gw.getShort(index));
        break;

      default:
        throw new IOException("internal error, not support type, typeId:" + colType);
    }
  }
예제 #4
0
  /**
   * read GPDBWritable from gpdb and then write it to hdfs
   *
   * @throws Exception when something goes wrong
   */
  public void doWrite() throws IOException {
    //		if there is no schema provided by user, we will read schema later
    ParquetWriter<Group> dataFileWriter = null;
    DataInputStream dis = new DataInputStream(System.in);
    try {
      MessageType schema = null;
      SimpleGroupFactory groupFactory = null;

      //			read table structure info and auto-gen avro schema
      schema = autoGenSchema(dis);
      //			int total = dis.readInt();//skip the original 4 byte VARHDSZ

      if (parquetSchemaFile != null) {
        //				if user give us a schema file, read schema from it
        String schemaString = readSchemaFile(parquetSchemaFile);
        schema = MessageTypeParser.parseMessageType(schemaString);
      }

      GroupWriteSupport.setSchema(schema, conf);

      CompressionCodecName codecName = CompressionCodecName.UNCOMPRESSED;
      if (isCompressed) {
        if (compressCodec.equals(LZO_COMPRESS)) {
          codecName = CompressionCodecName.LZO;
        } else if (compressCodec.equals(SNAPPY_COMPRESS)) {
          codecName = CompressionCodecName.SNAPPY;
        } else if (compressCodec.equals(GZIP_COMPRESS)) {
          codecName = CompressionCodecName.GZIP;
        } else {
          throw new IOException("compression method not support, codec:" + compressCodec);
        }
      }

      dataFileWriter =
          new ParquetWriter<Group>(
              new Path(outputPath),
              new GroupWriteSupport(),
              codecName,
              rowGroupSize,
              pageSize,
              dicPageSize,
              dicEnable,
              false,
              parquetVersion,
              conf);

      groupFactory = new SimpleGroupFactory(schema);

      while (true) {
        GPDBWritable gw = new GPDBWritable();
        gw.readFields(dis);

        Group pqGroup = groupFactory.newGroup();

        fillRecord(pqGroup, gw, schema);

        dataFileWriter.write(pqGroup);
      }
    } catch (EOFException e) {
      // this is ok, read the end of input stream, keep error msg for testing
      // e.printStackTrace();
    } finally {
      if (dataFileWriter != null) {
        dataFileWriter.close();
      }

      dis.close();
    }
  }