private PqElementType generateElementType(int colType, int notNull, int ndims) { PqElementType eType = new PqElementType(); if (GPDBWritable.isArrayType(colType, ndims)) { eType.isArray = true; colType = GPDBWritable.getElementTypeFromArrayType(colType); } switch (colType) { case GPDBWritable.BOOLEAN: eType.primitiveType = PrimitiveTypeName.BOOLEAN; break; case GPDBWritable.BYTEA: eType.primitiveType = PrimitiveTypeName.BINARY; break; case GPDBWritable.BIGINT: eType.primitiveType = PrimitiveTypeName.INT64; break; case GPDBWritable.SMALLINT: case GPDBWritable.INTEGER: eType.primitiveType = PrimitiveTypeName.INT32; break; case GPDBWritable.REAL: eType.primitiveType = PrimitiveTypeName.FLOAT; break; case GPDBWritable.FLOAT8: eType.primitiveType = PrimitiveTypeName.DOUBLE; break; case GPDBWritable.CHAR: case GPDBWritable.VARCHAR: case GPDBWritable.BPCHAR: case GPDBWritable.NUMERIC: case GPDBWritable.DATE: case GPDBWritable.TIME: case GPDBWritable.TIMESTAMP: case GPDBWritable.TEXT: default: // others we just treat them as text, may be udt eType.primitiveType = PrimitiveTypeName.BINARY; if (colType == GPDBWritable.NUMERIC) { eType.originalType = OriginalType.DECIMAL; } else { eType.originalType = OriginalType.UTF8; } } return eType; }
/** * fill group using GPDBWritable * * @throws IOException */ private void fillRecord(Group pqGroup, GPDBWritable gw, MessageType schema) throws IOException { int[] colType = gw.getColumnType(); List<Type> fields = schema.getFields(); for (int i = 0; i < colType.length; i++) { fillElement(i, colType[i], pqGroup, gw, fields.get(i)); } }
private void fillElement(int index, int colType, Group pqGroup, GPDBWritable gw, Type field) throws IOException { switch (colType) { case GPDBWritable.BPCHAR: case GPDBWritable.CHAR: case GPDBWritable.DATE: case GPDBWritable.NUMERIC: case GPDBWritable.TIME: case GPDBWritable.TIMESTAMP: case GPDBWritable.VARCHAR: case GPDBWritable.TEXT: // utf8 or array if (field.getRepetition() == Repetition.REPEATED) { decodeArrayString( index, field, pqGroup, gw.getString(index), columnSchemas.get(index).getDelim()); } else { int gpdbType = columnSchemas.get(index).getType(); PrimitiveTypeName priType = field.asPrimitiveType().getPrimitiveTypeName(); OriginalType originalType = field.getOriginalType(); if (gpdbType == GPDBWritable.NUMERIC && priType == PrimitiveTypeName.INT32) { pqGroup.add(index, Integer.parseInt(gw.getString(index))); } else if (gpdbType == GPDBWritable.NUMERIC && priType == PrimitiveTypeName.INT64) { pqGroup.add(index, Long.parseLong(gw.getString(index))); } else if (gpdbType == GPDBWritable.DATE && priType == PrimitiveTypeName.INT32) { pqGroup.add( index, (int) FormatHandlerUtil.getTimeDiff( gw.getString(index), "1970-01-01", "yyyy-mm-dd", 24 * 60 * 60 * 1000)); } else if (gpdbType == GPDBWritable.TIME && priType == PrimitiveTypeName.INT32) { pqGroup.add( index, (int) FormatHandlerUtil.getTimeDiff(gw.getString(index), "00:00:00", "mm:hh:ss", 1)); } else if (gpdbType == GPDBWritable.TIMESTAMP && priType == PrimitiveTypeName.INT64) { pqGroup.add( index, FormatHandlerUtil.getTimeDiff( gw.getString(index), "1970-01-01 00:00:00", "yyyy-mm-dd mm:hh:ss", 1)); } else if (gpdbType == GPDBWritable.INTERVAL && originalType == OriginalType.INTERVAL) { // interval is complex, we just use string, for now, we just support 'postgres' // style interval // 1 year 2 mons -3 days +04:05:06.00901 byte[] interval = FormatHandlerUtil.getParquetInterval(gw.getString(index)); pqGroup.add(index, Binary.fromByteArray(interval)); } else { pqGroup.add(index, gw.getString(index)); } } break; case GPDBWritable.BYTEA: pqGroup.add(index, Binary.fromByteArray(gw.getBytes(index))); break; case GPDBWritable.REAL: pqGroup.add(index, gw.getFloat(index)); break; case GPDBWritable.BIGINT: pqGroup.add(index, gw.getLong(index)); break; case GPDBWritable.BOOLEAN: pqGroup.add(index, gw.getBoolean(index)); break; case GPDBWritable.FLOAT8: pqGroup.add(index, gw.getDouble(index)); break; case GPDBWritable.INTEGER: pqGroup.add(index, gw.getInt(index)); break; case GPDBWritable.SMALLINT: pqGroup.add(index, gw.getShort(index)); break; default: throw new IOException("internal error, not support type, typeId:" + colType); } }
/** * read GPDBWritable from gpdb and then write it to hdfs * * @throws Exception when something goes wrong */ public void doWrite() throws IOException { // if there is no schema provided by user, we will read schema later ParquetWriter<Group> dataFileWriter = null; DataInputStream dis = new DataInputStream(System.in); try { MessageType schema = null; SimpleGroupFactory groupFactory = null; // read table structure info and auto-gen avro schema schema = autoGenSchema(dis); // int total = dis.readInt();//skip the original 4 byte VARHDSZ if (parquetSchemaFile != null) { // if user give us a schema file, read schema from it String schemaString = readSchemaFile(parquetSchemaFile); schema = MessageTypeParser.parseMessageType(schemaString); } GroupWriteSupport.setSchema(schema, conf); CompressionCodecName codecName = CompressionCodecName.UNCOMPRESSED; if (isCompressed) { if (compressCodec.equals(LZO_COMPRESS)) { codecName = CompressionCodecName.LZO; } else if (compressCodec.equals(SNAPPY_COMPRESS)) { codecName = CompressionCodecName.SNAPPY; } else if (compressCodec.equals(GZIP_COMPRESS)) { codecName = CompressionCodecName.GZIP; } else { throw new IOException("compression method not support, codec:" + compressCodec); } } dataFileWriter = new ParquetWriter<Group>( new Path(outputPath), new GroupWriteSupport(), codecName, rowGroupSize, pageSize, dicPageSize, dicEnable, false, parquetVersion, conf); groupFactory = new SimpleGroupFactory(schema); while (true) { GPDBWritable gw = new GPDBWritable(); gw.readFields(dis); Group pqGroup = groupFactory.newGroup(); fillRecord(pqGroup, gw, schema); dataFileWriter.write(pqGroup); } } catch (EOFException e) { // this is ok, read the end of input stream, keep error msg for testing // e.printStackTrace(); } finally { if (dataFileWriter != null) { dataFileWriter.close(); } dis.close(); } }