public void flush() { if (valueCount > 0) { writePage(); } final DictionaryPage dictionaryPage = dataColumn.toDictPageAndClose(); if (dictionaryPage != null) { if (DEBUG) LOG.debug("write dictionary"); try { pageWriter.writeDictionaryPage(dictionaryPage); } catch (IOException e) { throw new ParquetEncodingException("could not write dictionary page for " + path, e); } dataColumn.resetDictionary(); } }
public static void writeIntLittleEndian(OutputStream out, int v) throws IOException { // TODO: this is duplicated code in LittleEndianDataOutputStream out.write((v >>> 0) & 0xFF); out.write((v >>> 8) & 0xFF); out.write((v >>> 16) & 0xFF); out.write((v >>> 24) & 0xFF); if (Log.DEBUG) LOG.debug( "write le int: " + v + " => " + ((v >>> 0) & 0xFF) + " " + ((v >>> 8) & 0xFF) + " " + ((v >>> 16) & 0xFF) + " " + ((v >>> 24) & 0xFF)); }
private void writePage() { if (DEBUG) LOG.debug("write page"); try { pageWriter.writePage( concat( repetitionLevelColumn.getBytes(), definitionLevelColumn.getBytes(), dataColumn.getBytes()), valueCount, statistics, repetitionLevelColumn.getEncoding(), definitionLevelColumn.getEncoding(), dataColumn.getEncoding()); } catch (IOException e) { throw new ParquetEncodingException("could not write page for " + path, e); } repetitionLevelColumn.reset(); definitionLevelColumn.reset(); dataColumn.reset(); valueCount = 0; resetStatistics(); }
private void log(Object value, int r, int d) { LOG.debug(path + " " + value + " r:" + r + " d:" + d); }
/** * Writes (repetition level, definition level, value) triplets and deals with writing pages to the * underlying layer. * * @author Julien Le Dem */ final class ColumnWriterV1 implements ColumnWriter { private static final Log LOG = Log.getLog(ColumnWriterV1.class); private static final boolean DEBUG = Log.DEBUG; private final ColumnDescriptor path; private final PageWriter pageWriter; private final ParquetProperties props; private ValuesWriter repetitionLevelColumn; private ValuesWriter definitionLevelColumn; private ValuesWriter dataColumn; private int valueCount; private int valueCountForNextSizeCheck; private Statistics statistics; public ColumnWriterV1(ColumnDescriptor path, PageWriter pageWriter, ParquetProperties props) { this.path = path; this.pageWriter = pageWriter; this.props = props; // initial check of memory usage. So that we have enough data to make an initial prediction this.valueCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck(); resetStatistics(); this.repetitionLevelColumn = props.newRepetitionLevelWriter(path); this.definitionLevelColumn = props.newDefinitionLevelWriter(path); this.dataColumn = props.newValuesWriter(path); } private void log(Object value, int r, int d) { LOG.debug(path + " " + value + " r:" + r + " d:" + d); } private void resetStatistics() { this.statistics = Statistics.getStatsBasedOnType(this.path.getType()); } /** * Counts how many values have been written and checks the memory usage to flush the page when we * reach the page threshold. * * <p>We measure the memory used when we reach the mid point toward our estimated count. We then * update the estimate and flush the page if we reached the threshold. * * <p>That way we check the memory size log2(n) times. */ private void accountForValueWritten() { ++valueCount; if (valueCount > valueCountForNextSizeCheck) { // not checking the memory used for every value long memSize = repetitionLevelColumn.getBufferedSize() + definitionLevelColumn.getBufferedSize() + dataColumn.getBufferedSize(); if (memSize > props.getPageSizeThreshold()) { // we will write the current page and check again the size at the predicted middle of next // page if (props.estimateNextSizeCheck()) { valueCountForNextSizeCheck = valueCount / 2; } else { valueCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck(); } writePage(); } else if (props.estimateNextSizeCheck()) { // not reached the threshold, will check again midway valueCountForNextSizeCheck = (int) (valueCount + ((float) valueCount * props.getPageSizeThreshold() / memSize)) / 2 + 1; } else { valueCountForNextSizeCheck += props.getMinRowCountForPageSizeCheck(); } } } private void updateStatisticsNumNulls() { statistics.incrementNumNulls(); } private void updateStatistics(int value) { statistics.updateStats(value); } private void updateStatistics(long value) { statistics.updateStats(value); } private void updateStatistics(float value) { statistics.updateStats(value); } private void updateStatistics(double value) { statistics.updateStats(value); } private void updateStatistics(Binary value) { statistics.updateStats(value); } private void updateStatistics(boolean value) { statistics.updateStats(value); } private void writePage() { if (DEBUG) LOG.debug("write page"); try { pageWriter.writePage( concat( repetitionLevelColumn.getBytes(), definitionLevelColumn.getBytes(), dataColumn.getBytes()), valueCount, statistics, repetitionLevelColumn.getEncoding(), definitionLevelColumn.getEncoding(), dataColumn.getEncoding()); } catch (IOException e) { throw new ParquetEncodingException("could not write page for " + path, e); } repetitionLevelColumn.reset(); definitionLevelColumn.reset(); dataColumn.reset(); valueCount = 0; resetStatistics(); } @Override public void writeNull(int repetitionLevel, int definitionLevel) { if (DEBUG) log(null, repetitionLevel, definitionLevel); repetitionLevelColumn.writeInteger(repetitionLevel); definitionLevelColumn.writeInteger(definitionLevel); updateStatisticsNumNulls(); accountForValueWritten(); } @Override public void write(double value, int repetitionLevel, int definitionLevel) { if (DEBUG) log(value, repetitionLevel, definitionLevel); repetitionLevelColumn.writeInteger(repetitionLevel); definitionLevelColumn.writeInteger(definitionLevel); dataColumn.writeDouble(value); updateStatistics(value); accountForValueWritten(); } @Override public void write(float value, int repetitionLevel, int definitionLevel) { if (DEBUG) log(value, repetitionLevel, definitionLevel); repetitionLevelColumn.writeInteger(repetitionLevel); definitionLevelColumn.writeInteger(definitionLevel); dataColumn.writeFloat(value); updateStatistics(value); accountForValueWritten(); } @Override public void write(Binary value, int repetitionLevel, int definitionLevel) { if (DEBUG) log(value, repetitionLevel, definitionLevel); repetitionLevelColumn.writeInteger(repetitionLevel); definitionLevelColumn.writeInteger(definitionLevel); dataColumn.writeBytes(value); updateStatistics(value); accountForValueWritten(); } @Override public void write(boolean value, int repetitionLevel, int definitionLevel) { if (DEBUG) log(value, repetitionLevel, definitionLevel); repetitionLevelColumn.writeInteger(repetitionLevel); definitionLevelColumn.writeInteger(definitionLevel); dataColumn.writeBoolean(value); updateStatistics(value); accountForValueWritten(); } @Override public void write(int value, int repetitionLevel, int definitionLevel) { if (DEBUG) log(value, repetitionLevel, definitionLevel); repetitionLevelColumn.writeInteger(repetitionLevel); definitionLevelColumn.writeInteger(definitionLevel); dataColumn.writeInteger(value); updateStatistics(value); accountForValueWritten(); } @Override public void write(long value, int repetitionLevel, int definitionLevel) { if (DEBUG) log(value, repetitionLevel, definitionLevel); repetitionLevelColumn.writeInteger(repetitionLevel); definitionLevelColumn.writeInteger(definitionLevel); dataColumn.writeLong(value); updateStatistics(value); accountForValueWritten(); } public void flush() { if (valueCount > 0) { writePage(); } final DictionaryPage dictionaryPage = dataColumn.toDictPageAndClose(); if (dictionaryPage != null) { if (DEBUG) LOG.debug("write dictionary"); try { pageWriter.writeDictionaryPage(dictionaryPage); } catch (IOException e) { throw new ParquetEncodingException("could not write dictionary page for " + path, e); } dataColumn.resetDictionary(); } } @Override public void close() { flush(); // Close the Values writers. repetitionLevelColumn.close(); definitionLevelColumn.close(); dataColumn.close(); } @Override public long getBufferedSizeInMemory() { return repetitionLevelColumn.getBufferedSize() + definitionLevelColumn.getBufferedSize() + dataColumn.getBufferedSize() + pageWriter.getMemSize(); } public long allocatedSize() { return repetitionLevelColumn.getAllocatedSize() + definitionLevelColumn.getAllocatedSize() + dataColumn.getAllocatedSize() + pageWriter.allocatedSize(); } public String memUsageString(String indent) { StringBuilder b = new StringBuilder(indent).append(path).append(" {\n"); b.append(repetitionLevelColumn.memUsageString(indent + " r:")).append("\n"); b.append(definitionLevelColumn.memUsageString(indent + " d:")).append("\n"); b.append(dataColumn.memUsageString(indent + " data:")).append("\n"); b.append(pageWriter.memUsageString(indent + " pages:")).append("\n"); b.append(indent) .append(String.format(" total: %,d/%,d", getBufferedSizeInMemory(), allocatedSize())) .append("\n"); b.append(indent).append("}\n"); return b.toString(); } }
/** * utility methods to deal with bytes * * @author Julien Le Dem */ public class BytesUtils { private static final Log LOG = Log.getLog(BytesUtils.class); public static final Charset UTF8 = Charset.forName("UTF-8"); /** * give the number of bits needed to encode an int given the max value * * @param bound max int that we want to encode * @return the number of bits required */ public static int getWidthFromMaxInt(int bound) { return 32 - Integer.numberOfLeadingZeros(bound); } /** * reads an int in little endian at the given position * * @param in * @param offset * @return * @throws IOException */ public static int readIntLittleEndian(ByteBuffer in, int offset) throws IOException { int ch4 = in.get(offset) & 0xff; int ch3 = in.get(offset + 1) & 0xff; int ch2 = in.get(offset + 2) & 0xff; int ch1 = in.get(offset + 3) & 0xff; return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0)); } /** * reads an int in little endian at the given position * * @param in * @param offset * @return * @throws IOException */ public static int readIntLittleEndian(byte[] in, int offset) throws IOException { int ch4 = in[offset] & 0xff; int ch3 = in[offset + 1] & 0xff; int ch2 = in[offset + 2] & 0xff; int ch1 = in[offset + 3] & 0xff; return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0)); } public static int readIntLittleEndian(InputStream in) throws IOException { // TODO: this is duplicated code in LittleEndianDataInputStream int ch1 = in.read(); int ch2 = in.read(); int ch3 = in.read(); int ch4 = in.read(); if ((ch1 | ch2 | ch3 | ch4) < 0) { throw new EOFException(); } return ((ch4 << 24) + (ch3 << 16) + (ch2 << 8) + (ch1 << 0)); } public static int readIntLittleEndianOnOneByte(InputStream in) throws IOException { int ch1 = in.read(); if (ch1 < 0) { throw new EOFException(); } return ch1; } public static int readIntLittleEndianOnTwoBytes(InputStream in) throws IOException { int ch1 = in.read(); int ch2 = in.read(); if ((ch1 | ch2) < 0) { throw new EOFException(); } return ((ch2 << 8) + (ch1 << 0)); } public static int readIntLittleEndianOnThreeBytes(InputStream in) throws IOException { int ch1 = in.read(); int ch2 = in.read(); int ch3 = in.read(); if ((ch1 | ch2 | ch3) < 0) { throw new EOFException(); } return ((ch3 << 16) + (ch2 << 8) + (ch1 << 0)); } public static int readIntLittleEndianPaddedOnBitWidth(InputStream in, int bitWidth) throws IOException { int bytesWidth = paddedByteCountFromBits(bitWidth); switch (bytesWidth) { case 0: return 0; case 1: return BytesUtils.readIntLittleEndianOnOneByte(in); case 2: return BytesUtils.readIntLittleEndianOnTwoBytes(in); case 3: return BytesUtils.readIntLittleEndianOnThreeBytes(in); case 4: return BytesUtils.readIntLittleEndian(in); default: throw new IOException( String.format("Encountered bitWidth (%d) that requires more than 4 bytes", bitWidth)); } } public static void writeIntLittleEndianOnOneByte(OutputStream out, int v) throws IOException { out.write((v >>> 0) & 0xFF); } public static void writeIntLittleEndianOnTwoBytes(OutputStream out, int v) throws IOException { out.write((v >>> 0) & 0xFF); out.write((v >>> 8) & 0xFF); } public static void writeIntLittleEndianOnThreeBytes(OutputStream out, int v) throws IOException { out.write((v >>> 0) & 0xFF); out.write((v >>> 8) & 0xFF); out.write((v >>> 16) & 0xFF); } public static void writeIntLittleEndian(OutputStream out, int v) throws IOException { // TODO: this is duplicated code in LittleEndianDataOutputStream out.write((v >>> 0) & 0xFF); out.write((v >>> 8) & 0xFF); out.write((v >>> 16) & 0xFF); out.write((v >>> 24) & 0xFF); if (Log.DEBUG) LOG.debug( "write le int: " + v + " => " + ((v >>> 0) & 0xFF) + " " + ((v >>> 8) & 0xFF) + " " + ((v >>> 16) & 0xFF) + " " + ((v >>> 24) & 0xFF)); } /** Write a little endian int to out, using the the number of bytes required by bit width */ public static void writeIntLittleEndianPaddedOnBitWidth(OutputStream out, int v, int bitWidth) throws IOException { int bytesWidth = paddedByteCountFromBits(bitWidth); switch (bytesWidth) { case 0: break; case 1: writeIntLittleEndianOnOneByte(out, v); break; case 2: writeIntLittleEndianOnTwoBytes(out, v); break; case 3: writeIntLittleEndianOnThreeBytes(out, v); break; case 4: writeIntLittleEndian(out, v); break; default: throw new IOException( String.format("Encountered value (%d) that requires more than 4 bytes", v)); } } public static int readUnsignedVarInt(InputStream in) throws IOException { int value = 0; int i = 0; int b; while (((b = in.read()) & 0x80) != 0) { value |= (b & 0x7F) << i; i += 7; } return value | (b << i); } /** * uses a trick mentioned in https://developers.google.com/protocol-buffers/docs/encoding to read * zigZag encoded data * * @param in * @return * @throws IOException */ public static int readZigZagVarInt(InputStream in) throws IOException { int raw = readUnsignedVarInt(in); int temp = (((raw << 31) >> 31) ^ raw) >> 1; return temp ^ (raw & (1 << 31)); } public static void writeUnsignedVarInt(int value, OutputStream out) throws IOException { while ((value & 0xFFFFFF80) != 0L) { out.write((value & 0x7F) | 0x80); value >>>= 7; } out.write(value & 0x7F); } public static void writeUnsignedVarInt(int value, ByteBuffer dest) throws IOException { while ((value & 0xFFFFFF80) != 0L) { dest.putInt((value & 0x7F) | 0x80); value >>>= 7; } dest.putInt(value & 0x7F); } public static void writeZigZagVarInt(int intValue, OutputStream out) throws IOException { writeUnsignedVarInt((intValue << 1) ^ (intValue >> 31), out); } /** * uses a trick mentioned in https://developers.google.com/protocol-buffers/docs/encoding to read * zigZag encoded data TODO: the implementation is compatible with readZigZagVarInt. Is there a * need for different functions? * * @param in * @return * @throws IOException */ public static long readZigZagVarLong(InputStream in) throws IOException { long raw = readUnsignedVarLong(in); long temp = (((raw << 63) >> 63) ^ raw) >> 1; return temp ^ (raw & (1L << 63)); } public static long readUnsignedVarLong(InputStream in) throws IOException { long value = 0; int i = 0; long b; while (((b = in.read()) & 0x80) != 0) { value |= (b & 0x7F) << i; i += 7; } return value | (b << i); } public static void writeUnsignedVarLong(long value, OutputStream out) throws IOException { while ((value & 0xFFFFFFFFFFFFFF80L) != 0L) { out.write((int) ((value & 0x7F) | 0x80)); value >>>= 7; } out.write((int) (value & 0x7F)); } public static void writeZigZagVarLong(long longValue, OutputStream out) throws IOException { writeUnsignedVarLong((longValue << 1) ^ (longValue >> 63), out); } /** * @param bitLength a count of bits * @return the corresponding byte count padded to the next byte */ public static int paddedByteCountFromBits(int bitLength) { return (bitLength + 7) / 8; } public static byte[] intToBytes(int value) { byte[] outBuffer = new byte[4]; outBuffer[3] = (byte) (value >>> 24); outBuffer[2] = (byte) (value >>> 16); outBuffer[1] = (byte) (value >>> 8); outBuffer[0] = (byte) (value >>> 0); return outBuffer; } public static int bytesToInt(byte[] bytes) { return ((int) (bytes[3] & 255) << 24) + ((int) (bytes[2] & 255) << 16) + ((int) (bytes[1] & 255) << 8) + ((int) (bytes[0] & 255) << 0); } public static byte[] longToBytes(long value) { byte[] outBuffer = new byte[8]; outBuffer[7] = (byte) (value >>> 56); outBuffer[6] = (byte) (value >>> 48); outBuffer[5] = (byte) (value >>> 40); outBuffer[4] = (byte) (value >>> 32); outBuffer[3] = (byte) (value >>> 24); outBuffer[2] = (byte) (value >>> 16); outBuffer[1] = (byte) (value >>> 8); outBuffer[0] = (byte) (value >>> 0); return outBuffer; } public static long bytesToLong(byte[] bytes) { return (((long) bytes[7] << 56) + ((long) (bytes[6] & 255) << 48) + ((long) (bytes[5] & 255) << 40) + ((long) (bytes[4] & 255) << 32) + ((long) (bytes[3] & 255) << 24) + ((long) (bytes[2] & 255) << 16) + ((long) (bytes[1] & 255) << 8) + ((long) (bytes[0] & 255) << 0)); } public static byte[] booleanToBytes(boolean value) { byte[] outBuffer = new byte[1]; outBuffer[0] = (byte) (value ? 1 : 0); return outBuffer; } public static boolean bytesToBool(byte[] bytes) { return ((int) (bytes[0] & 255) != 0); } }