@Override public void writePageV2( int rowCount, int nullCount, int valueCount, BytesInput repetitionLevels, BytesInput definitionLevels, Encoding dataEncoding, BytesInput data, Statistics<?> statistics) throws IOException { int rlByteLength = toIntWithCheck(repetitionLevels.size()); int dlByteLength = toIntWithCheck(definitionLevels.size()); int uncompressedSize = toIntWithCheck(data.size() + repetitionLevels.size() + definitionLevels.size()); // TODO: decide if we compress BytesInput compressedData = compressor.compress(data); int compressedSize = toIntWithCheck(compressedData.size() + repetitionLevels.size() + definitionLevels.size()); tempOutputStream.reset(); parquetMetadataConverter.writeDataPageV2Header( uncompressedSize, compressedSize, valueCount, nullCount, rowCount, statistics, dataEncoding, rlByteLength, dlByteLength, tempOutputStream); this.uncompressedLength += uncompressedSize; this.compressedLength += compressedSize; this.totalValueCount += valueCount; this.pageCount += 1; this.totalStatistics.mergeStatistics(statistics); // by concatenating before collecting instead of collecting twice, // we only allocate one buffer to copy into instead of multiple. buf.collect( BytesInput.concat( BytesInput.from(tempOutputStream), repetitionLevels, definitionLevels, compressedData)); dataEncodings.add(dataEncoding); }
@Override public void writePage( BytesInput bytes, int valueCount, Statistics statistics, Encoding rlEncoding, Encoding dlEncoding, Encoding valuesEncoding) throws IOException { long uncompressedSize = bytes.size(); if (uncompressedSize > Integer.MAX_VALUE) { throw new ParquetEncodingException( "Cannot write page larger than Integer.MAX_VALUE bytes: " + uncompressedSize); } BytesInput compressedBytes = compressor.compress(bytes); long compressedSize = compressedBytes.size(); if (compressedSize > Integer.MAX_VALUE) { throw new ParquetEncodingException( "Cannot write compressed page larger than Integer.MAX_VALUE bytes: " + compressedSize); } tempOutputStream.reset(); parquetMetadataConverter.writeDataPageHeader( (int) uncompressedSize, (int) compressedSize, valueCount, statistics, rlEncoding, dlEncoding, valuesEncoding, tempOutputStream); this.uncompressedLength += uncompressedSize; this.compressedLength += compressedSize; this.totalValueCount += valueCount; this.pageCount += 1; this.totalStatistics.mergeStatistics(statistics); // by concatenating before collecting instead of collecting twice, // we only allocate one buffer to copy into instead of multiple. buf.collect(BytesInput.concat(BytesInput.from(tempOutputStream), compressedBytes)); rlEncodings.add(rlEncoding); dlEncodings.add(dlEncoding); dataEncodings.add(valuesEncoding); }
public void writeToFileWriter(ParquetFileWriter writer) throws IOException { writer.startColumn(path, totalValueCount, compressor.getCodecName()); if (dictionaryPage != null) { writer.writeDictionaryPage(dictionaryPage); // tracking the dictionary encoding is handled in writeDictionaryPage } writer.writeDataPages( buf, uncompressedLength, compressedLength, totalStatistics, rlEncodings, dlEncodings, dataEncodings); writer.endColumn(); if (LOG.isDebugEnabled()) { LOG.debug( String.format( "written %,dB for %s: %,d values, %,dB raw, %,dB comp, %d pages, encodings: %s", buf.size(), path, totalValueCount, uncompressedLength, compressedLength, pageCount, new HashSet<Encoding>(dataEncodings)) + (dictionaryPage != null ? String.format( ", dic { %,d entries, %,dB raw, %,dB comp}", dictionaryPage.getDictionarySize(), dictionaryPage.getUncompressedSize(), dictionaryPage.getDictionarySize()) : "")); } rlEncodings.clear(); dlEncodings.clear(); dataEncodings.clear(); pageCount = 0; }
@Override public String memUsageString(String prefix) { return buf.memUsageString(prefix + " ColumnChunkPageWriter"); }
@Override public long allocatedSize() { return buf.size(); }
@Override public long getMemSize() { return buf.size(); }