@Override public void writeBytes(Binary v) { if (!dictionaryTooBig) { writeBytesUsingDict(v); if (dictionaryByteSize > maxDictionaryByteSize || dict.size() > MAX_DICTIONARY_ENTRIES) { // if the dictionary reaches the max byte size or the values can not be encoded on two bytes // anymore. if (DEBUG) LOG.debug( "dictionary is now too big, falling back to plain: " + dictionaryByteSize + "B and " + dict.size() + " entries"); dictionaryTooBig = true; if (lastUsedDictionarySize == 0) { // if we never used the dictionary // we free dictionary encoded data dict = null; dictionaryByteSize = 0; out = null; } } } // write also to plain encoding if we need to fall back plainValuesWriter.writeBytes(v); }
@Override public void reset() { if (out != null) { out = new IntList(); } plainValuesWriter.reset(); }
@Override public Encoding getEncoding() { if (!dictionaryTooBig && dict.size() > 0) { return PLAIN_DICTIONARY; } return plainValuesWriter.getEncoding(); }
@Override public BytesInput getBytes() { if (!dictionaryTooBig && dict.size() > 0) { // remember size of dictionary when we last wrote a page lastUsedDictionarySize = dict.size(); lastUsedDictionaryByteSize = dictionaryByteSize; int maxDicId = dict.size() - 1; if (DEBUG) LOG.debug("max dic id " + maxDicId); int bitWidth = BytesUtils.getWidthFromMaxInt(maxDicId); // TODO: what is a good initialCapacity? final RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(BytesUtils.getWidthFromMaxInt(maxDicId), 64 * 1024); IntIterator iterator = out.iterator(); try { while (iterator.hasNext()) { encoder.writeInt(iterator.next()); } // encodes the bit width byte[] bytesHeader = new byte[] {(byte) bitWidth}; BytesInput rleEncodedBytes = encoder.toBytes(); if (DEBUG) LOG.debug("rle encoded bytes " + rleEncodedBytes.size()); return concat(BytesInput.from(bytesHeader), rleEncodedBytes); } catch (IOException e) { throw new ParquetEncodingException("could not encode the values", e); } } return plainValuesWriter.getBytes(); }
@Override public long getAllocatedSize() { // size used in memory return (out == null ? 0 : out.size() * 4) + dictionaryByteSize + plainValuesWriter.getAllocatedSize(); }
@Override public String memUsageString(String prefix) { return String.format( "%s DictionaryValuesWriter{\n%s\n%s\n%s\n%s}\n", prefix, plainValuesWriter.memUsageString(prefix + " plain:"), prefix + " dict:" + dictionaryByteSize, prefix + " values:" + (out.size() * 4), prefix); }
@Override public DictionaryPage createDictionaryPage() { if (lastUsedDictionarySize > 0) { // return a dictionary only if we actually used it try { CapacityByteArrayOutputStream dictBuf = new CapacityByteArrayOutputStream(lastUsedDictionaryByteSize); LittleEndianDataOutputStream dictOut = new LittleEndianDataOutputStream(dictBuf); Iterator<Binary> entryIterator = dict.keySet().iterator(); // write only the part of the dict that we used for (int i = 0; i < lastUsedDictionarySize; i++) { Binary entry = entryIterator.next(); dictOut.writeInt(entry.length()); entry.writeTo(dictOut); } return new DictionaryPage( BytesInput.from(dictBuf), lastUsedDictionarySize, PLAIN_DICTIONARY); } catch (IOException e) { throw new ParquetEncodingException("Could not generate dictionary Page", e); } } return plainValuesWriter.createDictionaryPage(); }
@Override public long getBufferedSize() { // size that will be written to a page // not including the dictionary size return dictionaryTooBig ? plainValuesWriter.getBufferedSize() : out.size() * 4; }