/** Read checksums and feed compressed block data into decompressor. */ @Override protected int getCompressedData() throws IOException { checkStream(); // Get the size of the compressed chunk int compressedLen = readInt(in, buf, 4); noCompressedBytes += 4; // Get the checksum of the compressed chunk int checksum = readInt(in, buf, 4); noCompressedBytes += 4; if (compressedLen > FourMcCodec.FOURMC_MAX_BLOCK_SIZE) { throw new IOException( "Compressed length " + compressedLen + " exceeds max block size " + FourMcCodec.FOURMC_MAX_BLOCK_SIZE); } Lz4Decompressor lz4dec = (Lz4Decompressor) decompressor; // if compressed len == uncompressedBlockSize, 4mc wrote data w/o compression lz4dec.setCurrentBlockUncompressed(compressedLen >= uncompressedBlockSize); // Read len bytes from underlying stream if (compressedLen > buffer.length) { buffer = new byte[compressedLen]; } readFully(in, buffer, 0, compressedLen); noCompressedBytes += compressedLen; // checksum check if (checksum != Lz4Decompressor.xxhash32(buffer, 0, compressedLen, 0)) { if (lz4dec.isCurrentBlockUncompressed()) { throw new IOException("Corrupted uncompressed block (invalid checksum)"); } else { throw new IOException("Corrupted compressed block (invalid checksum)"); } } // Send the read data to the decompressor. lz4dec.setInput(buffer, 0, compressedLen); return compressedLen; }
static { if (FourMcNativeCodeLoader.isNativeCodeLoaded()) { boolean nativeLoaded = Lz4Decompressor.isNativeLoaded(); if (!nativeLoaded) { LOG.error("Failed to load/initialize native-4mc library"); } } else { LOG.error("Cannot load native-4mc without native-hadoop"); } }
@Override public void close() throws IOException { if (decompressor == null) { return; } byte[] b = new byte[4096]; while (!decompressor.finished()) { decompressor.decompress(b, 0, b.length); } super.close(); // force release direct buffers of decompressor ((Lz4Decompressor) decompressor).releaseDirectBuffers(); decompressor = null; }
/** Read and verify 4mc header. */ protected void readHeader(InputStream in) throws IOException { readFully(in, buf, 0, 12); int magic = getInt(buf, 0); if (magic != FourMcCodec.FOURMC_MAGIC) { throw new IOException("Invalid 4mc header (wrong magic)"); } int version = getInt(buf, 4); if (version != FourMcCodec.FOURMC_VERSION) { throw new IOException("Invalid 4mc header (wrong version)"); } int hdrChecksum = getInt(buf, 8); if (hdrChecksum != Lz4Decompressor.xxhash32(buf, 0, 8, 0)) { throw new IOException("Invalid 4mc header (invalid checksum)"); } }
/** * Reads blocks index at tail of file. * * @param fs filesystem * @param file path to 4mc file * @return block index * @throws IOException */ public static FourMcBlockIndex readIndex(FileSystem fs, Path file) throws IOException { long fileSize = fs.getFileStatus(file).getLen(); if (fileSize < (12 + 20)) { // file too small return new FourMcBlockIndex(); } FSDataInputStream indexIn = fs.open(file); /* 4mc Footer: Footer size: 4 bytes Footer version: 4 byte (1) Block index offset: 4 bytes delta offset for each stored block, the delta between offset between previous file position and next block Footer size: 4 bytes (repeated to be able to read from end of file) MAGIC SIGNATURE: 4 bytes: "4MC\0" Footer checksum: 4 bytes (always in XXHASH32) */ /** * jump to file tail and read-ahead last 4KB of file which should be enough in most cases * Improvement: we could estimate a best case compression factor of 10% and calc forecast based * on filesize and blocksize, to see if better to read-head more. */ int readTailSize = 4 * 1024; if (readTailSize > (fileSize - 12)) readTailSize = (int) (fileSize - 12); indexIn.seek(fileSize - readTailSize); byte[] buf = new byte[readTailSize]; readFully(indexIn, buf, 0, buf.length); int footerSize = getInt(buf, buf.length - 12); int magic = getInt(buf, buf.length - 8); int checksum = getInt(buf, buf.length - 4); if (magic != FourMcCodec.FOURMC_MAGIC) { throw new IOException("Invalid 4mc footer magic"); } if (footerSize >= (fileSize - 12)) { throw new IOException("Invalid 4mc footer checksum"); } // very rare case: read head was not enough! seek back and read it all if (footerSize > readTailSize) { readTailSize = footerSize; indexIn.seek(fileSize - readTailSize); buf = new byte[readTailSize]; readFully(indexIn, buf, 0, buf.length); } indexIn.close(); int startFooterOffset = readTailSize - footerSize; if (getInt(buf, startFooterOffset) != footerSize) { // size again throw new IOException("Invalid 4mc footer size"); } if (getInt(buf, startFooterOffset + 4) != FourMcCodec.FOURMC_VERSION) { // version throw new IOException( "Invalid 4mc footer version (" + getInt(buf, startFooterOffset + 4) + ")"); } if (checksum != Lz4Decompressor.xxhash32(buf, startFooterOffset, footerSize - 4, 0)) { throw new IOException("Invalid 4mc footer checksum"); } int totalBlocks = (footerSize - 20) / 4; FourMcBlockIndex index = new FourMcBlockIndex(totalBlocks); long curOffset = 0; for (int i = 0; i < totalBlocks; ++i) { curOffset += getInt(buf, startFooterOffset + 8 + (i * 4)); index.set(i, curOffset); } return index; }