/** * Checks that the data starting at startLocRecord looks like a local file record header. * * @param channel the channel * @param startLocRecord offset into channel of the start of the local record * @param compressedSize expected compressed size of the file, or -1 to indicate this isn't known */ private static boolean validateLocalFileRecord( FileChannel channel, long startLocRecord, long compressedSize) throws IOException { ByteBuffer lfhBuffer = getByteBuffer(LOCLEN); read(lfhBuffer, channel, startLocRecord); if (lfhBuffer.limit() < LOCLEN || getUnsignedInt(lfhBuffer, 0) != LOCSIG) { return false; } if (compressedSize == -1) { // We can't further evaluate return true; } int fnLen = getUnsignedShort(lfhBuffer, LOC_FILENAMELEN); int extFieldLen = getUnsignedShort(lfhBuffer, LOC_EXTFLDLEN); long nextSigPos = startLocRecord + LOCLEN + compressedSize + fnLen + extFieldLen; read(lfhBuffer, channel, nextSigPos); long header = getUnsignedInt(lfhBuffer, 0); return header == LOCSIG || header == EXTSIG || header == CENSIG; }
/** * Boyer Moore scan that proceeds backwards from the end of the file looking for endsig * * @param file the file being checked * @param channel the channel * @param pattern the search pattern * @param byteSkip the bad bytes skip table * @param endSig the end of central dir signature * @return * @throws IOException */ private static long scanForEndSig( final File file, final FileChannel channel, final byte[] pattern, final int[] byteSkip, final long endSig) throws IOException { // TODO Consider just reading in MAX_REVERSE_SCAN bytes -- increased peak memory cost but less // complex ByteBuffer bb = getByteBuffer(CHUNK_SIZE); long start = channel.size(); long end = Math.max(0, start - MAX_REVERSE_SCAN); long channelPos = Math.max(0, start - CHUNK_SIZE); long lastChannelPos = channelPos; while (lastChannelPos >= end) { read(bb, channel, channelPos); int actualRead = bb.limit(); int bufferPos = actualRead - 1; while (bufferPos >= SIG_PATTERN_LENGTH) { // Following is based on the Boyer Moore algorithm but simplified to reflect // a) the pattern is static // b) the pattern has no repeating bytes int patternPos; for (patternPos = SIG_PATTERN_LENGTH - 1; patternPos >= 0 && pattern[patternPos] == bb.get(bufferPos - patternPos); --patternPos) { // empty loop while bytes match } // Switch gives same results as checking the "good suffix array" in the Boyer Moore // algorithm switch (patternPos) { case -1: { // Pattern matched. Confirm is this is the start of a valid end of central dir record long startEndRecord = channelPos + bufferPos - SIG_PATTERN_LENGTH + 1; if (validateEndRecord(file, channel, startEndRecord, endSig)) { return startEndRecord; } // wasn't a valid end record; continue scan bufferPos -= 4; break; } case 3: { // No bytes matched; the common case. // With our pattern, this is the only case where the Boyer Moore algorithm's "bad char // array" may // produce a shift greater than the "good suffix array" (which would shift 1 byte) int idx = bb.get(bufferPos - patternPos) - Byte.MIN_VALUE; bufferPos -= byteSkip[idx]; break; } default: // 1 or more bytes matched bufferPos -= 4; } } // Move back a full chunk. If we didn't read a full chunk, that's ok, // it means we read all data and the outer while loop will terminate if (channelPos <= bufferPos) { break; } lastChannelPos = channelPos; channelPos -= Math.min(channelPos - bufferPos, CHUNK_SIZE - bufferPos); } return -1; }
/** * Boyer Moore scan that proceeds forwards from the end of the file looking for the first LOCSIG */ private static long scanForLocSig(FileChannel channel) throws IOException { channel.position(0); ByteBuffer bb = getByteBuffer(CHUNK_SIZE); long end = channel.size(); while (channel.position() <= end) { read(bb, channel); int bufferPos = 0; while (bufferPos <= bb.limit() - SIG_PATTERN_LENGTH) { // Following is based on the Boyer Moore algorithm but simplified to reflect // a) the size of the pattern is static // b) the pattern is static and has no repeating bytes int patternPos; for (patternPos = SIG_PATTERN_LENGTH - 1; patternPos >= 0 && LOCSIG_PATTERN[patternPos] == bb.get(bufferPos + patternPos); --patternPos) { // empty loop while bytes match } // Outer switch gives same results as checking the "good suffix array" in the Boyer Moore // algorithm switch (patternPos) { case -1: { // Pattern matched. Confirm is this is the start of a valid local file record long startLocRecord = channel.position() - bb.limit() + bufferPos; long currentPos = channel.position(); if (validateLocalFileRecord(channel, startLocRecord, -1)) { return startLocRecord; } // Restore position in case it shifted channel.position(currentPos); // wasn't a valid local file record; continue scan bufferPos += 4; break; } case 3: { // No bytes matched; the common case. // With our pattern, this is the only case where the Boyer Moore algorithm's "bad char // array" may // produce a shift greater than the "good suffix array" (which would shift 1 byte) int idx = bb.get(bufferPos + patternPos) - Byte.MIN_VALUE; bufferPos += LOC_BAD_BYTE_SKIP[idx]; break; } default: // 1 or more bytes matched bufferPos += 4; } } } return -1; }
/** * Validates that the data structure at position startEndRecord has a field in the expected * position that points to the start of the first central directory file, and, if so, that the * file has a complete end of central directory record comment at the end. * * @param file the file being checked * @param channel the channel * @param startEndRecord the start of the end of central directory record * @param endSig the end of central dir signature * @return true if it can be confirmed that the end of directory record points to a central * directory file and a complete comment is present, false otherwise * @throws java.io.IOException */ private static boolean validateEndRecord( File file, FileChannel channel, long startEndRecord, long endSig) throws IOException { try { channel.position(startEndRecord); final ByteBuffer endDirHeader = getByteBuffer(ENDLEN); read(endDirHeader, channel); if (endDirHeader.limit() < ENDLEN) { // Couldn't read the full end of central directory record header return false; } else if (getUnsignedInt(endDirHeader, 0) != endSig) { return false; } long pos = getUnsignedInt(endDirHeader, END_CENSTART); // TODO deal with Zip64 if (pos == ZIP64_MARKER) { return false; } ByteBuffer cdfhBuffer = getByteBuffer(CENLEN); read(cdfhBuffer, channel, pos); long header = getUnsignedInt(cdfhBuffer, 0); if (header == CENSIG) { long firstLoc = getUnsignedInt(cdfhBuffer, CEN_LOC_OFFSET); long firstSize = getUnsignedInt(cdfhBuffer, CENSIZ); if (firstLoc == 0) { // normal case -- first bytes are the first local file if (!validateLocalFileRecord(channel, 0, firstSize)) { return false; } } else { // confirm that firstLoc is indeed the first local file long fileFirstLoc = scanForLocSig(channel); if (firstLoc != fileFirstLoc) { if (fileFirstLoc == 0) { return false; } else { // scanForLocSig() found a LOCSIG, but not at position zero and not // at the expected position. // With a file like this, we can't tell if we're in a nested zip // or we're in an outer zip and had the bad luck to find random bytes // that look like LOCSIG. return false; } } } // At this point, endDirHeader points to the correct end of central dir record. // Just need to validate the record is complete, including any comment int commentLen = getUnsignedShort(endDirHeader, END_COMMENTLEN); long commentEnd = startEndRecord + ENDLEN + commentLen; return commentEnd <= channel.size(); } return false; } catch (EOFException eof) { // pos or firstLoc weren't really positions and moved us to an invalid location return false; } }