public static Pair<byte[], Integer> retrieveChunksAsBytes( ZooKeeper zk, String path, String prefix, boolean getCRC) throws Exception { TreeSet<String> chunks = new TreeSet<String>(); while (true) { boolean allUploadsComplete = true; if (!chunks.contains(path + "/" + prefix + "_complete")) { allUploadsComplete = false; } if (allUploadsComplete) { break; } chunks = new TreeSet<String>(zk.getChildren(path, false)); for (String chunk : chunks) { for (int ii = 0; ii < chunks.size(); ii++) { if (chunk.startsWith(path + "/" + prefix)) { chunks.add(chunk); } } } } byte resultBuffers[][] = new byte[chunks.size() - 1][]; int ii = 0; PureJavaCrc32 crc = getCRC ? new PureJavaCrc32() : null; for (String chunk : chunks) { if (chunk.endsWith("_complete")) continue; resultBuffers[ii] = zk.getData(chunk, false, null); if (crc != null) { crc.update(resultBuffers[ii]); } ii++; } return Pair.of(decompressBytes(resultBuffers), crc != null ? (int) crc.getValue() : null); }
public DefaultSnapshotDataTarget( final File file, final int hostId, final String clusterName, final String databaseName, final String tableName, final int numPartitions, final boolean isReplicated, final List<Integer> partitionIds, final VoltTable schemaTable, final long txnId, final long timestamp, int version[]) throws IOException { String hostname = CoreUtils.getHostnameOrAddress(); m_file = file; m_tableName = tableName; m_fos = new FileOutputStream(file); m_channel = m_fos.getChannel(); m_needsFinalClose = !isReplicated; final FastSerializer fs = new FastSerializer(); fs.writeInt(0); // CRC fs.writeInt(0); // Header length placeholder fs.writeByte( 1); // Indicate the snapshot was not completed, set to true for the CRC calculation, false // later for (int ii = 0; ii < 4; ii++) { fs.writeInt(version[ii]); // version } JSONStringer stringer = new JSONStringer(); byte jsonBytes[] = null; try { stringer.object(); stringer.key("txnId").value(txnId); stringer.key("hostId").value(hostId); stringer.key("hostname").value(hostname); stringer.key("clusterName").value(clusterName); stringer.key("databaseName").value(databaseName); stringer.key("tableName").value(tableName.toUpperCase()); stringer.key("isReplicated").value(isReplicated); stringer.key("isCompressed").value(true); stringer.key("checksumType").value("CRC32C"); stringer.key("timestamp").value(timestamp); /* * The timestamp string is for human consumption, automated stuff should use * the actual timestamp */ stringer.key("timestampString").value(SnapshotUtil.formatHumanReadableDate(timestamp)); if (!isReplicated) { stringer.key("partitionIds").array(); for (int partitionId : partitionIds) { stringer.value(partitionId); } stringer.endArray(); stringer.key("numPartitions").value(numPartitions); } stringer.endObject(); String jsonString = stringer.toString(); JSONObject jsonObj = new JSONObject(jsonString); jsonString = jsonObj.toString(4); jsonBytes = jsonString.getBytes("UTF-8"); } catch (Exception e) { throw new IOException(e); } fs.writeInt(jsonBytes.length); fs.write(jsonBytes); final BBContainer container = fs.getBBContainer(); container.b.position(4); container.b.putInt(container.b.remaining() - 4); container.b.position(0); final byte schemaBytes[] = PrivateVoltTableFactory.getSchemaBytes(schemaTable); final PureJavaCrc32 crc = new PureJavaCrc32(); ByteBuffer aggregateBuffer = ByteBuffer.allocate(container.b.remaining() + schemaBytes.length); aggregateBuffer.put(container.b); aggregateBuffer.put(schemaBytes); aggregateBuffer.flip(); crc.update(aggregateBuffer.array(), 4, aggregateBuffer.capacity() - 4); final int crcValue = (int) crc.getValue(); aggregateBuffer.putInt(crcValue).position(8); aggregateBuffer.put((byte) 0).position(0); // Haven't actually finished writing file if (m_simulateFullDiskWritingHeader) { m_writeException = new IOException("Disk full"); m_writeFailed = true; m_fos.close(); throw m_writeException; } /* * Be completely sure the write succeeded. If it didn't * the disk is probably full or the path is bunk etc. */ m_acceptOneWrite = true; ListenableFuture<?> writeFuture = write(Callables.returning((BBContainer) DBBPool.wrapBB(aggregateBuffer)), false); try { writeFuture.get(); } catch (InterruptedException e) { m_fos.close(); throw new java.io.InterruptedIOException(); } catch (ExecutionException e) { m_fos.close(); throw m_writeException; } if (m_writeFailed) { m_fos.close(); throw m_writeException; } ScheduledFuture<?> syncTask = null; syncTask = m_syncService.scheduleAtFixedRate( new Runnable() { @Override public void run() { // Only sync for at least 4 megabyte of data, enough to amortize the cost of seeking // on ye olden platters. Since we are appending to a file it's actually 2 seeks. while (m_bytesWrittenSinceLastSync.get() > (1024 * 1024 * 4)) { final int bytesSinceLastSync = m_bytesWrittenSinceLastSync.getAndSet(0); try { m_channel.force(false); } catch (IOException e) { if (!(e instanceof java.nio.channels.AsynchronousCloseException)) { SNAP_LOG.error("Error syncing snapshot", e); } else { SNAP_LOG.debug( "Asynchronous close syncing snasphot data, presumably graceful", e); } } m_bytesAllowedBeforeSync.release(bytesSinceLastSync); } } }, SNAPSHOT_SYNC_FREQUENCY, SNAPSHOT_SYNC_FREQUENCY, TimeUnit.MILLISECONDS); m_syncTask = syncTask; }
// XXX maybe consider an IOException subclass at some point public TableSaveFile( FileChannel dataIn, int readAheadChunks, Integer[] relevantPartitionIds, boolean continueOnCorruptedChunk) throws IOException { try { EELibraryLoader.loadExecutionEngineLibrary(true); if (relevantPartitionIds == null) { m_relevantPartitionIds = null; } else { m_relevantPartitionIds = new HashSet<Integer>(); for (Integer i : relevantPartitionIds) { m_relevantPartitionIds.add(i); } } m_chunkReads = new Semaphore(readAheadChunks); m_saveFile = dataIn; m_continueOnCorruptedChunk = continueOnCorruptedChunk; final PureJavaCrc32 crc = new PureJavaCrc32(); /* * If the CRC check fails because the file wasn't completed */ final PureJavaCrc32 secondCRC = new PureJavaCrc32(); /* * Get the header with the save restore specific information */ final ByteBuffer lengthBuffer = ByteBuffer.allocate(8); while (lengthBuffer.hasRemaining()) { final int read = m_saveFile.read(lengthBuffer); if (read == -1) { throw new EOFException(); } } lengthBuffer.flip(); final int originalCRC = lengthBuffer.getInt(); int length = lengthBuffer.getInt(); crc.update(lengthBuffer.array(), 4, 4); secondCRC.update(lengthBuffer.array(), 4, 4); if (length < 0) { throw new IOException("Corrupted save file has negative header length"); } if (length > 2097152) { throw new IOException("Corrupted save file has unreasonable header length > 2 megs"); } final ByteBuffer saveRestoreHeader = ByteBuffer.allocate(length); while (saveRestoreHeader.hasRemaining()) { final int read = m_saveFile.read(saveRestoreHeader); if (read == -1 || read < length) { throw new EOFException(); } } saveRestoreHeader.flip(); crc.update(saveRestoreHeader.array()); secondCRC.update(new byte[] {1}); secondCRC.update(saveRestoreHeader.array(), 1, saveRestoreHeader.array().length - 1); /* * Get the template for the VoltTable serialization header. * It will have an extra length value preceded to it so that * it can be sucked straight into a buffer. This will not * contain a row count since that varies from chunk to chunk * and is supplied by the chunk */ lengthBuffer.clear(); lengthBuffer.limit(4); /* * Why this stupidity and no while loop? * Because java is broken and complains about a random final * elsewhere if you do. */ { final int read = m_saveFile.read(lengthBuffer); if (read == -1) { throw new EOFException(); } } crc.update(lengthBuffer.array(), 0, 4); secondCRC.update(lengthBuffer.array(), 0, 4); lengthBuffer.flip(); length = lengthBuffer.getInt(); if (length < 4) { throw new IOException( "Corrupted save file has negative length or too small length for VoltTable header"); } if (length > 2097152) { throw new IOException( "Corrupted save file has unreasonable VoltTable header length > 2 megs"); } m_tableHeader = ByteBuffer.allocate(length + 4); m_tableHeader.putInt(length); while (m_tableHeader.hasRemaining()) { final int read = m_saveFile.read(m_tableHeader); if (read == -1) { throw new EOFException(); } } crc.update(m_tableHeader.array(), 4, length); secondCRC.update(m_tableHeader.array(), 4, length); boolean failedCRCDueToNotCompleted = false; final int actualCRC = (int) crc.getValue(); if (originalCRC != actualCRC) { /* * Check if the CRC mismatch is due to the snapshot not being completed */ final int secondCRCValue = (int) secondCRC.getValue(); if (secondCRCValue == originalCRC) { failedCRCDueToNotCompleted = true; } else { throw new IOException("Checksum mismatch"); } } FastDeserializer fd = new FastDeserializer(saveRestoreHeader); byte completedByte = fd.readByte(); m_completed = failedCRCDueToNotCompleted ? false : (completedByte == 1 ? true : false); for (int ii = 0; ii < 4; ii++) { m_versionNum[ii] = fd.readInt(); } /* * Support the original pre 1.3 header format as well as a new JSON format. * JSON will make it possible to add info to a snapshot header without * breaking backwards compatibility. */ if (m_versionNum[3] == 0) { m_txnId = fd.readLong(); m_timestamp = TransactionIdManager.getTimestampFromTransactionId(m_txnId); m_hostId = fd.readInt(); m_hostname = fd.readString(); m_clusterName = fd.readString(); m_databaseName = fd.readString(); m_tableName = fd.readString(); m_isReplicated = fd.readBoolean(); m_isCompressed = false; m_checksumType = ChecksumType.CRC32; if (!m_isReplicated) { m_partitionIds = (int[]) fd.readArray(int.class); if (!m_completed) { for (Integer partitionId : m_partitionIds) { m_corruptedPartitions.add(partitionId); } } m_totalPartitions = fd.readInt(); } else { m_partitionIds = new int[] {0}; m_totalPartitions = 1; if (!m_completed) { m_corruptedPartitions.add(0); } } m_hasVersion2FormatChunks = false; } else { assert (m_versionNum[3] == 1 || m_versionNum[3] == 2); if (m_versionNum[3] >= 2) { m_hasVersion2FormatChunks = true; } else { m_hasVersion2FormatChunks = false; } int numJSONBytes = fd.readInt(); byte jsonBytes[] = new byte[numJSONBytes]; fd.readFully(jsonBytes); String jsonString = new String(jsonBytes, "UTF-8"); JSONObject obj = new JSONObject(jsonString); m_txnId = obj.getLong("txnId"); // Timestamp field added for 3.0, might not be there if (obj.has("timestamp")) { m_timestamp = obj.getLong("timestamp"); } else { // Pre 3.0/IV2 the timestamp was in the transactionid m_timestamp = TransactionIdManager.getTimestampFromTransactionId(m_txnId); } m_hostId = obj.getInt("hostId"); m_hostname = obj.getString("hostname"); m_clusterName = obj.getString("clusterName"); m_databaseName = obj.getString("databaseName"); m_tableName = obj.getString("tableName"); m_isReplicated = obj.getBoolean("isReplicated"); m_isCompressed = obj.optBoolean("isCompressed", false); m_checksumType = ChecksumType.valueOf(obj.optString("checksumType", "CRC32")); if (!m_isReplicated) { JSONArray partitionIds = obj.getJSONArray("partitionIds"); m_partitionIds = new int[partitionIds.length()]; for (int ii = 0; ii < m_partitionIds.length; ii++) { m_partitionIds[ii] = partitionIds.getInt(ii); } if (!m_completed) { for (Integer partitionId : m_partitionIds) { m_corruptedPartitions.add(partitionId); } } m_totalPartitions = obj.getInt("numPartitions"); } else { m_partitionIds = new int[] {0}; m_totalPartitions = 1; if (!m_completed) { m_corruptedPartitions.add(0); } } } /* * Several runtime exceptions can be thrown in valid failure cases where * a corrupt save file is being detected. */ } catch (BufferUnderflowException e) { throw new IOException(e); } catch (BufferOverflowException e) { throw new IOException(e); } catch (IndexOutOfBoundsException e) { throw new IOException(e); } catch (JSONException e) { throw new IOException(e); } }