public class DefaultSnapshotDataTarget implements SnapshotDataTarget { /* * Make it possible for test code to block a write and thus snapshot completion */ public static volatile CountDownLatch m_simulateBlockedWrite = null; public static volatile boolean m_simulateFullDiskWritingHeader = false; public static volatile boolean m_simulateFullDiskWritingChunk = false; private final File m_file; private final FileChannel m_channel; private final FileOutputStream m_fos; private static final VoltLogger SNAP_LOG = new VoltLogger("SNAPSHOT"); private Runnable m_onCloseHandler = null; /* * If a write fails then this snapshot is hosed. * Set the flag so all writes return immediately. The system still * needs to scan all the tables to clear the dirty bits * so the process continues as if the writes are succeeding. * A more efficient failure mode would do the scan but not the * extra serialization work. */ private volatile boolean m_writeFailed = false; private volatile IOException m_writeException = null; private volatile long m_bytesWritten = 0; private static final Semaphore m_bytesAllowedBeforeSync = new Semaphore((1024 * 1024) * 256); private final AtomicInteger m_bytesWrittenSinceLastSync = new AtomicInteger(0); private final ScheduledFuture<?> m_syncTask; /* * Accept a single write even though simulating a full disk is enabled; */ private volatile boolean m_acceptOneWrite = false; private boolean m_needsFinalClose = true; @SuppressWarnings("unused") private final String m_tableName; private final AtomicInteger m_outstandingWriteTasks = new AtomicInteger(0); private final ReentrantLock m_outstandingWriteTasksLock = new ReentrantLock(); private final Condition m_noMoreOutstandingWriteTasksCondition = m_outstandingWriteTasksLock.newCondition(); private static final ListeningExecutorService m_es = CoreUtils.getSingleThreadExecutor("Snapshot write service "); static final ListeningScheduledExecutorService m_syncService = MoreExecutors.listeningDecorator( Executors.newSingleThreadScheduledExecutor( CoreUtils.getThreadFactory("Snapshot sync service"))); public static final int SNAPSHOT_SYNC_FREQUENCY = Integer.getInteger("SNAPSHOT_SYNC_FREQUENCY", 500); public DefaultSnapshotDataTarget( final File file, final int hostId, final String clusterName, final String databaseName, final String tableName, final int numPartitions, final boolean isReplicated, final List<Integer> partitionIds, final VoltTable schemaTable, final long txnId, final long timestamp) throws IOException { this( file, hostId, clusterName, databaseName, tableName, numPartitions, isReplicated, partitionIds, schemaTable, txnId, timestamp, new int[] {0, 0, 0, 2}); } public DefaultSnapshotDataTarget( final File file, final int hostId, final String clusterName, final String databaseName, final String tableName, final int numPartitions, final boolean isReplicated, final List<Integer> partitionIds, final VoltTable schemaTable, final long txnId, final long timestamp, int version[]) throws IOException { String hostname = CoreUtils.getHostnameOrAddress(); m_file = file; m_tableName = tableName; m_fos = new FileOutputStream(file); m_channel = m_fos.getChannel(); m_needsFinalClose = !isReplicated; final FastSerializer fs = new FastSerializer(); fs.writeInt(0); // CRC fs.writeInt(0); // Header length placeholder fs.writeByte( 1); // Indicate the snapshot was not completed, set to true for the CRC calculation, false // later for (int ii = 0; ii < 4; ii++) { fs.writeInt(version[ii]); // version } JSONStringer stringer = new JSONStringer(); byte jsonBytes[] = null; try { stringer.object(); stringer.key("txnId").value(txnId); stringer.key("hostId").value(hostId); stringer.key("hostname").value(hostname); stringer.key("clusterName").value(clusterName); stringer.key("databaseName").value(databaseName); stringer.key("tableName").value(tableName.toUpperCase()); stringer.key("isReplicated").value(isReplicated); stringer.key("isCompressed").value(true); stringer.key("checksumType").value("CRC32C"); stringer.key("timestamp").value(timestamp); /* * The timestamp string is for human consumption, automated stuff should use * the actual timestamp */ stringer.key("timestampString").value(SnapshotUtil.formatHumanReadableDate(timestamp)); if (!isReplicated) { stringer.key("partitionIds").array(); for (int partitionId : partitionIds) { stringer.value(partitionId); } stringer.endArray(); stringer.key("numPartitions").value(numPartitions); } stringer.endObject(); String jsonString = stringer.toString(); JSONObject jsonObj = new JSONObject(jsonString); jsonString = jsonObj.toString(4); jsonBytes = jsonString.getBytes("UTF-8"); } catch (Exception e) { throw new IOException(e); } fs.writeInt(jsonBytes.length); fs.write(jsonBytes); final BBContainer container = fs.getBBContainer(); container.b.position(4); container.b.putInt(container.b.remaining() - 4); container.b.position(0); final byte schemaBytes[] = PrivateVoltTableFactory.getSchemaBytes(schemaTable); final PureJavaCrc32 crc = new PureJavaCrc32(); ByteBuffer aggregateBuffer = ByteBuffer.allocate(container.b.remaining() + schemaBytes.length); aggregateBuffer.put(container.b); aggregateBuffer.put(schemaBytes); aggregateBuffer.flip(); crc.update(aggregateBuffer.array(), 4, aggregateBuffer.capacity() - 4); final int crcValue = (int) crc.getValue(); aggregateBuffer.putInt(crcValue).position(8); aggregateBuffer.put((byte) 0).position(0); // Haven't actually finished writing file if (m_simulateFullDiskWritingHeader) { m_writeException = new IOException("Disk full"); m_writeFailed = true; m_fos.close(); throw m_writeException; } /* * Be completely sure the write succeeded. If it didn't * the disk is probably full or the path is bunk etc. */ m_acceptOneWrite = true; ListenableFuture<?> writeFuture = write(Callables.returning((BBContainer) DBBPool.wrapBB(aggregateBuffer)), false); try { writeFuture.get(); } catch (InterruptedException e) { m_fos.close(); throw new java.io.InterruptedIOException(); } catch (ExecutionException e) { m_fos.close(); throw m_writeException; } if (m_writeFailed) { m_fos.close(); throw m_writeException; } ScheduledFuture<?> syncTask = null; syncTask = m_syncService.scheduleAtFixedRate( new Runnable() { @Override public void run() { // Only sync for at least 4 megabyte of data, enough to amortize the cost of seeking // on ye olden platters. Since we are appending to a file it's actually 2 seeks. while (m_bytesWrittenSinceLastSync.get() > (1024 * 1024 * 4)) { final int bytesSinceLastSync = m_bytesWrittenSinceLastSync.getAndSet(0); try { m_channel.force(false); } catch (IOException e) { if (!(e instanceof java.nio.channels.AsynchronousCloseException)) { SNAP_LOG.error("Error syncing snapshot", e); } else { SNAP_LOG.debug( "Asynchronous close syncing snasphot data, presumably graceful", e); } } m_bytesAllowedBeforeSync.release(bytesSinceLastSync); } } }, SNAPSHOT_SYNC_FREQUENCY, SNAPSHOT_SYNC_FREQUENCY, TimeUnit.MILLISECONDS); m_syncTask = syncTask; } @Override public boolean needsFinalClose() { return m_needsFinalClose; } @Override public void close() throws IOException, InterruptedException { try { m_outstandingWriteTasksLock.lock(); try { while (m_outstandingWriteTasks.get() > 0) { m_noMoreOutstandingWriteTasksCondition.await(); } } finally { m_outstandingWriteTasksLock.unlock(); } m_syncTask.cancel(false); m_channel.force(false); } finally { m_bytesAllowedBeforeSync.release(m_bytesWrittenSinceLastSync.getAndSet(0)); } m_channel.position(8); ByteBuffer completed = ByteBuffer.allocate(1); if (m_writeFailed) { completed.put((byte) 0).flip(); } else { completed.put((byte) 1).flip(); } m_channel.write(completed); m_channel.force(false); m_channel.close(); if (m_onCloseHandler != null) { m_onCloseHandler.run(); } } @Override public int getHeaderSize() { return 0; } /* * Prepend length is basically synonymous with writing actual tuple data and not * the header. */ private ListenableFuture<?> write( final Callable<BBContainer> tupleDataC, final boolean prependLength) { /* * Unwrap the data to be written. For the traditional * snapshot data target this should be a noop. */ BBContainer tupleDataTemp; try { tupleDataTemp = tupleDataC.call(); /* * Can be null if the dedupe filter nulled out the buffer */ if (tupleDataTemp == null) { return Futures.immediateFuture(null); } } catch (Throwable t) { return Futures.immediateFailedFuture(t); } final BBContainer tupleData = tupleDataTemp; if (m_writeFailed) { tupleData.discard(); return null; } m_outstandingWriteTasks.incrementAndGet(); Future<BBContainer> compressionTask = null; if (prependLength) { BBContainer cont = DBBPool.allocateDirectAndPool(SnapshotSiteProcessor.m_snapshotBufferCompressedLen); // Skip 4-bytes so the partition ID is not compressed // That way if we detect a corruption we know what partition is bad tupleData.b.position(tupleData.b.position() + 4); /* * Leave 12 bytes, it's going to be a 4-byte length prefix, a 4-byte partition id, * and a 4-byte CRC32C of just the header bytes, in addition to the compressed payload CRC * that is 16 bytes, but 4 of those are done by CompressionService */ cont.b.position(12); compressionTask = CompressionService.compressAndCRC32cBufferAsync(tupleData.b, cont); } final Future<BBContainer> compressionTaskFinal = compressionTask; ListenableFuture<?> writeTask = m_es.submit( new Callable<Object>() { @Override public Object call() throws Exception { try { if (m_acceptOneWrite) { m_acceptOneWrite = false; } else { if (m_simulateBlockedWrite != null) { m_simulateBlockedWrite.await(); } if (m_simulateFullDiskWritingChunk) { throw new IOException("Disk full"); } } int totalWritten = 0; if (prependLength) { BBContainer payloadContainer = compressionTaskFinal.get(); try { final ByteBuffer payloadBuffer = payloadContainer.b; payloadBuffer.position(0); ByteBuffer lengthPrefix = ByteBuffer.allocate(12); m_bytesAllowedBeforeSync.acquire(payloadBuffer.remaining()); // Length prefix does not include 4 header items, just compressd payload // that follows lengthPrefix.putInt(payloadBuffer.remaining() - 16); // length prefix lengthPrefix.putInt(tupleData.b.getInt(0)); // partitionId /* * Checksum the header and put it in the payload buffer */ PureJavaCrc32C crc = new PureJavaCrc32C(); crc.update(lengthPrefix.array(), 0, 8); lengthPrefix.putInt((int) crc.getValue()); lengthPrefix.flip(); payloadBuffer.put(lengthPrefix); payloadBuffer.position(0); /* * Write payload to file */ while (payloadBuffer.hasRemaining()) { totalWritten += m_channel.write(payloadBuffer); } } finally { payloadContainer.discard(); } } else { while (tupleData.b.hasRemaining()) { totalWritten += m_channel.write(tupleData.b); } } m_bytesWritten += totalWritten; m_bytesWrittenSinceLastSync.addAndGet(totalWritten); } catch (IOException e) { m_writeException = e; SNAP_LOG.error( "Error while attempting to write snapshot data to file " + m_file, e); m_writeFailed = true; throw e; } finally { try { tupleData.discard(); } finally { m_outstandingWriteTasksLock.lock(); try { if (m_outstandingWriteTasks.decrementAndGet() == 0) { m_noMoreOutstandingWriteTasksCondition.signalAll(); } } finally { m_outstandingWriteTasksLock.unlock(); } } } return null; } }); return writeTask; } @Override public ListenableFuture<?> write(final Callable<BBContainer> tupleData, int tableId) { return write(tupleData, true); } @Override public long getBytesWritten() { return m_bytesWritten; } @Override public void setOnCloseHandler(Runnable onClose) { m_onCloseHandler = onClose; } @Override public IOException getLastWriteException() { return m_writeException; } @Override public SnapshotFormat getFormat() { return SnapshotFormat.NATIVE; } /** * Get the row count if any, of the content wrapped in the given {@link BBContainer} * * @param tupleData * @return the numbers of tuple data rows contained within a container */ @Override public int getInContainerRowCount(BBContainer tupleData) { return SnapshotDataTarget.ROW_COUNT_UNSUPPORTED; } @Override public String toString() { return m_file.toString(); } }
/** * Tracker monitors and provides snapshots of a single ZK node's children. The children data objects * must be JSONObjects. */ public class MapCache { // // API // public MapCache(ZooKeeper zk, String rootNode) { m_zk = zk; m_rootNode = rootNode; } public void start(boolean block) throws InterruptedException, ExecutionException { Future<?> task = m_es.submit(new ParentEvent(null)); if (block) { task.get(); } } public void shutdown() throws InterruptedException { m_shutdown.set(true); m_es.shutdown(); m_es.awaitTermination(356, TimeUnit.DAYS); } public ImmutableMap<String, JSONObject> pointInTimeCache() { if (m_shutdown.get()) { throw new RuntimeException("Requested cache from shutdown MapCache."); } return m_publicCache.get(); } // // Implementation // private final ZooKeeper m_zk; private final AtomicBoolean m_shutdown = new AtomicBoolean(false); // the children of this node are observed. private final String m_rootNode; // All watch processing is run serially in this thread. private final ListeningExecutorService m_es = MoreExecutors.listeningDecorator( Executors.newSingleThreadExecutor( CoreUtils.getThreadFactory("Mailbox tracker", 1024 * 128))); // previous children snapshot for internal use. private Set<String> m_lastChildren = new HashSet<String>(); // the cache exposed to the public. Start empty. private AtomicReference<ImmutableMap<String, JSONObject>> m_publicCache = new AtomicReference<ImmutableMap<String, JSONObject>>(); // parent (root node) sees new or deleted child private class ParentEvent implements Runnable { private final WatchedEvent m_event; public ParentEvent(WatchedEvent event) { m_event = event; } @Override public void run() { try { processParentEvent(m_event); } catch (Exception e) { // ignore post-shutdown session termination exceptions. if (!m_shutdown.get()) { org.voltdb.VoltDB.crashLocalVoltDB("Unexpected failure in MapCache.", true, e); } } } } // child node sees modification or deletion private class ChildEvent implements Runnable { private final WatchedEvent m_event; public ChildEvent(WatchedEvent event) { m_event = event; } @Override public void run() { try { processChildEvent(m_event); } catch (Exception e) { // ignore post-shutdown session termination exceptions. if (!m_shutdown.get()) { org.voltdb.VoltDB.crashLocalVoltDB("Unexpected failure in MapCache.", true, e); } } } } // Boilerplate to forward zookeeper watches to the executor service private final Watcher m_parentWatch = new Watcher() { @Override public void process(final WatchedEvent event) { try { if (!m_shutdown.get()) { m_es.submit(new ParentEvent(event)); } } catch (RejectedExecutionException e) { if (m_es.isShutdown()) { return; } else { org.voltdb.VoltDB.crashLocalVoltDB( "Unexpected rejected execution exception", false, e); } } } }; // Boilerplate to forward zookeeper watches to the executor service private final Watcher m_childWatch = new Watcher() { @Override public void process(final WatchedEvent event) { try { if (!m_shutdown.get()) { m_es.submit(new ChildEvent(event)); } } catch (RejectedExecutionException e) { if (m_es.isShutdown()) { return; } else { org.voltdb.VoltDB.crashLocalVoltDB( "Unexpected rejected execution exception", false, e); } } } }; /** * Rebuild the point-in-time snapshot of the children objects and set watches on new * children. @Param event may be null on the first initialization. */ private void processParentEvent(WatchedEvent event) throws Exception { // get current children snapshot and reset this watch. Set<String> children = new TreeSet<String>(m_zk.getChildren(m_rootNode, m_parentWatch)); // intersect to get newChildren and update m_lastChildren to the current set. Set<String> newChildren = new HashSet<String>(children); newChildren.removeAll(m_lastChildren); m_lastChildren = children; List<ByteArrayCallback> callbacks = new ArrayList<ByteArrayCallback>(); for (String child : children) { ByteArrayCallback cb = new ByteArrayCallback(); // set watches on new children. if (newChildren.contains(child)) { m_zk.getData(ZKUtil.joinZKPath(m_rootNode, child), m_childWatch, cb, null); } else { m_zk.getData(ZKUtil.joinZKPath(m_rootNode, child), false, cb, null); } callbacks.add(cb); } HashMap<String, JSONObject> cache = new HashMap<String, JSONObject>(); for (ByteArrayCallback callback : callbacks) { try { byte payload[] = callback.getData(); JSONObject jsObj = new JSONObject(new String(payload, "UTF-8")); cache.put(callback.getPath(), jsObj); } catch (KeeperException.NoNodeException e) { // child may have been deleted between the parent trigger and getData. } } m_publicCache.set(ImmutableMap.copyOf(cache)); } /** * Update a modified child and republish a new snapshot. This may indicate a deleted child or a * child with modified data. */ private void processChildEvent(WatchedEvent event) throws Exception { HashMap<String, JSONObject> cacheCopy = new HashMap<String, JSONObject>(m_publicCache.get()); ByteArrayCallback cb = new ByteArrayCallback(); m_zk.getData(event.getPath(), m_childWatch, cb, null); try { byte payload[] = cb.getData(); JSONObject jsObj = new JSONObject(new String(payload, "UTF-8")); cacheCopy.put(cb.getPath(), jsObj); } catch (KeeperException.NoNodeException e) { cacheCopy.remove(event.getPath()); } m_publicCache.set(ImmutableMap.copyOf(cacheCopy)); } }