public static void logFragmentTaskMessage( FragmentTaskMessage ftask, long localHSId, long spHandle, boolean borrow) { if (iv2log.isTraceEnabled()) { String label = "rxFragMsg"; if (borrow) { label = "rxBrrwMsg"; } if (ftask.getSpHandle() != Long.MIN_VALUE && ftask.getSpHandle() != spHandle) { iv2log.error( "FragmentTaskMessage SP HANDLE conflict. Message: " + ftask.getSpHandle() + ", locally held: " + spHandle); } String logmsg = "%s %s from %s txnId %s spHandle %s trunc %s"; iv2log.trace( String.format( logmsg, label, CoreUtils.hsIdToString(localHSId), CoreUtils.hsIdToString(ftask.m_sourceHSId), txnIdToString(ftask.getTxnId()), txnIdToString(spHandle), txnIdToString(ftask.getTruncationHandle()))); } }
@Override public void run() { initialize(); try { while (m_shouldContinue) { // Normal operation blocks the site thread on the sitetasker queue. SiteTasker task = m_scheduler.take();; } } catch (OutOfMemoryError e) { // Even though OOM should be caught by the Throwable section below, // it sadly needs to be handled seperately. The goal here is to make // sure VoltDB crashes. String errmsg = "Site: " + org.voltcore.utils.CoreUtils.hsIdToString(m_siteId) + " ran out of Java memory. " + "This node will shut down."; VoltDB.crashLocalVoltDB(errmsg, true, e); } catch (Throwable t) { String errmsg = "Site: " + org.voltcore.utils.CoreUtils.hsIdToString(m_siteId) + " encountered an " + "unexpected error and will die, taking this VoltDB node down."; VoltDB.crashLocalVoltDB(errmsg, true, t); } shutdown(); }
public void handleDumpMessage() { String who = CoreUtils.hsIdToString(m_mailbox.getHSId()); hostLog.warn("State dump for site: " + who); hostLog.warn("" + who + ": partition: " + m_partitionId + ", isLeader: " + m_isLeader); if (m_isLeader) { hostLog.warn("" + who + ": replicas: " + CoreUtils.hsIdCollectionToString(m_replicaHSIds)); if (m_sendToHSIds.length > 0) { m_mailbox.send(m_sendToHSIds, new DumpMessage()); } } hostLog.warn( "" + who + ": most recent SP handle: " + getCurrentTxnId() + " " + TxnEgo.txnIdToString(getCurrentTxnId())); hostLog.warn( "" + who + ": outstanding txns: " + m_outstandingTxns.keySet() + " " + TxnEgo.txnIdCollectionToString(m_outstandingTxns.keySet())); hostLog.warn("" + who + ": TransactionTaskQueue: " + m_pendingTasks.toString()); if (m_duplicateCounters.size() > 0) { hostLog.warn("" + who + ": duplicate counters: "); for (Entry<DuplicateCounterKey, DuplicateCounter> e : m_duplicateCounters.entrySet()) { hostLog.warn("\t" + who + ": " + e.getKey().toString() + ": " + e.getValue().toString()); } } }
public static void logIv2InitiateTaskMessage( Iv2InitiateTaskMessage itask, long localHSId, long txnid, long spHandle) { if (iv2log.isTraceEnabled()) { String logmsg = "rxInitMsg %s from %s ciHandle %s txnId %s spHandle %s trunc %s"; if (itask.getTxnId() != Long.MIN_VALUE && itask.getTxnId() != txnid) { iv2log.error( "Iv2InitiateTaskMessage TXN ID conflict. Message: " + itask.getTxnId() + ", locally held: " + txnid); } if (itask.getSpHandle() != Long.MIN_VALUE && itask.getSpHandle() != spHandle) { iv2log.error( "Iv2InitiateTaskMessage SP HANDLE conflict. Message: " + itask.getSpHandle() + ", locally held: " + spHandle); } iv2log.trace( String.format( logmsg, CoreUtils.hsIdToString(localHSId), CoreUtils.hsIdToString(itask.m_sourceHSId), ClientInterfaceHandleManager.handleToString(itask.getClientInterfaceHandle()), txnIdToString(txnid), txnIdToString(spHandle), txnIdToString(itask.getTruncationHandle()))); } }
public static void logInitiatorRxMsg(VoltMessage msg, long localHSId) { if (iv2log.isTraceEnabled()) { if (msg instanceof InitiateResponseMessage) { InitiateResponseMessage iresp = (InitiateResponseMessage) msg; String logmsg = "rxInitRsp %s from %s ciHandle %s txnId %s spHandle %s status %s"; iv2log.trace( String.format( logmsg, CoreUtils.hsIdToString(localHSId), CoreUtils.hsIdToString(iresp.m_sourceHSId), ClientInterfaceHandleManager.handleToString(iresp.getClientInterfaceHandle()), txnIdToString(iresp.getTxnId()), txnIdToString(iresp.getSpHandle()), respStatusToString(iresp.getClientResponseData().getStatus()))); } else if (msg instanceof FragmentResponseMessage) { FragmentResponseMessage fresp = (FragmentResponseMessage) msg; String logmsg = "rxFragRsp %s from %s txnId %s spHandle %s status %s"; iv2log.trace( String.format( logmsg, CoreUtils.hsIdToString(localHSId), CoreUtils.hsIdToString(fresp.m_sourceHSId), txnIdToString(fresp.getTxnId()), txnIdToString(fresp.getSpHandle()), fragStatusToString(fresp.getStatusCode()))); } } }
/** Runs when the RejoinCoordinator decides this site should start rejoin. */ void doInitiation(RejoinMessage message) { m_coordinatorHsId = message.m_sourceHSId; m_streamSnapshotMb = VoltDB.instance().getHostMessenger().createMailbox(); m_rejoinSiteProcessor = new StreamSnapshotSink(m_streamSnapshotMb); // MUST choose the leader as the source. long sourceSite = m_mailbox.getMasterHsId(m_partitionId); long hsId = m_rejoinSiteProcessor.initialize( message.getSnapshotSourceCount(), message.getSnapshotBufferPool()); REJOINLOG.debug( m_whoami + "received INITIATION message. Doing rejoin" + ". Source site is: " + CoreUtils.hsIdToString(sourceSite) + " and destination rejoin processor is: " + CoreUtils.hsIdToString(hsId) + " and snapshot nonce is: " + message.getSnapshotNonce()); registerSnapshotMonitor(message.getSnapshotNonce()); // Tell the RejoinCoordinator everything it will need to know to get us our snapshot stream. RejoinMessage initResp = new RejoinMessage(m_mailbox.getHSId(), sourceSite, hsId); m_mailbox.send(m_coordinatorHsId, initResp); // Start waiting for snapshot data m_taskQueue.offer(this); }
public static void logTopology(long leaderHSId, List<Long> replicas, int partitionId) { if (iv2log.isTraceEnabled()) { String logmsg = "topology partition %d leader %s replicas (%s)"; iv2log.trace( String.format( logmsg, partitionId, CoreUtils.hsIdToString(leaderHSId), CoreUtils.hsIdCollectionToString(replicas))); } }
/** Create a native VoltDB execution engine */ ExecutionEngine initializeEE(String serializedCatalog, final long timestamp) { String hostname = CoreUtils.getHostnameOrAddress(); ExecutionEngine eeTemp = null; try { if (m_backend == BackendTarget.NATIVE_EE_JNI) { eeTemp = new ExecutionEngineJNI( m_context.cluster.getRelativeIndex(), m_siteId, m_partitionId, CoreUtils.getHostIdFromHSId(m_siteId), hostname, m_context .cluster .getDeployment() .get("deployment") .getSystemsettings() .get("systemsettings") .getMaxtemptablesize(), m_numberOfPartitions); eeTemp.loadCatalog(timestamp, serializedCatalog); } else { // set up the EE over IPC eeTemp = new ExecutionEngineIPC( m_context.cluster.getRelativeIndex(), m_siteId, m_partitionId, CoreUtils.getHostIdFromHSId(m_siteId), hostname, m_context .cluster .getDeployment() .get("deployment") .getSystemsettings() .get("systemsettings") .getMaxtemptablesize(), m_backend, VoltDB.instance().getConfig().m_ipcPorts.remove(0), m_numberOfPartitions); eeTemp.loadCatalog(timestamp, serializedCatalog); } } // just print error info an bail if we run into an error here catch (final Exception ex) { hostLog.l7dlog( Level.FATAL,, new Object[] {m_siteId, m_siteIndex}, ex); VoltDB.crashLocalVoltDB(ex.getMessage(), true, ex); } return eeTemp; }
public static void logFinishTransaction(InitiateResponseMessage msg, long localHSId) { if (iv2log.isTraceEnabled()) { String logmsg = "finishTxn %s ciHandle %s initHSId %s status %s"; iv2log.trace( String.format( logmsg, CoreUtils.hsIdToString(localHSId), ClientInterfaceHandleManager.handleToString(msg.getClientInterfaceHandle()), CoreUtils.hsIdToString(msg.getCoordinatorHSId()), respStatusToString(msg.getClientResponseData().getStatus()))); } }
/** Process a new repair log response */ @Override public void deliver(VoltMessage message) { if (message instanceof Iv2RepairLogResponseMessage) { Iv2RepairLogResponseMessage response = (Iv2RepairLogResponseMessage) message; if (response.getRequestId() != m_requestId) { tmLog.debug( m_whoami + "rejecting stale repair response." + " Current request id is: " + m_requestId + " Received response for request id: " + response.getRequestId()); return; } ReplicaRepairStruct rrs = m_replicaRepairStructs.get(response.m_sourceHSId); if (rrs.m_expectedResponses < 0) { tmLog.debug( m_whoami + "collecting " + response.getOfTotal() + " repair log entries from " + CoreUtils.hsIdToString(response.m_sourceHSId)); } // Long.MAX_VALUE has rejoin semantics if (response.getHandle() != Long.MAX_VALUE) { m_maxSeenTxnId = Math.max(m_maxSeenTxnId, response.getHandle()); } if (response.getPayload() != null) { m_repairLogUnion.add(response); if (tmLog.isTraceEnabled()) { tmLog.trace( m_whoami + " collected from " + CoreUtils.hsIdToString(response.m_sourceHSId) + ", message: " + response.getPayload()); } } if (rrs.update(response)) { tmLog.debug( m_whoami + "collected " + rrs.m_receivedResponses + " responses for " + rrs.m_expectedResponses + " repair log entries from " + CoreUtils.hsIdToString(response.m_sourceHSId)); if (areRepairLogsComplete()) { repairSurvivors(); } } } }
public static void logIv2MultipartSentinel( MultiPartitionParticipantMessage message, long localHSId, long txnId) { if (iv2log.isTraceEnabled()) { String logmsg = "rxSntlMsg %s from %s txnId %s"; iv2log.trace( String.format( logmsg, CoreUtils.hsIdToString(localHSId), CoreUtils.hsIdToString(message.m_sourceHSId), txnIdToString(txnId))); } }
public static void logCreateTransaction(Iv2InitiateTaskMessage msg) { if (iv2log.isTraceEnabled()) { String logmsg = "createTxn %s ciHandle %s initHSId %s proc %s"; iv2log.trace( String.format( logmsg, CoreUtils.hsIdToString(msg.getInitiatorHSId()), ClientInterfaceHandleManager.handleToString(msg.getClientInterfaceHandle()), CoreUtils.hsIdToString(msg.getCoordinatorHSId()), msg.getStoredProcedureInvocation().getProcName())); } }
public long getBuddySiteForMPI(long hsid) { int host = CoreUtils.getHostIdFromHSId(hsid); // We'll be lazy and get the map we'd feed to SiteTracker's // constructor, then go looking for a matching host ID. List<MailboxNodeContent> sitesList = getMailboxNodeContentList(); for (MailboxNodeContent site : sitesList) { if (site.partitionId != MpInitiator.MP_INIT_PID && host == CoreUtils.getHostIdFromHSId(site.HSId)) { return site.HSId; } } throw new RuntimeException( "Unable to find a buddy initiator for MPI with HSID: " + CoreUtils.hsIdToString(hsid)); }
public static void logCompleteTransactionMessage( CompleteTransactionMessage ctask, long localHSId) { if (iv2log.isTraceEnabled()) { String logmsg = "rxCompMsg %s from %s txnId %s %s %s"; iv2log.trace( String.format( logmsg, CoreUtils.hsIdToString(localHSId), CoreUtils.hsIdToString(ctask.m_sourceHSId), txnIdToString(ctask.getTxnId()), ctask.isRollback() ? "ROLLBACK" : "COMMIT", ctask.isRestart() ? "RESTART" : "")); } }
private long assignLeader(int partitionId, List<Long> children) { // We used masterHostId = -1 as a way to force the leader choice to be // the first replica in the list, if we don't have some other mechanism // which has successfully overridden it. int masterHostId = -1; if (m_state.get() == AppointerState.CLUSTER_START) { try { // find master in topo JSONArray parts = m_topo.getJSONArray("partitions"); for (int p = 0; p < parts.length(); p++) { JSONObject aPartition = parts.getJSONObject(p); int pid = aPartition.getInt("partition_id"); if (pid == partitionId) { masterHostId = aPartition.getInt("master"); } } } catch (JSONException jse) { tmLog.error("Failed to find master for partition " + partitionId + ", defaulting to 0"); jse.printStackTrace(); masterHostId = -1; // stupid default } } else { // For now, if we're appointing a new leader as a result of a // failure, just pick the first replica in the children list. // Could eventually do something more complex here to try to keep a // semi-balance, but it's unclear that this has much utility until // we add rebalancing on rejoin as well. masterHostId = -1; } long masterHSId = children.get(0); for (Long child : children) { if (CoreUtils.getHostIdFromHSId(child) == masterHostId) { masterHSId = child; break; } } "Appointing HSId " + CoreUtils.hsIdToString(masterHSId) + " as leader for partition " + partitionId); try { m_iv2appointees.put(partitionId, masterHSId); } catch (Exception e) { VoltDB.crashLocalVoltDB("Unable to appoint new master for partition " + partitionId, true, e); } return masterHSId; }
@Override protected void updateStatsRow(Object rowKey, Object[] rowValues) { long leader; List<Long> sites = new ArrayList<Long>(); if (rowKey.equals(MpInitiator.MP_INIT_PID)) { leader = getHSIdForMultiPartitionInitiator(); sites.add(leader); } else { leader = m_iv2Masters.pointInTimeCache().get((Integer) rowKey); sites.addAll(getReplicasForPartition((Integer) rowKey)); } rowValues[columnNameToIndex.get("Partition")] = rowKey; rowValues[columnNameToIndex.get("Sites")] = CoreUtils.hsIdCollectionToString(sites); rowValues[columnNameToIndex.get("Leader")] = CoreUtils.hsIdToString(leader); }
@Override public void run() { Thread.currentThread().setName("Iv2ExecutionSite: " + CoreUtils.hsIdToString(m_siteId)); initialize(m_startupConfig.m_serializedCatalog, m_startupConfig.m_timestamp); m_startupConfig = null; // release the serializedCatalog bytes. try { while (m_shouldContinue) { if (m_rejoinState == kStateRunning) { // Normal operation blocks the site thread on the sitetasker queue. SiteTasker task = m_scheduler.take(); if (task instanceof TransactionTask) { m_currentTxnId = ((TransactionTask) task).getTxnId(); m_lastTxnTime = EstTime.currentTimeMillis(); }; } else { // Rejoin operation poll and try to do some catchup work. Tasks // are responsible for logging any rejoin work they might have. SiteTasker task = m_scheduler.poll(); if (task != null) { task.runForRejoin(getSiteProcedureConnection(), m_rejoinTaskLog); } replayFromTaskLog(); } } } catch (OutOfMemoryError e) { // Even though OOM should be caught by the Throwable section below, // it sadly needs to be handled seperately. The goal here is to make // sure VoltDB crashes. String errmsg = "Site: " + org.voltcore.utils.CoreUtils.hsIdToString(m_siteId) + " ran out of Java memory. " + "This node will shut down."; VoltDB.crashLocalVoltDB(errmsg, true, e); } catch (Throwable t) { String errmsg = "Site: " + org.voltcore.utils.CoreUtils.hsIdToString(m_siteId) + " encountered an " + "unexpected error and will die, taking this VoltDB node down."; VoltDB.crashLocalVoltDB(errmsg, true, t); } shutdown(); }
@Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("MpProcedureTask:"); sb.append(" TXN ID: ").append(TxnEgo.txnIdToString(getTxnId())); sb.append(" SP HANDLE ID: ").append(TxnEgo.txnIdToString(getSpHandle())); sb.append(" ON HSID: ").append(CoreUtils.hsIdToString(m_initiator.getHSId())); return sb.toString(); }
/** Send missed-messages to survivors. */ public void repairSurvivors() { // cancel() and repair() must be synchronized by the caller (the deliver lock, // currently). If cancelled and the last repair message arrives, don't send // out corrections! if (this.m_promotionResult.isCancelled()) { tmLog.debug(m_whoami + "Skipping repair message creation for cancelled Term."); return; } int queued = 0; tmLog.debug(m_whoami + "received all repair logs and is repairing surviving replicas."); for (Iv2RepairLogResponseMessage li : m_repairLogUnion) { List<Long> needsRepair = new ArrayList<Long>(5); for (Entry<Long, ReplicaRepairStruct> entry : m_replicaRepairStructs.entrySet()) { if (entry.getValue().needs(li.getHandle())) { ++queued; tmLog.debug( m_whoami + "repairing " + CoreUtils.hsIdToString(entry.getKey()) + ". Max seen " + entry.getValue().m_maxSpHandleSeen + ". Repairing with " + li.getHandle()); needsRepair.add(entry.getKey()); } } if (!needsRepair.isEmpty()) { if (tmLog.isTraceEnabled()) { tmLog.trace( m_whoami + "repairing: " + CoreUtils.hsIdCollectionToString(needsRepair) + " with message: " + li.getPayload()); } m_mailbox.repairReplicasWith(needsRepair, li.getPayload()); } } tmLog.debug(m_whoami + "finished queuing " + queued + " replica repair messages."); m_promotionResult.done(m_maxSeenTxnId); }
/** Notify the coordinator that this site has received the first fragment message */ private void sendFirstFragResponse() { if (JOINLOG.isDebugEnabled()) { JOINLOG.debug( "P" + m_partitionId + " sending first fragment response to coordinator " + CoreUtils.hsIdToString(m_coordinatorHsId)); } RejoinMessage msg = new RejoinMessage(m_mailbox.getHSId(), RejoinMessage.Type.FIRST_FRAGMENT_RECEIVED); m_mailbox.send(m_coordinatorHsId, msg); m_firstFragResponseSent = true; }
@Override public void run() { REJOINLOG.debug( m_whoami + "informing rejoinCoordinator " + CoreUtils.hsIdToString(m_coordinatorHsId) + " of REPLAY_FINISHED"); RejoinMessage replay_complete = new RejoinMessage(m_mailbox.getHSId(), RejoinMessage.Type.REPLAY_FINISHED); m_mailbox.send(m_coordinatorHsId, replay_complete); m_currentlyRejoining.set(false); SnapshotSaveAPI.recoveringSiteCount.decrementAndGet(); }
// This message used to be sent by the SP or MP initiator when they accepted a promotion. // For dev speed, we'll detect mastership changes here and construct and send this message to the // local client interface so we can keep the CIs implementation private void sendLeaderChangeNotify(long hsId, int partitionId) { try { JSONStringer stringer = new JSONStringer(); stringer.object(); stringer.key(JSON_PARTITION_ID).value(partitionId); stringer.key(JSON_INITIATOR_HSID).value(hsId); stringer.endObject(); BinaryPayloadMessage bpm = new BinaryPayloadMessage(new byte[0], stringer.toString().getBytes("UTF-8")); int hostId = m_hostMessenger.getHostId(); m_hostMessenger.send( CoreUtils.getHSIdFromHostAndSite(hostId, HostMessenger.CLIENT_INTERFACE_SITE_ID), bpm); } catch (Exception e) { VoltDB.crashLocalVoltDB("Unable to propogate leader promotion to client interface.", true, e); } }
/** Start fixing survivors: setup scoreboard and request repair logs. */ void prepareForFaultRecovery() { for (Long hsid : m_survivors) { m_replicaRepairStructs.put(hsid, new ReplicaRepairStruct()); } m_whoami + "found (including self) " + m_survivors.size() + " surviving replicas to repair. " + " Survivors: " + CoreUtils.hsIdCollectionToString(m_survivors)); VoltMessage logRequest = new Iv2RepairLogRequestMessage(m_requestId, Iv2RepairLogRequestMessage.SPREQUEST); m_mailbox.send(, logRequest); }
public void dump(long hsId) { final String who = CoreUtils.hsIdToString(hsId); String.format( "%s: REPLAY SEQUENCER DUMP, LAST POLLED FRAGMENT %d (%s), LAST SEEN TXNID %d (%s), %s%s", who, m_lastPolledFragmentTxnId, TxnEgo.txnIdToString(m_lastPolledFragmentTxnId), m_lastSeenTxnId, TxnEgo.txnIdToString(m_lastSeenTxnId), m_mpiEOLReached ? "MPI EOL, " : "", m_mustDrain ? "MUST DRAIN" : "")); for (Entry<Long, ReplayEntry> e : m_replayEntries.entrySet()) {"%s: REPLAY ENTRY %s: %s", who, e.getKey(), e.getValue())); } }
public SimpleFileSnapshotDataTarget(File file, boolean needsFinalClose) throws IOException { m_file = file; m_tempFile = new File(m_file.getParentFile(), m_file.getName() + ".incomplete"); m_ras = new RandomAccessFile(m_tempFile, "rw"); m_fc = m_ras.getChannel(); m_needsFinalClose = needsFinalClose; m_es = CoreUtils.getListeningSingleThreadExecutor("Snapshot write thread for " + m_file); ScheduledFuture<?> syncTask = null; syncTask = DefaultSnapshotDataTarget.m_syncService.scheduleAtFixedRate( new Runnable() { private long syncedBytes = 0; @Override public void run() { // Only sync for at least 4 megabyte of data, enough to amortize the cost of seeking // on ye olden platters. Since we are appending to a file it's actually 2 seeks. while (m_bytesSinceLastSync.get() > 1024 * 1024 * 4) { try { final long syncStart = syncedBytes; syncedBytes = Bits.sync_file_range( SNAP_LOG, m_ras.getFD(), m_fc, syncStart, m_fc.position()); } catch (IOException e) { if (!(e instanceof java.nio.channels.AsynchronousCloseException)) { SNAP_LOG.error("Error syncing snapshot", e); } else { SNAP_LOG.debug( "Asynchronous close syncing snapshot data, presumably graceful", e); } } // Blind setting to 0 means we could technically write more than // 256 megabytes at a time but 512 is the worst case and that is fine m_bytesSinceLastSync.set(0); } } }, DefaultSnapshotDataTarget.SNAPSHOT_SYNC_FREQUENCY, DefaultSnapshotDataTarget.SNAPSHOT_SYNC_FREQUENCY, TimeUnit.MILLISECONDS); m_syncTask = syncTask; }
@Override public Callable<Boolean> createSetup( String file_path, String file_nonce, long txnId, Map<Integer, Long> partitionTransactionIds, JSONObject jsData, SystemProcedureExecutionContext context, VoltTable result, Map<String, Map<Integer, Pair<Long, Long>>> exportSequenceNumbers, SiteTracker tracker, HashinatorSnapshotData hashinatorData, long timestamp) { assert SnapshotSiteProcessor.ExecutionSitesCurrentlySnapshotting.isEmpty(); final IndexSnapshotRequestConfig config = new IndexSnapshotRequestConfig(jsData, context.getDatabase()); final Map<Integer, Long> pidToLocalHSIds = findLocalSources(config.partitionRanges, tracker); // mark snapshot start in registry final AtomicInteger numTables = new AtomicInteger(config.tables.length); m_snapshotRecord = SnapshotRegistry.startSnapshot( txnId, context.getHostId(), file_path, file_nonce, SnapshotFormat.INDEX, config.tables); // create table tasks for (Table table : config.tables) { createTasksForTable( table, config.partitionRanges, pidToLocalHSIds, numTables, m_snapshotRecord); result.addRow( context.getHostId(), CoreUtils.getHostnameOrAddress(), table.getTypeName(), "SUCCESS", ""); } return null; }
@Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("COMPLETE_TRANSACTION (FROM COORD: "); sb.append(CoreUtils.hsIdToString(m_coordinatorHSId)); sb.append(") FOR TXN "); sb.append(m_txnId); sb.append("\n FLAGS: ").append(m_flags); sb.append("\n HASH: " + String.valueOf(m_hash)); if (isRollback()) sb.append("\n THIS IS AN ROLLBACK REQUEST"); if (requiresAck()) sb.append("\n THIS MESSAGE REQUIRES AN ACK"); if (isRestart()) { sb.append("\n THIS IS A TRANSACTION RESTART"); } return sb.toString(); }
/** * LeaderAppointer handles centralized appointment of partition leaders across the partition. This * is primarily so that the leaders can be evenly distributed throughout the cluster, reducing * bottlenecks (at least at startup). As a side-effect, this service also controls the initial * startup of the cluster, blocking operation until each partition has a k-safe set of replicas, * each partition has a leader, and the MPI has started. */ public class LeaderAppointer implements Promotable { private static final VoltLogger tmLog = new VoltLogger("TM"); private enum AppointerState { INIT, // Initial start state, used to inhibit ZK callback actions CLUSTER_START, // indicates that we're doing the initial cluster startup DONE // indicates normal running conditions, including repair } private final HostMessenger m_hostMessenger; private final ZooKeeper m_zk; private final int m_partitionCount; private final BabySitter[] m_partitionWatchers; private final LeaderCache m_iv2appointees; private final LeaderCache m_iv2masters; private final PartitionCallback[] m_callbacks; private final int m_kfactor; private final JSONObject m_topo; private final MpInitiator m_MPI; private final AtomicReference<AppointerState> m_state = new AtomicReference<AppointerState>(AppointerState.INIT); private CountDownLatch m_startupLatch = null; private final boolean m_partitionDetectionEnabled; private boolean m_partitionDetected = false; private boolean m_usingCommandLog = false; private final AtomicBoolean m_replayComplete = new AtomicBoolean(false); // Provide a single single-threaded executor service to all the BabySitters for each partition. // This will guarantee that the ordering of events generated by ZooKeeper is preserved in the // handling of callbacks in LeaderAppointer. private final ExecutorService m_es = CoreUtils.getCachedSingleThreadExecutor("LeaderAppointer-Babysitters", 15000); private final SnapshotSchedule m_partSnapshotSchedule; private final SnapshotResponseHandler m_snapshotHandler = new SnapshotResponseHandler() { @Override public void handleResponse(ClientResponse resp) { if (resp == null) { VoltDB.crashLocalVoltDB( "Received a null response to a snapshot initiation request. " + "This should be impossible.", true, null); } else if (resp.getStatus() != ClientResponse.SUCCESS) { "Failed to complete partition detection snapshot, status: " + resp.getStatus() + ", reason: " + resp.getStatusString());"Retrying partition detection snapshot..."); SnapshotUtil.requestSnapshot( 0L, m_partSnapshotSchedule.getPath(), m_partSnapshotSchedule.getPrefix() + System.currentTimeMillis(), true, SnapshotFormat.NATIVE, null, m_snapshotHandler, true); } else if (!SnapshotUtil.didSnapshotRequestSucceed(resp.getResults())) { VoltDB.crashGlobalVoltDB( "Unable to complete partition detection snapshot: " + resp.getResults()[0], false, null); } else { VoltDB.crashGlobalVoltDB( "Partition detection snapshot completed. Shutting down.", false, null); } } }; private class PartitionCallback extends BabySitter.Callback { final int m_partitionId; final Set<Long> m_replicas; long m_currentLeader; /** Constructor used when we know (or think we know) who the leader for this partition is */ PartitionCallback(int partitionId, long currentLeader) { this(partitionId); // Try to be clever for repair. Create ourselves with the current leader set to // whatever is in the LeaderCache, and claim that replica exists, then let the // first run() call fix the world. m_currentLeader = currentLeader; m_replicas.add(currentLeader); } /** Constructor used at startup when there is no leader */ PartitionCallback(int partitionId) { m_partitionId = partitionId; // A bit of a hack, but we should never end up with an HSID as Long.MAX_VALUE m_currentLeader = Long.MAX_VALUE; m_replicas = new HashSet<Long>(); } @Override public void run(List<String> children) { List<Long> updatedHSIds = VoltZK.childrenToReplicaHSIds(children); // compute previously unseen HSId set in the callback list Set<Long> newHSIds = new HashSet<Long>(updatedHSIds); newHSIds.removeAll(m_replicas); tmLog.debug("Newly seen replicas: " + CoreUtils.hsIdCollectionToString(newHSIds)); // compute previously seen but now vanished from the callback list HSId set Set<Long> missingHSIds = new HashSet<Long>(m_replicas); missingHSIds.removeAll(updatedHSIds); tmLog.debug("Newly dead replicas: " + CoreUtils.hsIdCollectionToString(missingHSIds)); tmLog.debug( "Handling babysitter callback for partition " + m_partitionId + ": children: " + CoreUtils.hsIdCollectionToString(updatedHSIds)); if (m_state.get() == AppointerState.CLUSTER_START) { // We can't yet tolerate a host failure during startup. Crash it all if (missingHSIds.size() > 0) { VoltDB.crashGlobalVoltDB("Node failure detected during startup.", false, null); } // ENG-3166: Eventually we would like to get rid of the extra replicas beyond k_factor, // but for now we just look to see how many replicas of this partition we actually expect // and gate leader assignment on that many copies showing up. int replicaCount = m_kfactor + 1; JSONArray parts; try { parts = m_topo.getJSONArray("partitions"); for (int p = 0; p < parts.length(); p++) { JSONObject aPartition = parts.getJSONObject(p); int pid = aPartition.getInt("partition_id"); if (pid == m_partitionId) { replicaCount = aPartition.getJSONArray("replicas").length(); } } } catch (JSONException e) { // Ignore and just assume the normal number of replicas } if (children.size() == replicaCount) { m_currentLeader = assignLeader(m_partitionId, updatedHSIds); } else { "Waiting on " + ((m_kfactor + 1) - children.size()) + " more nodes " + "for k-safety before startup"); } } else { // Check for k-safety if (!isClusterKSafe()) { VoltDB.crashGlobalVoltDB( "Some partitions have no replicas. Cluster has become unviable.", false, null); } // Check if replay has completed if (m_replayComplete.get() == false) { VoltDB.crashGlobalVoltDB( "Detected node failure during command log replay. Cluster will shut down.", false, null); } // Check to see if there's been a possible network partition and we're not already handling // it if (m_partitionDetectionEnabled && !m_partitionDetected) { doPartitionDetectionActivities(); } // If we survived the above gauntlet of fail, appoint a new leader for this partition. if (missingHSIds.contains(m_currentLeader)) { m_currentLeader = assignLeader(m_partitionId, updatedHSIds); } } m_replicas.clear(); m_replicas.addAll(updatedHSIds); } } /* We'll use this callback purely for startup so we can discover when all * the leaders we have appointed have completed their promotions and * published themselves to Zookeeper */ LeaderCache.Callback m_masterCallback = new LeaderCache.Callback() { @Override public void run(ImmutableMap<Integer, Long> cache) { Set<Long> currentLeaders = new HashSet<Long>(cache.values()); tmLog.debug("Updated leaders: " + currentLeaders); if (m_state.get() == AppointerState.CLUSTER_START) { if (currentLeaders.size() == m_partitionCount) { tmLog.debug("Leader appointment complete, promoting MPI and unblocking."); m_state.set(AppointerState.DONE); m_MPI.acceptPromotion(); m_startupLatch.countDown(); } } } }; public LeaderAppointer( HostMessenger hm, int numberOfPartitions, int kfactor, boolean partitionDetectionEnabled, SnapshotSchedule partitionSnapshotSchedule, boolean usingCommandLog, JSONObject topology, MpInitiator mpi) { m_hostMessenger = hm; m_zk = hm.getZK(); m_kfactor = kfactor; m_topo = topology; m_MPI = mpi; m_partitionCount = numberOfPartitions; m_callbacks = new PartitionCallback[m_partitionCount]; m_partitionWatchers = new BabySitter[m_partitionCount]; m_iv2appointees = new LeaderCache(m_zk, VoltZK.iv2appointees); m_iv2masters = new LeaderCache(m_zk, VoltZK.iv2masters, m_masterCallback); m_partitionDetectionEnabled = partitionDetectionEnabled; m_partSnapshotSchedule = partitionSnapshotSchedule; m_usingCommandLog = usingCommandLog; } @Override public void acceptPromotion() throws InterruptedException, ExecutionException, KeeperException { // Crank up the leader caches. Use blocking startup so that we'll have valid point-in-time // caches later. m_iv2appointees.start(true); m_iv2masters.start(true); // Figure out what conditions we assumed leadership under. if (m_iv2appointees.pointInTimeCache().size() == 0) { tmLog.debug("LeaderAppointer in startup"); m_state.set(AppointerState.CLUSTER_START); } else if ((m_iv2appointees.pointInTimeCache().size() != m_partitionCount) || (m_iv2masters.pointInTimeCache().size() != m_partitionCount)) { // If we are promoted and the appointees or masters set is partial, the previous appointer // failed // during startup (at least for now, until we add add/remove a partition on the fly). VoltDB.crashGlobalVoltDB("Detected failure during startup, unable to start", false, null); } else { tmLog.debug("LeaderAppointer in repair"); m_state.set(AppointerState.DONE); } if (m_state.get() == AppointerState.CLUSTER_START) { // Need to block the return of acceptPromotion until after the MPI is promoted. Wait for this // latch // to countdown after appointing all the partition leaders. The // LeaderCache callback will count it down once it has seen all the // appointed leaders publish themselves as the actual leaders. m_startupLatch = new CountDownLatch(1); writeKnownLiveNodes(m_hostMessenger.getLiveHostIds()); for (int i = 0; i < m_partitionCount; i++) { String dir = LeaderElector.electionDirForPartition(i); // Race along with all of the replicas for this partition to create the ZK parent node try { m_zk.create(dir, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } catch (KeeperException.NodeExistsException e) { // expected on all nodes that don't start() first. } m_callbacks[i] = new PartitionCallback(i); Pair<BabySitter, List<String>> sitterstuff = BabySitter.blockingFactory(m_zk, dir, m_callbacks[i], m_es); m_partitionWatchers[i] = sitterstuff.getFirst(); } m_startupLatch.await(); } else { // If we're taking over for a failed LeaderAppointer, we know when // we get here that every partition had a leader at some point in // time. We'll seed each of the PartitionCallbacks for each // partition with the HSID of the last published leader. The // blocking startup of the BabySitter watching that partition will // call our callback, get the current full set of replicas, and // appoint a new leader if the seeded one has actually failed Map<Integer, Long> masters = m_iv2masters.pointInTimeCache();"LeaderAppointer repairing with master set: " + masters); for (Entry<Integer, Long> master : masters.entrySet()) { int partId = master.getKey(); String dir = LeaderElector.electionDirForPartition(partId); m_callbacks[partId] = new PartitionCallback(partId, master.getValue()); Pair<BabySitter, List<String>> sitterstuff = BabySitter.blockingFactory(m_zk, dir, m_callbacks[partId], m_es); m_partitionWatchers[partId] = sitterstuff.getFirst(); } // just go ahead and promote our MPI m_MPI.acceptPromotion(); } } private long assignLeader(int partitionId, List<Long> children) { // We used masterHostId = -1 as a way to force the leader choice to be // the first replica in the list, if we don't have some other mechanism // which has successfully overridden it. int masterHostId = -1; if (m_state.get() == AppointerState.CLUSTER_START) { try { // find master in topo JSONArray parts = m_topo.getJSONArray("partitions"); for (int p = 0; p < parts.length(); p++) { JSONObject aPartition = parts.getJSONObject(p); int pid = aPartition.getInt("partition_id"); if (pid == partitionId) { masterHostId = aPartition.getInt("master"); } } } catch (JSONException jse) { tmLog.error("Failed to find master for partition " + partitionId + ", defaulting to 0"); jse.printStackTrace(); masterHostId = -1; // stupid default } } else { // For now, if we're appointing a new leader as a result of a // failure, just pick the first replica in the children list. // Could eventually do something more complex here to try to keep a // semi-balance, but it's unclear that this has much utility until // we add rebalancing on rejoin as well. masterHostId = -1; } long masterHSId = children.get(0); for (Long child : children) { if (CoreUtils.getHostIdFromHSId(child) == masterHostId) { masterHSId = child; break; } } "Appointing HSId " + CoreUtils.hsIdToString(masterHSId) + " as leader for partition " + partitionId); try { m_iv2appointees.put(partitionId, masterHSId); } catch (Exception e) { VoltDB.crashLocalVoltDB("Unable to appoint new master for partition " + partitionId, true, e); } return masterHSId; } private void writeKnownLiveNodes(List<Integer> liveNodes) { try { if (m_zk.exists(VoltZK.lastKnownLiveNodes, null) == null) { // VoltZK.createPersistentZKNodes should have done this m_zk.create(VoltZK.lastKnownLiveNodes, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } JSONStringer stringer = new JSONStringer(); stringer.object(); stringer.key("liveNodes").array(); for (Integer node : liveNodes) { stringer.value(node); } stringer.endArray(); stringer.endObject(); JSONObject obj = new JSONObject(stringer.toString()); tmLog.debug("Writing live nodes to ZK: " + obj.toString(4)); m_zk.setData(VoltZK.lastKnownLiveNodes, obj.toString(4).getBytes("UTF-8"), -1); } catch (Exception e) { VoltDB.crashLocalVoltDB( "Unable to update known live nodes at ZK path: " + VoltZK.lastKnownLiveNodes, true, e); } } private Set<Integer> readPriorKnownLiveNodes() { Set<Integer> nodes = new HashSet<Integer>(); try { byte[] data = m_zk.getData(VoltZK.lastKnownLiveNodes, false, null); String jsonString = new String(data, "UTF-8"); tmLog.debug("Read prior known live nodes: " + jsonString); JSONObject jsObj = new JSONObject(jsonString); JSONArray jsonNodes = jsObj.getJSONArray("liveNodes"); for (int ii = 0; ii < jsonNodes.length(); ii++) { nodes.add(jsonNodes.getInt(ii)); } } catch (Exception e) { VoltDB.crashLocalVoltDB( "Unable to read prior known live nodes at ZK path: " + VoltZK.lastKnownLiveNodes, true, e); } return nodes; } /** * Given a set of the known host IDs before a fault, and the known host IDs in the post-fault * cluster, determine whether or not we think a network partition may have happened. NOTE: this * assumes that we have already done the k-safety validation for every partition and already * failed if we weren't a viable cluster. ALSO NOTE: not private so it may be unit-tested. */ static boolean makePPDDecision(Set<Integer> previousHosts, Set<Integer> currentHosts) { // Real partition detection stuff would go here // find the lowest hostId between the still-alive hosts and the // failed hosts. Which set contains the lowest hostId? int blessedHostId = Integer.MAX_VALUE; boolean blessedHostIdInFailedSet = true; // This should be all the pre-partition hosts IDs. Any new host IDs // (say, if this was triggered by rejoin), will be greater than any surviving // host ID, so don't worry about including it in this search. for (Integer hostId : previousHosts) { if (hostId < blessedHostId) { blessedHostId = hostId; } } for (Integer hostId : currentHosts) { if (hostId.equals(blessedHostId)) { blessedHostId = hostId; blessedHostIdInFailedSet = false; } } // Evaluate PPD triggers. boolean partitionDetectionTriggered = false; // Exact 50-50 splits. The set with the lowest survivor host doesn't trigger PPD // If the blessed host is in the failure set, this set is not blessed. if (currentHosts.size() * 2 == previousHosts.size()) { if (blessedHostIdInFailedSet) { "Partition detection triggered for 50/50 cluster failure. " + "This survivor set is shutting down."); partitionDetectionTriggered = true; } else { "Partition detected for 50/50 failure. " + "This survivor set is continuing execution."); } } // A strict, viable minority is always a partition. if (currentHosts.size() * 2 < previousHosts.size()) { "Partition detection triggered. " + "This minority survivor set is shutting down."); partitionDetectionTriggered = true; } return partitionDetectionTriggered; } private void doPartitionDetectionActivities() { // We should never re-enter here once we've decided we're partitioned and doomed assert (!m_partitionDetected); // After everything is resolved, write the new surviving set to ZK List<Integer> currentNodes = null; try { currentNodes = m_hostMessenger.getLiveHostIds(); } catch (Exception e) { } Set<Integer> currentHosts = new HashSet<Integer>(currentNodes); Set<Integer> previousHosts = readPriorKnownLiveNodes(); boolean partitionDetectionTriggered = makePPDDecision(previousHosts, currentHosts); if (partitionDetectionTriggered) { m_partitionDetected = true; if (m_usingCommandLog) { // Just shut down immediately VoltDB.crashGlobalVoltDB( "Use of command logging detected, no additional database snapshot will " + "be generated. Please use the 'recover' action to restore the database if necessary.", false, null); } else { SnapshotUtil.requestSnapshot( 0L, m_partSnapshotSchedule.getPath(), m_partSnapshotSchedule.getPrefix() + System.currentTimeMillis(), true, SnapshotFormat.NATIVE, null, m_snapshotHandler, true); } } // If the cluster host set has changed, then write the new set to ZK // NOTE: we don't want to update the known live nodes if we've decided that our subcluster is // dying, otherwise a poorly timed subsequent failure might reverse this decision. Any future // promoted // LeaderAppointer should make their partition detection decision based on the pre-partition // cluster state. else if (!currentHosts.equals(previousHosts)) { writeKnownLiveNodes(currentNodes); } } private boolean isClusterKSafe() { boolean retval = true; for (int i = 0; i < m_partitionCount; i++) { String dir = LeaderElector.electionDirForPartition(i); try { List<String> replicas = m_zk.getChildren(dir, null, null); if (replicas.isEmpty()) { tmLog.fatal("K-Safety violation: No replicas found for partition: " + i); retval = false; } } catch (Exception e) { VoltDB.crashLocalVoltDB("Unable to read replicas in ZK dir: " + dir, true, e); } } return retval; } public void onReplayCompletion() { m_replayComplete.set(true); } public void shutdown() { try { m_iv2appointees.shutdown(); m_iv2masters.shutdown(); for (BabySitter watcher : m_partitionWatchers) { watcher.shutdown(); } } catch (Exception e) { // don't care, we're going down } } }
@Override public void run(List<String> children) { List<Long> updatedHSIds = VoltZK.childrenToReplicaHSIds(children); // compute previously unseen HSId set in the callback list Set<Long> newHSIds = new HashSet<Long>(updatedHSIds); newHSIds.removeAll(m_replicas); tmLog.debug("Newly seen replicas: " + CoreUtils.hsIdCollectionToString(newHSIds)); // compute previously seen but now vanished from the callback list HSId set Set<Long> missingHSIds = new HashSet<Long>(m_replicas); missingHSIds.removeAll(updatedHSIds); tmLog.debug("Newly dead replicas: " + CoreUtils.hsIdCollectionToString(missingHSIds)); tmLog.debug( "Handling babysitter callback for partition " + m_partitionId + ": children: " + CoreUtils.hsIdCollectionToString(updatedHSIds)); if (m_state.get() == AppointerState.CLUSTER_START) { // We can't yet tolerate a host failure during startup. Crash it all if (missingHSIds.size() > 0) { VoltDB.crashGlobalVoltDB("Node failure detected during startup.", false, null); } // ENG-3166: Eventually we would like to get rid of the extra replicas beyond k_factor, // but for now we just look to see how many replicas of this partition we actually expect // and gate leader assignment on that many copies showing up. int replicaCount = m_kfactor + 1; JSONArray parts; try { parts = m_topo.getJSONArray("partitions"); for (int p = 0; p < parts.length(); p++) { JSONObject aPartition = parts.getJSONObject(p); int pid = aPartition.getInt("partition_id"); if (pid == m_partitionId) { replicaCount = aPartition.getJSONArray("replicas").length(); } } } catch (JSONException e) { // Ignore and just assume the normal number of replicas } if (children.size() == replicaCount) { m_currentLeader = assignLeader(m_partitionId, updatedHSIds); } else { "Waiting on " + ((m_kfactor + 1) - children.size()) + " more nodes " + "for k-safety before startup"); } } else { // Check for k-safety if (!isClusterKSafe()) { VoltDB.crashGlobalVoltDB( "Some partitions have no replicas. Cluster has become unviable.", false, null); } // Check if replay has completed if (m_replayComplete.get() == false) { VoltDB.crashGlobalVoltDB( "Detected node failure during command log replay. Cluster will shut down.", false, null); } // Check to see if there's been a possible network partition and we're not already handling // it if (m_partitionDetectionEnabled && !m_partitionDetected) { doPartitionDetectionActivities(); } // If we survived the above gauntlet of fail, appoint a new leader for this partition. if (missingHSIds.contains(m_currentLeader)) { m_currentLeader = assignLeader(m_partitionId, updatedHSIds); } } m_replicas.clear(); m_replicas.addAll(updatedHSIds); }
/** Export data from a single catalog version and database instance. */ public class ExportGeneration { /** Processors also log using this facility. */ private static final VoltLogger exportLog = new VoltLogger("EXPORT"); public Long m_timestamp; public final File m_directory; private String m_leadersZKPath; private String m_mailboxesZKPath; /** * Data sources, one per table per site, provide the interface to poll() and ack() Export data * from the execution engines. Data sources are configured by the Export manager at initialization * time. partitionid : <tableid : datasource>. */ public final HashMap<Integer, HashMap<String, ExportDataSource>> m_dataSourcesByPartition = new HashMap<Integer, HashMap<String, ExportDataSource>>(); private int m_numSources = 0; private final AtomicInteger m_drainedSources = new AtomicInteger(0); private final Runnable m_onAllSourcesDrained; private final Runnable m_onSourceDrained = new Runnable() { @Override public void run() { int numSourcesDrained = m_drainedSources.incrementAndGet(); "Drained source in generation " + m_timestamp + " with " + numSourcesDrained + " of " + m_numSources + " drained"); if (numSourcesDrained == m_numSources) { if (m_partitionLeaderZKName.isEmpty()) {; } else { ListenableFuture<?> removeLeadership = m_childUpdatingThread.submit( new Runnable() { @Override public void run() { for (Map.Entry<Integer, String> entry : m_partitionLeaderZKName.entrySet()) { m_zk.delete( m_leadersZKPath + "/" + entry.getKey() + "/" + entry.getValue(), -1, new AsyncCallback.VoidCallback() { @Override public void processResult(int rc, String path, Object ctx) { KeeperException.Code code = KeeperException.Code.get(rc); if (code != KeeperException.Code.OK) { VoltDB.crashLocalVoltDB( "Error in export leader election giving up leadership of " + path, true, KeeperException.create(code)); } } }, null); } } }, null); removeLeadership.addListener( m_onAllSourcesDrained, MoreExecutors.sameThreadExecutor()); } ; } } }; private Mailbox m_mbox; private ZooKeeper m_zk; private volatile boolean shutdown = false; private static final ListeningExecutorService m_childUpdatingThread = CoreUtils.getListeningExecutorService("Export ZK Watcher", 1); private final Map<Integer, String> m_partitionLeaderZKName = new HashMap<Integer, String>(); private final Set<Integer> m_partitionsIKnowIAmTheLeader = new HashSet<Integer>(); /* * Set to true if this export generation was initialized from disk * instead of being fed data from the current live system */ private boolean m_diskBased = false; /** * Constructor to create a new generation of export data * * @param exportOverflowDirectory * @throws IOException */ public ExportGeneration(long txnId, Runnable onAllSourcesDrained, File exportOverflowDirectory) throws IOException { m_onAllSourcesDrained = onAllSourcesDrained; m_timestamp = txnId; m_directory = new File(exportOverflowDirectory, Long.toString(txnId)); if (!m_directory.mkdirs()) { throw new IOException("Could not create " + m_directory); }"Creating new export generation " + m_timestamp); } /** * Constructor to create a generation based on one that has been persisted to disk * * @param generationDirectory * @param generationTimestamp * @throws IOException */ public ExportGeneration(Runnable onAllSourcesDrained, File generationDirectory) throws IOException { m_onAllSourcesDrained = onAllSourcesDrained; m_directory = generationDirectory; } public boolean isDiskBased() { return m_diskBased; } boolean initializeGenerationFromDisk(final Connector conn, HostMessenger messenger) { m_diskBased = true; Set<Integer> partitions = new HashSet<Integer>(); /* * Find all the advertisements. Once one is found, extract the nonce * and check for any data files related to the advertisement. If no data files * exist ignore the advertisement. */ boolean hadValidAd = false; for (File f : m_directory.listFiles()) { if (f.getName().endsWith(".ad")) { boolean haveDataFiles = false; String nonce = f.getName().substring(0, f.getName().length() - 3); for (File dataFile : m_directory.listFiles()) { if (dataFile.getName().startsWith(nonce) && !dataFile.getName().equals(f.getName())) { haveDataFiles = true; break; } } if (haveDataFiles) { try { addDataSource(f, partitions); hadValidAd = true; } catch (IOException e) { VoltDB.crashLocalVoltDB("Error intializing export datasource " + f, true, e); } } else { // Delete ads that have no data f.delete(); } } } createAndRegisterAckMailboxes(partitions, messenger);"Restoring export generation " + m_timestamp); return hadValidAd; } /* * Run a leader election for every partition to determine who will * start consuming the export data. * */ public void kickOffLeaderElection() { m_childUpdatingThread.submit( new Runnable() { @Override public void run() { try { /* * The path where leaders will register for this generation */ m_leadersZKPath = VoltZK.exportGenerations + "/" + m_timestamp + "/" + "leaders"; /* * Create a directory for each partition */ for (Integer partition : m_dataSourcesByPartition.keySet()) { ZKUtil.asyncMkdirs(m_zk, m_leadersZKPath + "/" + partition); } /* * Queue the creation of our ephemeral sequential and then queue * a task to retrieve the children to find the result of the election */ List<ZKUtil.ChildrenCallback> callbacks = new ArrayList<ZKUtil.ChildrenCallback>(); for (final Integer partition : m_dataSourcesByPartition.keySet()) { m_zk.create( m_leadersZKPath + "/" + partition + "/leader", null, Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL_SEQUENTIAL, new org.apache.zookeeper_voltpatches.AsyncCallback.StringCallback() { @Override public void processResult(int rc, String path, Object ctx, String name) { KeeperException.Code code = KeeperException.Code.get(rc); if (code != KeeperException.Code.OK) { VoltDB.crashLocalVoltDB( "Error in export leader election", true, KeeperException.create(code)); } String splitName[] = name.split("/"); m_partitionLeaderZKName.put(partition, splitName[splitName.length - 1]); } }, null); ZKUtil.ChildrenCallback cb = new ZKUtil.ChildrenCallback(); callbacks.add(cb); m_zk.getChildren( m_leadersZKPath + "/" + partition, constructLeaderChildWatcher(partition), cb, null); } /* * Process the result of the per partition elections. * No worries about ordering with the watcher because the watcher tasks * all get funneled through this thread */ Iterator<ZKUtil.ChildrenCallback> iter = callbacks.iterator(); for (Integer partition : m_dataSourcesByPartition.keySet()) { ZKUtil.ChildrenCallback cb =; handleLeaderChildrenUpdate(partition, cb.getChildren()); } } catch (Throwable t) { VoltDB.crashLocalVoltDB("Error in export leader election", true, t); } } }); } private Watcher constructLeaderChildWatcher(final Integer partition) { return new Watcher() { @Override public void process(final WatchedEvent event) { final Runnable processRunnable = new Runnable() { @Override public void run() { if (m_drainedSources.get() == m_numSources) { return; } final AsyncCallback.ChildrenCallback childrenCallback = new org.apache.zookeeper_voltpatches.AsyncCallback.ChildrenCallback() { @Override public void processResult( final int rc, final String path, Object ctx, final List<String> children) { KeeperException.Code code = KeeperException.Code.get(rc); if (code != KeeperException.Code.OK) { VoltDB.crashLocalVoltDB( "Error in export leader election", true, KeeperException.create(code)); } m_childUpdatingThread.execute( new Runnable() { @Override public void run() { try { handleLeaderChildrenUpdate(partition, children); } catch (Throwable t) { VoltDB.crashLocalVoltDB( "Error in export leader election", true, t); } } }); } }; m_zk.getChildren( m_leadersZKPath + "/" + partition, constructLeaderChildWatcher(partition), childrenCallback, null); } }; m_childUpdatingThread.execute(processRunnable); } }; } private void handleLeaderChildrenUpdate(Integer partition, List<String> children) { if (m_drainedSources.get() == m_numSources || children.isEmpty()) { return; } String leader = Collections.min(children); if (m_partitionLeaderZKName.get(partition).equals(leader)) { if (m_partitionsIKnowIAmTheLeader.add(partition)) { for (ExportDataSource eds : m_dataSourcesByPartition.get(partition).values()) { try { eds.acceptMastership(); } catch (Exception e) { exportLog.error("Unable to start exporting", e); } } } } } void initializeGenerationFromCatalog( final Connector conn, int hostId, HostMessenger messenger, List<Pair<Integer, Long>> partitions) { /* * Now create datasources based on the catalog */ Iterator<ConnectorTableInfo> tableInfoIt = conn.getTableinfo().iterator(); // Only populate partitions in use if export is actually happening Set<Integer> partitionsInUse = new HashSet<Integer>(); while (tableInfoIt.hasNext()) { ConnectorTableInfo next =; Table table = next.getTable(); addDataSources(table, hostId, partitions); for (Pair<Integer, Long> p : partitions) { partitionsInUse.add(p.getFirst()); } } createAndRegisterAckMailboxes(partitionsInUse, messenger); } private void createAndRegisterAckMailboxes( final Set<Integer> localPartitions, HostMessenger messenger) { m_zk = messenger.getZK(); m_mailboxesZKPath = VoltZK.exportGenerations + "/" + m_timestamp + "/" + "mailboxes"; m_mbox = new LocalMailbox(messenger) { @Override public void deliver(VoltMessage message) { if (message instanceof BinaryPayloadMessage) { BinaryPayloadMessage bpm = (BinaryPayloadMessage) message; ByteBuffer buf = ByteBuffer.wrap(bpm.m_payload); final int partition = buf.getInt(); final int length = buf.getInt(); byte stringBytes[] = new byte[length]; buf.get(stringBytes); String signature = new String(stringBytes, Constants.UTF8ENCODING); final long ackUSO = buf.getLong(); final HashMap<String, ExportDataSource> partitionSources = m_dataSourcesByPartition.get(partition); if (partitionSources == null) { exportLog.error( "Received an export ack for partition " + partition + " which does not exist on this node"); return; } final ExportDataSource eds = partitionSources.get(signature); if (eds == null) { exportLog.error( "Received an export ack for partition " + partition + " source signature " + signature + " which does not exist on this node"); return; } try { eds.ack(ackUSO); } catch (RejectedExecutionException ignoreIt) { // ignore it: as it is already shutdown } } else { exportLog.error("Receive unexpected message " + message + " in export subsystem"); } } }; messenger.createMailbox(null, m_mbox); for (Integer partition : localPartitions) { final String partitionDN = m_mailboxesZKPath + "/" + partition; ZKUtil.asyncMkdirs(m_zk, partitionDN); ZKUtil.StringCallback cb = new ZKUtil.StringCallback(); m_zk.create( partitionDN + "/" + m_mbox.getHSId(), null, Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, cb, null); } ListenableFuture<?> fut = m_childUpdatingThread.submit( new Runnable() { @Override public void run() { List<Pair<Integer, ZKUtil.ChildrenCallback>> callbacks = new ArrayList<Pair<Integer, ZKUtil.ChildrenCallback>>(); for (Integer partition : localPartitions) { ZKUtil.ChildrenCallback callback = new ZKUtil.ChildrenCallback(); m_zk.getChildren( m_mailboxesZKPath + "/" + partition, constructMailboxChildWatcher(), callback, null); callbacks.add(Pair.of(partition, callback)); } for (Pair<Integer, ZKUtil.ChildrenCallback> p : callbacks) { final Integer partition = p.getFirst(); List<String> children = null; try { children = p.getSecond().getChildren(); } catch (InterruptedException e) { Throwables.propagate(e); } catch (KeeperException e) { Throwables.propagate(e); } ImmutableList.Builder<Long> mailboxes = ImmutableList.builder(); for (String child : children) { if (child.equals(Long.toString(m_mbox.getHSId()))) continue; mailboxes.add(Long.valueOf(child)); } ImmutableList<Long> mailboxHsids =; for (ExportDataSource eds : m_dataSourcesByPartition.get(partition).values()) { eds.updateAckMailboxes(Pair.of(m_mbox, mailboxHsids)); } } } }); try { fut.get(); } catch (Throwable t) { Throwables.propagate(t); } } private Watcher constructMailboxChildWatcher() { return new Watcher() { @Override public void process(final WatchedEvent event) { m_childUpdatingThread.submit( new Runnable() { @Override public void run() { try { handleChildUpdate(event); } catch (Throwable t) { VoltDB.crashLocalVoltDB("Error in export ack handling", true, t); } } }); } }; } private void handleChildUpdate(final WatchedEvent event) { m_zk.getChildren( event.getPath(), constructMailboxChildWatcher(), constructChildRetrievalCallback(), null); } private AsyncCallback.ChildrenCallback constructChildRetrievalCallback() { return new AsyncCallback.ChildrenCallback() { @Override public void processResult( final int rc, final String path, Object ctx, final List<String> children) { m_childUpdatingThread.submit( new Runnable() { @Override public void run() { try { if (shutdown) return; KeeperException.Code code = KeeperException.Code.get(rc); if (code != KeeperException.Code.OK) { throw KeeperException.create(code); } final String split[] = path.split("/"); final int partition = Integer.valueOf(split[split.length - 1]); ImmutableList.Builder<Long> mailboxes = ImmutableList.builder(); for (String child : children) { if (child.equals(Long.toString(m_mbox.getHSId()))) continue; mailboxes.add(Long.valueOf(child)); } ImmutableList<Long> mailboxHsids =; for (ExportDataSource eds : m_dataSourcesByPartition.get(partition).values()) { eds.updateAckMailboxes(Pair.of(m_mbox, mailboxHsids)); } } catch (Throwable t) { VoltDB.crashLocalVoltDB("Error in export ack handling", true, t); } } }); } }; } public long getQueuedExportBytes(int partitionId, String signature) { // assert(m_dataSourcesByPartition.containsKey(partitionId)); // assert(m_dataSourcesByPartition.get(partitionId).containsKey(delegateId)); HashMap<String, ExportDataSource> sources = m_dataSourcesByPartition.get(partitionId); if (sources == null) { /* * This is fine. If the table is dropped it won't have an entry in the generation created * after the table was dropped. */ // exportLog.error("Could not find export data sources for generation " + // m_timestamp + " partition " // + partitionId); return 0; } ExportDataSource source = sources.get(signature); if (source == null) { /* * This is fine. If the table is dropped it won't have an entry in the generation created * after the table was dropped. */ // exportLog.error("Could not find export data source for generation " + m_timestamp + " // partition " + partitionId + // " signature " + signature); return 0; } return source.sizeInBytes(); } /* * Create a datasource based on an ad file */ private void addDataSource(File adFile, Set<Integer> partitions) throws IOException { m_numSources++; ExportDataSource source = new ExportDataSource(m_onSourceDrained, adFile); partitions.add(source.getPartitionId()); m_timestamp = source.getGeneration(); "Creating ExportDataSource for " + adFile + " table " + source.getTableName() + " signature " + source.getSignature() + " partition id " + source.getPartitionId() + " bytes " + source.sizeInBytes()); HashMap<String, ExportDataSource> dataSourcesForPartition = m_dataSourcesByPartition.get(source.getPartitionId()); if (dataSourcesForPartition == null) { dataSourcesForPartition = new HashMap<String, ExportDataSource>(); m_dataSourcesByPartition.put(source.getPartitionId(), dataSourcesForPartition); } dataSourcesForPartition.put(source.getSignature(), source); } /* * An unfortunate test only method for supplying a mock source */ public void addDataSource(ExportDataSource source) { HashMap<String, ExportDataSource> dataSourcesForPartition = m_dataSourcesByPartition.get(source.getPartitionId()); if (dataSourcesForPartition == null) { dataSourcesForPartition = new HashMap<String, ExportDataSource>(); m_dataSourcesByPartition.put(source.getPartitionId(), dataSourcesForPartition); } dataSourcesForPartition.put(source.getSignature(), source); } // silly helper to add datasources for a table catalog object private void addDataSources(Table table, int hostId, List<Pair<Integer, Long>> partitions) { for (Pair<Integer, Long> p : partitions) { Integer partition = p.getFirst(); Long site = p.getSecond(); /* * IOException can occur if there is a problem * with the persistent aspects of the datasource storage */ try { HashMap<String, ExportDataSource> dataSourcesForPartition = m_dataSourcesByPartition.get(partition); if (dataSourcesForPartition == null) { dataSourcesForPartition = new HashMap<String, ExportDataSource>(); m_dataSourcesByPartition.put(partition, dataSourcesForPartition); } ExportDataSource exportDataSource = new ExportDataSource( m_onSourceDrained, "database", table.getTypeName(), partition, site, table.getSignature(), m_timestamp, table.getColumns(), m_directory.getPath()); m_numSources++; "Creating ExportDataSource for table " + table.getTypeName() + " signature " + table.getSignature() + " partition id " + partition); dataSourcesForPartition.put(table.getSignature(), exportDataSource); } catch (IOException e) { VoltDB.crashLocalVoltDB( "Error creating datasources for table " + table.getTypeName() + " host id " + hostId, true, e); } } } public void pushExportBuffer( int partitionId, String signature, long uso, long bufferPtr, ByteBuffer buffer, boolean sync, boolean endOfStream) { // System.out.println("In generation " + m_timestamp + " partition " + partitionId + " // signature " + signature + (buffer == null ? " null buffer " : (" buffer length " + // buffer.remaining()))); // for (Integer i : m_dataSourcesByPartition.keySet()) { // System.out.println("Have partition " + i); // } assert (m_dataSourcesByPartition.containsKey(partitionId)); assert (m_dataSourcesByPartition.get(partitionId).containsKey(signature)); HashMap<String, ExportDataSource> sources = m_dataSourcesByPartition.get(partitionId); if (sources == null) { exportLog.error( "Could not find export data sources for partition " + partitionId + " generation " + m_timestamp + " the export data is being discarded"); DBBPool.deleteCharArrayMemory(bufferPtr); return; } ExportDataSource source = sources.get(signature); if (source == null) { exportLog.error( "Could not find export data source for partition " + partitionId + " signature " + signature + " generation " + m_timestamp + " the export data is being discarded"); DBBPool.deleteCharArrayMemory(bufferPtr); return; } source.pushExportBuffer(uso, bufferPtr, buffer, sync, endOfStream); } public void closeAndDelete() throws IOException { List<ListenableFuture<?>> tasks = new ArrayList<ListenableFuture<?>>(); for (HashMap<String, ExportDataSource> map : m_dataSourcesByPartition.values()) { for (ExportDataSource source : map.values()) { tasks.add(source.closeAndDelete()); } } try { Futures.allAsList(tasks).get(); } catch (Exception e) { Throwables.propagateIfPossible(e, IOException.class); } shutdown = true; VoltFile.recursivelyDelete(m_directory); } /* * Returns true if the generatino was completely truncated away */ public boolean truncateExportToTxnId(long txnId, long[] perPartitionTxnIds) { // create an easy partitionId:txnId lookup. HashMap<Integer, Long> partitionToTxnId = new HashMap<Integer, Long>(); for (long tid : perPartitionTxnIds) { partitionToTxnId.put(TxnEgo.getPartitionId(tid), tid); } List<ListenableFuture<?>> tasks = new ArrayList<ListenableFuture<?>>(); // pre-iv2, the truncation point is the snapshot transaction id. // In iv2, truncation at the per-partition txn id recorded in the snapshot. for (HashMap<String, ExportDataSource> dataSources : m_dataSourcesByPartition.values()) { for (ExportDataSource source : dataSources.values()) { if (VoltDB.instance().isIV2Enabled()) { Long truncationPoint = partitionToTxnId.get(source.getPartitionId()); if (truncationPoint == null) { exportLog.error( "Snapshot " + txnId + " does not include truncation point for partition " + source.getPartitionId()); } else { tasks.add(source.truncateExportToTxnId(truncationPoint)); } } else { tasks.add(source.truncateExportToTxnId(txnId)); } } } try { Futures.allAsList(tasks).get(); } catch (Exception e) { VoltDB.crashLocalVoltDB( "Unexpected exception truncating export data during snapshot restore. " + "You can back up export overflow data and start the " + "DB without it to get past this error", true, e); } return m_drainedSources.get() == m_numSources; } public void close() { List<ListenableFuture<?>> tasks = new ArrayList<ListenableFuture<?>>(); for (HashMap<String, ExportDataSource> sources : m_dataSourcesByPartition.values()) { for (ExportDataSource source : sources.values()) { tasks.add(source.close()); } } try { Futures.allAsList(tasks).get(); } catch (Exception e) { // Logging of errors is done inside the tasks so nothing to do here // intentionally not failing if there is an issue with close exportLog.error("Error closing export data sources", e); } shutdown = true; } /** * Indicate to all associated {@link ExportDataSource}to assume mastership role for the given * partition id * * @param partitionId */ public void acceptMastershipTask(int partitionId) { HashMap<String, ExportDataSource> partitionDataSourceMap = m_dataSourcesByPartition.get(partitionId); "Export generation " + m_timestamp + " accepting mastership for partition " + partitionId); for (ExportDataSource eds : partitionDataSourceMap.values()) { try { eds.acceptMastership(); } catch (Exception e) { exportLog.error("Unable to start exporting", e); } } } @Override public String toString() { return "Export Generation - " + m_timestamp.toString(); } }