@Override public void setPerPartitionTxnIds(long[] perPartitionTxnIds) { boolean foundMultipartTxnId = false; boolean foundSinglepartTxnId = false; for (long txnId : perPartitionTxnIds) { if (TxnEgo.getPartitionId(txnId) == m_partitionId) { if (foundSinglepartTxnId) { VoltDB.crashLocalVoltDB( "Found multiple transactions ids during restore for a partition", false, null); } foundSinglepartTxnId = true; m_initiatorMailbox.setMaxLastSeenTxnId(txnId); } if (TxnEgo.getPartitionId(txnId) == MpInitiator.MP_INIT_PID) { if (foundMultipartTxnId) { VoltDB.crashLocalVoltDB( "Found multiple transactions ids during restore for a multipart txnid", false, null); } foundMultipartTxnId = true; m_initiatorMailbox.setMaxLastSeenMultipartTxnId(txnId); } } if (!foundMultipartTxnId) { VoltDB.crashLocalVoltDB("Didn't find a multipart txnid on restore", false, null); } }
public void handleDumpMessage() { String who = CoreUtils.hsIdToString(m_mailbox.getHSId()); hostLog.warn("State dump for site: " + who); hostLog.warn("" + who + ": partition: " + m_partitionId + ", isLeader: " + m_isLeader); if (m_isLeader) { hostLog.warn("" + who + ": replicas: " + CoreUtils.hsIdCollectionToString(m_replicaHSIds)); if (m_sendToHSIds.length > 0) { m_mailbox.send(m_sendToHSIds, new DumpMessage()); } } hostLog.warn( "" + who + ": most recent SP handle: " + getCurrentTxnId() + " " + TxnEgo.txnIdToString(getCurrentTxnId())); hostLog.warn( "" + who + ": outstanding txns: " + m_outstandingTxns.keySet() + " " + TxnEgo.txnIdCollectionToString(m_outstandingTxns.keySet())); hostLog.warn("" + who + ": TransactionTaskQueue: " + m_pendingTasks.toString()); if (m_duplicateCounters.size() > 0) { hostLog.warn("" + who + ": duplicate counters: "); for (Entry<DuplicateCounterKey, DuplicateCounter> e : m_duplicateCounters.entrySet()) { hostLog.warn("\t" + who + ": " + e.getKey().toString() + ": " + e.getValue().toString()); } } }
@Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("MpProcedureTask:"); sb.append(" TXN ID: ").append(TxnEgo.txnIdToString(getTxnId())); sb.append(" SP HANDLE ID: ").append(TxnEgo.txnIdToString(getSpHandle())); sb.append(" ON HSID: ").append(CoreUtils.hsIdToString(m_initiator.getHSId())); return sb.toString(); }
@Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("CompleteTransactionTask:"); sb.append(" TXN ID: ").append(TxnEgo.txnIdToString(getTxnId())); sb.append(" SP HANDLE: ").append(TxnEgo.txnIdToString(getSpHandle())); sb.append(" UNDO TOKEN: ").append(m_txnState.getBeginUndoToken()); sb.append(" MSG: ").append(m_completeMsg.toString()); return sb.toString(); }
// SpSchedulers will see FragmentTaskMessage for: // - The scatter fragment(s) of a multi-part transaction (normal or sysproc) // - Borrow tasks to do the local fragment work if this partition is the // buddy of the MPI. Borrow tasks may include input dependency tables for // aggregation fragments, or not, if it's a replicated table read. // For multi-batch MP transactions, we'll need to look up the transaction state // that gets created when the first batch arrives. // During command log replay a new SP handle is going to be generated, but it really // doesn't matter, it isn't going to be used for anything. void handleFragmentTaskMessage(FragmentTaskMessage message) { FragmentTaskMessage msg = message; long newSpHandle; if (m_isLeader) { // Quick hack to make progress...we need to copy the FragmentTaskMessage // before we start mucking with its state (SPHANDLE). We need to revisit // all the messaging mess at some point. msg = new FragmentTaskMessage( message.getInitiatorHSId(), message.getCoordinatorHSId(), message); // Not going to use the timestamp from the new Ego because the multi-part timestamp is what // should be used TxnEgo ego = advanceTxnEgo(); newSpHandle = ego.getTxnId(); msg.setSpHandle(newSpHandle); if (msg.getInitiateTask() != null) { msg.getInitiateTask().setSpHandle(newSpHandle); // set the handle msg.setInitiateTask( msg.getInitiateTask()); // Trigger reserialization so the new handle is used } /* * If there a replicas to send it to, forward it! * Unless... it's read only AND not a sysproc. Read only sysprocs may expect to be sent * everywhere. * In that case don't propagate it to avoid a determinism check and extra messaging overhead */ if (m_sendToHSIds.length > 0 && (!msg.isReadOnly() || msg.isSysProcTask())) { FragmentTaskMessage replmsg = new FragmentTaskMessage(m_mailbox.getHSId(), m_mailbox.getHSId(), msg); m_mailbox.send(m_sendToHSIds, replmsg); DuplicateCounter counter; /* * Non-determinism should be impossible to happen with MP fragments. * if you see "MP_DETERMINISM_ERROR" as procedure name in the crash logs * something has horribly gone wrong. */ if (message.getFragmentTaskType() != FragmentTaskMessage.SYS_PROC_PER_SITE) { counter = new DuplicateCounter( msg.getCoordinatorHSId(), msg.getTxnId(), m_replicaHSIds, "MP_DETERMINISM_ERROR"); } else { counter = new SysProcDuplicateCounter( msg.getCoordinatorHSId(), msg.getTxnId(), m_replicaHSIds, "MP_DETERMINISM_ERROR"); } m_duplicateCounters.put(new DuplicateCounterKey(msg.getTxnId(), newSpHandle), counter); } } else { newSpHandle = msg.getSpHandle(); setMaxSeenTxnId(newSpHandle); } Iv2Trace.logFragmentTaskMessage(message, m_mailbox.getHSId(), newSpHandle, false); doLocalFragmentOffer(msg); }
/* * Inherit the per partition txnid from the long since gone * partition that existed in the past */ private long[] fetchPerPartitionTxnId() { ZooKeeper zk = VoltDB.instance().getHostMessenger().getZK(); byte partitionTxnIdsBytes[] = null; try { partitionTxnIdsBytes = zk.getData(VoltZK.perPartitionTxnIds, false, null); } catch (KeeperException.NoNodeException e) { return null; } // Can be no node if the cluster was never restored catch (Exception e) { VoltDB.crashLocalVoltDB("Error retrieving per partition txn ids", true, e); } ByteBuffer buf = ByteBuffer.wrap(partitionTxnIdsBytes); int count = buf.getInt(); Long partitionTxnId = null; long partitionTxnIds[] = new long[count]; for (int ii = 0; ii < count; ii++) { long txnId = buf.getLong(); partitionTxnIds[ii] = txnId; int partitionId = TxnEgo.getPartitionId(txnId); if (partitionId == m_partitionId) { partitionTxnId = txnId; continue; } } if (partitionTxnId != null) { return partitionTxnIds; } return null; }
public void dump(long hsId) { final String who = CoreUtils.hsIdToString(hsId); tmLog.info( String.format( "%s: REPLAY SEQUENCER DUMP, LAST POLLED FRAGMENT %d (%s), LAST SEEN TXNID %d (%s), %s%s", who, m_lastPolledFragmentTxnId, TxnEgo.txnIdToString(m_lastPolledFragmentTxnId), m_lastSeenTxnId, TxnEgo.txnIdToString(m_lastSeenTxnId), m_mpiEOLReached ? "MPI EOL, " : "", m_mustDrain ? "MUST DRAIN" : "")); for (Entry<Long, ReplayEntry> e : m_replayEntries.entrySet()) { tmLog.info(String.format("%s: REPLAY ENTRY %s: %s", who, e.getKey(), e.getValue())); } }
private static String txnIdToString(long txnId) { if (txnId == Long.MIN_VALUE) { return "UNUSED"; } else { return TxnEgo.txnIdToString(txnId); } }
/** Setup a new RepairAlgo but don't take any action to take responsibility. */ public SpPromoteAlgo( List<Long> survivors, InitiatorMailbox mailbox, String whoami, int partitionId) { m_mailbox = mailbox; m_survivors = survivors; m_whoami = whoami; m_maxSeenTxnId = TxnEgo.makeZero(partitionId).getTxnId(); }
@Override public String toString() { return String.format( "(SENTINEL TXNID: %d (%s), %d BLOCKED MESSAGES, %s)\n%s", m_sentinalTxnId, TxnEgo.txnIdToString(m_sentinalTxnId), m_blockedMessages.size(), m_servedFragment ? "SERVED FRAGMENT" : "", m_firstFragment); }
/** Create a new execution site and the corresponding EE */ public Site( SiteTaskerQueue scheduler, long siteId, BackendTarget backend, CatalogContext context, String serializedCatalog, long txnId, int partitionId, int numPartitions, VoltDB.START_ACTION startAction, int snapshotPriority, InitiatorMailbox initiatorMailbox, StatsAgent agent, MemoryStats memStats) { m_siteId = siteId; m_context = context; m_partitionId = partitionId; m_numberOfPartitions = numPartitions; m_scheduler = scheduler; m_backend = backend; m_startAction = startAction; m_rejoinState = VoltDB.createForRejoin(startAction) ? kStateRejoining : kStateRunning; m_snapshotPriority = snapshotPriority; // need this later when running in the final thread. m_startupConfig = new StartupConfig(serializedCatalog, context.m_timestamp); m_lastCommittedTxnId = TxnEgo.makeZero(partitionId).getTxnId(); m_lastCommittedSpHandle = TxnEgo.makeZero(partitionId).getTxnId(); m_currentTxnId = Long.MIN_VALUE; m_initiatorMailbox = initiatorMailbox; if (agent != null) { m_tableStats = new TableStats(m_siteId); agent.registerStatsSource(SysProcSelector.TABLE, m_siteId, m_tableStats); m_indexStats = new IndexStats(m_siteId); agent.registerStatsSource(SysProcSelector.INDEX, m_siteId, m_indexStats); m_memStats = memStats; } else { // MPI doesn't need to track these stats m_tableStats = null; m_indexStats = null; m_memStats = null; } }
@Override public void truncateUndoLog(boolean rollback, long beginUndoToken, long txnId, long spHandle) { if (rollback) { m_ee.undoUndoToken(beginUndoToken); } else { assert (latestUndoToken != Site.kInvalidUndoToken); assert (latestUndoToken >= beginUndoToken); if (latestUndoToken > beginUndoToken) { m_ee.releaseUndoToken(latestUndoToken); } m_lastCommittedTxnId = txnId; if (TxnEgo.getPartitionId(m_lastCommittedSpHandle) != TxnEgo.getPartitionId(spHandle)) { VoltDB.crashLocalVoltDB( "Mismatch SpHandle partitiond id " + TxnEgo.getPartitionId(m_lastCommittedSpHandle) + ", " + TxnEgo.getPartitionId(spHandle), true, null); } m_lastCommittedSpHandle = spHandle; } }
// SpScheduler expects to see InitiateTaskMessages corresponding to single-partition // procedures only. public void handleIv2InitiateTaskMessage(Iv2InitiateTaskMessage message) { if (!message.isSinglePartition()) { throw new RuntimeException( "SpScheduler.handleIv2InitiateTaskMessage " + "should never receive multi-partition initiations."); } final String procedureName = message.getStoredProcedureName(); long newSpHandle; long uniqueId = Long.MIN_VALUE; Iv2InitiateTaskMessage msg = message; if (m_isLeader || message.isReadOnly()) { /* * A short circuit read is a read where the client interface is local to * this node. The CI will let a replica perform a read in this case and * it does looser tracking of client handles since it can't be * partitioned from the local replica. */ if (!m_isLeader && CoreUtils.getHostIdFromHSId(msg.getInitiatorHSId()) != CoreUtils.getHostIdFromHSId(m_mailbox.getHSId())) { VoltDB.crashLocalVoltDB("Only allowed to do short circuit reads locally", true, null); } /* * If this is for CL replay or DR, update the unique ID generator */ if (message.isForReplay()) { uniqueId = message.getUniqueId(); try { m_uniqueIdGenerator.updateMostRecentlyGeneratedUniqueId(uniqueId); } catch (Exception e) { hostLog.fatal(e.getMessage()); hostLog.fatal("Invocation: " + message); VoltDB.crashLocalVoltDB(e.getMessage(), true, e); } } else if (message.isForDR()) { uniqueId = message.getStoredProcedureInvocation().getOriginalUniqueId(); // @LoadSinglepartitionTable does not have a valid uid if (UniqueIdGenerator.getPartitionIdFromUniqueId(uniqueId) == m_partitionId) { m_uniqueIdGenerator.updateMostRecentlyGeneratedUniqueId(uniqueId); } } /* * If this is CL replay use the txnid from the CL and also * update the txnid to match the one from the CL */ if (message.isForReplay()) { newSpHandle = message.getTxnId(); setMaxSeenTxnId(newSpHandle); } else if (m_isLeader) { TxnEgo ego = advanceTxnEgo(); newSpHandle = ego.getTxnId(); uniqueId = m_uniqueIdGenerator.getNextUniqueId(); } else { /* * The short circuit read case. Since we are not a master * we can't create new transaction IDs, so reuse the last seen * txnid. For a timestamp, might as well give a reasonable one * for a read heavy workload so time isn't bursty. */ uniqueId = UniqueIdGenerator.makeIdFromComponents( Math.max(System.currentTimeMillis(), m_uniqueIdGenerator.lastUsedTime), 0, m_uniqueIdGenerator.partitionId); // Don't think it wise to make a new one for a short circuit read newSpHandle = getCurrentTxnId(); } // Need to set the SP handle on the received message // Need to copy this or the other local sites handling // the same initiate task message will overwrite each // other's memory -- the message isn't copied on delivery // to other local mailboxes. msg = new Iv2InitiateTaskMessage( message.getInitiatorHSId(), message.getCoordinatorHSId(), m_repairLogTruncationHandle, message.getTxnId(), message.getUniqueId(), message.isReadOnly(), message.isSinglePartition(), message.getStoredProcedureInvocation(), message.getClientInterfaceHandle(), message.getConnectionId(), message.isForReplay()); msg.setSpHandle(newSpHandle); // Also, if this is a vanilla single-part procedure, make the TXNID // be the SpHandle (for now) // Only system procedures are every-site, so we'll check through the SystemProcedureCatalog if (SystemProcedureCatalog.listing.get(procedureName) == null || !SystemProcedureCatalog.listing.get(procedureName).getEverysite()) { msg.setTxnId(newSpHandle); msg.setUniqueId(uniqueId); } // Don't replicate reads, this really assumes that DML validation // is going to be integrated soonish if (m_isLeader && !msg.isReadOnly() && m_sendToHSIds.length > 0) { Iv2InitiateTaskMessage replmsg = new Iv2InitiateTaskMessage( m_mailbox.getHSId(), m_mailbox.getHSId(), m_repairLogTruncationHandle, msg.getTxnId(), msg.getUniqueId(), msg.isReadOnly(), msg.isSinglePartition(), msg.getStoredProcedureInvocation(), msg.getClientInterfaceHandle(), msg.getConnectionId(), msg.isForReplay()); // Update the handle in the copy since the constructor doesn't set it replmsg.setSpHandle(newSpHandle); m_mailbox.send(m_sendToHSIds, replmsg); DuplicateCounter counter = new DuplicateCounter( msg.getInitiatorHSId(), msg.getTxnId(), m_replicaHSIds, msg.getStoredProcedureName()); m_duplicateCounters.put(new DuplicateCounterKey(msg.getTxnId(), newSpHandle), counter); } } else { setMaxSeenTxnId(msg.getSpHandle()); newSpHandle = msg.getSpHandle(); uniqueId = msg.getUniqueId(); } Iv2Trace.logIv2InitiateTaskMessage(message, m_mailbox.getHSId(), msg.getTxnId(), newSpHandle); doLocalInitiateOffer(msg); return; }