// SpSchedulers will see FragmentTaskMessage for: // - The scatter fragment(s) of a multi-part transaction (normal or sysproc) // - Borrow tasks to do the local fragment work if this partition is the // buddy of the MPI. Borrow tasks may include input dependency tables for // aggregation fragments, or not, if it's a replicated table read. // For multi-batch MP transactions, we'll need to look up the transaction state // that gets created when the first batch arrives. // During command log replay a new SP handle is going to be generated, but it really // doesn't matter, it isn't going to be used for anything. void handleFragmentTaskMessage(FragmentTaskMessage message) { FragmentTaskMessage msg = message; long newSpHandle; if (m_isLeader) { // Quick hack to make progress...we need to copy the FragmentTaskMessage // before we start mucking with its state (SPHANDLE). We need to revisit // all the messaging mess at some point. msg = new FragmentTaskMessage( message.getInitiatorHSId(), message.getCoordinatorHSId(), message); // Not going to use the timestamp from the new Ego because the multi-part timestamp is what // should be used TxnEgo ego = advanceTxnEgo(); newSpHandle = ego.getTxnId(); msg.setSpHandle(newSpHandle); if (msg.getInitiateTask() != null) { msg.getInitiateTask().setSpHandle(newSpHandle); // set the handle msg.setInitiateTask( msg.getInitiateTask()); // Trigger reserialization so the new handle is used } /* * If there a replicas to send it to, forward it! * Unless... it's read only AND not a sysproc. Read only sysprocs may expect to be sent * everywhere. * In that case don't propagate it to avoid a determinism check and extra messaging overhead */ if (m_sendToHSIds.length > 0 && (!msg.isReadOnly() || msg.isSysProcTask())) { FragmentTaskMessage replmsg = new FragmentTaskMessage(m_mailbox.getHSId(), m_mailbox.getHSId(), msg); m_mailbox.send(m_sendToHSIds, replmsg); DuplicateCounter counter; /* * Non-determinism should be impossible to happen with MP fragments. * if you see "MP_DETERMINISM_ERROR" as procedure name in the crash logs * something has horribly gone wrong. */ if (message.getFragmentTaskType() != FragmentTaskMessage.SYS_PROC_PER_SITE) { counter = new DuplicateCounter( msg.getCoordinatorHSId(), msg.getTxnId(), m_replicaHSIds, "MP_DETERMINISM_ERROR"); } else { counter = new SysProcDuplicateCounter( msg.getCoordinatorHSId(), msg.getTxnId(), m_replicaHSIds, "MP_DETERMINISM_ERROR"); } m_duplicateCounters.put(new DuplicateCounterKey(msg.getTxnId(), newSpHandle), counter); } } else { newSpHandle = msg.getSpHandle(); setMaxSeenTxnId(newSpHandle); } Iv2Trace.logFragmentTaskMessage(message, m_mailbox.getHSId(), newSpHandle, false); doLocalFragmentOffer(msg); }
/** * Do the work necessary to turn the FragmentTaskMessage into a TransactionTask which can be * queued to the TransactionTaskQueue. This is reused by both the normal message handling path and * the repair path, and assumes that the caller has dealt with or ensured that the necessary ID, * SpHandles, and replication issues are resolved. */ private void doLocalFragmentOffer(FragmentTaskMessage msg) { TransactionState txn = m_outstandingTxns.get(msg.getTxnId()); boolean logThis = false; // bit of a hack...we will probably not want to create and // offer FragmentTasks for txn ids that don't match if we have // something in progress already if (txn == null) { txn = new ParticipantTransactionState(msg.getSpHandle(), msg); m_outstandingTxns.put(msg.getTxnId(), txn); // Only want to send things to the command log if it satisfies this predicate // AND we've never seen anything for this transaction before. We can't // actually log until we create a TransactionTask, though, so just keep track // of whether it needs to be done. logThis = (msg.getInitiateTask() != null && !msg.getInitiateTask().isReadOnly()); } // Check to see if this is the final task for this txn, and if so, if we can close it out early // Right now, this just means read-only. // NOTE: this overlaps slightly with CompleteTransactionMessage handling completion. It's so // tiny // that for now, meh, but if this scope grows then it should get refactored out if (msg.isFinalTask() && txn.isReadOnly()) { m_outstandingTxns.remove(msg.getTxnId()); } TransactionTask task; if (msg.isSysProcTask()) { task = new SysprocFragmentTask( m_mailbox, (ParticipantTransactionState) txn, m_pendingTasks, msg, null); } else { task = new FragmentTask(m_mailbox, (ParticipantTransactionState) txn, m_pendingTasks, msg, null); } if (logThis) { if (!m_cl.log(msg.getInitiateTask(), msg.getSpHandle(), m_durabilityListener, task)) { m_pendingTasks.offer(task); } else { /* Getting here means that the task is the first fragment of an MP txn and * synchronous command logging is on, so create a backlog for future tasks of * this MP arrived before it's marked durable. * * This is important for synchronous command logging and MP txn restart. Without * this, a restarted MP txn may not be gated by logging of the first fragment. */ assert !m_mpsPendingDurability.containsKey(task.getTxnId()); m_mpsPendingDurability.put(task.getTxnId(), new ArrayDeque<TransactionTask>()); } } else { queueOrOfferMPTask(task); } }
public static void logFragmentTaskMessage( FragmentTaskMessage ftask, long localHSId, long spHandle, boolean borrow) { if (iv2log.isTraceEnabled()) { String label = "rxFragMsg"; if (borrow) { label = "rxBrrwMsg"; } if (ftask.getSpHandle() != Long.MIN_VALUE && ftask.getSpHandle() != spHandle) { iv2log.error( "FragmentTaskMessage SP HANDLE conflict. Message: " + ftask.getSpHandle() + ", locally held: " + spHandle); } String logmsg = "%s %s from %s txnId %s spHandle %s trunc %s"; iv2log.trace( String.format( logmsg, label, CoreUtils.hsIdToString(localHSId), CoreUtils.hsIdToString(ftask.m_sourceHSId), txnIdToString(ftask.getTxnId()), txnIdToString(spHandle), txnIdToString(ftask.getTruncationHandle()))); } }
private void handleFragmentTaskMessageRepair( List<Long> needsRepair, FragmentTaskMessage message) { // set up duplicate counter. expect exactly the responses corresponding // to needsRepair. These may, or may not, include the local site. List<Long> expectedHSIds = new ArrayList<Long>(needsRepair); DuplicateCounter counter = new DuplicateCounter( message.getCoordinatorHSId(), // Assume that the MPI's HSID hasn't changed message.getTxnId(), expectedHSIds, "MP_DETERMINISM_ERROR"); m_duplicateCounters.put( new DuplicateCounterKey(message.getTxnId(), message.getSpHandle()), counter); // is local repair necessary? if (needsRepair.contains(m_mailbox.getHSId())) { // Sanity check that we really need repair. if (m_outstandingTxns.get(message.getTxnId()) != null) { hostLog.warn( "SPI repair attempted to repair a fragment which it has already seen. " + "This shouldn't be possible."); // Not sure what to do in this event. Crash for now throw new RuntimeException("Attempted to repair with a fragment we've already seen."); } needsRepair.remove(m_mailbox.getHSId()); // make a copy because handleIv2 non-repair case does? FragmentTaskMessage localWork = new FragmentTaskMessage( message.getInitiatorHSId(), message.getCoordinatorHSId(), message); doLocalFragmentOffer(localWork); } // is remote repair necessary? if (!needsRepair.isEmpty()) { FragmentTaskMessage replmsg = new FragmentTaskMessage(m_mailbox.getHSId(), m_mailbox.getHSId(), message); m_mailbox.send(com.google.common.primitives.Longs.toArray(needsRepair), replmsg); } }
// Offer a new message to the repair log. This will truncate // the repairLog if the message includes a truncation hint. public void deliver(VoltMessage msg) { if (!m_isLeader && msg instanceof Iv2InitiateTaskMessage) { final Iv2InitiateTaskMessage m = (Iv2InitiateTaskMessage) msg; // We can't repair read only SP transactions. Just don't log them to the repair log. if (m.isReadOnly()) { return; } m_lastSpHandle = m.getSpHandle(); truncate(m.getTruncationHandle(), IS_SP); m_logSP.add(new Item(IS_SP, m, m.getSpHandle(), m.getTxnId())); } else if (msg instanceof FragmentTaskMessage) { final FragmentTaskMessage m = (FragmentTaskMessage) msg; // We can't repair read only SP transactions. Just don't log them to the repair log. if (m.isReadOnly()) { return; } truncate(m.getTruncationHandle(), IS_MP); // only log the first fragment of a procedure (and handle 1st case) if (m.getTxnId() > m_lastMpHandle || m_lastMpHandle == Long.MAX_VALUE) { m_logMP.add(new Item(IS_MP, m, m.getSpHandle(), m.getTxnId())); m_lastMpHandle = m.getTxnId(); m_lastSpHandle = m.getSpHandle(); } } else if (msg instanceof CompleteTransactionMessage) { // a CompleteTransactionMessage which indicates restart is not the end of the // transaction. We don't want to log it in the repair log. CompleteTransactionMessage ctm = (CompleteTransactionMessage) msg; // We can't repair read only SP transactions. Just don't log them to the repair log. // Restart transaction do not need to be repaired here, don't log them as well. if (ctm.isReadOnly() || ctm.isRestart()) { return; } truncate(ctm.getTruncationHandle(), IS_MP); m_logMP.add(new Item(IS_MP, ctm, ctm.getSpHandle(), ctm.getTxnId())); // Restore will send a complete transaction message with a lower mp transaction id because // the restore transaction precedes the loading of the right mp transaction id from the // snapshot // Hence Math.max m_lastMpHandle = Math.max(m_lastMpHandle, ctm.getTxnId()); m_lastSpHandle = ctm.getSpHandle(); } else if (msg instanceof DumpMessage) { String who = CoreUtils.hsIdToString(m_HSId); tmLog.warn( "Repair log dump for site: " + who + ", isLeader: " + m_isLeader + ", " + who + ": lastSpHandle: " + m_lastSpHandle + ", lastMpHandle: " + m_lastMpHandle); for (Iv2RepairLogResponseMessage il : contents(0l, false)) { tmLog.warn("[Repair log contents]" + who + ": msg: " + il); } } else if (msg instanceof RepairLogTruncationMessage) { final RepairLogTruncationMessage truncateMsg = (RepairLogTruncationMessage) msg; truncate(truncateMsg.getHandle(), IS_SP); } }
void replayFromTaskLog() throws IOException { // not yet time to catch-up. if (m_rejoinState != kStateReplayingRejoin) { return; } // replay 10:1 in favor of replay for (int i = 0; i < 10; ++i) { if (m_rejoinTaskLog.isEmpty()) { break; } TransactionInfoBaseMessage tibm = m_rejoinTaskLog.getNextMessage(); if (tibm == null) { break; } // Apply the readonly / sysproc filter. With Iv2 read optimizations, // reads should not reach here; the cost of post-filtering shouldn't // be particularly high (vs pre-filtering). if (filter(tibm)) { continue; } if (tibm instanceof Iv2InitiateTaskMessage) { Iv2InitiateTaskMessage m = (Iv2InitiateTaskMessage) tibm; SpProcedureTask t = new SpProcedureTask(m_initiatorMailbox, m.getStoredProcedureName(), null, m, null); t.runFromTaskLog(this); } else if (tibm instanceof FragmentTaskMessage) { FragmentTaskMessage m = (FragmentTaskMessage) tibm; if (global_replay_mpTxn == null) { global_replay_mpTxn = new ParticipantTransactionState(m.getTxnId(), m); } else if (global_replay_mpTxn.txnId != m.getTxnId()) { VoltDB.crashLocalVoltDB( "Started a MP transaction during replay before completing " + " open transaction.", false, null); } FragmentTask t = new FragmentTask(m_initiatorMailbox, m, global_replay_mpTxn); t.runFromTaskLog(this); } else if (tibm instanceof CompleteTransactionMessage) { // Needs improvement: completes for sysprocs aren't filterable as sysprocs. // Only complete transactions that are open... if (global_replay_mpTxn != null) { CompleteTransactionMessage m = (CompleteTransactionMessage) tibm; CompleteTransactionTask t = new CompleteTransactionTask(global_replay_mpTxn, null, m, null); if (!m.isRestart()) { global_replay_mpTxn = null; } t.runFromTaskLog(this); } } else { VoltDB.crashLocalVoltDB( "Can not replay message type " + tibm + " during live rejoin. Unexpected error.", false, null); } } // exit replay being careful not to exit in the middle of a multi-partititon // transaction. The SPScheduler doesn't have a valid transaction state for a // partially replayed MP txn and in case of rollback the scheduler's undo token // is wrong. Run MP txns fully kStateRejoining or fully kStateRunning. if (m_rejoinTaskLog.isEmpty() && global_replay_mpTxn == null) { setReplayRejoinComplete(); } }