/* * Multi-partition/non-replicated fragment with collector and aggregator. */ void addFragmentPair( int index, PlanFragment collectorFragment, PlanFragment aggregatorFragment, ByteBuffer params) { assert (index >= 0); assert (index < m_batchSize); assert (collectorFragment != null); assert (aggregatorFragment != null); assert (collectorFragment.getHasdependencies() == false); assert (aggregatorFragment.getHasdependencies() == true); // frags with no deps are usually collector frags that go to all partitions long distributedFragId = CatalogUtil.getUniqueIdForFragment(collectorFragment); long localFragId = CatalogUtil.getUniqueIdForFragment(aggregatorFragment); // if any frag is transactional, update this check if (aggregatorFragment.getNontransactional() == true) { m_localFragsAreNonTransactional = true; } int outputDepId = m_txnState.getNextDependencyId() | DtxnConstants.MULTIPARTITION_DEPENDENCY; m_depsForLocalTask[index] = outputDepId; // Add local and distributed fragments. m_localTask.addFragment(localFragId, m_depsToResume[index], params); m_distributedTask.addFragment(distributedFragId, outputDepId, params); }
@Override public void createLocalFragmentWork(FragmentTaskMessage task, boolean nonTransactional) { if (task.getFragmentCount() <= 0) return; WorkUnit w = new WorkUnit( m_site.getSiteTracker(), task, task.getAllUnorderedInputDepIds(), m_hsId, m_nonCoordinatingSites, false, m_allowMismatchedResults); w.nonTransactional = nonTransactional; for (int i = 0; i < task.getFragmentCount(); i++) { ArrayList<Integer> inputDepIds = task.getInputDepIds(i); if (inputDepIds == null) continue; for (int inputDepId : inputDepIds) { if (m_missingDependencies == null) m_missingDependencies = new HashMap<Integer, WorkUnit>(); assert (!m_missingDependencies.containsKey(inputDepId)); m_missingDependencies.put(inputDepId, w); } } if (w.allDependenciesSatisfied()) m_readyWorkUnits.add(w); }
private void logToDR(PartitionDRGateway drGateway) { // Log invocation to DR if (drGateway != null && !m_txnState.isForReplay() && !m_txnState.isReadOnly() && !m_completeMsg.isRollback()) { FragmentTaskMessage fragment = (FragmentTaskMessage) m_txnState.getNotice(); Iv2InitiateTaskMessage initiateTask = fragment.getInitiateTask(); assert (initiateTask != null); if (initiateTask == null) { hostLog.error( "Unable to log MP transaction to DR because of missing InitiateTaskMessage, " + "fragment: " + fragment.toString()); } StoredProcedureInvocation invocation = initiateTask.getStoredProcedureInvocation().getShallowCopy(); drGateway.onSuccessfulMPCall( m_txnState.m_spHandle, m_txnState.txnId, m_txnState.uniqueId, m_completeMsg.getHash(), invocation, m_txnState.getResults()); } }
public static void logFragmentTaskMessage( FragmentTaskMessage ftask, long localHSId, long spHandle, boolean borrow) { if (iv2log.isTraceEnabled()) { String label = "rxFragMsg"; if (borrow) { label = "rxBrrwMsg"; } if (ftask.getSpHandle() != Long.MIN_VALUE && ftask.getSpHandle() != spHandle) { iv2log.error( "FragmentTaskMessage SP HANDLE conflict. Message: " + ftask.getSpHandle() + ", locally held: " + spHandle); } String logmsg = "%s %s from %s txnId %s spHandle %s trunc %s"; iv2log.trace( String.format( logmsg, label, CoreUtils.hsIdToString(localHSId), CoreUtils.hsIdToString(ftask.m_sourceHSId), txnIdToString(ftask.getTxnId()), txnIdToString(spHandle), txnIdToString(ftask.getTruncationHandle()))); } }
/** * Do the work necessary to turn the FragmentTaskMessage into a TransactionTask which can be * queued to the TransactionTaskQueue. This is reused by both the normal message handling path and * the repair path, and assumes that the caller has dealt with or ensured that the necessary ID, * SpHandles, and replication issues are resolved. */ private void doLocalFragmentOffer(FragmentTaskMessage msg) { TransactionState txn = m_outstandingTxns.get(msg.getTxnId()); boolean logThis = false; // bit of a hack...we will probably not want to create and // offer FragmentTasks for txn ids that don't match if we have // something in progress already if (txn == null) { txn = new ParticipantTransactionState(msg.getSpHandle(), msg); m_outstandingTxns.put(msg.getTxnId(), txn); // Only want to send things to the command log if it satisfies this predicate // AND we've never seen anything for this transaction before. We can't // actually log until we create a TransactionTask, though, so just keep track // of whether it needs to be done. logThis = (msg.getInitiateTask() != null && !msg.getInitiateTask().isReadOnly()); } // Check to see if this is the final task for this txn, and if so, if we can close it out early // Right now, this just means read-only. // NOTE: this overlaps slightly with CompleteTransactionMessage handling completion. It's so // tiny // that for now, meh, but if this scope grows then it should get refactored out if (msg.isFinalTask() && txn.isReadOnly()) { m_outstandingTxns.remove(msg.getTxnId()); } TransactionTask task; if (msg.isSysProcTask()) { task = new SysprocFragmentTask( m_mailbox, (ParticipantTransactionState) txn, m_pendingTasks, msg, null); } else { task = new FragmentTask(m_mailbox, (ParticipantTransactionState) txn, m_pendingTasks, msg, null); } if (logThis) { if (!m_cl.log(msg.getInitiateTask(), msg.getSpHandle(), m_durabilityListener, task)) { m_pendingTasks.offer(task); } else { /* Getting here means that the task is the first fragment of an MP txn and * synchronous command logging is on, so create a backlog for future tasks of * this MP arrived before it's marked durable. * * This is important for synchronous command logging and MP txn restart. Without * this, a restarted MP txn may not be gated by logging of the first fragment. */ assert !m_mpsPendingDurability.containsKey(task.getTxnId()); m_mpsPendingDurability.put(task.getTxnId(), new ArrayDeque<TransactionTask>()); } } else { queueOrOfferMPTask(task); } }
/* * Multi-partition/non-replicated custom fragment with collector and aggregator. */ void addCustomFragmentPair( int index, byte[] collectorFragment, byte[] aggregatorFragment, ByteBuffer params) { assert (index >= 0); assert (index < m_batchSize); assert (collectorFragment != null); assert (aggregatorFragment != null); int outputDepId = m_txnState.getNextDependencyId() | DtxnConstants.MULTIPARTITION_DEPENDENCY; m_depsForLocalTask[index] = outputDepId; // Add the aggegator and collector fragments. m_localTask.addCustomFragment(m_depsToResume[index], params, aggregatorFragment); m_distributedTask.addCustomFragment(outputDepId, params, collectorFragment); }
/* * Replicated custom fragment. */ void addCustomFragment(int index, byte[] aggregatorFragment, ByteBuffer params) { assert (index >= 0); assert (index < m_batchSize); assert (aggregatorFragment != null); m_depsForLocalTask[index] = -1; m_localTask.addCustomFragment(m_depsToResume[index], params, aggregatorFragment); }
private void processRejoiningFragmentWork( FragmentTaskMessage ftask, HashMap<Integer, List<VoltTable>> dependencies) { assert (ftask.getFragmentCount() > 0); assert (m_rejoinState == RejoinState.REJOINING); FragmentResponseMessage response = new FragmentResponseMessage(ftask, m_hsId); response.setRecovering(true); response.setStatus(FragmentResponseMessage.SUCCESS, null); // log the work done for replay if (!ftask.isReadOnly() && !ftask.isSysProcTask()) { assert (m_notice.isReadOnly() == false); assert (m_loggedFragments != null); m_loggedFragments.appendFragmentTask(ftask); } // add a dummy table for all of the expected dependency ids for (int i = 0; i < ftask.getFragmentCount(); i++) { response.addDependency( ftask.getOutputDepId(i), new VoltTable(new VoltTable.ColumnInfo("DUMMY", VoltType.BIGINT))); } m_mbox.send(response.getDestinationSiteId(), response); // If we're not the coordinator, the transaction is read-only, // and this was the final task, then we can try to move on after // we've finished this work. if (!isCoordinator() && isReadOnly() && ftask.isFinalTask()) { m_done = true; } }
/* * Replicated fragment. */ void addFragment(int index, PlanFragment frag, ByteBuffer params) { assert (index >= 0); assert (index < m_batchSize); assert (frag != null); assert (frag.getHasdependencies() == false); // if any frag is transactional, update this check if (frag.getNontransactional() == true) m_localFragsAreNonTransactional = true; long localFragId = CatalogUtil.getUniqueIdForFragment(frag); m_depsForLocalTask[index] = -1; // Add the local fragment data. m_localTask.addFragment(localFragId, m_depsToResume[index], params); }
void processFragmentWork( FragmentTaskMessage ftask, HashMap<Integer, List<VoltTable>> dependencies) { assert (ftask.getFragmentCount() > 0); FragmentResponseMessage response = m_site.processFragmentTask(this, dependencies, ftask); if (response.getStatusCode() != FragmentResponseMessage.SUCCESS) { if (m_missingDependencies != null) m_missingDependencies.clear(); m_readyWorkUnits.clear(); if (m_isCoordinator) { // throw an exception which will back the runtime all the way // to the stored procedure invocation call, triggering undo // at that point if (response.getException() != null) { throw response.getException(); } else { throw new FragmentFailureException(); } } else { m_needsRollback = true; m_done = true; } } if (m_isCoordinator && (response.getDestinationSiteId() == response.getExecutorSiteId())) { processFragmentResponseDependencies(response); } else { m_mbox.send(response.getDestinationSiteId(), response); // If we're not the coordinator, the transaction is read-only, // and this was the final task, then we can try to move on after // we've finished this work. if (!isCoordinator() && isReadOnly() && ftask.isFinalTask()) { m_done = true; } } }
@Test public void testTruncationHandleForwarding() throws IOException { long truncPt = 100L; Iv2InitiateTaskMessage taskmsg = new Iv2InitiateTaskMessage( 0, 0, truncPt, 101L, System.currentTimeMillis(), true, false, new StoredProcedureInvocation(), 0, 0, false); assertEquals(truncPt, taskmsg.getTruncationHandle()); FragmentTaskMessage localFrag = mock(FragmentTaskMessage.class); FragmentTaskMessage remoteFrag = mock(FragmentTaskMessage.class); when(remoteFrag.getFragmentCount()).thenReturn(1); buddyHSId = 0; Mailbox mailbox = mock(Mailbox.class); MpTransactionState dut = new MpTransactionState(mailbox, taskmsg, allHsids, buddyHSId, false); // create local work and verify the created localwork has the // expected truncation point. dut.createLocalFragmentWork(localFrag, false); verify(dut.m_localWork).setTruncationHandle(truncPt); // same with partcipating work. dut.createAllParticipatingFragmentWork(remoteFrag); verify(dut.m_remoteWork).setTruncationHandle(truncPt); }
private void handleFragmentTaskMessageRepair( List<Long> needsRepair, FragmentTaskMessage message) { // set up duplicate counter. expect exactly the responses corresponding // to needsRepair. These may, or may not, include the local site. List<Long> expectedHSIds = new ArrayList<Long>(needsRepair); DuplicateCounter counter = new DuplicateCounter( message.getCoordinatorHSId(), // Assume that the MPI's HSID hasn't changed message.getTxnId(), expectedHSIds, "MP_DETERMINISM_ERROR"); m_duplicateCounters.put( new DuplicateCounterKey(message.getTxnId(), message.getSpHandle()), counter); // is local repair necessary? if (needsRepair.contains(m_mailbox.getHSId())) { // Sanity check that we really need repair. if (m_outstandingTxns.get(message.getTxnId()) != null) { hostLog.warn( "SPI repair attempted to repair a fragment which it has already seen. " + "This shouldn't be possible."); // Not sure what to do in this event. Crash for now throw new RuntimeException("Attempted to repair with a fragment we've already seen."); } needsRepair.remove(m_mailbox.getHSId()); // make a copy because handleIv2 non-repair case does? FragmentTaskMessage localWork = new FragmentTaskMessage( message.getInitiatorHSId(), message.getCoordinatorHSId(), message); doLocalFragmentOffer(localWork); } // is remote repair necessary? if (!needsRepair.isEmpty()) { FragmentTaskMessage replmsg = new FragmentTaskMessage(m_mailbox.getHSId(), m_mailbox.getHSId(), message); m_mailbox.send(com.google.common.primitives.Longs.toArray(needsRepair), replmsg); } }
// SpSchedulers will see FragmentTaskMessage for: // - The scatter fragment(s) of a multi-part transaction (normal or sysproc) // - Borrow tasks to do the local fragment work if this partition is the // buddy of the MPI. Borrow tasks may include input dependency tables for // aggregation fragments, or not, if it's a replicated table read. // For multi-batch MP transactions, we'll need to look up the transaction state // that gets created when the first batch arrives. // During command log replay a new SP handle is going to be generated, but it really // doesn't matter, it isn't going to be used for anything. void handleFragmentTaskMessage(FragmentTaskMessage message) { FragmentTaskMessage msg = message; long newSpHandle; if (m_isLeader) { // Quick hack to make progress...we need to copy the FragmentTaskMessage // before we start mucking with its state (SPHANDLE). We need to revisit // all the messaging mess at some point. msg = new FragmentTaskMessage( message.getInitiatorHSId(), message.getCoordinatorHSId(), message); // Not going to use the timestamp from the new Ego because the multi-part timestamp is what // should be used TxnEgo ego = advanceTxnEgo(); newSpHandle = ego.getTxnId(); msg.setSpHandle(newSpHandle); if (msg.getInitiateTask() != null) { msg.getInitiateTask().setSpHandle(newSpHandle); // set the handle msg.setInitiateTask( msg.getInitiateTask()); // Trigger reserialization so the new handle is used } /* * If there a replicas to send it to, forward it! * Unless... it's read only AND not a sysproc. Read only sysprocs may expect to be sent * everywhere. * In that case don't propagate it to avoid a determinism check and extra messaging overhead */ if (m_sendToHSIds.length > 0 && (!msg.isReadOnly() || msg.isSysProcTask())) { FragmentTaskMessage replmsg = new FragmentTaskMessage(m_mailbox.getHSId(), m_mailbox.getHSId(), msg); m_mailbox.send(m_sendToHSIds, replmsg); DuplicateCounter counter; /* * Non-determinism should be impossible to happen with MP fragments. * if you see "MP_DETERMINISM_ERROR" as procedure name in the crash logs * something has horribly gone wrong. */ if (message.getFragmentTaskType() != FragmentTaskMessage.SYS_PROC_PER_SITE) { counter = new DuplicateCounter( msg.getCoordinatorHSId(), msg.getTxnId(), m_replicaHSIds, "MP_DETERMINISM_ERROR"); } else { counter = new SysProcDuplicateCounter( msg.getCoordinatorHSId(), msg.getTxnId(), m_replicaHSIds, "MP_DETERMINISM_ERROR"); } m_duplicateCounters.put(new DuplicateCounterKey(msg.getTxnId(), newSpHandle), counter); } } else { newSpHandle = msg.getSpHandle(); setMaxSeenTxnId(newSpHandle); } Iv2Trace.logFragmentTaskMessage(message, m_mailbox.getHSId(), newSpHandle, false); doLocalFragmentOffer(msg); }
// Offer a new message to the repair log. This will truncate // the repairLog if the message includes a truncation hint. public void deliver(VoltMessage msg) { if (!m_isLeader && msg instanceof Iv2InitiateTaskMessage) { final Iv2InitiateTaskMessage m = (Iv2InitiateTaskMessage) msg; // We can't repair read only SP transactions. Just don't log them to the repair log. if (m.isReadOnly()) { return; } m_lastSpHandle = m.getSpHandle(); truncate(m.getTruncationHandle(), IS_SP); m_logSP.add(new Item(IS_SP, m, m.getSpHandle(), m.getTxnId())); } else if (msg instanceof FragmentTaskMessage) { final FragmentTaskMessage m = (FragmentTaskMessage) msg; // We can't repair read only SP transactions. Just don't log them to the repair log. if (m.isReadOnly()) { return; } truncate(m.getTruncationHandle(), IS_MP); // only log the first fragment of a procedure (and handle 1st case) if (m.getTxnId() > m_lastMpHandle || m_lastMpHandle == Long.MAX_VALUE) { m_logMP.add(new Item(IS_MP, m, m.getSpHandle(), m.getTxnId())); m_lastMpHandle = m.getTxnId(); m_lastSpHandle = m.getSpHandle(); } } else if (msg instanceof CompleteTransactionMessage) { // a CompleteTransactionMessage which indicates restart is not the end of the // transaction. We don't want to log it in the repair log. CompleteTransactionMessage ctm = (CompleteTransactionMessage) msg; // We can't repair read only SP transactions. Just don't log them to the repair log. // Restart transaction do not need to be repaired here, don't log them as well. if (ctm.isReadOnly() || ctm.isRestart()) { return; } truncate(ctm.getTruncationHandle(), IS_MP); m_logMP.add(new Item(IS_MP, ctm, ctm.getSpHandle(), ctm.getTxnId())); // Restore will send a complete transaction message with a lower mp transaction id because // the restore transaction precedes the loading of the right mp transaction id from the // snapshot // Hence Math.max m_lastMpHandle = Math.max(m_lastMpHandle, ctm.getTxnId()); m_lastSpHandle = ctm.getSpHandle(); } else if (msg instanceof DumpMessage) { String who = CoreUtils.hsIdToString(m_HSId); tmLog.warn( "Repair log dump for site: " + who + ", isLeader: " + m_isLeader + ", " + who + ": lastSpHandle: " + m_lastSpHandle + ", lastMpHandle: " + m_lastMpHandle); for (Iv2RepairLogResponseMessage il : contents(0l, false)) { tmLog.warn("[Repair log contents]" + who + ": msg: " + il); } } else if (msg instanceof RepairLogTruncationMessage) { final RepairLogTruncationMessage truncateMsg = (RepairLogTruncationMessage) msg; truncate(truncateMsg.getHandle(), IS_SP); } }
void replayFromTaskLog() throws IOException { // not yet time to catch-up. if (m_rejoinState != kStateReplayingRejoin) { return; } // replay 10:1 in favor of replay for (int i = 0; i < 10; ++i) { if (m_rejoinTaskLog.isEmpty()) { break; } TransactionInfoBaseMessage tibm = m_rejoinTaskLog.getNextMessage(); if (tibm == null) { break; } // Apply the readonly / sysproc filter. With Iv2 read optimizations, // reads should not reach here; the cost of post-filtering shouldn't // be particularly high (vs pre-filtering). if (filter(tibm)) { continue; } if (tibm instanceof Iv2InitiateTaskMessage) { Iv2InitiateTaskMessage m = (Iv2InitiateTaskMessage) tibm; SpProcedureTask t = new SpProcedureTask(m_initiatorMailbox, m.getStoredProcedureName(), null, m, null); t.runFromTaskLog(this); } else if (tibm instanceof FragmentTaskMessage) { FragmentTaskMessage m = (FragmentTaskMessage) tibm; if (global_replay_mpTxn == null) { global_replay_mpTxn = new ParticipantTransactionState(m.getTxnId(), m); } else if (global_replay_mpTxn.txnId != m.getTxnId()) { VoltDB.crashLocalVoltDB( "Started a MP transaction during replay before completing " + " open transaction.", false, null); } FragmentTask t = new FragmentTask(m_initiatorMailbox, m, global_replay_mpTxn); t.runFromTaskLog(this); } else if (tibm instanceof CompleteTransactionMessage) { // Needs improvement: completes for sysprocs aren't filterable as sysprocs. // Only complete transactions that are open... if (global_replay_mpTxn != null) { CompleteTransactionMessage m = (CompleteTransactionMessage) tibm; CompleteTransactionTask t = new CompleteTransactionTask(global_replay_mpTxn, null, m, null); if (!m.isRestart()) { global_replay_mpTxn = null; } t.runFromTaskLog(this); } } else { VoltDB.crashLocalVoltDB( "Can not replay message type " + tibm + " during live rejoin. Unexpected error.", false, null); } } // exit replay being careful not to exit in the middle of a multi-partititon // transaction. The SPScheduler doesn't have a valid transaction state for a // partially replayed MP txn and in case of rollback the scheduler's undo token // is wrong. Run MP txns fully kStateRejoining or fully kStateRunning. if (m_rejoinTaskLog.isEmpty() && global_replay_mpTxn == null) { setReplayRejoinComplete(); } }