public void handleDumpMessage() { String who = CoreUtils.hsIdToString(m_mailbox.getHSId()); hostLog.warn("State dump for site: " + who); hostLog.warn("" + who + ": partition: " + m_partitionId + ", isLeader: " + m_isLeader); if (m_isLeader) { hostLog.warn("" + who + ": replicas: " + CoreUtils.hsIdCollectionToString(m_replicaHSIds)); if (m_sendToHSIds.length > 0) { m_mailbox.send(m_sendToHSIds, new DumpMessage()); } } hostLog.warn( "" + who + ": most recent SP handle: " + getCurrentTxnId() + " " + TxnEgo.txnIdToString(getCurrentTxnId())); hostLog.warn( "" + who + ": outstanding txns: " + m_outstandingTxns.keySet() + " " + TxnEgo.txnIdCollectionToString(m_outstandingTxns.keySet())); hostLog.warn("" + who + ": TransactionTaskQueue: " + m_pendingTasks.toString()); if (m_duplicateCounters.size() > 0) { hostLog.warn("" + who + ": duplicate counters: "); for (Entry<DuplicateCounterKey, DuplicateCounter> e : m_duplicateCounters.entrySet()) { hostLog.warn("\t" + who + ": " + e.getKey().toString() + ": " + e.getValue().toString()); } } }
public static void logTopology(long leaderHSId, List<Long> replicas, int partitionId) { if (iv2log.isTraceEnabled()) { String logmsg = "topology partition %d leader %s replicas (%s)"; iv2log.trace( String.format( logmsg, partitionId, CoreUtils.hsIdToString(leaderHSId), CoreUtils.hsIdCollectionToString(replicas))); } }
@Override protected void updateStatsRow(Object rowKey, Object[] rowValues) { long leader; List<Long> sites = new ArrayList<Long>(); if (rowKey.equals(MpInitiator.MP_INIT_PID)) { leader = getHSIdForMultiPartitionInitiator(); sites.add(leader); } else { leader = m_iv2Masters.pointInTimeCache().get((Integer) rowKey); sites.addAll(getReplicasForPartition((Integer) rowKey)); } rowValues[columnNameToIndex.get("Partition")] = rowKey; rowValues[columnNameToIndex.get("Sites")] = CoreUtils.hsIdCollectionToString(sites); rowValues[columnNameToIndex.get("Leader")] = CoreUtils.hsIdToString(leader); }
/** Start fixing survivors: setup scoreboard and request repair logs. */ void prepareForFaultRecovery() { for (Long hsid : m_survivors) { m_replicaRepairStructs.put(hsid, new ReplicaRepairStruct()); } tmLog.info( m_whoami + "found (including self) " + m_survivors.size() + " surviving replicas to repair. " + " Survivors: " + CoreUtils.hsIdCollectionToString(m_survivors)); VoltMessage logRequest = new Iv2RepairLogRequestMessage(m_requestId, Iv2RepairLogRequestMessage.SPREQUEST); m_mailbox.send(com.google.common.primitives.Longs.toArray(m_survivors), logRequest); }
/** Send missed-messages to survivors. */ public void repairSurvivors() { // cancel() and repair() must be synchronized by the caller (the deliver lock, // currently). If cancelled and the last repair message arrives, don't send // out corrections! if (this.m_promotionResult.isCancelled()) { tmLog.debug(m_whoami + "Skipping repair message creation for cancelled Term."); return; } int queued = 0; tmLog.debug(m_whoami + "received all repair logs and is repairing surviving replicas."); for (Iv2RepairLogResponseMessage li : m_repairLogUnion) { List<Long> needsRepair = new ArrayList<Long>(5); for (Entry<Long, ReplicaRepairStruct> entry : m_replicaRepairStructs.entrySet()) { if (entry.getValue().needs(li.getHandle())) { ++queued; tmLog.debug( m_whoami + "repairing " + CoreUtils.hsIdToString(entry.getKey()) + ". Max seen " + entry.getValue().m_maxSpHandleSeen + ". Repairing with " + li.getHandle()); needsRepair.add(entry.getKey()); } } if (!needsRepair.isEmpty()) { if (tmLog.isTraceEnabled()) { tmLog.trace( m_whoami + "repairing: " + CoreUtils.hsIdCollectionToString(needsRepair) + " with message: " + li.getPayload()); } m_mailbox.repairReplicasWith(needsRepair, li.getPayload()); } } tmLog.debug(m_whoami + "finished queuing " + queued + " replica repair messages."); m_promotionResult.done(m_maxSeenTxnId); }
@Override public void run(List<String> children) { List<Long> updatedHSIds = VoltZK.childrenToReplicaHSIds(children); // compute previously unseen HSId set in the callback list Set<Long> newHSIds = new HashSet<Long>(updatedHSIds); newHSIds.removeAll(m_replicas); tmLog.debug("Newly seen replicas: " + CoreUtils.hsIdCollectionToString(newHSIds)); // compute previously seen but now vanished from the callback list HSId set Set<Long> missingHSIds = new HashSet<Long>(m_replicas); missingHSIds.removeAll(updatedHSIds); tmLog.debug("Newly dead replicas: " + CoreUtils.hsIdCollectionToString(missingHSIds)); tmLog.debug( "Handling babysitter callback for partition " + m_partitionId + ": children: " + CoreUtils.hsIdCollectionToString(updatedHSIds)); if (m_state.get() == AppointerState.CLUSTER_START) { // We can't yet tolerate a host failure during startup. Crash it all if (missingHSIds.size() > 0) { VoltDB.crashGlobalVoltDB("Node failure detected during startup.", false, null); } // ENG-3166: Eventually we would like to get rid of the extra replicas beyond k_factor, // but for now we just look to see how many replicas of this partition we actually expect // and gate leader assignment on that many copies showing up. int replicaCount = m_kfactor + 1; JSONArray parts; try { parts = m_topo.getJSONArray("partitions"); for (int p = 0; p < parts.length(); p++) { JSONObject aPartition = parts.getJSONObject(p); int pid = aPartition.getInt("partition_id"); if (pid == m_partitionId) { replicaCount = aPartition.getJSONArray("replicas").length(); } } } catch (JSONException e) { // Ignore and just assume the normal number of replicas } if (children.size() == replicaCount) { m_currentLeader = assignLeader(m_partitionId, updatedHSIds); } else { tmLog.info( "Waiting on " + ((m_kfactor + 1) - children.size()) + " more nodes " + "for k-safety before startup"); } } else { // Check for k-safety if (!isClusterKSafe()) { VoltDB.crashGlobalVoltDB( "Some partitions have no replicas. Cluster has become unviable.", false, null); } // Check if replay has completed if (m_replayComplete.get() == false) { VoltDB.crashGlobalVoltDB( "Detected node failure during command log replay. Cluster will shut down.", false, null); } // Check to see if there's been a possible network partition and we're not already handling // it if (m_partitionDetectionEnabled && !m_partitionDetected) { doPartitionDetectionActivities(); } // If we survived the above gauntlet of fail, appoint a new leader for this partition. if (missingHSIds.contains(m_currentLeader)) { m_currentLeader = assignLeader(m_partitionId, updatedHSIds); } } m_replicas.clear(); m_replicas.addAll(updatedHSIds); }
private void createSetupIv2( final String file_path, final String file_nonce, SnapshotFormat format, final long txnId, final Map<Integer, Long> partitionTransactionIds, String data, final SystemProcedureExecutionContext context, final VoltTable result, Map<String, Map<Integer, Pair<Long, Long>>> exportSequenceNumbers, SiteTracker tracker, HashinatorSnapshotData hashinatorData, long timestamp) { JSONObject jsData = null; if (data != null && !data.isEmpty()) { try { jsData = new JSONObject(data); } catch (JSONException e) { SNAP_LOG.error(String.format("JSON exception on snapshot data \"%s\".", data), e); } } SnapshotWritePlan plan; if (format == SnapshotFormat.NATIVE) { plan = new NativeSnapshotWritePlan(); } else if (format == SnapshotFormat.CSV) { plan = new CSVSnapshotWritePlan(); } else if (format == SnapshotFormat.STREAM) { plan = new StreamSnapshotWritePlan(); } else if (format == SnapshotFormat.INDEX) { plan = new IndexSnapshotWritePlan(); } else { throw new RuntimeException("BAD BAD BAD"); } final Callable<Boolean> deferredSetup = plan.createSetup( file_path, file_nonce, txnId, partitionTransactionIds, jsData, context, result, exportSequenceNumbers, tracker, hashinatorData, timestamp); m_deferredSetupFuture = VoltDB.instance() .submitSnapshotIOWork( new DeferredSnapshotSetup(plan, deferredSetup, txnId, partitionTransactionIds)); synchronized (m_createLock) { // Seems like this should be cleared out just in case // Log if there is actually anything to clear since it is unexpected if (!m_taskListsForHSIds.isEmpty()) { SNAP_LOG.warn("Found lingering snapshot tasks while setting up a snapshot"); } m_taskListsForHSIds.clear(); m_createSuccess.set(true); m_createResult.set(result); m_taskListsForHSIds.putAll(plan.getTaskListsForHSIds()); // HACK HACK HACK. If the task list is empty, this host has no work to do for // this snapshot. We're going to create an empty list of tasks for one of the sites to do // so that we'll have a SnapshotSiteProcessor which will do the logSnapshotCompleteToZK. if (m_taskListsForHSIds.isEmpty()) { SNAP_LOG.debug( "Node had no snapshot work to do. Creating a null task to drive completion."); m_taskListsForHSIds.put(context.getSiteId(), new ArrayDeque<SnapshotTableTask>()); } SNAP_LOG.debug( "Planned tasks: " + CoreUtils.hsIdCollectionToString(plan.getTaskListsForHSIds().keySet())); SNAP_LOG.debug( "Created tasks for HSIds: " + CoreUtils.hsIdCollectionToString(m_taskListsForHSIds.keySet())); } }
/** * The only public method: do all the work to start a snapshot. Assumes that a snapshot is * feasible, that the caller has validated it can be accomplished, that the caller knows this is a * consistent or useful transaction point at which to snapshot. * * @param file_path * @param file_nonce * @param format * @param block * @param txnId * @param data * @param context * @param hostname * @return VoltTable describing the results of the snapshot attempt */ public VoltTable startSnapshotting( final String file_path, final String file_nonce, final SnapshotFormat format, final byte block, final long multiPartTxnId, final long partitionTxnId, final long legacyPerPartitionTxnIds[], final String data, final SystemProcedureExecutionContext context, final String hostname, final HashinatorSnapshotData hashinatorData, final long timestamp) { TRACE_LOG.trace("Creating snapshot target and handing to EEs"); final VoltTable result = SnapshotUtil.constructNodeResultsTable(); final int numLocalSites = context.getCluster().getDeployment().get("deployment").getSitesperhost(); // One site wins the race to create the snapshot targets, populating // m_taskListsForSites for the other sites and creating an appropriate // number of snapshot permits. synchronized (SnapshotSiteProcessor.m_snapshotCreateLock) { SnapshotSiteProcessor.m_snapshotCreateSetupBarrierActualAction.set( new Runnable() { @Override public void run() { Map<Integer, Long> partitionTransactionIds = new HashMap<Integer, Long>(); partitionTransactionIds = m_partitionLastSeenTransactionIds; SNAP_LOG.debug("Last seen partition transaction ids " + partitionTransactionIds); m_partitionLastSeenTransactionIds = new HashMap<Integer, Long>(); partitionTransactionIds.put(TxnEgo.getPartitionId(multiPartTxnId), multiPartTxnId); /* * Do a quick sanity check that the provided IDs * don't conflict with currently active partitions. If they do * it isn't fatal we can just skip it. */ for (long txnId : legacyPerPartitionTxnIds) { final int legacyPartition = TxnEgo.getPartitionId(txnId); if (partitionTransactionIds.containsKey(legacyPartition)) { SNAP_LOG.warn( "While saving a snapshot and propagating legacy " + "transaction ids found an id that matches currently active partition" + partitionTransactionIds.get(legacyPartition)); } else { partitionTransactionIds.put(legacyPartition, txnId); } } exportSequenceNumbers = SnapshotSiteProcessor.getExportSequenceNumbers(); createSetupIv2( file_path, file_nonce, format, multiPartTxnId, partitionTransactionIds, data, context, result, exportSequenceNumbers, context.getSiteTrackerForSnapshot(), hashinatorData, timestamp); } }); // Create a barrier to use with the current number of sites to wait for // or if the barrier is already set up check if it is broken and reset if necessary SnapshotSiteProcessor.readySnapshotSetupBarriers(numLocalSites); // From within this EE, record the sequence numbers as of the start of the snapshot (now) // so that the info can be put in the digest. SnapshotSiteProcessor.populateExportSequenceNumbersForExecutionSite(context); SNAP_LOG.debug( "Registering transaction id " + partitionTxnId + " for " + TxnEgo.getPartitionId(partitionTxnId)); m_partitionLastSeenTransactionIds.put(TxnEgo.getPartitionId(partitionTxnId), partitionTxnId); } boolean runPostTasks = false; VoltTable earlyResultTable = null; try { SnapshotSiteProcessor.m_snapshotCreateSetupBarrier.await(); try { synchronized (m_createLock) { SNAP_LOG.debug( "Found tasks for HSIds: " + CoreUtils.hsIdCollectionToString(m_taskListsForHSIds.keySet())); SNAP_LOG.debug("Looking for local HSID: " + CoreUtils.hsIdToString(context.getSiteId())); Deque<SnapshotTableTask> taskList = m_taskListsForHSIds.remove(context.getSiteId()); // If createSetup failed, then the first site to reach here is going // to send the results table generated by createSetup, and then empty out the table. // All other sites to reach here will send the appropriate empty table. // If createSetup was a success but the taskList is null, then we'll use the block // switch to figure out what flavor of empty SnapshotSave result table to return. if (!m_createSuccess.get()) { // There shouldn't be any work for any site if we failed assert (m_taskListsForHSIds.isEmpty()); VoltTable finalresult = m_createResult.get(); if (finalresult != null) { m_createResult.set(null); earlyResultTable = finalresult; } else { // We returned a non-empty NodeResultsTable with the failures in it, // every other site needs to return a NodeResultsTable as well. earlyResultTable = SnapshotUtil.constructNodeResultsTable(); } } else if (taskList == null) { SNAP_LOG.debug("No task for this site, block " + block); // This node is participating in the snapshot but this site has nothing to do. // Send back an appropriate empty table based on the block flag if (block != 0) { runPostTasks = true; earlyResultTable = SnapshotUtil.constructPartitionResultsTable(); earlyResultTable.addRow( context.getHostId(), hostname, CoreUtils.getSiteIdFromHSId(context.getSiteId()), "SUCCESS", ""); } else { earlyResultTable = SnapshotUtil.constructNodeResultsTable(); } } else { context .getSiteSnapshotConnection() .initiateSnapshots(format, taskList, multiPartTxnId, exportSequenceNumbers); } if (m_deferredSetupFuture != null) { // Add a listener to the deferred setup so that it can kick off the snapshot // task once the setup is done. m_deferredSetupFuture.addListener( new Runnable() { @Override public void run() { DeferredSnapshotSetup deferredSnapshotSetup = null; try { deferredSnapshotSetup = m_deferredSetupFuture.get(); } catch (Exception e) { // it doesn't throw } assert deferredSnapshotSetup != null; context .getSiteSnapshotConnection() .startSnapshotWithTargets( deferredSnapshotSetup.getPlan().getSnapshotDataTargets()); } }, CoreUtils.SAMETHREADEXECUTOR); } } } finally { SnapshotSiteProcessor.m_snapshotCreateFinishBarrier.await(120, TimeUnit.SECONDS); } } catch (TimeoutException e) { VoltDB.crashLocalVoltDB( "Timed out waiting 120 seconds for all threads to arrive and start snapshot", true, null); } catch (InterruptedException e) { result.addRow(context.getHostId(), hostname, "", "FAILURE", CoreUtils.throwableToString(e)); earlyResultTable = result; } catch (BrokenBarrierException e) { result.addRow(context.getHostId(), hostname, "", "FAILURE", CoreUtils.throwableToString(e)); earlyResultTable = result; } // If earlyResultTable is set, return here if (earlyResultTable != null) { if (runPostTasks) { // Need to run post-snapshot tasks before finishing SnapshotSiteProcessor.runPostSnapshotTasks(context); } return earlyResultTable; } if (block != 0) { HashSet<Exception> failures = Sets.newHashSet(); String status = "SUCCESS"; String err = ""; try { // For blocking snapshot, propogate the error from deferred setup back to the client final DeferredSnapshotSetup deferredSnapshotSetup = m_deferredSetupFuture.get(); if (deferredSnapshotSetup != null && deferredSnapshotSetup.getError() != null) { status = "FAILURE"; err = deferredSnapshotSetup.getError().toString(); failures.add(deferredSnapshotSetup.getError()); } failures.addAll(context.getSiteSnapshotConnection().completeSnapshotWork()); SnapshotSiteProcessor.runPostSnapshotTasks(context); } catch (Exception e) { status = "FAILURE"; err = e.toString(); failures.add(e); } final VoltTable blockingResult = SnapshotUtil.constructPartitionResultsTable(); if (failures.isEmpty()) { blockingResult.addRow( context.getHostId(), hostname, CoreUtils.getSiteIdFromHSId(context.getSiteId()), status, err); } else { status = "FAILURE"; for (Exception e : failures) { err = e.toString(); } blockingResult.addRow( context.getHostId(), hostname, CoreUtils.getSiteIdFromHSId(context.getSiteId()), status, err); } return blockingResult; } return result; }