private long assignLeader(int partitionId, List<Long> children) { // We used masterHostId = -1 as a way to force the leader choice to be // the first replica in the list, if we don't have some other mechanism // which has successfully overridden it. int masterHostId = -1; if (m_state.get() == AppointerState.CLUSTER_START) { try { // find master in topo JSONArray parts = m_topo.getJSONArray("partitions"); for (int p = 0; p < parts.length(); p++) { JSONObject aPartition = parts.getJSONObject(p); int pid = aPartition.getInt("partition_id"); if (pid == partitionId) { masterHostId = aPartition.getInt("master"); } } } catch (JSONException jse) { tmLog.error("Failed to find master for partition " + partitionId + ", defaulting to 0"); jse.printStackTrace(); masterHostId = -1; // stupid default } } else { // For now, if we're appointing a new leader as a result of a // failure, just pick the first replica in the children list. // Could eventually do something more complex here to try to keep a // semi-balance, but it's unclear that this has much utility until // we add rebalancing on rejoin as well. masterHostId = -1; } long masterHSId = children.get(0); for (Long child : children) { if (CoreUtils.getHostIdFromHSId(child) == masterHostId) { masterHSId = child; break; } } tmLog.info( "Appointing HSId " + CoreUtils.hsIdToString(masterHSId) + " as leader for partition " + partitionId); try { m_iv2appointees.put(partitionId, masterHSId); } catch (Exception e) { VoltDB.crashLocalVoltDB("Unable to appoint new master for partition " + partitionId, true, e); } return masterHSId; }
@Override public void run(List<String> children) { List<Long> updatedHSIds = VoltZK.childrenToReplicaHSIds(children); // compute previously unseen HSId set in the callback list Set<Long> newHSIds = new HashSet<Long>(updatedHSIds); newHSIds.removeAll(m_replicas); tmLog.debug("Newly seen replicas: " + CoreUtils.hsIdCollectionToString(newHSIds)); // compute previously seen but now vanished from the callback list HSId set Set<Long> missingHSIds = new HashSet<Long>(m_replicas); missingHSIds.removeAll(updatedHSIds); tmLog.debug("Newly dead replicas: " + CoreUtils.hsIdCollectionToString(missingHSIds)); tmLog.debug( "Handling babysitter callback for partition " + m_partitionId + ": children: " + CoreUtils.hsIdCollectionToString(updatedHSIds)); if (m_state.get() == AppointerState.CLUSTER_START) { // We can't yet tolerate a host failure during startup. Crash it all if (missingHSIds.size() > 0) { VoltDB.crashGlobalVoltDB("Node failure detected during startup.", false, null); } // ENG-3166: Eventually we would like to get rid of the extra replicas beyond k_factor, // but for now we just look to see how many replicas of this partition we actually expect // and gate leader assignment on that many copies showing up. int replicaCount = m_kfactor + 1; JSONArray parts; try { parts = m_topo.getJSONArray("partitions"); for (int p = 0; p < parts.length(); p++) { JSONObject aPartition = parts.getJSONObject(p); int pid = aPartition.getInt("partition_id"); if (pid == m_partitionId) { replicaCount = aPartition.getJSONArray("replicas").length(); } } } catch (JSONException e) { // Ignore and just assume the normal number of replicas } if (children.size() == replicaCount) { m_currentLeader = assignLeader(m_partitionId, updatedHSIds); } else { tmLog.info( "Waiting on " + ((m_kfactor + 1) - children.size()) + " more nodes " + "for k-safety before startup"); } } else { // Check for k-safety if (!isClusterKSafe()) { VoltDB.crashGlobalVoltDB( "Some partitions have no replicas. Cluster has become unviable.", false, null); } // Check if replay has completed if (m_replayComplete.get() == false) { VoltDB.crashGlobalVoltDB( "Detected node failure during command log replay. Cluster will shut down.", false, null); } // Check to see if there's been a possible network partition and we're not already handling // it if (m_partitionDetectionEnabled && !m_partitionDetected) { doPartitionDetectionActivities(); } // If we survived the above gauntlet of fail, appoint a new leader for this partition. if (missingHSIds.contains(m_currentLeader)) { m_currentLeader = assignLeader(m_partitionId, updatedHSIds); } } m_replicas.clear(); m_replicas.addAll(updatedHSIds); }