@Override public void configure( BackendTarget backend, String serializedCatalog, CatalogContext catalogContext, int kfactor, CatalogSpecificPlanner csp, int numberOfPartitions, VoltDB.START_ACTION startAction, StatsAgent agent, MemoryStats memStats, CommandLog cl, NodeDRGateway nodeDRGateway, String coreBindIds) throws KeeperException, InterruptedException, ExecutionException { try { m_leaderCache.start(true); } catch (Exception e) { VoltDB.crashLocalVoltDB("Unable to configure SpInitiator.", true, e); } // configure DR PartitionDRGateway drGateway = PartitionDRGateway.getInstance( m_partitionId, nodeDRGateway, true, VoltDB.createForRejoin(startAction)); ((SpScheduler) m_scheduler).setDRGateway(drGateway); super.configureCommon( backend, serializedCatalog, catalogContext, csp, numberOfPartitions, startAction, agent, memStats, cl, coreBindIds, drGateway); m_tickProducer.start(); // add ourselves to the ephemeral node list which BabySitters will watch for this // partition LeaderElector.createParticipantNode( m_messenger.getZK(), LeaderElector.electionDirForPartition(m_partitionId), Long.toString(getInitiatorHSId()), null); }
/** * Given a set of partition IDs, return a map of partition to a list of HSIDs of all the sites * with copies of each partition */ public Map<Integer, List<Long>> getReplicasForPartitions(Collection<Integer> partitions) { Map<Integer, List<Long>> retval = new HashMap<Integer, List<Long>>(); List<Pair<Integer, ZKUtil.ChildrenCallback>> callbacks = new ArrayList<Pair<Integer, ZKUtil.ChildrenCallback>>(); for (Integer partition : partitions) { String zkpath = LeaderElector.electionDirForPartition(partition); ZKUtil.ChildrenCallback cb = new ZKUtil.ChildrenCallback(); callbacks.add(Pair.of(partition, cb)); m_zk.getChildren(zkpath, false, cb, null); } for (Pair<Integer, ZKUtil.ChildrenCallback> p : callbacks) { final Integer partition = p.getFirst(); try { List<String> children = p.getSecond().getChildren(); List<Long> sites = new ArrayList<Long>(); for (String child : children) { sites.add(Long.valueOf(child.split("_")[0])); } retval.put(partition, sites); } catch (KeeperException ke) { org.voltdb.VoltDB.crashLocalVoltDB( "KeeperException getting replicas for partition: " + partition, true, ke); } catch (InterruptedException ie) { org.voltdb.VoltDB.crashLocalVoltDB( "InterruptedException getting replicas for partition: " + partition, true, ie); } } return retval; }
void shutdown() { try { m_leaderElector.shutdown(); } catch (Exception e) { VoltDB.crashLocalVoltDB("Error shutting down GlobalServiceElector's LeaderElector", true, e); } }
/** * Returns the IDs of the partitions currently in the cluster. * * @return A list of partition IDs */ public static List<Integer> getPartitions(ZooKeeper zk) { List<Integer> partitions = new ArrayList<Integer>(); try { List<String> children = zk.getChildren(VoltZK.leaders_initiators, null); for (String child : children) { partitions.add(LeaderElector.getPartitionFromElectionDir(child)); } } catch (KeeperException e) { VoltDB.crashLocalVoltDB("Failed to get partition IDs from ZK", true, e); } catch (InterruptedException e) { VoltDB.crashLocalVoltDB("Failed to get partition IDs from ZK", true, e); } return partitions; }
private boolean isClusterKSafe() { boolean retval = true; for (int i = 0; i < m_partitionCount; i++) { String dir = LeaderElector.electionDirForPartition(i); try { List<String> replicas = m_zk.getChildren(dir, null, null); if (replicas.isEmpty()) { tmLog.fatal("K-Safety violation: No replicas found for partition: " + i); retval = false; } } catch (Exception e) { VoltDB.crashLocalVoltDB("Unable to read replicas in ZK dir: " + dir, true, e); } } return retval; }
/** Given a partition ID, return a list of HSIDs of all the sites with copies of that partition */ public List<Long> getReplicasForPartition(int partition) { String zkpath = LeaderElector.electionDirForPartition(partition); List<Long> retval = new ArrayList<Long>(); try { List<String> children = m_zk.getChildren(zkpath, null); for (String child : children) { retval.add(Long.valueOf(child.split("_")[0])); } } catch (KeeperException ke) { org.voltdb.VoltDB.crashLocalVoltDB( "KeeperException getting replicas for partition: " + partition, true, ke); } catch (InterruptedException ie) { org.voltdb.VoltDB.crashLocalVoltDB( "InterruptedException getting replicas for partition: " + partition, true, ie); } return retval; }
@Override public void acceptPromotion() throws InterruptedException, ExecutionException, KeeperException { // Crank up the leader caches. Use blocking startup so that we'll have valid point-in-time // caches later. m_iv2appointees.start(true); m_iv2masters.start(true); // Figure out what conditions we assumed leadership under. if (m_iv2appointees.pointInTimeCache().size() == 0) { tmLog.debug("LeaderAppointer in startup"); m_state.set(AppointerState.CLUSTER_START); } else if ((m_iv2appointees.pointInTimeCache().size() != m_partitionCount) || (m_iv2masters.pointInTimeCache().size() != m_partitionCount)) { // If we are promoted and the appointees or masters set is partial, the previous appointer // failed // during startup (at least for now, until we add add/remove a partition on the fly). VoltDB.crashGlobalVoltDB("Detected failure during startup, unable to start", false, null); } else { tmLog.debug("LeaderAppointer in repair"); m_state.set(AppointerState.DONE); } if (m_state.get() == AppointerState.CLUSTER_START) { // Need to block the return of acceptPromotion until after the MPI is promoted. Wait for this // latch // to countdown after appointing all the partition leaders. The // LeaderCache callback will count it down once it has seen all the // appointed leaders publish themselves as the actual leaders. m_startupLatch = new CountDownLatch(1); writeKnownLiveNodes(m_hostMessenger.getLiveHostIds()); for (int i = 0; i < m_partitionCount; i++) { String dir = LeaderElector.electionDirForPartition(i); // Race along with all of the replicas for this partition to create the ZK parent node try { m_zk.create(dir, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } catch (KeeperException.NodeExistsException e) { // expected on all nodes that don't start() first. } m_callbacks[i] = new PartitionCallback(i); Pair<BabySitter, List<String>> sitterstuff = BabySitter.blockingFactory(m_zk, dir, m_callbacks[i], m_es); m_partitionWatchers[i] = sitterstuff.getFirst(); } m_startupLatch.await(); } else { // If we're taking over for a failed LeaderAppointer, we know when // we get here that every partition had a leader at some point in // time. We'll seed each of the PartitionCallbacks for each // partition with the HSID of the last published leader. The // blocking startup of the BabySitter watching that partition will // call our callback, get the current full set of replicas, and // appoint a new leader if the seeded one has actually failed Map<Integer, Long> masters = m_iv2masters.pointInTimeCache(); tmLog.info("LeaderAppointer repairing with master set: " + masters); for (Entry<Integer, Long> master : masters.entrySet()) { int partId = master.getKey(); String dir = LeaderElector.electionDirForPartition(partId); m_callbacks[partId] = new PartitionCallback(partId, master.getValue()); Pair<BabySitter, List<String>> sitterstuff = BabySitter.blockingFactory(m_zk, dir, m_callbacks[partId], m_es); m_partitionWatchers[partId] = sitterstuff.getFirst(); } // just go ahead and promote our MPI m_MPI.acceptPromotion(); } }
/** * Kick off the leader election. Will block until all of the acceptPromotion() calls to all of the * services return at the initial leader. */ void start() throws KeeperException, InterruptedException, ExecutionException { m_leaderElector.start(true); }