@After public void tearDown() throws Exception { for (HostMessenger hm : createdMessengers) { hm.shutdown(); } createdMessengers.clear(); }
private HostMessenger createHostMessenger(int index, boolean start) throws Exception { HostMessenger.Config config = new HostMessenger.Config(); config.internalPort = config.internalPort + index; config.zkInterface = "127.0.0.1:" + (2181 + index); HostMessenger hm = new HostMessenger(config); createdMessengers.add(hm); if (start) { hm.start(); } return hm; }
// This message used to be sent by the SP or MP initiator when they accepted a promotion. // For dev speed, we'll detect mastership changes here and construct and send this message to the // local client interface so we can keep the CIs implementation private void sendLeaderChangeNotify(long hsId, int partitionId) { try { JSONStringer stringer = new JSONStringer(); stringer.object(); stringer.key(JSON_PARTITION_ID).value(partitionId); stringer.key(JSON_INITIATOR_HSID).value(hsId); stringer.endObject(); BinaryPayloadMessage bpm = new BinaryPayloadMessage(new byte[0], stringer.toString().getBytes("UTF-8")); int hostId = m_hostMessenger.getHostId(); m_hostMessenger.send( CoreUtils.getHSIdFromHostAndSite(hostId, HostMessenger.CLIENT_INTERFACE_SITE_ID), bpm); } catch (Exception e) { VoltDB.crashLocalVoltDB("Unable to propogate leader promotion to client interface.", true, e); } }
public MockVoltDB(int clientPort, int adminPort, int httpPort, int drPort) { try { JSONObject obj = new JSONObject(); JSONArray jsonArray = new JSONArray(); jsonArray.put("127.0.0.1"); obj.put("interfaces", jsonArray); obj.put("clientPort", clientPort); obj.put("adminPort", adminPort); obj.put("httpPort", httpPort); obj.put("drPort", drPort); m_localMetadata = obj.toString(4); m_catalog = new Catalog(); m_catalog.execute("add / clusters " + m_clusterName); m_catalog.execute( "add " + m_catalog.getClusters().get(m_clusterName).getPath() + " databases " + m_databaseName); Cluster cluster = m_catalog.getClusters().get(m_clusterName); // Set a sane default for TestMessaging (at least) cluster.setHeartbeattimeout(10000); assert (cluster != null); try { m_hostMessenger.start(); } catch (Exception e) { throw new RuntimeException(e); } VoltZK.createPersistentZKNodes(m_hostMessenger.getZK()); m_hostMessenger .getZK() .create( VoltZK.cluster_metadata + "/" + m_hostMessenger.getHostId(), getLocalMetadata().getBytes("UTF-8"), Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL); m_hostMessenger.generateMailboxId( m_hostMessenger.getHSIdForLocalSite(HostMessenger.STATS_SITE_ID)); m_statsAgent = new StatsAgent(); m_statsAgent.registerMailbox( m_hostMessenger, m_hostMessenger.getHSIdForLocalSite(HostMessenger.STATS_SITE_ID)); for (MailboxType type : MailboxType.values()) { m_mailboxMap.put(type, new LinkedList<MailboxNodeContent>()); } m_mailboxMap .get(MailboxType.StatsAgent) .add( new MailboxNodeContent( m_hostMessenger.getHSIdForLocalSite(HostMessenger.STATS_SITE_ID), null)); m_siteTracker = new SiteTracker(m_hostId, m_mailboxMap); } catch (Exception e) { throw new RuntimeException(e); } }
@Test public void testSingleHost() throws Exception { HostMessenger hm = createHostMessenger(0); Mailbox m1 = hm.createMailbox(); SiteMailbox sm = new SiteMailbox(hm, (-2L << 32)); hm.createMailbox(sm.getHSId(), sm); sm.send(m1.getHSId(), new LocalObjectMessage(null)); m1.send(sm.getHSId(), new LocalObjectMessage(null)); LocalObjectMessage lom = (LocalObjectMessage) m1.recv(); assertEquals(lom.m_sourceHSId, sm.getHSId()); lom = (LocalObjectMessage) sm.recv(); assertEquals(lom.m_sourceHSId, m1.getHSId()); }
public Cartographer(HostMessenger hostMessenger) { super(false); m_hostMessenger = hostMessenger; m_zk = hostMessenger.getZK(); m_iv2Masters = new LeaderCache(m_zk, VoltZK.iv2masters, m_SPIMasterCallback); m_iv2Mpi = new LeaderCache(m_zk, VoltZK.iv2mpi, m_MPICallback); try { m_iv2Masters.start(true); m_iv2Mpi.start(true); } catch (Exception e) { VoltDB.crashLocalVoltDB("Screwed", true, e); } }
@Override public boolean shutdown(Thread mainSiteThread) throws InterruptedException { if (m_faultDistributor != null) { m_faultDistributor.shutDown(); } VoltDB.wasCrashCalled = false; VoltDB.crashMessage = null; m_snapshotCompletionMonitor.shutdown(); m_es.shutdown(); m_es.awaitTermination(1, TimeUnit.DAYS); m_statsAgent.shutdown(); m_hostMessenger.shutdown(); return true; }
private void doPartitionDetectionActivities() { // We should never re-enter here once we've decided we're partitioned and doomed assert (!m_partitionDetected); // After everything is resolved, write the new surviving set to ZK List<Integer> currentNodes = null; try { currentNodes = m_hostMessenger.getLiveHostIds(); } catch (Exception e) { } Set<Integer> currentHosts = new HashSet<Integer>(currentNodes); Set<Integer> previousHosts = readPriorKnownLiveNodes(); boolean partitionDetectionTriggered = makePPDDecision(previousHosts, currentHosts); if (partitionDetectionTriggered) { m_partitionDetected = true; if (m_usingCommandLog) { // Just shut down immediately VoltDB.crashGlobalVoltDB( "Use of command logging detected, no additional database snapshot will " + "be generated. Please use the 'recover' action to restore the database if necessary.", false, null); } else { SnapshotUtil.requestSnapshot( 0L, m_partSnapshotSchedule.getPath(), m_partSnapshotSchedule.getPrefix() + System.currentTimeMillis(), true, SnapshotFormat.NATIVE, null, m_snapshotHandler, true); } } // If the cluster host set has changed, then write the new set to ZK // NOTE: we don't want to update the known live nodes if we've decided that our subcluster is // dying, otherwise a poorly timed subsequent failure might reverse this decision. Any future // promoted // LeaderAppointer should make their partition detection decision based on the pre-partition // cluster state. else if (!currentHosts.equals(previousHosts)) { writeKnownLiveNodes(currentNodes); } }
public SpInitiator( HostMessenger messenger, Integer partition, StatsAgent agent, SnapshotCompletionMonitor snapMonitor, VoltDB.START_ACTION startAction) { super( VoltZK.iv2masters, messenger, partition, new SpScheduler(partition, new SiteTaskerQueue(), snapMonitor), "SP", agent, startAction); m_leaderCache = new LeaderCache(messenger.getZK(), VoltZK.iv2appointees, m_leadersChangeHandler); m_tickProducer = new TickProducer(m_scheduler.m_tasks); }
public LeaderAppointer( HostMessenger hm, int numberOfPartitions, int kfactor, boolean partitionDetectionEnabled, SnapshotSchedule partitionSnapshotSchedule, boolean usingCommandLog, JSONObject topology, MpInitiator mpi) { m_hostMessenger = hm; m_zk = hm.getZK(); m_kfactor = kfactor; m_topo = topology; m_MPI = mpi; m_partitionCount = numberOfPartitions; m_callbacks = new PartitionCallback[m_partitionCount]; m_partitionWatchers = new BabySitter[m_partitionCount]; m_iv2appointees = new LeaderCache(m_zk, VoltZK.iv2appointees); m_iv2masters = new LeaderCache(m_zk, VoltZK.iv2masters, m_masterCallback); m_partitionDetectionEnabled = partitionDetectionEnabled; m_partSnapshotSchedule = partitionSnapshotSchedule; m_usingCommandLog = usingCommandLog; }
@Override public void acceptPromotion() throws InterruptedException, ExecutionException, KeeperException { // Crank up the leader caches. Use blocking startup so that we'll have valid point-in-time // caches later. m_iv2appointees.start(true); m_iv2masters.start(true); // Figure out what conditions we assumed leadership under. if (m_iv2appointees.pointInTimeCache().size() == 0) { tmLog.debug("LeaderAppointer in startup"); m_state.set(AppointerState.CLUSTER_START); } else if ((m_iv2appointees.pointInTimeCache().size() != m_partitionCount) || (m_iv2masters.pointInTimeCache().size() != m_partitionCount)) { // If we are promoted and the appointees or masters set is partial, the previous appointer // failed // during startup (at least for now, until we add add/remove a partition on the fly). VoltDB.crashGlobalVoltDB("Detected failure during startup, unable to start", false, null); } else { tmLog.debug("LeaderAppointer in repair"); m_state.set(AppointerState.DONE); } if (m_state.get() == AppointerState.CLUSTER_START) { // Need to block the return of acceptPromotion until after the MPI is promoted. Wait for this // latch // to countdown after appointing all the partition leaders. The // LeaderCache callback will count it down once it has seen all the // appointed leaders publish themselves as the actual leaders. m_startupLatch = new CountDownLatch(1); writeKnownLiveNodes(m_hostMessenger.getLiveHostIds()); for (int i = 0; i < m_partitionCount; i++) { String dir = LeaderElector.electionDirForPartition(i); // Race along with all of the replicas for this partition to create the ZK parent node try { m_zk.create(dir, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } catch (KeeperException.NodeExistsException e) { // expected on all nodes that don't start() first. } m_callbacks[i] = new PartitionCallback(i); Pair<BabySitter, List<String>> sitterstuff = BabySitter.blockingFactory(m_zk, dir, m_callbacks[i], m_es); m_partitionWatchers[i] = sitterstuff.getFirst(); } m_startupLatch.await(); } else { // If we're taking over for a failed LeaderAppointer, we know when // we get here that every partition had a leader at some point in // time. We'll seed each of the PartitionCallbacks for each // partition with the HSID of the last published leader. The // blocking startup of the BabySitter watching that partition will // call our callback, get the current full set of replicas, and // appoint a new leader if the seeded one has actually failed Map<Integer, Long> masters = m_iv2masters.pointInTimeCache(); tmLog.info("LeaderAppointer repairing with master set: " + masters); for (Entry<Integer, Long> master : masters.entrySet()) { int partId = master.getKey(); String dir = LeaderElector.electionDirForPartition(partId); m_callbacks[partId] = new PartitionCallback(partId, master.getValue()); Pair<BabySitter, List<String>> sitterstuff = BabySitter.blockingFactory(m_zk, dir, m_callbacks[partId], m_es); m_partitionWatchers[partId] = sitterstuff.getFirst(); } // just go ahead and promote our MPI m_MPI.acceptPromotion(); } }
private void createAndRegisterAckMailboxes( final Set<Integer> localPartitions, HostMessenger messenger) { m_zk = messenger.getZK(); m_mailboxesZKPath = VoltZK.exportGenerations + "/" + m_timestamp + "/" + "mailboxes"; m_mbox = new LocalMailbox(messenger) { @Override public void deliver(VoltMessage message) { if (message instanceof BinaryPayloadMessage) { BinaryPayloadMessage bpm = (BinaryPayloadMessage) message; ByteBuffer buf = ByteBuffer.wrap(bpm.m_payload); final int partition = buf.getInt(); final int length = buf.getInt(); byte stringBytes[] = new byte[length]; buf.get(stringBytes); String signature = new String(stringBytes, Constants.UTF8ENCODING); final long ackUSO = buf.getLong(); final HashMap<String, ExportDataSource> partitionSources = m_dataSourcesByPartition.get(partition); if (partitionSources == null) { exportLog.error( "Received an export ack for partition " + partition + " which does not exist on this node"); return; } final ExportDataSource eds = partitionSources.get(signature); if (eds == null) { exportLog.error( "Received an export ack for partition " + partition + " source signature " + signature + " which does not exist on this node"); return; } try { eds.ack(ackUSO); } catch (RejectedExecutionException ignoreIt) { // ignore it: as it is already shutdown } } else { exportLog.error("Receive unexpected message " + message + " in export subsystem"); } } }; messenger.createMailbox(null, m_mbox); for (Integer partition : localPartitions) { final String partitionDN = m_mailboxesZKPath + "/" + partition; ZKUtil.asyncMkdirs(m_zk, partitionDN); ZKUtil.StringCallback cb = new ZKUtil.StringCallback(); m_zk.create( partitionDN + "/" + m_mbox.getHSId(), null, Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, cb, null); } ListenableFuture<?> fut = m_childUpdatingThread.submit( new Runnable() { @Override public void run() { List<Pair<Integer, ZKUtil.ChildrenCallback>> callbacks = new ArrayList<Pair<Integer, ZKUtil.ChildrenCallback>>(); for (Integer partition : localPartitions) { ZKUtil.ChildrenCallback callback = new ZKUtil.ChildrenCallback(); m_zk.getChildren( m_mailboxesZKPath + "/" + partition, constructMailboxChildWatcher(), callback, null); callbacks.add(Pair.of(partition, callback)); } for (Pair<Integer, ZKUtil.ChildrenCallback> p : callbacks) { final Integer partition = p.getFirst(); List<String> children = null; try { children = p.getSecond().getChildren(); } catch (InterruptedException e) { Throwables.propagate(e); } catch (KeeperException e) { Throwables.propagate(e); } ImmutableList.Builder<Long> mailboxes = ImmutableList.builder(); for (String child : children) { if (child.equals(Long.toString(m_mbox.getHSId()))) continue; mailboxes.add(Long.valueOf(child)); } ImmutableList<Long> mailboxHsids = mailboxes.build(); for (ExportDataSource eds : m_dataSourcesByPartition.get(partition).values()) { eds.updateAckMailboxes(Pair.of(m_mbox, mailboxHsids)); } } } }); try { fut.get(); } catch (Throwable t) { Throwables.propagate(t); } }
/** Discard the mailbox. */ public void close() { m_messenger.removeMailbox(getHSId()); }
public JoinCoordinator(HostMessenger hostMessenger) { super(hostMessenger, hostMessenger.generateMailboxId(null)); m_messenger = hostMessenger; }
@Test public void testMultiHost() throws Exception { HostMessenger hm1 = createHostMessenger(0); final HostMessenger hm2 = createHostMessenger(1, false); final HostMessenger hm3 = createHostMessenger(2, false); final AtomicReference<Exception> exception = new AtomicReference<Exception>(); Thread hm2Start = new Thread() { @Override public void run() { try { hm2.start(); } catch (Exception e) { e.printStackTrace(); exception.set(e); } } }; Thread hm3Start = new Thread() { @Override public void run() { try { hm3.start(); } catch (Exception e) { e.printStackTrace(); exception.set(e); } } }; hm2Start.start(); hm3Start.start(); hm2Start.join(); System.out.println(hm2.getZK().getChildren(CoreZK.hostids, false)); hm3Start.join(); if (exception.get() != null) { fail(exception.get().toString()); } List<String> root1 = hm1.getZK().getChildren("/", false); List<String> root2 = hm2.getZK().getChildren("/", false); List<String> root3 = hm3.getZK().getChildren("/", false); System.out.println(root1); System.out.println(root2); System.out.println(root3); assertTrue(root1.equals(root2)); assertTrue(root2.equals(root3)); List<String> hostids1 = hm1.getZK().getChildren(CoreZK.hostids, false); List<String> hostids2 = hm2.getZK().getChildren(CoreZK.hostids, false); List<String> hostids3 = hm3.getZK().getChildren(CoreZK.hostids, false); System.out.println(hostids1); System.out.println(hostids2); System.out.println(hostids3); assertTrue(hostids1.equals(hostids2)); assertTrue(hostids2.equals(hostids3)); List<String> hosts3; List<String> hosts1; hm2.shutdown(); boolean success = false; for (int ii = 0; ii < (200 / 5); ii++) { hosts3 = hm3.getZK().getChildren(CoreZK.hosts, false); hosts1 = hm1.getZK().getChildren(CoreZK.hosts, false); if (hosts3.size() == 2 && hosts1.size() == 2 && hosts1.equals(hosts3)) { success = true; break; } Thread.sleep(5); } assertTrue(success); hm1.waitForGroupJoin(2); hm3.waitForGroupJoin(2); }