void configure(String root, ZooKeeper zk) throws Exception { Long aa = 12345678L; Long bb = 87654321L; Long cc = 11223344L; zk.create(root, new byte[] {}, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); zk.create(root + "/0", aa.toString().getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); zk.create(root + "/1", bb.toString().getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); zk.create(root + "/2", cc.toString().getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); }
public static ZKUtil.StringCallback asyncMkdirs(ZooKeeper zk, String dirDN, byte payload[]) { Preconditions.checkArgument( dirDN != null && !dirDN.trim().isEmpty() && !"/".equals(dirDN) && dirDN.startsWith("/")); StringBuilder dsb = new StringBuilder(128); ZKUtil.StringCallback lastCallback = null; try { String dirPortions[] = dirDN.substring(1).split("/"); for (int ii = 0; ii < dirPortions.length; ii++) { String dirPortion = dirPortions[ii]; lastCallback = new ZKUtil.StringCallback(); dsb.append('/').append(dirPortion); zk.create( dsb.toString(), ii == dirPortions.length - 1 ? payload : null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, lastCallback, null); } } catch (Throwable t) { Throwables.propagate(t); } return lastCallback; }
public static void uploadBytesAsChunks( ZooKeeper zk, String node, byte payload[], boolean ephemeral) throws Exception { ByteBuffer buffer = ByteBuffer.wrap(compressBytes(payload)); while (buffer.hasRemaining()) { int nextChunkSize = Math.min(1024 * 1024, buffer.remaining()); ByteBuffer nextChunk = ByteBuffer.allocate(nextChunkSize); buffer.limit(buffer.position() + nextChunkSize); nextChunk.put(buffer); buffer.limit(buffer.capacity()); zk.create( node, nextChunk.array(), Ids.OPEN_ACL_UNSAFE, ephemeral ? CreateMode.EPHEMERAL_SEQUENTIAL : CreateMode.PERSISTENT_SEQUENTIAL); } zk.create( node + "_complete", null, Ids.OPEN_ACL_UNSAFE, ephemeral ? CreateMode.EPHEMERAL : CreateMode.PERSISTENT); }
@Override public void run() { try { JSONStringer js = new JSONStringer(); js.object(); js.key("role").value(m_config.m_replicationRole.ordinal()); js.key("active").value(m_rvdb.getReplicationActive()); js.endObject(); ZooKeeper zk = m_rvdb.getHostMessenger().getZK(); // rejoining nodes figure out the replication role from other nodes if (!m_isRejoin) { try { zk.create( VoltZK.replicationconfig, js.toString().getBytes("UTF-8"), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } catch (KeeperException.NodeExistsException e) { } String discoveredReplicationConfig = new String(zk.getData(VoltZK.replicationconfig, false, null), "UTF-8"); JSONObject discoveredjsObj = new JSONObject(discoveredReplicationConfig); ReplicationRole discoveredRole = ReplicationRole.get((byte) discoveredjsObj.getLong("role")); if (!discoveredRole.equals(m_config.m_replicationRole)) { VoltDB.crashGlobalVoltDB( "Discovered replication role " + discoveredRole + " doesn't match locally specified replication role " + m_config.m_replicationRole, true, null); } // See if we should bring the server up in WAN replication mode m_rvdb.setReplicationRole(discoveredRole); } else { String discoveredReplicationConfig = new String(zk.getData(VoltZK.replicationconfig, false, null), "UTF-8"); JSONObject discoveredjsObj = new JSONObject(discoveredReplicationConfig); ReplicationRole discoveredRole = ReplicationRole.get((byte) discoveredjsObj.getLong("role")); boolean replicationActive = discoveredjsObj.getBoolean("active"); // See if we should bring the server up in WAN replication mode m_rvdb.setReplicationRole(discoveredRole); m_rvdb.setReplicationActive(replicationActive); } } catch (Exception e) { VoltDB.crashGlobalVoltDB("Error discovering replication role", false, e); } }
private void writeKnownLiveNodes(List<Integer> liveNodes) { try { if (m_zk.exists(VoltZK.lastKnownLiveNodes, null) == null) { // VoltZK.createPersistentZKNodes should have done this m_zk.create(VoltZK.lastKnownLiveNodes, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } JSONStringer stringer = new JSONStringer(); stringer.object(); stringer.key("liveNodes").array(); for (Integer node : liveNodes) { stringer.value(node); } stringer.endArray(); stringer.endObject(); JSONObject obj = new JSONObject(stringer.toString()); tmLog.debug("Writing live nodes to ZK: " + obj.toString(4)); m_zk.setData(VoltZK.lastKnownLiveNodes, obj.toString(4).getBytes("UTF-8"), -1); } catch (Exception e) { VoltDB.crashLocalVoltDB( "Unable to update known live nodes at ZK path: " + VoltZK.lastKnownLiveNodes, true, e); } }
@Override public void acceptPromotion() throws InterruptedException, ExecutionException, KeeperException { // Crank up the leader caches. Use blocking startup so that we'll have valid point-in-time // caches later. m_iv2appointees.start(true); m_iv2masters.start(true); // Figure out what conditions we assumed leadership under. if (m_iv2appointees.pointInTimeCache().size() == 0) { tmLog.debug("LeaderAppointer in startup"); m_state.set(AppointerState.CLUSTER_START); } else if ((m_iv2appointees.pointInTimeCache().size() != m_partitionCount) || (m_iv2masters.pointInTimeCache().size() != m_partitionCount)) { // If we are promoted and the appointees or masters set is partial, the previous appointer // failed // during startup (at least for now, until we add add/remove a partition on the fly). VoltDB.crashGlobalVoltDB("Detected failure during startup, unable to start", false, null); } else { tmLog.debug("LeaderAppointer in repair"); m_state.set(AppointerState.DONE); } if (m_state.get() == AppointerState.CLUSTER_START) { // Need to block the return of acceptPromotion until after the MPI is promoted. Wait for this // latch // to countdown after appointing all the partition leaders. The // LeaderCache callback will count it down once it has seen all the // appointed leaders publish themselves as the actual leaders. m_startupLatch = new CountDownLatch(1); writeKnownLiveNodes(m_hostMessenger.getLiveHostIds()); for (int i = 0; i < m_partitionCount; i++) { String dir = LeaderElector.electionDirForPartition(i); // Race along with all of the replicas for this partition to create the ZK parent node try { m_zk.create(dir, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } catch (KeeperException.NodeExistsException e) { // expected on all nodes that don't start() first. } m_callbacks[i] = new PartitionCallback(i); Pair<BabySitter, List<String>> sitterstuff = BabySitter.blockingFactory(m_zk, dir, m_callbacks[i], m_es); m_partitionWatchers[i] = sitterstuff.getFirst(); } m_startupLatch.await(); } else { // If we're taking over for a failed LeaderAppointer, we know when // we get here that every partition had a leader at some point in // time. We'll seed each of the PartitionCallbacks for each // partition with the HSID of the last published leader. The // blocking startup of the BabySitter watching that partition will // call our callback, get the current full set of replicas, and // appoint a new leader if the seeded one has actually failed Map<Integer, Long> masters = m_iv2masters.pointInTimeCache(); tmLog.info("LeaderAppointer repairing with master set: " + masters); for (Entry<Integer, Long> master : masters.entrySet()) { int partId = master.getKey(); String dir = LeaderElector.electionDirForPartition(partId); m_callbacks[partId] = new PartitionCallback(partId, master.getValue()); Pair<BabySitter, List<String>> sitterstuff = BabySitter.blockingFactory(m_zk, dir, m_callbacks[partId], m_es); m_partitionWatchers[partId] = sitterstuff.getFirst(); } // just go ahead and promote our MPI m_MPI.acceptPromotion(); } }
private void createAndRegisterAckMailboxes( final Set<Integer> localPartitions, HostMessenger messenger) { m_zk = messenger.getZK(); m_mailboxesZKPath = VoltZK.exportGenerations + "/" + m_timestamp + "/" + "mailboxes"; m_mbox = new LocalMailbox(messenger) { @Override public void deliver(VoltMessage message) { if (message instanceof BinaryPayloadMessage) { BinaryPayloadMessage bpm = (BinaryPayloadMessage) message; ByteBuffer buf = ByteBuffer.wrap(bpm.m_payload); final int partition = buf.getInt(); final int length = buf.getInt(); byte stringBytes[] = new byte[length]; buf.get(stringBytes); String signature = new String(stringBytes, Constants.UTF8ENCODING); final long ackUSO = buf.getLong(); final HashMap<String, ExportDataSource> partitionSources = m_dataSourcesByPartition.get(partition); if (partitionSources == null) { exportLog.error( "Received an export ack for partition " + partition + " which does not exist on this node"); return; } final ExportDataSource eds = partitionSources.get(signature); if (eds == null) { exportLog.error( "Received an export ack for partition " + partition + " source signature " + signature + " which does not exist on this node"); return; } try { eds.ack(ackUSO); } catch (RejectedExecutionException ignoreIt) { // ignore it: as it is already shutdown } } else { exportLog.error("Receive unexpected message " + message + " in export subsystem"); } } }; messenger.createMailbox(null, m_mbox); for (Integer partition : localPartitions) { final String partitionDN = m_mailboxesZKPath + "/" + partition; ZKUtil.asyncMkdirs(m_zk, partitionDN); ZKUtil.StringCallback cb = new ZKUtil.StringCallback(); m_zk.create( partitionDN + "/" + m_mbox.getHSId(), null, Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, cb, null); } ListenableFuture<?> fut = m_childUpdatingThread.submit( new Runnable() { @Override public void run() { List<Pair<Integer, ZKUtil.ChildrenCallback>> callbacks = new ArrayList<Pair<Integer, ZKUtil.ChildrenCallback>>(); for (Integer partition : localPartitions) { ZKUtil.ChildrenCallback callback = new ZKUtil.ChildrenCallback(); m_zk.getChildren( m_mailboxesZKPath + "/" + partition, constructMailboxChildWatcher(), callback, null); callbacks.add(Pair.of(partition, callback)); } for (Pair<Integer, ZKUtil.ChildrenCallback> p : callbacks) { final Integer partition = p.getFirst(); List<String> children = null; try { children = p.getSecond().getChildren(); } catch (InterruptedException e) { Throwables.propagate(e); } catch (KeeperException e) { Throwables.propagate(e); } ImmutableList.Builder<Long> mailboxes = ImmutableList.builder(); for (String child : children) { if (child.equals(Long.toString(m_mbox.getHSId()))) continue; mailboxes.add(Long.valueOf(child)); } ImmutableList<Long> mailboxHsids = mailboxes.build(); for (ExportDataSource eds : m_dataSourcesByPartition.get(partition).values()) { eds.updateAckMailboxes(Pair.of(m_mbox, mailboxHsids)); } } } }); try { fut.get(); } catch (Throwable t) { Throwables.propagate(t); } }