Beispiel #1
0
  @Test
  public void testUpdate() throws Exception {
    ZooKeeper zk = getClient(0);
    ZooKeeper zk2 = getClient(0);
    MailboxTracker tracker = new MailboxTracker(zk, handler);
    MailboxPublisher publisher = new MailboxPublisher(VoltZK.mailboxes + "/1");

    VoltZK.createPersistentZKNodes(zk);

    publisher.registerMailbox(MailboxType.ExecutionSite, new MailboxNodeContent(1L, 0));
    publisher.publish(zk2);

    publisher = new MailboxPublisher(VoltZK.mailboxes + "/2");
    publisher.registerMailbox(MailboxType.ExecutionSite, new MailboxNodeContent(2L, 1));
    publisher.publish(zk);

    tracker.start();

    // The ephemaral node just created will disappear and we should get an update
    zk2.close();
    while (handler.m_handleCount.get() < 2) {
      Thread.sleep(1);
    }

    Map<MailboxType, List<MailboxNodeContent>> value = handler.m_mailboxes;
    assertTrue(value.containsKey(MailboxType.ExecutionSite));
    List<MailboxNodeContent> list = value.get(MailboxType.ExecutionSite);
    assertEquals(1, list.size());
    assertEquals(2, list.get(0).HSId.longValue());
    assertEquals(1, list.get(0).partitionId.intValue());
    tracker.shutdown();
  }
Beispiel #2
0
  /**
   * Rebuild the point-in-time snapshot of the children objects and set watches on new
   * children. @Param event may be null on the first initialization.
   */
  private void processParentEvent(WatchedEvent event) throws Exception {
    // get current children snapshot and reset this watch.
    Set<String> children = new TreeSet<String>(m_zk.getChildren(m_rootNode, m_parentWatch));
    // intersect to get newChildren and update m_lastChildren to the current set.
    Set<String> newChildren = new HashSet<String>(children);
    newChildren.removeAll(m_lastChildren);
    m_lastChildren = children;

    List<ByteArrayCallback> callbacks = new ArrayList<ByteArrayCallback>();
    for (String child : children) {
      ByteArrayCallback cb = new ByteArrayCallback();
      // set watches on new children.
      if (newChildren.contains(child)) {
        m_zk.getData(ZKUtil.joinZKPath(m_rootNode, child), m_childWatch, cb, null);
      } else {
        m_zk.getData(ZKUtil.joinZKPath(m_rootNode, child), false, cb, null);
      }

      callbacks.add(cb);
    }

    HashMap<String, JSONObject> cache = new HashMap<String, JSONObject>();
    for (ByteArrayCallback callback : callbacks) {
      try {
        byte payload[] = callback.getData();
        JSONObject jsObj = new JSONObject(new String(payload, "UTF-8"));
        cache.put(callback.getPath(), jsObj);
      } catch (KeeperException.NoNodeException e) {
        // child may have been deleted between the parent trigger and getData.
      }
    }

    m_publicCache.set(ImmutableMap.copyOf(cache));
  }
  /*
   * Inherit the per partition txnid from the long since gone
   * partition that existed in the past
   */
  private long[] fetchPerPartitionTxnId() {
    ZooKeeper zk = VoltDB.instance().getHostMessenger().getZK();
    byte partitionTxnIdsBytes[] = null;
    try {
      partitionTxnIdsBytes = zk.getData(VoltZK.perPartitionTxnIds, false, null);
    } catch (KeeperException.NoNodeException e) {
      return null;
    } // Can be no node if the cluster was never restored
    catch (Exception e) {
      VoltDB.crashLocalVoltDB("Error retrieving per partition txn ids", true, e);
    }
    ByteBuffer buf = ByteBuffer.wrap(partitionTxnIdsBytes);

    int count = buf.getInt();
    Long partitionTxnId = null;
    long partitionTxnIds[] = new long[count];
    for (int ii = 0; ii < count; ii++) {
      long txnId = buf.getLong();
      partitionTxnIds[ii] = txnId;
      int partitionId = TxnEgo.getPartitionId(txnId);
      if (partitionId == m_partitionId) {
        partitionTxnId = txnId;
        continue;
      }
    }
    if (partitionTxnId != null) {
      return partitionTxnIds;
    }
    return null;
  }
Beispiel #4
0
  @Test
  public void testDeleteChildWithCallback() throws Exception {
    ZooKeeper zk = getClient(0);
    configure("/cache02", zk);

    TestCallback cb = new TestCallback();
    LeaderCache dut = new LeaderCache(zk, "/cache02", cb);
    dut.start(true);
    Map<Integer, Long> cache = cb.m_cache;
    assertEquals("3 items cached.", 3, cache.size());

    zk.delete("/cache02/1", -1);
    while (true) {
      cache = cb.m_cache;
      if (cache.size() == 3) {
        Thread.sleep(1);
      } else {
        break;
      }
    }
    assertEquals("Item removed", 2, cache.size());
    assertEquals(null, cache.get(1));
    assertEquals(12345678, cache.get(0).longValue());
    assertEquals(11223344, cache.get(2).longValue());

    dut.shutdown();
    zk.close();
  }
Beispiel #5
0
  @Test
  public void testModifyChildWithCallback() throws Exception {
    ZooKeeper zk = getClient(0);
    configure("/cache03", zk);

    TestCallback cb = new TestCallback();
    LeaderCache dut = new LeaderCache(zk, "/cache03", cb);
    dut.start(true);
    Map<Integer, Long> cache = cb.m_cache;

    assertEquals("3 items cached.", 3, cache.size());
    assertEquals(12345678, cache.get(0).longValue());

    dut.put(0, 23456789);
    while (true) {
      cache = cb.m_cache;
      if (cache.get(0) == 23456789) {
        break;
      }
    }
    cache = cb.m_cache;
    assertEquals("3 items cached.", 3, cache.size());
    assertEquals(23456789, cache.get(0).longValue());
    assertEquals(87654321, cache.get(1).longValue());
    assertEquals(11223344, cache.get(2).longValue());

    dut.shutdown();
    zk.close();
  }
Beispiel #6
0
  @Test
  public void testModifyChild() throws Exception {
    ZooKeeper zk = getClient(0);
    configure("/cache03", zk);

    LeaderCache dut = new LeaderCache(zk, "/cache03");
    dut.start(true);
    Map<Integer, Long> cache = dut.pointInTimeCache();

    assertEquals("3 items cached.", 3, cache.size());
    assertEquals(12345678, dut.get(0).longValue());

    zk.setData("/cache03/0", Long.toString(23456789).getBytes(), -1);
    while (true) {
      if (dut.get(0) == 23456789) {
        break;
      }
    }
    assertEquals("3 items cached.", 3, cache.size());
    assertEquals(23456789L, dut.get(0).longValue());
    assertEquals(87654321L, dut.get(1).longValue());
    assertEquals(11223344L, dut.get(2).longValue());

    dut.shutdown();
    zk.close();
  }
Beispiel #7
0
 public static void deleteRecursively(ZooKeeper zk, String dir)
     throws KeeperException, InterruptedException {
   List<String> children = zk.getChildren(dir, false);
   for (String child : children) {
     deleteRecursively(zk, joinZKPath(dir, child));
   }
   zk.delete(dir, -1);
 }
Beispiel #8
0
 void configure(String root, ZooKeeper zk) throws Exception {
   Long aa = 12345678L;
   Long bb = 87654321L;
   Long cc = 11223344L;
   zk.create(root, new byte[] {}, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
   zk.create(root + "/0", aa.toString().getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
   zk.create(root + "/1", bb.toString().getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
   zk.create(root + "/2", cc.toString().getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
 }
Beispiel #9
0
    @Override
    public void run() {
      try {
        JSONStringer js = new JSONStringer();
        js.object();
        js.key("role").value(m_config.m_replicationRole.ordinal());
        js.key("active").value(m_rvdb.getReplicationActive());
        js.endObject();

        ZooKeeper zk = m_rvdb.getHostMessenger().getZK();
        // rejoining nodes figure out the replication role from other nodes
        if (!m_isRejoin) {
          try {
            zk.create(
                VoltZK.replicationconfig,
                js.toString().getBytes("UTF-8"),
                Ids.OPEN_ACL_UNSAFE,
                CreateMode.PERSISTENT);
          } catch (KeeperException.NodeExistsException e) {
          }
          String discoveredReplicationConfig =
              new String(zk.getData(VoltZK.replicationconfig, false, null), "UTF-8");
          JSONObject discoveredjsObj = new JSONObject(discoveredReplicationConfig);
          ReplicationRole discoveredRole =
              ReplicationRole.get((byte) discoveredjsObj.getLong("role"));
          if (!discoveredRole.equals(m_config.m_replicationRole)) {
            VoltDB.crashGlobalVoltDB(
                "Discovered replication role "
                    + discoveredRole
                    + " doesn't match locally specified replication role "
                    + m_config.m_replicationRole,
                true,
                null);
          }

          // See if we should bring the server up in WAN replication mode
          m_rvdb.setReplicationRole(discoveredRole);
        } else {
          String discoveredReplicationConfig =
              new String(zk.getData(VoltZK.replicationconfig, false, null), "UTF-8");
          JSONObject discoveredjsObj = new JSONObject(discoveredReplicationConfig);
          ReplicationRole discoveredRole =
              ReplicationRole.get((byte) discoveredjsObj.getLong("role"));
          boolean replicationActive = discoveredjsObj.getBoolean("active");
          // See if we should bring the server up in WAN replication mode
          m_rvdb.setReplicationRole(discoveredRole);
          m_rvdb.setReplicationActive(replicationActive);
        }
      } catch (Exception e) {
        VoltDB.crashGlobalVoltDB("Error discovering replication role", false, e);
      }
    }
Beispiel #10
0
  public static ZKUtil.StringCallback asyncMkdirs(ZooKeeper zk, String dirDN, byte payload[]) {
    Preconditions.checkArgument(
        dirDN != null && !dirDN.trim().isEmpty() && !"/".equals(dirDN) && dirDN.startsWith("/"));

    StringBuilder dsb = new StringBuilder(128);
    ZKUtil.StringCallback lastCallback = null;
    try {
      String dirPortions[] = dirDN.substring(1).split("/");
      for (int ii = 0; ii < dirPortions.length; ii++) {
        String dirPortion = dirPortions[ii];
        lastCallback = new ZKUtil.StringCallback();
        dsb.append('/').append(dirPortion);
        zk.create(
            dsb.toString(),
            ii == dirPortions.length - 1 ? payload : null,
            Ids.OPEN_ACL_UNSAFE,
            CreateMode.PERSISTENT,
            lastCallback,
            null);
      }
    } catch (Throwable t) {
      Throwables.propagate(t);
    }
    return lastCallback;
  }
Beispiel #11
0
  /**
   * Given a set of partition IDs, return a map of partition to a list of HSIDs of all the sites
   * with copies of each partition
   */
  public Map<Integer, List<Long>> getReplicasForPartitions(Collection<Integer> partitions) {
    Map<Integer, List<Long>> retval = new HashMap<Integer, List<Long>>();
    List<Pair<Integer, ZKUtil.ChildrenCallback>> callbacks =
        new ArrayList<Pair<Integer, ZKUtil.ChildrenCallback>>();

    for (Integer partition : partitions) {
      String zkpath = LeaderElector.electionDirForPartition(partition);
      ZKUtil.ChildrenCallback cb = new ZKUtil.ChildrenCallback();
      callbacks.add(Pair.of(partition, cb));
      m_zk.getChildren(zkpath, false, cb, null);
    }

    for (Pair<Integer, ZKUtil.ChildrenCallback> p : callbacks) {
      final Integer partition = p.getFirst();
      try {
        List<String> children = p.getSecond().getChildren();
        List<Long> sites = new ArrayList<Long>();
        for (String child : children) {
          sites.add(Long.valueOf(child.split("_")[0]));
        }
        retval.put(partition, sites);
      } catch (KeeperException ke) {
        org.voltdb.VoltDB.crashLocalVoltDB(
            "KeeperException getting replicas for partition: " + partition, true, ke);
      } catch (InterruptedException ie) {
        org.voltdb.VoltDB.crashLocalVoltDB(
            "InterruptedException getting replicas for partition: " + partition, true, ie);
      }
    }
    return retval;
  }
Beispiel #12
0
  @Test
  public void testInitialCacheWithCallback() throws Exception {
    ZooKeeper zk = getClient(0);
    configure("/cache01", zk);

    TestCallback cb = new TestCallback();
    LeaderCache dut = new LeaderCache(zk, "/cache01", cb);
    dut.start(true);

    assertEquals("3 items cached.", 3, cb.m_cache.size());
    assertEquals(12345678, cb.m_cache.get(0).longValue());
    assertEquals(87654321, cb.m_cache.get(1).longValue());
    assertEquals(11223344, cb.m_cache.get(2).longValue());

    dut.shutdown();
    zk.close();
  }
Beispiel #13
0
  @Test
  public void testInitialCache() throws Exception {
    ZooKeeper zk = getClient(0);
    configure("/cache01", zk);

    LeaderCache dut = new LeaderCache(zk, "/cache01");
    dut.start(true);
    Map<Integer, Long> cache = dut.pointInTimeCache();

    assertEquals("3 items cached.", 3, cache.size());
    assertEquals(12345678L, dut.get(0).longValue());
    assertEquals(87654321L, dut.get(1).longValue());
    assertEquals(11223344L, dut.get(2).longValue());

    dut.shutdown();
    zk.close();
  }
Beispiel #14
0
  /**
   * Once participating host count is set, SnapshotCompletionMonitor can check this ZK node to
   * determine whether the snapshot has finished or not.
   *
   * <p>This should only be called when all participants have responded. It is possible that some
   * hosts finish taking snapshot before the coordinator logs the participating host count. In this
   * case, the host count would have been decremented multiple times already. To make sure finished
   * hosts are logged correctly, this method adds participating host count + 1 to the current host
   * count.
   *
   * @param txnId The snapshot txnId
   * @param participantCount The number of hosts participating in this snapshot
   */
  public static void logParticipatingHostCount(long txnId, int participantCount) {
    ZooKeeper zk = VoltDB.instance().getHostMessenger().getZK();
    final String snapshotPath = VoltZK.completed_snapshots + "/" + txnId;

    boolean success = false;
    while (!success) {
      Stat stat = new Stat();
      byte data[] = null;
      try {
        data = zk.getData(snapshotPath, false, stat);
      } catch (KeeperException e) {
        if (e.code() == KeeperException.Code.NONODE) {
          // If snapshot creation failed for some reason, the node won't exist. ignore
          return;
        }
        VoltDB.crashLocalVoltDB("Failed to get snapshot completion node", true, e);
      } catch (InterruptedException e) {
        VoltDB.crashLocalVoltDB("Interrupted getting snapshot completion node", true, e);
      }
      if (data == null) {
        VoltDB.crashLocalVoltDB("Data should not be null if the node exists", false, null);
      }

      try {
        JSONObject jsonObj = new JSONObject(new String(data, Charsets.UTF_8));
        if (jsonObj.getLong("txnId") != txnId) {
          VoltDB.crashLocalVoltDB("TxnId should match", false, null);
        }

        int hostCount = jsonObj.getInt("hostCount");
        // +1 because hostCount was initialized to -1
        jsonObj.put("hostCount", hostCount + participantCount + 1);
        zk.setData(snapshotPath, jsonObj.toString(4).getBytes(Charsets.UTF_8), stat.getVersion());
      } catch (KeeperException.BadVersionException e) {
        continue;
      } catch (Exception e) {
        VoltDB.crashLocalVoltDB("This ZK call should never fail", true, e);
      }

      success = true;
    }
  }
Beispiel #15
0
 public static void uploadBytesAsChunks(
     ZooKeeper zk, String node, byte payload[], boolean ephemeral) throws Exception {
   ByteBuffer buffer = ByteBuffer.wrap(compressBytes(payload));
   while (buffer.hasRemaining()) {
     int nextChunkSize = Math.min(1024 * 1024, buffer.remaining());
     ByteBuffer nextChunk = ByteBuffer.allocate(nextChunkSize);
     buffer.limit(buffer.position() + nextChunkSize);
     nextChunk.put(buffer);
     buffer.limit(buffer.capacity());
     zk.create(
         node,
         nextChunk.array(),
         Ids.OPEN_ACL_UNSAFE,
         ephemeral ? CreateMode.EPHEMERAL_SEQUENTIAL : CreateMode.PERSISTENT_SEQUENTIAL);
   }
   zk.create(
       node + "_complete",
       null,
       Ids.OPEN_ACL_UNSAFE,
       ephemeral ? CreateMode.EPHEMERAL : CreateMode.PERSISTENT);
 }
Beispiel #16
0
  @Test
  public void testAddChildWithPutWithCallback() throws Exception {
    ZooKeeper zk = getClient(0);
    configure("/cache04", zk);

    TestCallback cb = new TestCallback();
    LeaderCache dut = new LeaderCache(zk, "/cache04", cb);
    dut.start(true);
    Map<Integer, Long> cache = cb.m_cache;

    dut.put(3, 88776655);

    while (true) {
      cache = cb.m_cache;
      if (cache.size() == 3) {
        Thread.sleep(1);
      } else {
        break;
      }
    }
    assertEquals("Item added", 4, cache.size());
    assertEquals(12345678, cache.get(0).longValue());
    assertEquals(87654321, cache.get(1).longValue());
    assertEquals(11223344, cache.get(2).longValue());
    assertEquals(88776655, cache.get(3).longValue());

    // modify the new child and make sure it has a watch set.
    dut.put(3, 99887766);
    while (true) {
      cache = cb.m_cache;
      if (cache.get(3) == 99887766) {
        break;
      }
    }
    assertEquals("Items accounted for.", 4, cache.size());
    assertEquals(99887766, cache.get(3).longValue());

    dut.shutdown();
    zk.close();
  }
Beispiel #17
0
 private void writeKnownLiveNodes(List<Integer> liveNodes) {
   try {
     if (m_zk.exists(VoltZK.lastKnownLiveNodes, null) == null) {
       // VoltZK.createPersistentZKNodes should have done this
       m_zk.create(VoltZK.lastKnownLiveNodes, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
     }
     JSONStringer stringer = new JSONStringer();
     stringer.object();
     stringer.key("liveNodes").array();
     for (Integer node : liveNodes) {
       stringer.value(node);
     }
     stringer.endArray();
     stringer.endObject();
     JSONObject obj = new JSONObject(stringer.toString());
     tmLog.debug("Writing live nodes to ZK: " + obj.toString(4));
     m_zk.setData(VoltZK.lastKnownLiveNodes, obj.toString(4).getBytes("UTF-8"), -1);
   } catch (Exception e) {
     VoltDB.crashLocalVoltDB(
         "Unable to update known live nodes at ZK path: " + VoltZK.lastKnownLiveNodes, true, e);
   }
 }
Beispiel #18
0
 /**
  * Update a modified child and republish a new snapshot. This may indicate a deleted child or a
  * child with modified data.
  */
 private void processChildEvent(WatchedEvent event) throws Exception {
   HashMap<String, JSONObject> cacheCopy = new HashMap<String, JSONObject>(m_publicCache.get());
   ByteArrayCallback cb = new ByteArrayCallback();
   m_zk.getData(event.getPath(), m_childWatch, cb, null);
   try {
     byte payload[] = cb.getData();
     JSONObject jsObj = new JSONObject(new String(payload, "UTF-8"));
     cacheCopy.put(cb.getPath(), jsObj);
   } catch (KeeperException.NoNodeException e) {
     cacheCopy.remove(event.getPath());
   }
   m_publicCache.set(ImmutableMap.copyOf(cacheCopy));
 }
Beispiel #19
0
 /**
  * Returns the IDs of the partitions currently in the cluster.
  *
  * @return A list of partition IDs
  */
 public static List<Integer> getPartitions(ZooKeeper zk) {
   List<Integer> partitions = new ArrayList<Integer>();
   try {
     List<String> children = zk.getChildren(VoltZK.leaders_initiators, null);
     for (String child : children) {
       partitions.add(LeaderElector.getPartitionFromElectionDir(child));
     }
   } catch (KeeperException e) {
     VoltDB.crashLocalVoltDB("Failed to get partition IDs from ZK", true, e);
   } catch (InterruptedException e) {
     VoltDB.crashLocalVoltDB("Failed to get partition IDs from ZK", true, e);
   }
   return partitions;
 }
Beispiel #20
0
  public static Pair<byte[], Integer> retrieveChunksAsBytes(
      ZooKeeper zk, String path, String prefix, boolean getCRC) throws Exception {
    TreeSet<String> chunks = new TreeSet<String>();
    while (true) {
      boolean allUploadsComplete = true;
      if (!chunks.contains(path + "/" + prefix + "_complete")) {
        allUploadsComplete = false;
      }
      if (allUploadsComplete) {
        break;
      }

      chunks = new TreeSet<String>(zk.getChildren(path, false));
      for (String chunk : chunks) {
        for (int ii = 0; ii < chunks.size(); ii++) {
          if (chunk.startsWith(path + "/" + prefix)) {
            chunks.add(chunk);
          }
        }
      }
    }

    byte resultBuffers[][] = new byte[chunks.size() - 1][];
    int ii = 0;
    PureJavaCrc32 crc = getCRC ? new PureJavaCrc32() : null;
    for (String chunk : chunks) {
      if (chunk.endsWith("_complete")) continue;
      resultBuffers[ii] = zk.getData(chunk, false, null);
      if (crc != null) {
        crc.update(resultBuffers[ii]);
      }
      ii++;
    }

    return Pair.of(decompressBytes(resultBuffers), crc != null ? (int) crc.getValue() : null);
  }
Beispiel #21
0
 private boolean isClusterKSafe() {
   boolean retval = true;
   for (int i = 0; i < m_partitionCount; i++) {
     String dir = LeaderElector.electionDirForPartition(i);
     try {
       List<String> replicas = m_zk.getChildren(dir, null, null);
       if (replicas.isEmpty()) {
         tmLog.fatal("K-Safety violation: No replicas found for partition: " + i);
         retval = false;
       }
     } catch (Exception e) {
       VoltDB.crashLocalVoltDB("Unable to read replicas in ZK dir: " + dir, true, e);
     }
   }
   return retval;
 }
Beispiel #22
0
 /** Given a partition ID, return a list of HSIDs of all the sites with copies of that partition */
 public List<Long> getReplicasForPartition(int partition) {
   String zkpath = LeaderElector.electionDirForPartition(partition);
   List<Long> retval = new ArrayList<Long>();
   try {
     List<String> children = m_zk.getChildren(zkpath, null);
     for (String child : children) {
       retval.add(Long.valueOf(child.split("_")[0]));
     }
   } catch (KeeperException ke) {
     org.voltdb.VoltDB.crashLocalVoltDB(
         "KeeperException getting replicas for partition: " + partition, true, ke);
   } catch (InterruptedException ie) {
     org.voltdb.VoltDB.crashLocalVoltDB(
         "InterruptedException getting replicas for partition: " + partition, true, ie);
   }
   return retval;
 }
Beispiel #23
0
 private Set<Integer> readPriorKnownLiveNodes() {
   Set<Integer> nodes = new HashSet<Integer>();
   try {
     byte[] data = m_zk.getData(VoltZK.lastKnownLiveNodes, false, null);
     String jsonString = new String(data, "UTF-8");
     tmLog.debug("Read prior known live nodes: " + jsonString);
     JSONObject jsObj = new JSONObject(jsonString);
     JSONArray jsonNodes = jsObj.getJSONArray("liveNodes");
     for (int ii = 0; ii < jsonNodes.length(); ii++) {
       nodes.add(jsonNodes.getInt(ii));
     }
   } catch (Exception e) {
     VoltDB.crashLocalVoltDB(
         "Unable to read prior known live nodes at ZK path: " + VoltZK.lastKnownLiveNodes,
         true,
         e);
   }
   return nodes;
 }
Beispiel #24
0
  @Override
  public void acceptPromotion() throws InterruptedException, ExecutionException, KeeperException {
    // Crank up the leader caches.  Use blocking startup so that we'll have valid point-in-time
    // caches later.
    m_iv2appointees.start(true);
    m_iv2masters.start(true);
    // Figure out what conditions we assumed leadership under.
    if (m_iv2appointees.pointInTimeCache().size() == 0) {
      tmLog.debug("LeaderAppointer in startup");
      m_state.set(AppointerState.CLUSTER_START);
    } else if ((m_iv2appointees.pointInTimeCache().size() != m_partitionCount)
        || (m_iv2masters.pointInTimeCache().size() != m_partitionCount)) {
      // If we are promoted and the appointees or masters set is partial, the previous appointer
      // failed
      // during startup (at least for now, until we add add/remove a partition on the fly).
      VoltDB.crashGlobalVoltDB("Detected failure during startup, unable to start", false, null);
    } else {
      tmLog.debug("LeaderAppointer in repair");
      m_state.set(AppointerState.DONE);
    }

    if (m_state.get() == AppointerState.CLUSTER_START) {
      // Need to block the return of acceptPromotion until after the MPI is promoted.  Wait for this
      // latch
      // to countdown after appointing all the partition leaders.  The
      // LeaderCache callback will count it down once it has seen all the
      // appointed leaders publish themselves as the actual leaders.
      m_startupLatch = new CountDownLatch(1);
      writeKnownLiveNodes(m_hostMessenger.getLiveHostIds());
      for (int i = 0; i < m_partitionCount; i++) {
        String dir = LeaderElector.electionDirForPartition(i);
        // Race along with all of the replicas for this partition to create the ZK parent node
        try {
          m_zk.create(dir, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
        } catch (KeeperException.NodeExistsException e) {
          // expected on all nodes that don't start() first.
        }
        m_callbacks[i] = new PartitionCallback(i);
        Pair<BabySitter, List<String>> sitterstuff =
            BabySitter.blockingFactory(m_zk, dir, m_callbacks[i], m_es);
        m_partitionWatchers[i] = sitterstuff.getFirst();
      }
      m_startupLatch.await();
    } else {
      // If we're taking over for a failed LeaderAppointer, we know when
      // we get here that every partition had a leader at some point in
      // time.  We'll seed each of the PartitionCallbacks for each
      // partition with the HSID of the last published leader.  The
      // blocking startup of the BabySitter watching that partition will
      // call our callback, get the current full set of replicas, and
      // appoint a new leader if the seeded one has actually failed
      Map<Integer, Long> masters = m_iv2masters.pointInTimeCache();
      tmLog.info("LeaderAppointer repairing with master set: " + masters);
      for (Entry<Integer, Long> master : masters.entrySet()) {
        int partId = master.getKey();
        String dir = LeaderElector.electionDirForPartition(partId);
        m_callbacks[partId] = new PartitionCallback(partId, master.getValue());
        Pair<BabySitter, List<String>> sitterstuff =
            BabySitter.blockingFactory(m_zk, dir, m_callbacks[partId], m_es);
        m_partitionWatchers[partId] = sitterstuff.getFirst();
      }
      // just go ahead and promote our MPI
      m_MPI.acceptPromotion();
    }
  }
 private void handleChildUpdate(final WatchedEvent event) {
   m_zk.getChildren(
       event.getPath(), constructMailboxChildWatcher(), constructChildRetrievalCallback(), null);
 }
  private void createAndRegisterAckMailboxes(
      final Set<Integer> localPartitions, HostMessenger messenger) {
    m_zk = messenger.getZK();
    m_mailboxesZKPath = VoltZK.exportGenerations + "/" + m_timestamp + "/" + "mailboxes";

    m_mbox =
        new LocalMailbox(messenger) {
          @Override
          public void deliver(VoltMessage message) {
            if (message instanceof BinaryPayloadMessage) {
              BinaryPayloadMessage bpm = (BinaryPayloadMessage) message;
              ByteBuffer buf = ByteBuffer.wrap(bpm.m_payload);
              final int partition = buf.getInt();
              final int length = buf.getInt();
              byte stringBytes[] = new byte[length];
              buf.get(stringBytes);
              String signature = new String(stringBytes, Constants.UTF8ENCODING);
              final long ackUSO = buf.getLong();

              final HashMap<String, ExportDataSource> partitionSources =
                  m_dataSourcesByPartition.get(partition);
              if (partitionSources == null) {
                exportLog.error(
                    "Received an export ack for partition "
                        + partition
                        + " which does not exist on this node");
                return;
              }

              final ExportDataSource eds = partitionSources.get(signature);
              if (eds == null) {
                exportLog.error(
                    "Received an export ack for partition "
                        + partition
                        + " source signature "
                        + signature
                        + " which does not exist on this node");
                return;
              }

              try {
                eds.ack(ackUSO);
              } catch (RejectedExecutionException ignoreIt) {
                // ignore it: as it is already shutdown
              }
            } else {
              exportLog.error("Receive unexpected message " + message + " in export subsystem");
            }
          }
        };
    messenger.createMailbox(null, m_mbox);

    for (Integer partition : localPartitions) {
      final String partitionDN = m_mailboxesZKPath + "/" + partition;
      ZKUtil.asyncMkdirs(m_zk, partitionDN);

      ZKUtil.StringCallback cb = new ZKUtil.StringCallback();
      m_zk.create(
          partitionDN + "/" + m_mbox.getHSId(),
          null,
          Ids.OPEN_ACL_UNSAFE,
          CreateMode.EPHEMERAL,
          cb,
          null);
    }

    ListenableFuture<?> fut =
        m_childUpdatingThread.submit(
            new Runnable() {
              @Override
              public void run() {
                List<Pair<Integer, ZKUtil.ChildrenCallback>> callbacks =
                    new ArrayList<Pair<Integer, ZKUtil.ChildrenCallback>>();
                for (Integer partition : localPartitions) {
                  ZKUtil.ChildrenCallback callback = new ZKUtil.ChildrenCallback();
                  m_zk.getChildren(
                      m_mailboxesZKPath + "/" + partition,
                      constructMailboxChildWatcher(),
                      callback,
                      null);
                  callbacks.add(Pair.of(partition, callback));
                }
                for (Pair<Integer, ZKUtil.ChildrenCallback> p : callbacks) {
                  final Integer partition = p.getFirst();
                  List<String> children = null;
                  try {
                    children = p.getSecond().getChildren();
                  } catch (InterruptedException e) {
                    Throwables.propagate(e);
                  } catch (KeeperException e) {
                    Throwables.propagate(e);
                  }
                  ImmutableList.Builder<Long> mailboxes = ImmutableList.builder();

                  for (String child : children) {
                    if (child.equals(Long.toString(m_mbox.getHSId()))) continue;
                    mailboxes.add(Long.valueOf(child));
                  }
                  ImmutableList<Long> mailboxHsids = mailboxes.build();

                  for (ExportDataSource eds : m_dataSourcesByPartition.get(partition).values()) {
                    eds.updateAckMailboxes(Pair.of(m_mbox, mailboxHsids));
                  }
                }
              }
            });
    try {
      fut.get();
    } catch (Throwable t) {
      Throwables.propagate(t);
    }
  }