Ejemplo n.º 1
0
 void configure(String root, ZooKeeper zk) throws Exception {
   Long aa = 12345678L;
   Long bb = 87654321L;
   Long cc = 11223344L;
   zk.create(root, new byte[] {}, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
   zk.create(root + "/0", aa.toString().getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
   zk.create(root + "/1", bb.toString().getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
   zk.create(root + "/2", cc.toString().getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
 }
Ejemplo n.º 2
0
  public static ZKUtil.StringCallback asyncMkdirs(ZooKeeper zk, String dirDN, byte payload[]) {
    Preconditions.checkArgument(
        dirDN != null && !dirDN.trim().isEmpty() && !"/".equals(dirDN) && dirDN.startsWith("/"));

    StringBuilder dsb = new StringBuilder(128);
    ZKUtil.StringCallback lastCallback = null;
    try {
      String dirPortions[] = dirDN.substring(1).split("/");
      for (int ii = 0; ii < dirPortions.length; ii++) {
        String dirPortion = dirPortions[ii];
        lastCallback = new ZKUtil.StringCallback();
        dsb.append('/').append(dirPortion);
        zk.create(
            dsb.toString(),
            ii == dirPortions.length - 1 ? payload : null,
            Ids.OPEN_ACL_UNSAFE,
            CreateMode.PERSISTENT,
            lastCallback,
            null);
      }
    } catch (Throwable t) {
      Throwables.propagate(t);
    }
    return lastCallback;
  }
Ejemplo n.º 3
0
 public static void uploadBytesAsChunks(
     ZooKeeper zk, String node, byte payload[], boolean ephemeral) throws Exception {
   ByteBuffer buffer = ByteBuffer.wrap(compressBytes(payload));
   while (buffer.hasRemaining()) {
     int nextChunkSize = Math.min(1024 * 1024, buffer.remaining());
     ByteBuffer nextChunk = ByteBuffer.allocate(nextChunkSize);
     buffer.limit(buffer.position() + nextChunkSize);
     nextChunk.put(buffer);
     buffer.limit(buffer.capacity());
     zk.create(
         node,
         nextChunk.array(),
         Ids.OPEN_ACL_UNSAFE,
         ephemeral ? CreateMode.EPHEMERAL_SEQUENTIAL : CreateMode.PERSISTENT_SEQUENTIAL);
   }
   zk.create(
       node + "_complete",
       null,
       Ids.OPEN_ACL_UNSAFE,
       ephemeral ? CreateMode.EPHEMERAL : CreateMode.PERSISTENT);
 }
Ejemplo n.º 4
0
    @Override
    public void run() {
      try {
        JSONStringer js = new JSONStringer();
        js.object();
        js.key("role").value(m_config.m_replicationRole.ordinal());
        js.key("active").value(m_rvdb.getReplicationActive());
        js.endObject();

        ZooKeeper zk = m_rvdb.getHostMessenger().getZK();
        // rejoining nodes figure out the replication role from other nodes
        if (!m_isRejoin) {
          try {
            zk.create(
                VoltZK.replicationconfig,
                js.toString().getBytes("UTF-8"),
                Ids.OPEN_ACL_UNSAFE,
                CreateMode.PERSISTENT);
          } catch (KeeperException.NodeExistsException e) {
          }
          String discoveredReplicationConfig =
              new String(zk.getData(VoltZK.replicationconfig, false, null), "UTF-8");
          JSONObject discoveredjsObj = new JSONObject(discoveredReplicationConfig);
          ReplicationRole discoveredRole =
              ReplicationRole.get((byte) discoveredjsObj.getLong("role"));
          if (!discoveredRole.equals(m_config.m_replicationRole)) {
            VoltDB.crashGlobalVoltDB(
                "Discovered replication role "
                    + discoveredRole
                    + " doesn't match locally specified replication role "
                    + m_config.m_replicationRole,
                true,
                null);
          }

          // See if we should bring the server up in WAN replication mode
          m_rvdb.setReplicationRole(discoveredRole);
        } else {
          String discoveredReplicationConfig =
              new String(zk.getData(VoltZK.replicationconfig, false, null), "UTF-8");
          JSONObject discoveredjsObj = new JSONObject(discoveredReplicationConfig);
          ReplicationRole discoveredRole =
              ReplicationRole.get((byte) discoveredjsObj.getLong("role"));
          boolean replicationActive = discoveredjsObj.getBoolean("active");
          // See if we should bring the server up in WAN replication mode
          m_rvdb.setReplicationRole(discoveredRole);
          m_rvdb.setReplicationActive(replicationActive);
        }
      } catch (Exception e) {
        VoltDB.crashGlobalVoltDB("Error discovering replication role", false, e);
      }
    }
Ejemplo n.º 5
0
 private void writeKnownLiveNodes(List<Integer> liveNodes) {
   try {
     if (m_zk.exists(VoltZK.lastKnownLiveNodes, null) == null) {
       // VoltZK.createPersistentZKNodes should have done this
       m_zk.create(VoltZK.lastKnownLiveNodes, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
     }
     JSONStringer stringer = new JSONStringer();
     stringer.object();
     stringer.key("liveNodes").array();
     for (Integer node : liveNodes) {
       stringer.value(node);
     }
     stringer.endArray();
     stringer.endObject();
     JSONObject obj = new JSONObject(stringer.toString());
     tmLog.debug("Writing live nodes to ZK: " + obj.toString(4));
     m_zk.setData(VoltZK.lastKnownLiveNodes, obj.toString(4).getBytes("UTF-8"), -1);
   } catch (Exception e) {
     VoltDB.crashLocalVoltDB(
         "Unable to update known live nodes at ZK path: " + VoltZK.lastKnownLiveNodes, true, e);
   }
 }
Ejemplo n.º 6
0
  @Override
  public void acceptPromotion() throws InterruptedException, ExecutionException, KeeperException {
    // Crank up the leader caches.  Use blocking startup so that we'll have valid point-in-time
    // caches later.
    m_iv2appointees.start(true);
    m_iv2masters.start(true);
    // Figure out what conditions we assumed leadership under.
    if (m_iv2appointees.pointInTimeCache().size() == 0) {
      tmLog.debug("LeaderAppointer in startup");
      m_state.set(AppointerState.CLUSTER_START);
    } else if ((m_iv2appointees.pointInTimeCache().size() != m_partitionCount)
        || (m_iv2masters.pointInTimeCache().size() != m_partitionCount)) {
      // If we are promoted and the appointees or masters set is partial, the previous appointer
      // failed
      // during startup (at least for now, until we add add/remove a partition on the fly).
      VoltDB.crashGlobalVoltDB("Detected failure during startup, unable to start", false, null);
    } else {
      tmLog.debug("LeaderAppointer in repair");
      m_state.set(AppointerState.DONE);
    }

    if (m_state.get() == AppointerState.CLUSTER_START) {
      // Need to block the return of acceptPromotion until after the MPI is promoted.  Wait for this
      // latch
      // to countdown after appointing all the partition leaders.  The
      // LeaderCache callback will count it down once it has seen all the
      // appointed leaders publish themselves as the actual leaders.
      m_startupLatch = new CountDownLatch(1);
      writeKnownLiveNodes(m_hostMessenger.getLiveHostIds());
      for (int i = 0; i < m_partitionCount; i++) {
        String dir = LeaderElector.electionDirForPartition(i);
        // Race along with all of the replicas for this partition to create the ZK parent node
        try {
          m_zk.create(dir, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
        } catch (KeeperException.NodeExistsException e) {
          // expected on all nodes that don't start() first.
        }
        m_callbacks[i] = new PartitionCallback(i);
        Pair<BabySitter, List<String>> sitterstuff =
            BabySitter.blockingFactory(m_zk, dir, m_callbacks[i], m_es);
        m_partitionWatchers[i] = sitterstuff.getFirst();
      }
      m_startupLatch.await();
    } else {
      // If we're taking over for a failed LeaderAppointer, we know when
      // we get here that every partition had a leader at some point in
      // time.  We'll seed each of the PartitionCallbacks for each
      // partition with the HSID of the last published leader.  The
      // blocking startup of the BabySitter watching that partition will
      // call our callback, get the current full set of replicas, and
      // appoint a new leader if the seeded one has actually failed
      Map<Integer, Long> masters = m_iv2masters.pointInTimeCache();
      tmLog.info("LeaderAppointer repairing with master set: " + masters);
      for (Entry<Integer, Long> master : masters.entrySet()) {
        int partId = master.getKey();
        String dir = LeaderElector.electionDirForPartition(partId);
        m_callbacks[partId] = new PartitionCallback(partId, master.getValue());
        Pair<BabySitter, List<String>> sitterstuff =
            BabySitter.blockingFactory(m_zk, dir, m_callbacks[partId], m_es);
        m_partitionWatchers[partId] = sitterstuff.getFirst();
      }
      // just go ahead and promote our MPI
      m_MPI.acceptPromotion();
    }
  }
Ejemplo n.º 7
0
  private void createAndRegisterAckMailboxes(
      final Set<Integer> localPartitions, HostMessenger messenger) {
    m_zk = messenger.getZK();
    m_mailboxesZKPath = VoltZK.exportGenerations + "/" + m_timestamp + "/" + "mailboxes";

    m_mbox =
        new LocalMailbox(messenger) {
          @Override
          public void deliver(VoltMessage message) {
            if (message instanceof BinaryPayloadMessage) {
              BinaryPayloadMessage bpm = (BinaryPayloadMessage) message;
              ByteBuffer buf = ByteBuffer.wrap(bpm.m_payload);
              final int partition = buf.getInt();
              final int length = buf.getInt();
              byte stringBytes[] = new byte[length];
              buf.get(stringBytes);
              String signature = new String(stringBytes, Constants.UTF8ENCODING);
              final long ackUSO = buf.getLong();

              final HashMap<String, ExportDataSource> partitionSources =
                  m_dataSourcesByPartition.get(partition);
              if (partitionSources == null) {
                exportLog.error(
                    "Received an export ack for partition "
                        + partition
                        + " which does not exist on this node");
                return;
              }

              final ExportDataSource eds = partitionSources.get(signature);
              if (eds == null) {
                exportLog.error(
                    "Received an export ack for partition "
                        + partition
                        + " source signature "
                        + signature
                        + " which does not exist on this node");
                return;
              }

              try {
                eds.ack(ackUSO);
              } catch (RejectedExecutionException ignoreIt) {
                // ignore it: as it is already shutdown
              }
            } else {
              exportLog.error("Receive unexpected message " + message + " in export subsystem");
            }
          }
        };
    messenger.createMailbox(null, m_mbox);

    for (Integer partition : localPartitions) {
      final String partitionDN = m_mailboxesZKPath + "/" + partition;
      ZKUtil.asyncMkdirs(m_zk, partitionDN);

      ZKUtil.StringCallback cb = new ZKUtil.StringCallback();
      m_zk.create(
          partitionDN + "/" + m_mbox.getHSId(),
          null,
          Ids.OPEN_ACL_UNSAFE,
          CreateMode.EPHEMERAL,
          cb,
          null);
    }

    ListenableFuture<?> fut =
        m_childUpdatingThread.submit(
            new Runnable() {
              @Override
              public void run() {
                List<Pair<Integer, ZKUtil.ChildrenCallback>> callbacks =
                    new ArrayList<Pair<Integer, ZKUtil.ChildrenCallback>>();
                for (Integer partition : localPartitions) {
                  ZKUtil.ChildrenCallback callback = new ZKUtil.ChildrenCallback();
                  m_zk.getChildren(
                      m_mailboxesZKPath + "/" + partition,
                      constructMailboxChildWatcher(),
                      callback,
                      null);
                  callbacks.add(Pair.of(partition, callback));
                }
                for (Pair<Integer, ZKUtil.ChildrenCallback> p : callbacks) {
                  final Integer partition = p.getFirst();
                  List<String> children = null;
                  try {
                    children = p.getSecond().getChildren();
                  } catch (InterruptedException e) {
                    Throwables.propagate(e);
                  } catch (KeeperException e) {
                    Throwables.propagate(e);
                  }
                  ImmutableList.Builder<Long> mailboxes = ImmutableList.builder();

                  for (String child : children) {
                    if (child.equals(Long.toString(m_mbox.getHSId()))) continue;
                    mailboxes.add(Long.valueOf(child));
                  }
                  ImmutableList<Long> mailboxHsids = mailboxes.build();

                  for (ExportDataSource eds : m_dataSourcesByPartition.get(partition).values()) {
                    eds.updateAckMailboxes(Pair.of(m_mbox, mailboxHsids));
                  }
                }
              }
            });
    try {
      fut.get();
    } catch (Throwable t) {
      Throwables.propagate(t);
    }
  }