Beispiel #1
0
  private long assignLeader(int partitionId, List<Long> children) {
    // We used masterHostId = -1 as a way to force the leader choice to be
    // the first replica in the list, if we don't have some other mechanism
    // which has successfully overridden it.
    int masterHostId = -1;
    if (m_state.get() == AppointerState.CLUSTER_START) {
      try {
        // find master in topo
        JSONArray parts = m_topo.getJSONArray("partitions");
        for (int p = 0; p < parts.length(); p++) {
          JSONObject aPartition = parts.getJSONObject(p);
          int pid = aPartition.getInt("partition_id");
          if (pid == partitionId) {
            masterHostId = aPartition.getInt("master");
          }
        }
      } catch (JSONException jse) {
        tmLog.error("Failed to find master for partition " + partitionId + ", defaulting to 0");
        jse.printStackTrace();
        masterHostId = -1; // stupid default
      }
    } else {
      // For now, if we're appointing a new leader as a result of a
      // failure, just pick the first replica in the children list.
      // Could eventually do something more complex here to try to keep a
      // semi-balance, but it's unclear that this has much utility until
      // we add rebalancing on rejoin as well.
      masterHostId = -1;
    }

    long masterHSId = children.get(0);
    for (Long child : children) {
      if (CoreUtils.getHostIdFromHSId(child) == masterHostId) {
        masterHSId = child;
        break;
      }
    }
    tmLog.info(
        "Appointing HSId "
            + CoreUtils.hsIdToString(masterHSId)
            + " as leader for partition "
            + partitionId);
    try {
      m_iv2appointees.put(partitionId, masterHSId);
    } catch (Exception e) {
      VoltDB.crashLocalVoltDB("Unable to appoint new master for partition " + partitionId, true, e);
    }
    return masterHSId;
  }
Beispiel #2
0
    @Override
    public void run(List<String> children) {
      List<Long> updatedHSIds = VoltZK.childrenToReplicaHSIds(children);
      // compute previously unseen HSId set in the callback list
      Set<Long> newHSIds = new HashSet<Long>(updatedHSIds);
      newHSIds.removeAll(m_replicas);
      tmLog.debug("Newly seen replicas: " + CoreUtils.hsIdCollectionToString(newHSIds));
      // compute previously seen but now vanished from the callback list HSId set
      Set<Long> missingHSIds = new HashSet<Long>(m_replicas);
      missingHSIds.removeAll(updatedHSIds);
      tmLog.debug("Newly dead replicas: " + CoreUtils.hsIdCollectionToString(missingHSIds));

      tmLog.debug(
          "Handling babysitter callback for partition "
              + m_partitionId
              + ": children: "
              + CoreUtils.hsIdCollectionToString(updatedHSIds));
      if (m_state.get() == AppointerState.CLUSTER_START) {
        // We can't yet tolerate a host failure during startup.  Crash it all
        if (missingHSIds.size() > 0) {
          VoltDB.crashGlobalVoltDB("Node failure detected during startup.", false, null);
        }
        // ENG-3166: Eventually we would like to get rid of the extra replicas beyond k_factor,
        // but for now we just look to see how many replicas of this partition we actually expect
        // and gate leader assignment on that many copies showing up.
        int replicaCount = m_kfactor + 1;
        JSONArray parts;
        try {
          parts = m_topo.getJSONArray("partitions");
          for (int p = 0; p < parts.length(); p++) {
            JSONObject aPartition = parts.getJSONObject(p);
            int pid = aPartition.getInt("partition_id");
            if (pid == m_partitionId) {
              replicaCount = aPartition.getJSONArray("replicas").length();
            }
          }
        } catch (JSONException e) {
          // Ignore and just assume the normal number of replicas
        }
        if (children.size() == replicaCount) {
          m_currentLeader = assignLeader(m_partitionId, updatedHSIds);
        } else {
          tmLog.info(
              "Waiting on "
                  + ((m_kfactor + 1) - children.size())
                  + " more nodes "
                  + "for k-safety before startup");
        }
      } else {
        // Check for k-safety
        if (!isClusterKSafe()) {
          VoltDB.crashGlobalVoltDB(
              "Some partitions have no replicas.  Cluster has become unviable.", false, null);
        }
        // Check if replay has completed
        if (m_replayComplete.get() == false) {
          VoltDB.crashGlobalVoltDB(
              "Detected node failure during command log replay. Cluster will shut down.",
              false,
              null);
        }
        // Check to see if there's been a possible network partition and we're not already handling
        // it
        if (m_partitionDetectionEnabled && !m_partitionDetected) {
          doPartitionDetectionActivities();
        }
        // If we survived the above gauntlet of fail, appoint a new leader for this partition.
        if (missingHSIds.contains(m_currentLeader)) {
          m_currentLeader = assignLeader(m_partitionId, updatedHSIds);
        }
      }
      m_replicas.clear();
      m_replicas.addAll(updatedHSIds);
    }