protected List<IdealState> setupIdealState(
      int nodes, String[] resources, int partitions, int replicas) {
    List<IdealState> idealStates = new ArrayList<IdealState>();
    List<String> instances = new ArrayList<String>();
    for (int i = 0; i < nodes; i++) {
      instances.add("localhost_" + i);
    }

    for (int i = 0; i < resources.length; i++) {
      String resourceName = resources[i];
      ZNRecord record = new ZNRecord(resourceName);
      for (int p = 0; p < partitions; p++) {
        List<String> value = new ArrayList<String>();
        for (int r = 0; r < replicas; r++) {
          value.add("localhost_" + (p + r + 1) % nodes);
        }
        record.setListField(resourceName + "_" + p, value);
      }
      IdealState idealState = new IdealState(record);
      idealState.setStateModelDefRef("MasterSlave");
      idealState.setIdealStateMode(IdealStateModeProperty.AUTO.toString());
      idealState.setNumPartitions(partitions);
      idealStates.add(idealState);

      //      System.out.println(idealState);

      Builder keyBuilder = accessor.keyBuilder();

      accessor.setProperty(keyBuilder.idealStates(resourceName), idealState);
    }
    return idealStates;
  }
Exemple #2
0
  /**
   * Returns all instances for the given resource.
   *
   * @param idealState IdealState of the resource for which to return the instances of.
   * @return Returns a Set of strings containing the instance names for the given cluster.
   */
  public static Set<String> getAllInstancesForResource(IdealState idealState) {
    final Set<String> instances = new HashSet<String>();

    for (final String partition : idealState.getPartitionSet()) {
      for (final String instance : idealState.getInstanceSet(partition)) {
        instances.add(instance);
      }
    }
    return instances;
  }
  @Override
  public void process(ClusterEvent event) throws Exception {
    Cluster cluster = event.getAttribute("Cluster");
    if (cluster == null) {
      throw new StageException("Missing attributes in event:" + event + ". Requires Cluster");
    }
    Map<ResourceId, ResourceConfig> resourceConfigMap =
        event.getAttribute(AttributeName.RESOURCES.toString());
    if (resourceConfigMap == null) {
      throw new StageException("Resources must be computed prior to validation!");
    }
    Map<ResourceId, Resource> resourceMap = cluster.getResourceMap();
    Map<String, Map<String, String>> idealStateRuleMap =
        event.getAttribute(AttributeName.IDEAL_STATE_RULES.toString());

    for (ResourceId resourceId : resourceMap.keySet()) {
      // check every ideal state against the ideal state rules
      // the pipeline should not process any resources that have an unsupported ideal state
      IdealState idealState = resourceMap.get(resourceId).getIdealState();
      if (idealState == null) {
        continue;
      }
      if (idealStateRuleMap != null && !idealStateRuleMap.isEmpty()) {
        boolean hasMatchingRule = false;
        for (String ruleName : idealStateRuleMap.keySet()) {
          Map<String, String> rule = idealStateRuleMap.get(ruleName);
          boolean matches = idealStateMatchesRule(idealState, rule);
          hasMatchingRule = hasMatchingRule || matches;
          if (matches) {
            break;
          }
        }
        if (!hasMatchingRule) {
          LOG.warn("Resource " + resourceId + " does not have a valid ideal state!");
          resourceConfigMap.remove(resourceId);
        }
      }

      // check that every resource to process has a live state model definition
      StateModelDefinitionId stateModelDefId = idealState.getStateModelDefId();
      StateModelDefinition stateModelDef = cluster.getStateModelMap().get(stateModelDefId);
      if (stateModelDef == null) {
        LOG.warn(
            "Resource "
                + resourceId
                + " uses state model "
                + stateModelDefId
                + ", but it is not on the cluster!");
        resourceConfigMap.remove(resourceId);
      }
    }
  }
  @Test
  public void missingEVPartitionTest() throws Exception {
    final String tableName = "myTable";
    List<String> allTableNames = new ArrayList<String>();
    allTableNames.add(tableName);
    IdealState idealState = new IdealState(tableName);
    idealState.setPartitionState("myTable_0", "pinot1", "ONLINE");
    idealState.setPartitionState("myTable_0", "pinot2", "ONLINE");
    idealState.setPartitionState("myTable_0", "pinot3", "ONLINE");
    idealState.setPartitionState("myTable_1", "pinot1", "ONLINE");
    idealState.setPartitionState("myTable_1", "pinot2", "ONLINE");
    idealState.setPartitionState("myTable_1", "pinot3", "ONLINE");
    idealState.setPartitionState("myTable_2", "pinot3", "OFFLINE");
    idealState.setPartitionState("myTable_3", "pinot3", "ONLINE");
    idealState.setReplicas("2");
    idealState.setRebalanceMode(IdealState.RebalanceMode.CUSTOMIZED);

    ExternalView externalView = new ExternalView(tableName);
    externalView.setState("myTable_0", "pinot1", "ONLINE");
    externalView.setState("myTable_0", "pinot2", "ONLINE");
    externalView.setState("myTable_1", "pinot1", "ERROR");
    externalView.setState("myTable_1", "pinot2", "ONLINE");

    HelixAdmin helixAdmin;
    {
      helixAdmin = mock(HelixAdmin.class);
      when(helixAdmin.getResourceIdealState("StatusChecker", "myTable")).thenReturn(idealState);
      when(helixAdmin.getResourceExternalView("StatusChecker", "myTable")).thenReturn(externalView);
    }
    {
      helixResourceManager = mock(PinotHelixResourceManager.class);
      when(helixResourceManager.isLeader()).thenReturn(true);
      when(helixResourceManager.getAllPinotTableNames()).thenReturn(allTableNames);
      when(helixResourceManager.getHelixClusterName()).thenReturn("StatusChecker");
      when(helixResourceManager.getHelixAdmin()).thenReturn(helixAdmin);
    }
    {
      config = mock(ControllerConf.class);
      when(config.getStatusControllerFrequencyInSeconds()).thenReturn(300);
    }
    metricsRegistry = new MetricsRegistry();
    controllerMetrics = new ControllerMetrics(metricsRegistry);
    segmentStatusChecker = new SegmentStatusChecker(helixResourceManager, config);
    segmentStatusChecker.setMetricsRegistry(controllerMetrics);
    segmentStatusChecker.runSegmentMetrics();
    Assert.assertEquals(
        controllerMetrics.getValueOfTableGauge(
            externalView.getId(), ControllerGauge.SEGMENTS_IN_ERROR_STATE),
        1);
    Assert.assertEquals(
        controllerMetrics.getValueOfTableGauge(
            externalView.getId(), ControllerGauge.NUMBER_OF_REPLICAS),
        0);
    segmentStatusChecker.stop();
  }
    @Override
    public IdealState computeNewIdealState(
        String resourceName,
        IdealState currentIdealState,
        CurrentStateOutput currentStateOutput,
        ClusterDataCache clusterData) {
      testRebalancerInvoked = true;
      for (String partition : currentIdealState.getPartitionSet()) {
        String instance = currentIdealState.getPreferenceList(partition).get(0);
        currentIdealState.getPreferenceList(partition).clear();
        currentIdealState.getPreferenceList(partition).add(instance);

        currentIdealState.getInstanceStateMap(partition).clear();
        currentIdealState.getInstanceStateMap(partition).put(instance, "MASTER");
      }
      currentIdealState.setReplicas("1");
      return currentIdealState;
    }
 /**
  * Check if the ideal state adheres to a rule
  *
  * @param idealState the ideal state to check
  * @param rule the rules of a valid ideal state
  * @return true if the ideal state is a superset of the entries of the rule, false otherwise
  */
 private boolean idealStateMatchesRule(IdealState idealState, Map<String, String> rule) {
   Map<String, String> simpleFields = idealState.getRecord().getSimpleFields();
   for (String key : rule.keySet()) {
     String value = rule.get(key);
     if (!simpleFields.containsKey(key) || !value.equals(simpleFields.get(key))) {
       return false;
     }
   }
   return true;
 }
  private void prepare(
      String controllerVersion, String participantVersion, String minSupportedParticipantVersion) {
    List<String> instances =
        Arrays.asList("localhost_0", "localhost_1", "localhost_2", "localhost_3", "localhost_4");
    int partitions = 10;
    int replicas = 1;

    // set ideal state
    String resourceName = "testResource";
    ZNRecord record =
        DefaultTwoStateStrategy.calculateIdealState(
            instances, partitions, replicas, resourceName, "MASTER", "SLAVE");
    IdealState idealState = new IdealState(record);
    idealState.setStateModelDefId(StateModelDefinitionId.from("MasterSlave"));

    PropertyKeyBuilder keyBuilder = accessor.keyBuilder();
    accessor.setProperty(keyBuilder.idealStates(resourceName), idealState);

    // set live instances
    record = new ZNRecord("localhost_0");
    if (participantVersion != null) {
      record.setSimpleField(LiveInstanceProperty.HELIX_VERSION.toString(), participantVersion);
    }
    LiveInstance liveInstance = new LiveInstance(record);
    liveInstance.setSessionId("session_0");
    accessor.setProperty(keyBuilder.liveInstance("localhost_0"), liveInstance);
    InstanceConfig config = new InstanceConfig(liveInstance.getInstanceName());
    accessor.setProperty(keyBuilder.instanceConfig(config.getInstanceName()), config);

    if (controllerVersion != null) {
      ((Mocks.MockManager) manager).setVersion(controllerVersion);
    }

    if (minSupportedParticipantVersion != null) {
      manager
          .getProperties()
          .getProperties()
          .put("minimum_supported_version.participant", minSupportedParticipantVersion);
    }
    event.addAttribute("helixmanager", manager);
    runStage(event, new ReadClusterDataStage());
  }
  @Test
  public void testCustomizedIdealStateRebalancer() throws InterruptedException {
    _setupTool.addResourceToCluster(CLUSTER_NAME, db2, 60, "MasterSlave");
    _setupTool.addResourceProperty(
        CLUSTER_NAME,
        db2,
        IdealStateProperty.REBALANCER_CLASS_NAME.toString(),
        TestCustomizedIdealStateRebalancer.TestRebalancer.class.getName());
    _setupTool.addResourceProperty(
        CLUSTER_NAME,
        db2,
        IdealStateProperty.REBALANCE_MODE.toString(),
        RebalanceMode.USER_DEFINED.toString());

    _setupTool.rebalanceStorageCluster(CLUSTER_NAME, db2, 3);

    boolean result =
        ClusterStateVerifier.verifyByZkCallback(
            new ExternalViewBalancedVerifier(_gZkClient, CLUSTER_NAME, db2));
    Assert.assertTrue(result);
    Thread.sleep(1000);
    HelixDataAccessor accessor =
        new ZKHelixDataAccessor(CLUSTER_NAME, new ZkBaseDataAccessor<ZNRecord>(_gZkClient));
    Builder keyBuilder = accessor.keyBuilder();
    ExternalView ev = accessor.getProperty(keyBuilder.externalView(db2));
    Assert.assertEquals(ev.getPartitionSet().size(), 60);
    for (String partition : ev.getPartitionSet()) {
      Assert.assertEquals(ev.getStateMap(partition).size(), 1);
    }
    IdealState is = accessor.getProperty(keyBuilder.idealStates(db2));
    for (String partition : is.getPartitionSet()) {
      Assert.assertEquals(is.getPreferenceList(partition).size(), 3);
      Assert.assertEquals(is.getInstanceStateMap(partition).size(), 3);
    }
    Assert.assertTrue(testRebalancerCreated);
    Assert.assertTrue(testRebalancerInvoked);
  }
  /**
   * TODO: This code is duplicate in multiple places. Can we do it in to one place in the beginning
   * and compute the stateConstraint instance once and re use at other places. Each IdealState must
   * have a constraint object associated with it
   *
   * @param stateModelDefinition
   * @param rebalancerConfig if rebalancerConfig == null, we can't evaluate R thus no constraints
   * @param cluster
   * @return
   */
  private Map<State, Bounds> computeStateConstraints(
      StateModelDefinition stateModelDefinition, IdealState idealState, Cluster cluster) {
    Map<State, Bounds> stateConstraints = new HashMap<State, Bounds>();

    List<State> statePriorityList = stateModelDefinition.getTypedStatesPriorityList();
    for (State state : statePriorityList) {
      String numInstancesPerState = stateModelDefinition.getNumParticipantsPerState(state);
      int max = -1;
      if ("N".equals(numInstancesPerState)) {
        max = cluster.getLiveParticipantMap().size();
      } else if ("R".equals(numInstancesPerState)) {
        // idealState is null when resource has been dropped,
        // R can't be evaluated and ignore state constraints
        if (idealState != null) {
          String replicas = idealState.getReplicas();
          if (replicas.equals(StateModelToken.ANY_LIVEINSTANCE.toString())) {
            max = cluster.getLiveParticipantMap().size();
          } else {
            max = Integer.parseInt(replicas);
          }
        }
      } else {
        try {
          max = Integer.parseInt(numInstancesPerState);
        } catch (Exception e) {
          // use -1
        }
      }

      if (max > -1) {
        // if state has no constraint, will not put in map
        stateConstraints.put(state, new Bounds(0, max));
      }
    }

    return stateConstraints;
  }
  public void testEspressoStorageClusterIdealState(int partitions, int nodes, int replica)
      throws Exception {
    List<String> storageNodes = new ArrayList<String>();
    for (int i = 0; i < partitions; i++) {
      storageNodes.add("localhost:123" + i);
    }

    List<String> relays = new ArrayList<String>();
    for (int i = 0; i < nodes; i++) {
      relays.add("relay:123" + i);
    }

    IdealState idealstate =
        IdealStateCalculatorForEspressoRelay.calculateRelayIdealState(
            storageNodes, relays, "TEST", replica, "Leader", "Standby", "LeaderStandby");

    Assert.assertEquals(
        idealstate.getRecord().getListFields().size(),
        idealstate.getRecord().getMapFields().size());

    Map<String, Integer> countMap = new TreeMap<String, Integer>();
    for (String key : idealstate.getRecord().getListFields().keySet()) {
      Assert.assertEquals(
          idealstate.getRecord().getListFields().get(key).size(),
          idealstate.getRecord().getMapFields().get(key).size());
      List<String> list = idealstate.getRecord().getListFields().get(key);
      Map<String, String> map = idealstate.getRecord().getMapFields().get(key);
      Assert.assertEquals(list.size(), replica);
      for (String val : list) {
        if (!countMap.containsKey(val)) {
          countMap.put(val, 1);
        } else {
          countMap.put(val, countMap.get(val) + 1);
        }
        Assert.assertTrue(map.containsKey(val));
      }
    }
    for (String nodeName : countMap.keySet()) {
      Assert.assertTrue(countMap.get(nodeName) <= partitions * replica / nodes + 1);
      // System.out.println(nodeName + " " + countMap.get(nodeName));
    }
    System.out.println();
  }
Exemple #11
0
  @Test
  public void testSchemataSM() throws Exception {
    String className = TestHelper.getTestClassName();
    String methodName = TestHelper.getTestMethodName();
    String clusterName = className + "_" + methodName;
    int n = 5;

    MockParticipant[] participants = new MockParticipant[n];

    System.out.println("START " + clusterName + " at " + new Date(System.currentTimeMillis()));

    TestHelper.setupCluster(
        clusterName,
        _zkaddr,
        12918, // participant start port
        "localhost", // participant name prefix
        "TestSchemata", // resource name prefix
        1, // resources
        1, // partitions per resource
        n, // number of nodes
        0, // replicas
        "STORAGE_DEFAULT_SM_SCHEMATA",
        false); // don't rebalance

    // rebalance ideal-state to use ANY_LIVEINSTANCE for preference list
    ZKHelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseAccessor);
    PropertyKey.Builder keyBuilder = accessor.keyBuilder();
    PropertyKey key = keyBuilder.idealStates("TestSchemata0");
    IdealState idealState = accessor.getProperty(key);
    idealState.setReplicas(HelixConstants.StateModelToken.ANY_LIVEINSTANCE.toString());
    idealState
        .getRecord()
        .setListField(
            "TestSchemata0_0",
            Arrays.asList(HelixConstants.StateModelToken.ANY_LIVEINSTANCE.toString()));
    accessor.setProperty(key, idealState);

    MockController controller = new MockController(_zkaddr, clusterName, "controller");
    controller.syncStart();

    // start n-1 participants
    for (int i = 1; i < n; i++) {
      String instanceName = "localhost_" + (12918 + i);

      participants[i] = new MockParticipant(_zkaddr, clusterName, instanceName);
      participants[i].syncStart();
    }

    boolean result =
        ClusterStateVerifier.verifyByZkCallback(
            new BestPossAndExtViewZkVerifier(_zkaddr, clusterName));
    Assert.assertTrue(result);

    // start the remaining 1 participant
    participants[0] = new MockParticipant(_zkaddr, clusterName, "localhost_12918");
    participants[0].syncStart();

    // make sure we have all participants in MASTER state
    result =
        ClusterStateVerifier.verifyByZkCallback(
            new BestPossAndExtViewZkVerifier(_zkaddr, clusterName));
    Assert.assertTrue(result);
    key = keyBuilder.externalView("TestSchemata0");
    ExternalView externalView = accessor.getProperty(key);
    Map<String, String> stateMap = externalView.getStateMap("TestSchemata0_0");
    Assert.assertNotNull(stateMap);
    Assert.assertEquals(stateMap.size(), n, "all " + n + " participants should be in Master state");
    for (int i = 0; i < n; i++) {
      String instanceName = "localhost_" + (12918 + i);
      Assert.assertNotNull(stateMap.get(instanceName));
      Assert.assertEquals(stateMap.get(instanceName), "MASTER");
    }

    // clean up
    controller.syncStop();
    for (int i = 0; i < n; i++) {
      participants[i].syncStop();
    }

    System.out.println("END " + clusterName + " at " + new Date(System.currentTimeMillis()));
  }
  @Override
  public void onCallback(NotificationContext context) {
    LOG.info(
        "START: MasterSlaveRebalancer.onCallback running at "
            + _context.getHelixManager().getInstanceName());

    if (context.getType().equals(NotificationContext.Type.FINALIZE)) {
      LOG.info(
          "END: MasterSlaveRebalancer.onCallback FINALIZE callback invoked. Likely lost connection to Helix");
      return;
    }

    HelixManager manager = context.getManager();
    String clusterName = manager.getClusterName();
    HelixAdmin helixAdmin = manager.getClusterManagmentTool();
    IdealState idealState =
        helixAdmin.getResourceIdealState(clusterName, MySQLConstants.MASTER_SLAVE_RESOURCE_NAME);

    if (idealState == null) {
      LOG.info(
          "END: MasterSlaveRebalancer.onCallback. "
              + MySQLConstants.MASTER_SLAVE_RESOURCE_NAME
              + " is not yet created");
    }

    PropertyKey.Builder builder = new PropertyKey.Builder(clusterName);
    Map<String, LiveInstance> liveInstancesMap =
        manager.getHelixDataAccessor().getChildValuesMap(builder.liveInstances());

    Map<String, InstanceConfig> instanceConfigs =
        manager.getHelixDataAccessor().getChildValuesMap(builder.instanceConfigs());

    IdealState newIdealState = new IdealState(idealState.getId());
    newIdealState.getRecord().setSimpleFields(idealState.getRecord().getSimpleFields());
    newIdealState.getRecord().setListFields(idealState.getRecord().getListFields());
    for (String partition : idealState.getPartitionSet()) {
      Map<String, String> instanceStateMap = idealState.getInstanceStateMap(partition);
      String currMaster = null;
      Set<String> slaveSet = new TreeSet<String>();
      for (String instance : instanceStateMap.keySet()) {
        if ("MASTER".equalsIgnoreCase(instanceStateMap.get(instance))) {
          currMaster = instance;
        }
        if ("SLAVE".equalsIgnoreCase(instanceStateMap.get(instance))) {
          slaveSet.add(instance);
        }
      }
      String newMaster = currMaster;
      if (!liveInstancesMap.containsKey(currMaster)
          || !instanceConfigs.get(currMaster).getInstanceEnabled()) {
        // need to find a new master.
        newMaster = findNewMaster(liveInstancesMap, instanceConfigs, currMaster, slaveSet);
      }
      for (String instance : instanceStateMap.keySet()) {
        if (instance.equalsIgnoreCase(newMaster)) {
          newIdealState.setPartitionState(partition, instance, "MASTER");
        } else {
          newIdealState.setPartitionState(partition, instance, "SLAVE");
        }
      }
    }
    if (!idealState.equals(newIdealState)) {
      LOG.info("New idealstate computed.");
      LOG.info(newIdealState.toString());
      manager
          .getClusterManagmentTool()
          .setResourceIdealState(
              clusterName, MySQLConstants.MASTER_SLAVE_RESOURCE_NAME, newIdealState);
    } else {
      LOG.info("No change in IdealState");
    }
    LOG.info("END: MasterSlaveRebalancer.onCallback");
  }
  private synchronized void assignRealtimeSegmentsToServerInstancesIfNecessary()
      throws JSONException, IOException {
    // Fetch current ideal state snapshot
    Map<String, IdealState> idealStateMap = new HashMap<String, IdealState>();

    for (String resource : _pinotHelixResourceManager.getAllRealtimeTables()) {
      idealStateMap.put(
          resource,
          _pinotHelixResourceManager
              .getHelixAdmin()
              .getResourceIdealState(_pinotHelixResourceManager.getHelixClusterName(), resource));
    }

    List<String> listOfSegmentsToAdd = new ArrayList<String>();

    for (String resource : idealStateMap.keySet()) {
      IdealState state = idealStateMap.get(resource);

      // Are there any partitions?
      if (state.getPartitionSet().size() == 0) {
        // No, this is a brand new ideal state, so we will add one new segment to every partition
        // and replica
        List<String> instancesInResource = new ArrayList<String>();
        try {
          instancesInResource.addAll(
              _pinotHelixResourceManager.getServerInstancesForTable(resource, TableType.REALTIME));
        } catch (Exception e) {
          LOGGER.error("Caught exception while fetching instances for resource {}", resource, e);
        }

        // Assign a new segment to all server instances
        for (String instanceId : instancesInResource) {
          InstanceZKMetadata instanceZKMetadata =
              _pinotHelixResourceManager.getInstanceZKMetadata(instanceId);
          String groupId = instanceZKMetadata.getGroupId(resource);
          String partitionId = instanceZKMetadata.getPartition(resource);
          listOfSegmentsToAdd.add(
              SegmentNameBuilder.Realtime.build(
                  resource,
                  instanceId,
                  groupId,
                  partitionId,
                  String.valueOf(System.currentTimeMillis())));
        }
      } else {
        // Add all server instances to the list of instances for which to assign a realtime segment
        Set<String> instancesToAssignRealtimeSegment = new HashSet<String>();
        instancesToAssignRealtimeSegment.addAll(
            _pinotHelixResourceManager.getServerInstancesForTable(resource, TableType.REALTIME));

        // Remove server instances that are currently processing a segment
        for (String partition : state.getPartitionSet()) {
          RealtimeSegmentZKMetadata realtimeSegmentZKMetadata =
              ZKMetadataProvider.getRealtimeSegmentZKMetadata(
                  _pinotHelixResourceManager.getPropertyStore(),
                  SegmentNameBuilder.Realtime.extractTableName(partition),
                  partition);
          if (realtimeSegmentZKMetadata.getStatus() == Status.IN_PROGRESS) {
            String instanceName = SegmentNameBuilder.Realtime.extractInstanceName(partition);
            instancesToAssignRealtimeSegment.remove(instanceName);
          }
        }

        // Assign a new segment to the server instances not currently processing this segment
        for (String instanceId : instancesToAssignRealtimeSegment) {
          InstanceZKMetadata instanceZKMetadata =
              _pinotHelixResourceManager.getInstanceZKMetadata(instanceId);
          String groupId = instanceZKMetadata.getGroupId(resource);
          String partitionId = instanceZKMetadata.getPartition(resource);
          listOfSegmentsToAdd.add(
              SegmentNameBuilder.Realtime.build(
                  resource,
                  instanceId,
                  groupId,
                  partitionId,
                  String.valueOf(System.currentTimeMillis())));
        }
      }
    }

    LOGGER.info(
        "Computed list of new segments to add : " + Arrays.toString(listOfSegmentsToAdd.toArray()));

    // Add the new segments to the server instances
    for (String segmentId : listOfSegmentsToAdd) {
      String resourceName = SegmentNameBuilder.Realtime.extractTableName(segmentId);
      String instanceName = SegmentNameBuilder.Realtime.extractInstanceName(segmentId);

      // Does the ideal state already contain this segment?
      if (!idealStateMap.get(resourceName).getPartitionSet().contains(segmentId)) {
        // No, add it
        // Create the realtime segment metadata
        RealtimeSegmentZKMetadata realtimeSegmentMetadataToAdd = new RealtimeSegmentZKMetadata();
        realtimeSegmentMetadataToAdd.setTableName(
            TableNameBuilder.extractRawTableName(resourceName));
        realtimeSegmentMetadataToAdd.setSegmentType(SegmentType.REALTIME);
        realtimeSegmentMetadataToAdd.setStatus(Status.IN_PROGRESS);
        realtimeSegmentMetadataToAdd.setSegmentName(segmentId);

        // Add the new metadata to the property store
        ZKMetadataProvider.setRealtimeSegmentZKMetadata(
            _pinotHelixResourceManager.getPropertyStore(), realtimeSegmentMetadataToAdd);

        // Update the ideal state to add the new realtime segment
        HelixHelper.updateIdealState(
            _pinotHelixResourceManager.getHelixZkManager(),
            resourceName,
            idealState ->
                PinotTableIdealStateBuilder.addNewRealtimeSegmentToIdealState(
                    segmentId, idealState, instanceName),
            RetryPolicies.exponentialBackoffRetryPolicy(5, 500L, 2.0f));
      }
    }
  }
  @Test
  public void testZKReconnect() throws Exception {
    final AtomicReference<ZkServer> zkServerRef = new AtomicReference<ZkServer>();
    final int zkPort = TestHelper.getRandomPort();
    final String zkAddr = String.format("localhost:%d", zkPort);
    ZkServer zkServer = TestHelper.startZkServer(zkAddr);
    zkServerRef.set(zkServer);

    String className = TestHelper.getTestClassName();
    String methodName = TestHelper.getTestMethodName();
    String clusterName = className + "_" + methodName;

    // Setup cluster
    LOG.info("Setup clusters");
    ClusterSetup clusterSetup = new ClusterSetup(zkAddr);
    clusterSetup.addCluster(clusterName, true);

    // Registers and starts controller
    LOG.info("Starts controller");
    HelixManager controller =
        HelixManagerFactory.getZKHelixManager(clusterName, null, InstanceType.CONTROLLER, zkAddr);
    controller.connect();

    // Registers and starts participant
    LOG.info("Starts participant");
    String hostname = "localhost";
    String instanceId = String.format("%s_%d", hostname, 1);
    clusterSetup.addInstanceToCluster(clusterName, instanceId);
    HelixManager participant =
        HelixManagerFactory.getZKHelixManager(
            clusterName, instanceId, InstanceType.PARTICIPANT, zkAddr);
    participant.connect();

    LOG.info("Register state machine");
    final CountDownLatch latch = new CountDownLatch(1);
    participant
        .getStateMachineEngine()
        .registerStateModelFactory(
            "OnlineOffline",
            new StateModelFactory<StateModel>() {
              @Override
              public StateModel createNewStateModel(String stateUnitKey) {
                return new SimpleStateModel(latch);
              }
            },
            "test");

    String resourceName = "test-resource";
    LOG.info("Ideal state assignment");
    HelixAdmin helixAdmin = participant.getClusterManagmentTool();
    helixAdmin.addResource(
        clusterName,
        resourceName,
        1,
        "OnlineOffline",
        IdealState.RebalanceMode.CUSTOMIZED.toString());

    IdealState idealState = helixAdmin.getResourceIdealState(clusterName, resourceName);
    idealState.setReplicas("1");
    idealState.setStateModelFactoryName("test");
    idealState.setPartitionState(resourceName + "_0", instanceId, "ONLINE");

    LOG.info("Shutdown ZK server");
    TestHelper.stopZkServer(zkServerRef.get());
    Executors.newSingleThreadScheduledExecutor()
        .schedule(
            new Runnable() {

              @Override
              public void run() {
                try {
                  LOG.info("Restart ZK server");
                  // zkServer.set(TestUtils.startZookeeper(zkDir, zkPort));
                  zkServerRef.set(TestHelper.startZkServer(zkAddr, null, false));
                } catch (Exception e) {
                  LOG.error(e.getMessage(), e);
                }
              }
            },
            2L,
            TimeUnit.SECONDS);

    // future.get();

    LOG.info("Before update ideal state");
    helixAdmin.setResourceIdealState(clusterName, resourceName, idealState);
    LOG.info("After update ideal state");

    LOG.info("Wait for OFFLINE->ONLINE state transition");
    try {
      Assert.assertTrue(latch.await(10, TimeUnit.SECONDS));

      // wait until stable state
      boolean result =
          ClusterStateVerifier.verifyByZkCallback(
              new BestPossAndExtViewZkVerifier(zkAddr, clusterName));
      Assert.assertTrue(result);

    } finally {
      participant.disconnect();
      zkServerRef.get().shutdown();
    }
  }
  @Test
  public void testExpandCluster() throws Exception {
    String DB2 = "TestDB2";
    int partitions = 100;
    int replica = 3;
    _setupTool.addResourceToCluster(CLUSTER_NAME, DB2, partitions, STATE_MODEL);
    _setupTool.rebalanceStorageCluster(CLUSTER_NAME, DB2, replica, "keyX");

    String DB3 = "TestDB3";

    _setupTool.addResourceToCluster(CLUSTER_NAME, DB3, partitions, STATE_MODEL);

    IdealState testDB0 =
        _setupTool.getClusterManagementTool().getResourceIdealState(CLUSTER_NAME, TEST_DB);
    IdealState testDB2 =
        _setupTool.getClusterManagementTool().getResourceIdealState(CLUSTER_NAME, DB2);
    IdealState testDB3 =
        _setupTool.getClusterManagementTool().getResourceIdealState(CLUSTER_NAME, DB3);

    for (int i = 0; i < 5; i++) {
      String storageNodeName = "localhost_" + (27960 + i);
      _setupTool.addInstanceToCluster(CLUSTER_NAME, storageNodeName);
    }
    String command = "-zkSvr localhost:2183 -expandCluster " + CLUSTER_NAME;
    ClusterSetup.processCommandLineArgs(command.split(" "));

    IdealState testDB0_1 =
        _setupTool.getClusterManagementTool().getResourceIdealState(CLUSTER_NAME, TEST_DB);
    IdealState testDB2_1 =
        _setupTool.getClusterManagementTool().getResourceIdealState(CLUSTER_NAME, DB2);
    IdealState testDB3_1 =
        _setupTool.getClusterManagementTool().getResourceIdealState(CLUSTER_NAME, DB3);

    Map<String, Object> resultOld2 = RebalanceUtil.buildInternalIdealState(testDB2);
    Map<String, Object> result2 = RebalanceUtil.buildInternalIdealState(testDB2_1);

    TestEspressoStorageClusterIdealState.Verify(result2, partitions, replica - 1);

    Double masterKeepRatio = 0.0, slaveKeepRatio = 0.0;
    double[] result = TestEspressoStorageClusterIdealState.compareResult(resultOld2, result2);
    masterKeepRatio = result[0];
    slaveKeepRatio = result[1];
    Assert.assertTrue(masterKeepRatio > 0.49 && masterKeepRatio < 0.51);

    Assert.assertTrue(testDB3_1.getRecord().getListFields().size() == 0);

    // partitions should stay as same
    Assert.assertTrue(
        testDB0_1
            .getRecord()
            .getListFields()
            .keySet()
            .containsAll(testDB0.getRecord().getListFields().keySet()));
    Assert.assertTrue(
        testDB0_1.getRecord().getListFields().size() == testDB0.getRecord().getListFields().size());
    Assert.assertTrue(
        testDB2_1
            .getRecord()
            .getMapFields()
            .keySet()
            .containsAll(testDB2.getRecord().getMapFields().keySet()));
    Assert.assertTrue(
        testDB2_1.getRecord().getMapFields().size() == testDB2.getRecord().getMapFields().size());
    Assert.assertTrue(
        testDB3_1
            .getRecord()
            .getMapFields()
            .keySet()
            .containsAll(testDB3.getRecord().getMapFields().keySet()));
    Assert.assertTrue(
        testDB3_1.getRecord().getMapFields().size() == testDB3.getRecord().getMapFields().size());

    Map<String, Object> resultOld = RebalanceUtil.buildInternalIdealState(testDB0);
    Map<String, Object> resultNew = RebalanceUtil.buildInternalIdealState(testDB0_1);

    result = TestEspressoStorageClusterIdealState.compareResult(resultOld, resultNew);
    masterKeepRatio = result[0];
    slaveKeepRatio = result[1];
    Assert.assertTrue(masterKeepRatio > 0.49 && masterKeepRatio < 0.51);
  }
  public static IdealState calculateRelayIdealState(
      List<String> partitions,
      List<String> instances,
      String resultRecordName,
      int replica,
      String firstValue,
      String restValue,
      String stateModelName) {
    Collections.sort(partitions);
    Collections.sort(instances);
    if (instances.size() % replica != 0) {
      throw new HelixException("Instances must be divided by replica");
    }

    IdealState result = new IdealState(resultRecordName);
    result.setNumPartitions(partitions.size());
    result.setReplicas("" + replica);
    result.setStateModelDefId(StateModelDefinitionId.from(stateModelName));

    int groups = instances.size() / replica;
    int remainder = instances.size() % replica;

    int remainder2 = partitions.size() % groups;
    int storageNodeGroupSize = partitions.size() / groups;

    for (int i = 0; i < groups; i++) {
      int relayStart = 0, relayEnd = 0, storageNodeStart = 0, storageNodeEnd = 0;
      if (i < remainder) {
        relayStart = (replica + 1) * i;
        relayEnd = (replica + 1) * (i + 1);
      } else {
        relayStart = (replica + 1) * remainder + replica * (i - remainder);
        relayEnd = relayStart + replica;
      }
      // System.out.println("relay start :" + relayStart + " relayEnd:" + relayEnd);
      if (i < remainder2) {
        storageNodeStart = (storageNodeGroupSize + 1) * i;
        storageNodeEnd = (storageNodeGroupSize + 1) * (i + 1);
      } else {
        storageNodeStart =
            (storageNodeGroupSize + 1) * remainder2 + storageNodeGroupSize * (i - remainder2);
        storageNodeEnd = storageNodeStart + storageNodeGroupSize;
      }

      // System.out.println("storageNodeStart :" + storageNodeStart + " storageNodeEnd:" +
      // storageNodeEnd);
      List<String> snBatch = partitions.subList(storageNodeStart, storageNodeEnd);
      List<String> relayBatch = instances.subList(relayStart, relayEnd);

      Map<String, List<String>> sublistFields =
          calculateSubIdealState(snBatch, relayBatch, replica);

      result.getRecord().getListFields().putAll(sublistFields);
    }

    for (String snName : result.getRecord().getListFields().keySet()) {
      Map<String, String> mapField = new TreeMap<String, String>();
      List<String> relayCandidates = result.getRecord().getListField(snName);
      mapField.put(relayCandidates.get(0), firstValue);
      for (int i = 1; i < relayCandidates.size(); i++) {
        mapField.put(relayCandidates.get(i), restValue);
      }
      result.getRecord().getMapFields().put(snName, mapField);
    }
    System.out.println();
    return result;
  }
  @Test
  public void testBasic() throws Exception {
    // Logger.getRootLogger().setLevel(Level.INFO);
    String className = TestHelper.getTestClassName();
    String methodName = TestHelper.getTestMethodName();
    String clusterName = className + "_" + methodName;
    final int n = 5;

    System.out.println("START " + clusterName + " at " + new Date(System.currentTimeMillis()));

    MockParticipantManager[] participants = new MockParticipantManager[n];

    TestHelper.setupCluster(
        clusterName,
        ZK_ADDR,
        12918, // participant port
        "localhost", // participant name prefix
        "TestDB", // resource name prefix
        1, // resources
        10, // partitions per resource
        n, // number of nodes
        3, // replicas
        "MasterSlave",
        true); // do rebalance

    ClusterControllerManager controller =
        new ClusterControllerManager(ZK_ADDR, clusterName, "controller_0");
    controller.syncStart();

    // start participants
    for (int i = 0; i < n; i++) {
      String instanceName = "localhost_" + (12918 + i);

      participants[i] = new MockParticipantManager(ZK_ADDR, clusterName, instanceName);
      participants[i].syncStart();
    }

    boolean result =
        ClusterStateVerifier.verifyByZkCallback(
            new BestPossAndExtViewZkVerifier(ZK_ADDR, clusterName));
    Assert.assertTrue(result);

    // add a new idealState without registering message handling factory
    ClusterSetup setupTool = new ClusterSetup(ZK_ADDR);
    setupTool.addResourceToCluster(clusterName, "TestDB1", 16, "MasterSlave");

    ZkBaseDataAccessor<ZNRecord> baseAccessor = new ZkBaseDataAccessor<ZNRecord>(_gZkClient);
    ZKHelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, baseAccessor);
    Builder keyBuilder = accessor.keyBuilder();
    IdealState idealState = accessor.getProperty(keyBuilder.idealStates("TestDB1"));
    idealState.setStateModelFactoryName("TestDB1_Factory");
    accessor.setProperty(keyBuilder.idealStates("TestDB1"), idealState);
    setupTool.rebalanceStorageCluster(clusterName, "TestDB1", 3);

    // assert that we have received OFFLINE->SLAVE messages for all partitions
    int totalMsgs = 0;
    for (int retry = 0; retry < 5; retry++) {
      Thread.sleep(100);
      totalMsgs = 0;
      for (int i = 0; i < n; i++) {
        List<Message> msgs =
            accessor.getChildValues(keyBuilder.messages(participants[i].getInstanceName()));
        totalMsgs += msgs.size();
      }

      if (totalMsgs == 48) // partition# x replicas
      break;
    }

    Assert.assertEquals(
        totalMsgs,
        48,
        "Should accumulated 48 unprocessed messages (1 O->S per partition per replica) because TestDB1 is added without state-model-factory but was "
            + totalMsgs);

    // register "TestDB1_Factory" state model factory
    // Logger.getRootLogger().setLevel(Level.INFO);
    for (int i = 0; i < n; i++) {
      participants[i]
          .getStateMachineEngine()
          .registerStateModelFactory("MasterSlave", new MockMSModelFactory(), "TestDB1_Factory");
    }

    result =
        ClusterStateVerifier.verifyByZkCallback(
            new BestPossAndExtViewZkVerifier(ZK_ADDR, clusterName));
    Assert.assertTrue(result);

    // clean up
    // wait for all zk callbacks done
    controller.syncStop();
    for (int i = 0; i < 5; i++) {
      participants[i].syncStop();
    }

    System.out.println("END " + clusterName + " at " + new Date(System.currentTimeMillis()));
  }