예제 #1
0
  /**
   * make assignments for a topology The nimbus core function, this function has been totally
   * rewrite
   *
   * @param nimbusData NimbusData
   * @param topologyId String
   * @param isScratch Boolean: isScratch is false unless rebalancing the topology
   * @throws Exception
   */
  public Assignment mkAssignment(TopologyAssignEvent event) throws Exception {
    String topologyId = event.getTopologyId();

    LOG.info("Determining assignment for " + topologyId);

    TopologyAssignContext context = prepareTopologyAssign(event);

    Set<ResourceWorkerSlot> assignments = null;

    if (!StormConfig.local_mode(nimbusData.getConf())) {

      IToplogyScheduler scheduler = schedulers.get(DEFAULT_SCHEDULER_NAME);

      assignments = scheduler.assignTasks(context);

    } else {
      assignments = mkLocalAssignment(context);
    }
    Assignment assignment = null;

    Map<String, String> nodeHost =
        getTopologyNodeHost(context.getCluster(), context.getOldAssignment(), assignments);

    Map<Integer, Integer> startTimes =
        getTaskStartTimes(context, nimbusData, topologyId, context.getOldAssignment(), assignments);

    String codeDir = StormConfig.masterStormdistRoot(nimbusData.getConf(), topologyId);

    assignment = new Assignment(codeDir, assignments, nodeHost, startTimes);

    StormClusterState stormClusterState = nimbusData.getStormClusterState();

    stormClusterState.set_assignment(topologyId, assignment);

    // update task heartbeat's start time
    NimbusUtils.updateTaskHbStartTime(nimbusData, assignment, topologyId);

    // Update metrics information in ZK when rebalance or reassignment
    // Only update metrics monitor status when creating topology
    if (context.getAssignType() == TopologyAssignContext.ASSIGN_TYPE_REBALANCE
        || context.getAssignType() == TopologyAssignContext.ASSIGN_TYPE_MONITOR)
      NimbusUtils.updateMetricsInfo(nimbusData, topologyId, assignment);
    else metricsMonitor(event);

    LOG.info("Successfully make assignment for topology id " + topologyId + ": " + assignment);

    return assignment;
  }
예제 #2
0
 /**
  * set topology status as active
  *
  * @param topologyname
  */
 @Override
 public void activate(String topologyName) throws NotAliveException, TException {
   try {
     NimbusUtils.transitionName(data, topologyName, true, StatusType.activate);
   } catch (NotAliveException e) {
     String errMsg = "Activate Error, no this topology " + topologyName;
     LOG.error(errMsg, e);
     throw new NotAliveException(errMsg);
   } catch (Exception e) {
     String errMsg = "Failed to active topology " + topologyName;
     LOG.error(errMsg, e);
     throw new TException(errMsg);
   }
 }
예제 #3
0
  /**
   * find all alived taskid Does not assume that clocks are synchronized. Task heartbeat is only
   * used so that nimbus knows when it's received a new heartbeat. All timing is done by nimbus and
   * tracked through task-heartbeat-cache 获取topology下的所有alive的task
   *
   * @param conf
   * @param topologyId
   * @param stormClusterState
   * @param taskIds
   * @param taskStartTimes
   * @param taskHeartbeatsCache --Map<topologyId, Map<taskid, Map<tkHbCacheTime, time>>>
   * @return Set<Integer> : taskid
   * @throws Exception
   */
  public Set<Integer> getAliveTasks(String topologyId, Set<Integer> taskIds) throws Exception {

    Set<Integer> aliveTasks = new HashSet<Integer>();

    // taskIds is the list from ZK /ZK-DIR/tasks/topologyId
    for (int taskId : taskIds) {

      boolean isDead = NimbusUtils.isTaskDead(nimbusData, topologyId, taskId);
      if (isDead == false) {
        aliveTasks.add(taskId);
      }
    }

    return aliveTasks;
  }
예제 #4
0
  /**
   * Get TopologyInfo, it contain all data of the topology running status
   *
   * @return TopologyInfo
   */
  @Override
  public TopologyInfo getTopologyInfo(String topologyId) throws NotAliveException, TException {

    TopologyInfo topologyInfo = new TopologyInfo();

    StormClusterState stormClusterState = data.getStormClusterState();

    try {

      // get topology's StormBase
      StormBase base = stormClusterState.storm_base(topologyId, null);
      if (base == null) {
        throw new NotAliveException("No topology of " + topologyId);
      }
      topologyInfo.set_id(topologyId);
      topologyInfo.set_name(base.getStormName());
      topologyInfo.set_uptime_secs(TimeUtils.time_delta(base.getLanchTimeSecs()));
      topologyInfo.set_status(base.getStatusString());

      // get topology's Assignment
      Assignment assignment = stormClusterState.assignment_info(topologyId, null);
      if (assignment == null) {
        throw new TException("Failed to get StormBase from ZK of " + topologyId);
      }

      // get topology's map<taskId, componentId>
      Map<Integer, String> taskInfo = Cluster.topology_task_info(stormClusterState, topologyId);

      List<TaskSummary> tasks =
          NimbusUtils.mkTaskSummary(stormClusterState, assignment, taskInfo, topologyId);
      topologyInfo.set_tasks(tasks);

      return topologyInfo;
    } catch (TException e) {
      LOG.info("Failed to get topologyInfo " + topologyId, e);
      throw e;
    } catch (Exception e) {
      LOG.info("Failed to get topologyInfo " + topologyId, e);
      throw new TException("Failed to get topologyInfo" + topologyId);
    }
  }
예제 #5
0
  @Override
  public void killTopologyWithOpts(String topologyName, KillOptions options)
      throws NotAliveException, TException {
    try {

      checkTopologyActive(data, topologyName, true);
      Integer wait_amt = null;
      if (options.is_set_wait_secs()) {
        wait_amt = options.get_wait_secs();
      }
      NimbusUtils.transitionName(data, topologyName, true, StatusType.kill, wait_amt);
    } catch (NotAliveException e) {
      String errMsg = "KillTopology Error, no this topology " + topologyName;
      LOG.error(errMsg, e);
      throw new NotAliveException(errMsg);
    } catch (Exception e) {
      String errMsg = "Failed to kill topology " + topologyName;
      LOG.error(errMsg, e);
      throw new TException(errMsg);
    }
  }
예제 #6
0
  /**
   * rebalance one topology @@@ rebalance options hasn't implements
   *
   * <p>It is used to let workers wait several seconds to finish jobs
   *
   * @param topologyname String
   * @param options RebalanceOptions
   */
  @Override
  public void rebalance(String topologyName, RebalanceOptions options)
      throws NotAliveException, TException, InvalidTopologyException {

    try {

      checkTopologyActive(data, topologyName, true);
      Integer wait_amt = null;
      if (options != null && options.is_set_wait_secs()) {
        wait_amt = options.get_wait_secs();
      }

      NimbusUtils.transitionName(data, topologyName, true, StatusType.rebalance, wait_amt);
    } catch (NotAliveException e) {
      String errMsg = "Rebalance Error, no this topology " + topologyName;
      LOG.error(errMsg, e);
      throw new NotAliveException(errMsg);
    } catch (Exception e) {
      String errMsg = "Failed to rebalance topology " + topologyName;
      LOG.error(errMsg, e);
      throw new TException(errMsg);
    }
  }
  @Override
  public <T> Object execute(T... args) {
    boolean isSetTaskInfo = false;
    try {
      Boolean reassign = (Boolean) args[1];
      Map<Object, Object> conf = (Map<Object, Object>) args[2]; // args[0]:
      // delay,
      // args[1]:
      // reassign_flag,
      // args[2]:
      // conf
      if (conf != null) {
        boolean isConfUpdate = false;
        Map stormConf = data.getConf();

        // Update topology code
        Map topoConf = StormConfig.read_nimbus_topology_conf(stormConf, topologyid);
        StormTopology rawOldTopology = StormConfig.read_nimbus_topology_code(stormConf, topologyid);
        StormTopology rawNewTopology = NimbusUtils.normalizeTopology(conf, rawOldTopology, true);
        StormTopology sysOldTopology = rawOldTopology.deepCopy();
        StormTopology sysNewTopology = rawNewTopology.deepCopy();
        if (conf.get(Config.TOPOLOGY_ACKER_EXECUTORS) != null) {
          Common.add_acker(topoConf, sysOldTopology);
          Common.add_acker(conf, sysNewTopology);
          int ackerNum = JStormUtils.parseInt(conf.get(Config.TOPOLOGY_ACKER_EXECUTORS));
          int oldAckerNum = JStormUtils.parseInt(topoConf.get(Config.TOPOLOGY_ACKER_EXECUTORS));
          LOG.info("Update acker from oldAckerNum=" + oldAckerNum + " to ackerNum=" + ackerNum);
          topoConf.put(Config.TOPOLOGY_ACKER_EXECUTORS, ackerNum);
          isConfUpdate = true;
        }

        // If scale-out, setup task info for new added tasks
        setTaskInfo(sysOldTopology, sysNewTopology);
        isSetTaskInfo = true;

        // If everything is OK, write topology code into disk
        StormConfig.write_nimbus_topology_code(
            stormConf, topologyid, Utils.serialize(rawNewTopology));

        // Update topology conf if worker num has been updated
        Set<Object> keys = conf.keySet();
        Integer workerNum = JStormUtils.parseInt(conf.get(Config.TOPOLOGY_WORKERS));
        if (workerNum != null) {
          Integer oldWorkerNum = JStormUtils.parseInt(topoConf.get(Config.TOPOLOGY_WORKERS));
          topoConf.put(Config.TOPOLOGY_WORKERS, workerNum);
          isConfUpdate = true;

          LOG.info("Update worker num from " + oldWorkerNum + " to " + workerNum);
        }

        if (keys.contains(Config.ISOLATION_SCHEDULER_MACHINES)) {
          topoConf.put(
              Config.ISOLATION_SCHEDULER_MACHINES, conf.get(Config.ISOLATION_SCHEDULER_MACHINES));
        }

        if (isConfUpdate) {
          StormConfig.write_nimbus_topology_conf(stormConf, topologyid, topoConf);
        }
      }

      TopologyAssignEvent event = new TopologyAssignEvent();

      event.setTopologyId(topologyid);
      event.setScratch(true);
      event.setOldStatus(oldStatus);
      event.setReassign(reassign);
      if (conf != null) event.setScaleTopology(true);
      TopologyAssign.push(event);
      event.waitFinish();
    } catch (Exception e) {
      LOG.error("do-rebalance error!", e);
      // Rollback the changes on ZK
      if (isSetTaskInfo) {
        try {
          StormClusterState clusterState = data.getStormClusterState();
          clusterState.remove_task(topologyid, newTasks);
        } catch (Exception e1) {
          LOG.error("Failed to rollback the changes on ZK for task-" + newTasks, e);
        }
      }
    }

    DelayStatusTransitionCallback delayCallback =
        new DelayStatusTransitionCallback(
            data, topologyid, oldStatus, StatusType.rebalancing, StatusType.done_rebalance);
    return delayCallback.execute();
  }
예제 #8
0
  @Override
  public SupervisorWorkers getSupervisorWorkers(String host) throws NotAliveException, TException {
    try {
      StormClusterState stormClusterState = data.getStormClusterState();

      String supervisorId = null;
      SupervisorInfo supervisorInfo = null;

      String ip = NetWorkUtils.host2Ip(host);
      String hostName = NetWorkUtils.ip2Host(host);

      // all supervisors
      Map<String, SupervisorInfo> supervisorInfos =
          Cluster.allSupervisorInfo(stormClusterState, null);

      for (Entry<String, SupervisorInfo> entry : supervisorInfos.entrySet()) {

        SupervisorInfo info = entry.getValue();
        if (info.getHostName().equals(hostName) || info.getHostName().equals(ip)) {
          supervisorId = entry.getKey();
          supervisorInfo = info;
          break;
        }
      }

      if (supervisorId == null) {
        throw new TException("No supervisor of " + host);
      }

      Map<String, Assignment> assignments = new HashMap<String, Assignment>();

      // get all active topology's StormBase
      Map<String, StormBase> bases = Cluster.topology_bases(stormClusterState);
      for (Entry<String, StormBase> entry : bases.entrySet()) {

        String topologyId = entry.getKey();
        StormBase base = entry.getValue();

        Assignment assignment = stormClusterState.assignment_info(topologyId, null);
        if (assignment == null) {
          LOG.error("Failed to get assignment of " + topologyId);
          continue;
        }
        assignments.put(topologyId, assignment);
      }

      Map<Integer, WorkerSummary> portWorkerSummarys = new TreeMap<Integer, WorkerSummary>();
      for (Entry<String, Assignment> entry : assignments.entrySet()) {
        String topologyId = entry.getKey();
        Assignment assignment = entry.getValue();

        Map<Integer, String> taskToComponent =
            Cluster.topology_task_info(stormClusterState, topologyId);

        Map<Integer, ResourceAssignment> taskToResource = assignment.getTaskToResource();

        for (Entry<Integer, ResourceAssignment> resourceEntry : taskToResource.entrySet()) {
          Integer taskId = resourceEntry.getKey();
          ResourceAssignment resourceAssignment = resourceEntry.getValue();

          if (supervisorId.equals(resourceAssignment.getSupervisorId()) == false) {
            continue;
          }

          supervisorInfo.allocResource(resourceAssignment);

          Integer port = resourceAssignment.getPort();
          WorkerSummary workerSummary = portWorkerSummarys.get(port);
          if (workerSummary == null) {
            workerSummary = new WorkerSummary();
            workerSummary.set_port(port);
            workerSummary.set_topology(topologyId);
            workerSummary.set_tasks(new ArrayList<TaskSummary>());

            portWorkerSummarys.put(port, workerSummary);
          }

          String componentName = taskToComponent.get(taskId);
          int uptime = TimeUtils.time_delta(assignment.getTaskStartTimeSecs().get(taskId));
          List<TaskSummary> tasks = workerSummary.get_tasks();

          TaskSummary taskSummary =
              NimbusUtils.mkSimpleTaskSummary(
                  resourceAssignment, taskId, componentName, host, uptime);

          tasks.add(taskSummary);
        }
      }

      List<WorkerSummary> wokersList = new ArrayList<WorkerSummary>();
      wokersList.addAll(portWorkerSummarys.values());

      SupervisorSummary supervisorSummary =
          NimbusUtils.mkSupervisorSummary(supervisorInfo, supervisorId);
      return new SupervisorWorkers(supervisorSummary, wokersList);

    } catch (TException e) {
      LOG.info("Failed to get ClusterSummary ", e);
      throw e;
    } catch (Exception e) {
      LOG.info("Failed to get ClusterSummary ", e);
      throw new TException(e);
    }
  }
예제 #9
0
  /**
   * get cluster's summary, it will contain SupervisorSummary and TopologySummary
   *
   * @return ClusterSummary
   */
  @Override
  public ClusterSummary getClusterInfo() throws TException {

    try {

      StormClusterState stormClusterState = data.getStormClusterState();

      Map<String, Assignment> assignments = new HashMap<String, Assignment>();

      // get nimbus running time
      int uptime = data.uptime();

      // get TopologySummary
      List<TopologySummary> topologySummaries = new ArrayList<TopologySummary>();

      // get all active topology's StormBase
      Map<String, StormBase> bases = Cluster.topology_bases(stormClusterState);
      for (Entry<String, StormBase> entry : bases.entrySet()) {

        String topologyId = entry.getKey();
        StormBase base = entry.getValue();

        Assignment assignment = stormClusterState.assignment_info(topologyId, null);
        if (assignment == null) {
          LOG.error("Failed to get assignment of " + topologyId);
          continue;
        }
        assignments.put(topologyId, assignment);
        String group = "default";
        if (data.isGroupMode()) group = base.getGroup();
        if (group == null) group = "default";

        TopologySummary topology =
            NimbusUtils.mkTopologySummary(
                assignment,
                topologyId,
                base.getStormName(),
                base.getStatusString(),
                TimeUtils.time_delta(base.getLanchTimeSecs()),
                group);

        topologySummaries.add(topology);
      }

      // all supervisors
      Map<String, SupervisorInfo> supervisorInfos =
          Cluster.allSupervisorInfo(stormClusterState, null);

      // generate SupervisorSummaries
      List<SupervisorSummary> supervisorSummaries =
          NimbusUtils.mkSupervisorSummaries(supervisorInfos, assignments);

      return new ClusterSummary(
          supervisorSummaries,
          uptime,
          topologySummaries,
          data.getGroupToTopology(),
          data.getGroupToResource(),
          data.getGroupToUsedResource(),
          data.isGroupMode());

    } catch (TException e) {
      LOG.info("Failed to get ClusterSummary ", e);
      throw e;
    } catch (Exception e) {
      LOG.info("Failed to get ClusterSummary ", e);
      throw new TException(e);
    }
  }
예제 #10
0
  /**
   * Submit one Topology
   *
   * @param topologyname String: topology name
   * @param uploadedJarLocation String: already uploaded jar path
   * @param jsonConf String: jsonConf serialize all toplogy configuration to Json
   * @param topology StormTopology: topology Object
   */
  @SuppressWarnings("unchecked")
  @Override
  public void submitTopologyWithOpts(
      String topologyname,
      String uploadedJarLocation,
      String jsonConf,
      StormTopology topology,
      SubmitOptions options)
      throws AlreadyAliveException, InvalidTopologyException, TopologyAssignException, TException {
    LOG.info("Receive " + topologyname + ", uploadedJarLocation:" + uploadedJarLocation);
    // @@@ Move validate topologyname in client code
    try {
      checkTopologyActive(data, topologyname, false);
    } catch (AlreadyAliveException e) {
      LOG.info(topologyname + " is already exist ");
      throw e;
    } catch (Exception e) {
      LOG.info("Failed to check whether topology is alive or not", e);
      throw new TException(e);
    }

    int counter = data.getSubmittedCount().incrementAndGet();
    String topologyId = topologyname + "-" + counter + "-" + TimeUtils.current_time_secs();

    Map<Object, Object> serializedConf = (Map<Object, Object>) JStormUtils.from_json(jsonConf);
    if (serializedConf == null) {
      LOG.warn("Failed to serialized Configuration");
      throw new InvalidTopologyException("Failed to serilaze topology configuration");
    }

    serializedConf.put(Config.TOPOLOGY_ID, topologyId);
    serializedConf.put(Config.TOPOLOGY_NAME, topologyname);

    try {
      Map<Object, Object> stormConf;

      stormConf = NimbusUtils.normalizeConf(conf, serializedConf, topology);

      Map<Object, Object> totalStormConf = new HashMap<Object, Object>(conf);
      totalStormConf.putAll(stormConf);

      StormTopology normalizedTopology = NimbusUtils.normalizeTopology(stormConf, topology);

      // this validates the structure of the topology
      Common.validate_basic(normalizedTopology, totalStormConf, topologyId);
      // don't need generate real topology, so skip Common.system_topology
      // Common.system_topology(totalStormConf, topology);

      StormClusterState stormClusterState = data.getStormClusterState();

      // create /local-dir/nimbus/topologyId/xxxx files
      setupStormCode(conf, topologyId, uploadedJarLocation, stormConf, normalizedTopology);

      // generate TaskInfo for every bolt or spout in ZK
      // /ZK/tasks/topoologyId/xxx
      setupZkTaskInfo(conf, topologyId, stormClusterState);

      // make assignments for a topology
      TopologyAssignEvent assignEvent = new TopologyAssignEvent();
      assignEvent.setTopologyId(topologyId);
      assignEvent.setScratch(false);
      assignEvent.setTopologyName(topologyname);
      assignEvent.setOldStatus(
          Thrift.topologyInitialStatusToStormStatus(options.get_initial_status()));

      TopologyAssign.push(assignEvent);
      LOG.info("Submit for " + topologyname + " with conf " + serializedConf);

      boolean isSuccess = assignEvent.waitFinish();
      if (isSuccess == true) {
        LOG.info("Finish submit for " + topologyname);
      } else {
        throw new FailedAssignTopologyException(assignEvent.getErrorMsg());
      }

    } catch (FailedAssignTopologyException e) {
      StringBuilder sb = new StringBuilder();
      sb.append("Fail to sumbit topology, Root cause:");
      if (e.getMessage() == null) {
        sb.append("submit timeout");
      } else {
        sb.append(e.getMessage());
      }

      sb.append("\n\n");
      sb.append("topologyId:" + topologyId);
      sb.append(", uploadedJarLocation:" + uploadedJarLocation + "\n");
      LOG.error(sb.toString(), e);
      throw new TopologyAssignException(sb.toString());
    } catch (InvalidParameterException e) {
      StringBuilder sb = new StringBuilder();
      sb.append("Fail to sumbit topology ");
      sb.append(e.getMessage());
      sb.append(", cause:" + e.getCause());
      sb.append("\n\n");
      sb.append("topologyId:" + topologyId);
      sb.append(", uploadedJarLocation:" + uploadedJarLocation + "\n");
      LOG.error(sb.toString(), e);
      throw new InvalidParameterException(sb.toString());
    } catch (Throwable e) {
      StringBuilder sb = new StringBuilder();
      sb.append("Fail to sumbit topology ");
      sb.append(e.getMessage());
      sb.append(", cause:" + e.getCause());
      sb.append("\n\n");
      sb.append("topologyId:" + topologyId);
      sb.append(", uploadedJarLocation:" + uploadedJarLocation + "\n");
      LOG.error(sb.toString(), e);
      throw new TopologyAssignException(sb.toString());
    }
  }