/** * make assignments for a topology The nimbus core function, this function has been totally * rewrite * * @param nimbusData NimbusData * @param topologyId String * @param isScratch Boolean: isScratch is false unless rebalancing the topology * @throws Exception */ public Assignment mkAssignment(TopologyAssignEvent event) throws Exception { String topologyId = event.getTopologyId(); LOG.info("Determining assignment for " + topologyId); TopologyAssignContext context = prepareTopologyAssign(event); Set<ResourceWorkerSlot> assignments = null; if (!StormConfig.local_mode(nimbusData.getConf())) { IToplogyScheduler scheduler = schedulers.get(DEFAULT_SCHEDULER_NAME); assignments = scheduler.assignTasks(context); } else { assignments = mkLocalAssignment(context); } Assignment assignment = null; Map<String, String> nodeHost = getTopologyNodeHost(context.getCluster(), context.getOldAssignment(), assignments); Map<Integer, Integer> startTimes = getTaskStartTimes(context, nimbusData, topologyId, context.getOldAssignment(), assignments); String codeDir = StormConfig.masterStormdistRoot(nimbusData.getConf(), topologyId); assignment = new Assignment(codeDir, assignments, nodeHost, startTimes); StormClusterState stormClusterState = nimbusData.getStormClusterState(); stormClusterState.set_assignment(topologyId, assignment); // update task heartbeat's start time NimbusUtils.updateTaskHbStartTime(nimbusData, assignment, topologyId); // Update metrics information in ZK when rebalance or reassignment // Only update metrics monitor status when creating topology if (context.getAssignType() == TopologyAssignContext.ASSIGN_TYPE_REBALANCE || context.getAssignType() == TopologyAssignContext.ASSIGN_TYPE_MONITOR) NimbusUtils.updateMetricsInfo(nimbusData, topologyId, assignment); else metricsMonitor(event); LOG.info("Successfully make assignment for topology id " + topologyId + ": " + assignment); return assignment; }
/** * set topology status as active * * @param topologyname */ @Override public void activate(String topologyName) throws NotAliveException, TException { try { NimbusUtils.transitionName(data, topologyName, true, StatusType.activate); } catch (NotAliveException e) { String errMsg = "Activate Error, no this topology " + topologyName; LOG.error(errMsg, e); throw new NotAliveException(errMsg); } catch (Exception e) { String errMsg = "Failed to active topology " + topologyName; LOG.error(errMsg, e); throw new TException(errMsg); } }
/** * find all alived taskid Does not assume that clocks are synchronized. Task heartbeat is only * used so that nimbus knows when it's received a new heartbeat. All timing is done by nimbus and * tracked through task-heartbeat-cache 获取topology下的所有alive的task * * @param conf * @param topologyId * @param stormClusterState * @param taskIds * @param taskStartTimes * @param taskHeartbeatsCache --Map<topologyId, Map<taskid, Map<tkHbCacheTime, time>>> * @return Set<Integer> : taskid * @throws Exception */ public Set<Integer> getAliveTasks(String topologyId, Set<Integer> taskIds) throws Exception { Set<Integer> aliveTasks = new HashSet<Integer>(); // taskIds is the list from ZK /ZK-DIR/tasks/topologyId for (int taskId : taskIds) { boolean isDead = NimbusUtils.isTaskDead(nimbusData, topologyId, taskId); if (isDead == false) { aliveTasks.add(taskId); } } return aliveTasks; }
/** * Get TopologyInfo, it contain all data of the topology running status * * @return TopologyInfo */ @Override public TopologyInfo getTopologyInfo(String topologyId) throws NotAliveException, TException { TopologyInfo topologyInfo = new TopologyInfo(); StormClusterState stormClusterState = data.getStormClusterState(); try { // get topology's StormBase StormBase base = stormClusterState.storm_base(topologyId, null); if (base == null) { throw new NotAliveException("No topology of " + topologyId); } topologyInfo.set_id(topologyId); topologyInfo.set_name(base.getStormName()); topologyInfo.set_uptime_secs(TimeUtils.time_delta(base.getLanchTimeSecs())); topologyInfo.set_status(base.getStatusString()); // get topology's Assignment Assignment assignment = stormClusterState.assignment_info(topologyId, null); if (assignment == null) { throw new TException("Failed to get StormBase from ZK of " + topologyId); } // get topology's map<taskId, componentId> Map<Integer, String> taskInfo = Cluster.topology_task_info(stormClusterState, topologyId); List<TaskSummary> tasks = NimbusUtils.mkTaskSummary(stormClusterState, assignment, taskInfo, topologyId); topologyInfo.set_tasks(tasks); return topologyInfo; } catch (TException e) { LOG.info("Failed to get topologyInfo " + topologyId, e); throw e; } catch (Exception e) { LOG.info("Failed to get topologyInfo " + topologyId, e); throw new TException("Failed to get topologyInfo" + topologyId); } }
@Override public void killTopologyWithOpts(String topologyName, KillOptions options) throws NotAliveException, TException { try { checkTopologyActive(data, topologyName, true); Integer wait_amt = null; if (options.is_set_wait_secs()) { wait_amt = options.get_wait_secs(); } NimbusUtils.transitionName(data, topologyName, true, StatusType.kill, wait_amt); } catch (NotAliveException e) { String errMsg = "KillTopology Error, no this topology " + topologyName; LOG.error(errMsg, e); throw new NotAliveException(errMsg); } catch (Exception e) { String errMsg = "Failed to kill topology " + topologyName; LOG.error(errMsg, e); throw new TException(errMsg); } }
/** * rebalance one topology @@@ rebalance options hasn't implements * * <p>It is used to let workers wait several seconds to finish jobs * * @param topologyname String * @param options RebalanceOptions */ @Override public void rebalance(String topologyName, RebalanceOptions options) throws NotAliveException, TException, InvalidTopologyException { try { checkTopologyActive(data, topologyName, true); Integer wait_amt = null; if (options != null && options.is_set_wait_secs()) { wait_amt = options.get_wait_secs(); } NimbusUtils.transitionName(data, topologyName, true, StatusType.rebalance, wait_amt); } catch (NotAliveException e) { String errMsg = "Rebalance Error, no this topology " + topologyName; LOG.error(errMsg, e); throw new NotAliveException(errMsg); } catch (Exception e) { String errMsg = "Failed to rebalance topology " + topologyName; LOG.error(errMsg, e); throw new TException(errMsg); } }
@Override public <T> Object execute(T... args) { boolean isSetTaskInfo = false; try { Boolean reassign = (Boolean) args[1]; Map<Object, Object> conf = (Map<Object, Object>) args[2]; // args[0]: // delay, // args[1]: // reassign_flag, // args[2]: // conf if (conf != null) { boolean isConfUpdate = false; Map stormConf = data.getConf(); // Update topology code Map topoConf = StormConfig.read_nimbus_topology_conf(stormConf, topologyid); StormTopology rawOldTopology = StormConfig.read_nimbus_topology_code(stormConf, topologyid); StormTopology rawNewTopology = NimbusUtils.normalizeTopology(conf, rawOldTopology, true); StormTopology sysOldTopology = rawOldTopology.deepCopy(); StormTopology sysNewTopology = rawNewTopology.deepCopy(); if (conf.get(Config.TOPOLOGY_ACKER_EXECUTORS) != null) { Common.add_acker(topoConf, sysOldTopology); Common.add_acker(conf, sysNewTopology); int ackerNum = JStormUtils.parseInt(conf.get(Config.TOPOLOGY_ACKER_EXECUTORS)); int oldAckerNum = JStormUtils.parseInt(topoConf.get(Config.TOPOLOGY_ACKER_EXECUTORS)); LOG.info("Update acker from oldAckerNum=" + oldAckerNum + " to ackerNum=" + ackerNum); topoConf.put(Config.TOPOLOGY_ACKER_EXECUTORS, ackerNum); isConfUpdate = true; } // If scale-out, setup task info for new added tasks setTaskInfo(sysOldTopology, sysNewTopology); isSetTaskInfo = true; // If everything is OK, write topology code into disk StormConfig.write_nimbus_topology_code( stormConf, topologyid, Utils.serialize(rawNewTopology)); // Update topology conf if worker num has been updated Set<Object> keys = conf.keySet(); Integer workerNum = JStormUtils.parseInt(conf.get(Config.TOPOLOGY_WORKERS)); if (workerNum != null) { Integer oldWorkerNum = JStormUtils.parseInt(topoConf.get(Config.TOPOLOGY_WORKERS)); topoConf.put(Config.TOPOLOGY_WORKERS, workerNum); isConfUpdate = true; LOG.info("Update worker num from " + oldWorkerNum + " to " + workerNum); } if (keys.contains(Config.ISOLATION_SCHEDULER_MACHINES)) { topoConf.put( Config.ISOLATION_SCHEDULER_MACHINES, conf.get(Config.ISOLATION_SCHEDULER_MACHINES)); } if (isConfUpdate) { StormConfig.write_nimbus_topology_conf(stormConf, topologyid, topoConf); } } TopologyAssignEvent event = new TopologyAssignEvent(); event.setTopologyId(topologyid); event.setScratch(true); event.setOldStatus(oldStatus); event.setReassign(reassign); if (conf != null) event.setScaleTopology(true); TopologyAssign.push(event); event.waitFinish(); } catch (Exception e) { LOG.error("do-rebalance error!", e); // Rollback the changes on ZK if (isSetTaskInfo) { try { StormClusterState clusterState = data.getStormClusterState(); clusterState.remove_task(topologyid, newTasks); } catch (Exception e1) { LOG.error("Failed to rollback the changes on ZK for task-" + newTasks, e); } } } DelayStatusTransitionCallback delayCallback = new DelayStatusTransitionCallback( data, topologyid, oldStatus, StatusType.rebalancing, StatusType.done_rebalance); return delayCallback.execute(); }
@Override public SupervisorWorkers getSupervisorWorkers(String host) throws NotAliveException, TException { try { StormClusterState stormClusterState = data.getStormClusterState(); String supervisorId = null; SupervisorInfo supervisorInfo = null; String ip = NetWorkUtils.host2Ip(host); String hostName = NetWorkUtils.ip2Host(host); // all supervisors Map<String, SupervisorInfo> supervisorInfos = Cluster.allSupervisorInfo(stormClusterState, null); for (Entry<String, SupervisorInfo> entry : supervisorInfos.entrySet()) { SupervisorInfo info = entry.getValue(); if (info.getHostName().equals(hostName) || info.getHostName().equals(ip)) { supervisorId = entry.getKey(); supervisorInfo = info; break; } } if (supervisorId == null) { throw new TException("No supervisor of " + host); } Map<String, Assignment> assignments = new HashMap<String, Assignment>(); // get all active topology's StormBase Map<String, StormBase> bases = Cluster.topology_bases(stormClusterState); for (Entry<String, StormBase> entry : bases.entrySet()) { String topologyId = entry.getKey(); StormBase base = entry.getValue(); Assignment assignment = stormClusterState.assignment_info(topologyId, null); if (assignment == null) { LOG.error("Failed to get assignment of " + topologyId); continue; } assignments.put(topologyId, assignment); } Map<Integer, WorkerSummary> portWorkerSummarys = new TreeMap<Integer, WorkerSummary>(); for (Entry<String, Assignment> entry : assignments.entrySet()) { String topologyId = entry.getKey(); Assignment assignment = entry.getValue(); Map<Integer, String> taskToComponent = Cluster.topology_task_info(stormClusterState, topologyId); Map<Integer, ResourceAssignment> taskToResource = assignment.getTaskToResource(); for (Entry<Integer, ResourceAssignment> resourceEntry : taskToResource.entrySet()) { Integer taskId = resourceEntry.getKey(); ResourceAssignment resourceAssignment = resourceEntry.getValue(); if (supervisorId.equals(resourceAssignment.getSupervisorId()) == false) { continue; } supervisorInfo.allocResource(resourceAssignment); Integer port = resourceAssignment.getPort(); WorkerSummary workerSummary = portWorkerSummarys.get(port); if (workerSummary == null) { workerSummary = new WorkerSummary(); workerSummary.set_port(port); workerSummary.set_topology(topologyId); workerSummary.set_tasks(new ArrayList<TaskSummary>()); portWorkerSummarys.put(port, workerSummary); } String componentName = taskToComponent.get(taskId); int uptime = TimeUtils.time_delta(assignment.getTaskStartTimeSecs().get(taskId)); List<TaskSummary> tasks = workerSummary.get_tasks(); TaskSummary taskSummary = NimbusUtils.mkSimpleTaskSummary( resourceAssignment, taskId, componentName, host, uptime); tasks.add(taskSummary); } } List<WorkerSummary> wokersList = new ArrayList<WorkerSummary>(); wokersList.addAll(portWorkerSummarys.values()); SupervisorSummary supervisorSummary = NimbusUtils.mkSupervisorSummary(supervisorInfo, supervisorId); return new SupervisorWorkers(supervisorSummary, wokersList); } catch (TException e) { LOG.info("Failed to get ClusterSummary ", e); throw e; } catch (Exception e) { LOG.info("Failed to get ClusterSummary ", e); throw new TException(e); } }
/** * get cluster's summary, it will contain SupervisorSummary and TopologySummary * * @return ClusterSummary */ @Override public ClusterSummary getClusterInfo() throws TException { try { StormClusterState stormClusterState = data.getStormClusterState(); Map<String, Assignment> assignments = new HashMap<String, Assignment>(); // get nimbus running time int uptime = data.uptime(); // get TopologySummary List<TopologySummary> topologySummaries = new ArrayList<TopologySummary>(); // get all active topology's StormBase Map<String, StormBase> bases = Cluster.topology_bases(stormClusterState); for (Entry<String, StormBase> entry : bases.entrySet()) { String topologyId = entry.getKey(); StormBase base = entry.getValue(); Assignment assignment = stormClusterState.assignment_info(topologyId, null); if (assignment == null) { LOG.error("Failed to get assignment of " + topologyId); continue; } assignments.put(topologyId, assignment); String group = "default"; if (data.isGroupMode()) group = base.getGroup(); if (group == null) group = "default"; TopologySummary topology = NimbusUtils.mkTopologySummary( assignment, topologyId, base.getStormName(), base.getStatusString(), TimeUtils.time_delta(base.getLanchTimeSecs()), group); topologySummaries.add(topology); } // all supervisors Map<String, SupervisorInfo> supervisorInfos = Cluster.allSupervisorInfo(stormClusterState, null); // generate SupervisorSummaries List<SupervisorSummary> supervisorSummaries = NimbusUtils.mkSupervisorSummaries(supervisorInfos, assignments); return new ClusterSummary( supervisorSummaries, uptime, topologySummaries, data.getGroupToTopology(), data.getGroupToResource(), data.getGroupToUsedResource(), data.isGroupMode()); } catch (TException e) { LOG.info("Failed to get ClusterSummary ", e); throw e; } catch (Exception e) { LOG.info("Failed to get ClusterSummary ", e); throw new TException(e); } }
/** * Submit one Topology * * @param topologyname String: topology name * @param uploadedJarLocation String: already uploaded jar path * @param jsonConf String: jsonConf serialize all toplogy configuration to Json * @param topology StormTopology: topology Object */ @SuppressWarnings("unchecked") @Override public void submitTopologyWithOpts( String topologyname, String uploadedJarLocation, String jsonConf, StormTopology topology, SubmitOptions options) throws AlreadyAliveException, InvalidTopologyException, TopologyAssignException, TException { LOG.info("Receive " + topologyname + ", uploadedJarLocation:" + uploadedJarLocation); // @@@ Move validate topologyname in client code try { checkTopologyActive(data, topologyname, false); } catch (AlreadyAliveException e) { LOG.info(topologyname + " is already exist "); throw e; } catch (Exception e) { LOG.info("Failed to check whether topology is alive or not", e); throw new TException(e); } int counter = data.getSubmittedCount().incrementAndGet(); String topologyId = topologyname + "-" + counter + "-" + TimeUtils.current_time_secs(); Map<Object, Object> serializedConf = (Map<Object, Object>) JStormUtils.from_json(jsonConf); if (serializedConf == null) { LOG.warn("Failed to serialized Configuration"); throw new InvalidTopologyException("Failed to serilaze topology configuration"); } serializedConf.put(Config.TOPOLOGY_ID, topologyId); serializedConf.put(Config.TOPOLOGY_NAME, topologyname); try { Map<Object, Object> stormConf; stormConf = NimbusUtils.normalizeConf(conf, serializedConf, topology); Map<Object, Object> totalStormConf = new HashMap<Object, Object>(conf); totalStormConf.putAll(stormConf); StormTopology normalizedTopology = NimbusUtils.normalizeTopology(stormConf, topology); // this validates the structure of the topology Common.validate_basic(normalizedTopology, totalStormConf, topologyId); // don't need generate real topology, so skip Common.system_topology // Common.system_topology(totalStormConf, topology); StormClusterState stormClusterState = data.getStormClusterState(); // create /local-dir/nimbus/topologyId/xxxx files setupStormCode(conf, topologyId, uploadedJarLocation, stormConf, normalizedTopology); // generate TaskInfo for every bolt or spout in ZK // /ZK/tasks/topoologyId/xxx setupZkTaskInfo(conf, topologyId, stormClusterState); // make assignments for a topology TopologyAssignEvent assignEvent = new TopologyAssignEvent(); assignEvent.setTopologyId(topologyId); assignEvent.setScratch(false); assignEvent.setTopologyName(topologyname); assignEvent.setOldStatus( Thrift.topologyInitialStatusToStormStatus(options.get_initial_status())); TopologyAssign.push(assignEvent); LOG.info("Submit for " + topologyname + " with conf " + serializedConf); boolean isSuccess = assignEvent.waitFinish(); if (isSuccess == true) { LOG.info("Finish submit for " + topologyname); } else { throw new FailedAssignTopologyException(assignEvent.getErrorMsg()); } } catch (FailedAssignTopologyException e) { StringBuilder sb = new StringBuilder(); sb.append("Fail to sumbit topology, Root cause:"); if (e.getMessage() == null) { sb.append("submit timeout"); } else { sb.append(e.getMessage()); } sb.append("\n\n"); sb.append("topologyId:" + topologyId); sb.append(", uploadedJarLocation:" + uploadedJarLocation + "\n"); LOG.error(sb.toString(), e); throw new TopologyAssignException(sb.toString()); } catch (InvalidParameterException e) { StringBuilder sb = new StringBuilder(); sb.append("Fail to sumbit topology "); sb.append(e.getMessage()); sb.append(", cause:" + e.getCause()); sb.append("\n\n"); sb.append("topologyId:" + topologyId); sb.append(", uploadedJarLocation:" + uploadedJarLocation + "\n"); LOG.error(sb.toString(), e); throw new InvalidParameterException(sb.toString()); } catch (Throwable e) { StringBuilder sb = new StringBuilder(); sb.append("Fail to sumbit topology "); sb.append(e.getMessage()); sb.append(", cause:" + e.getCause()); sb.append("\n\n"); sb.append("topologyId:" + topologyId); sb.append(", uploadedJarLocation:" + uploadedJarLocation + "\n"); LOG.error(sb.toString(), e); throw new TopologyAssignException(sb.toString()); } }