/** * whether the topology is active by topology name * * @param stormClusterState see Cluster_clj * @param topologyName * @return boolean if the storm is active, return true, otherwise return false; * @throws Exception */ public boolean isTopologyActive(StormClusterState stormClusterState, String topologyName) throws Exception { boolean rtn = false; if (Cluster.get_topology_id(stormClusterState, topologyName) != null) { rtn = true; } return rtn; }
/** * private ConcurrentHashMap<Integer, WorkerSlot> taskNodeport; private HashMap<Integer, String> * tasksToComponent; private Map<String, List<Integer>> componentToSortedTasks; private * Map<String, Map<String, Fields>> componentToStreamToFields; private Map<String, Object> * defaultResources; private Map<String, Object> userResources; private Map<String, Object> * executorData; private Map registeredMetrics; * * @throws Exception */ private void generateMaps() throws Exception { this.tasksToComponent = Cluster.topology_task_info(zkCluster, topologyId); LOG.info("Map<taskId, component>:" + tasksToComponent); this.componentToSortedTasks = JStormUtils.reverse_map(tasksToComponent); for (java.util.Map.Entry<String, List<Integer>> entry : componentToSortedTasks.entrySet()) { List<Integer> tasks = entry.getValue(); Collections.sort(tasks); } this.defaultResources = new HashMap<String, Object>(); this.userResources = new HashMap<String, Object>(); this.executorData = new HashMap<String, Object>(); this.registeredMetrics = new HashMap(); }
/** * Get TopologyInfo, it contain all data of the topology running status * * @return TopologyInfo */ @Override public TopologyInfo getTopologyInfo(String topologyId) throws NotAliveException, TException { TopologyInfo topologyInfo = new TopologyInfo(); StormClusterState stormClusterState = data.getStormClusterState(); try { // get topology's StormBase StormBase base = stormClusterState.storm_base(topologyId, null); if (base == null) { throw new NotAliveException("No topology of " + topologyId); } topologyInfo.set_id(topologyId); topologyInfo.set_name(base.getStormName()); topologyInfo.set_uptime_secs(TimeUtils.time_delta(base.getLanchTimeSecs())); topologyInfo.set_status(base.getStatusString()); // get topology's Assignment Assignment assignment = stormClusterState.assignment_info(topologyId, null); if (assignment == null) { throw new TException("Failed to get StormBase from ZK of " + topologyId); } // get topology's map<taskId, componentId> Map<Integer, String> taskInfo = Cluster.topology_task_info(stormClusterState, topologyId); List<TaskSummary> tasks = NimbusUtils.mkTaskSummary(stormClusterState, assignment, taskInfo, topologyId); topologyInfo.set_tasks(tasks); return topologyInfo; } catch (TException e) { LOG.info("Failed to get topologyInfo " + topologyId, e); throw e; } catch (Exception e) { LOG.info("Failed to get topologyInfo " + topologyId, e); throw new TException("Failed to get topologyInfo" + topologyId); } }
/** * Get free resources 获得所有的空闲资源 * * @param supervisorInfos * @param stormClusterState * @throws Exception */ public static void getFreeSlots( Map<String, SupervisorInfo> supervisorInfos, StormClusterState stormClusterState) throws Exception { Map<String, Assignment> assignments = Cluster.get_all_assignment(stormClusterState, null); for (Entry<String, Assignment> entry : assignments.entrySet()) { String topologyId = entry.getKey(); Assignment assignment = entry.getValue(); Set<ResourceWorkerSlot> workers = assignment.getWorkers(); for (ResourceWorkerSlot worker : workers) { SupervisorInfo supervisorInfo = supervisorInfos.get(worker.getNodeId()); if (supervisorInfo == null) { // the supervisor is dead continue; } supervisorInfo.getWorkerPorts().remove(worker.getPort()); } } }
/** * Backup the toplogy's Assignment to ZK @@@ Question Do we need to do backup operation every * time? * * @param assignment * @param event */ public void backupAssignment(Assignment assignment, TopologyAssignEvent event) { String topologyId = event.getTopologyId(); String topologyName = event.getTopologyName(); try { StormClusterState zkClusterState = nimbusData.getStormClusterState(); // one little problem, get tasks twice when assign one topology HashMap<Integer, String> tasks = Cluster.topology_task_info(zkClusterState, topologyId); Map<String, List<Integer>> componentTasks = JStormUtils.reverse_map(tasks); for (Entry<String, List<Integer>> entry : componentTasks.entrySet()) { List<Integer> keys = entry.getValue(); Collections.sort(keys); } AssignmentBak assignmentBak = new AssignmentBak(componentTasks, assignment); zkClusterState.backup_assignment(topologyName, assignmentBak); } catch (Exception e) { LOG.warn("Failed to backup " + topologyId + " assignment " + assignment, e); } }
@SuppressWarnings({"rawtypes", "unchecked"}) public WorkerData( Map conf, IContext context, String topology_id, String supervisor_id, int port, String worker_id, String jar_path) throws Exception { this.conf = conf; this.context = context; this.topologyId = topology_id; this.supervisorId = supervisor_id; this.port = port; this.workerId = worker_id; this.active = new AtomicBoolean(true); this.topologyStatus = StatusType.active; if (StormConfig.cluster_mode(conf).equals("distributed")) { String pidDir = StormConfig.worker_pids_root(conf, worker_id); JStormServerUtils.createPid(pidDir); } // create zk interface this.zkClusterstate = ZkTool.mk_distributed_cluster_state(conf); this.zkCluster = Cluster.mk_storm_cluster_state(zkClusterstate); Map rawConf = StormConfig.read_supervisor_topology_conf(conf, topology_id); this.stormConf = new HashMap<Object, Object>(); this.stormConf.putAll(conf); this.stormConf.putAll(rawConf); LOG.info("Worker Configuration " + stormConf); try { boolean enableClassloader = ConfigExtension.isEnableTopologyClassLoader(stormConf); boolean enableDebugClassloader = ConfigExtension.isEnableClassloaderDebug(stormConf); if (jar_path == null && enableClassloader == true) { LOG.error("enable classloader, but not app jar"); throw new InvalidParameterException(); } URL[] urlArray = new URL[0]; if (jar_path != null) { String[] paths = jar_path.split(":"); Set<URL> urls = new HashSet<URL>(); for (String path : paths) { if (StringUtils.isBlank(path)) continue; URL url = new URL("File:" + path); urls.add(url); } urlArray = urls.toArray(new URL[0]); } WorkerClassLoader.mkInstance( urlArray, ClassLoader.getSystemClassLoader(), ClassLoader.getSystemClassLoader().getParent(), enableClassloader, enableDebugClassloader); } catch (Exception e) { // TODO Auto-generated catch block LOG.error("init jarClassLoader error!", e); throw new InvalidParameterException(); } if (this.context == null) { this.context = TransportFactory.makeContext(stormConf); } boolean disruptorUseSleep = ConfigExtension.isDisruptorUseSleep(stormConf); DisruptorQueue.setUseSleep(disruptorUseSleep); boolean isLimited = ConfigExtension.getTopologyBufferSizeLimited(stormConf); DisruptorQueue.setLimited(isLimited); LOG.info("Disruptor use sleep:" + disruptorUseSleep + ", limited size:" + isLimited); // this.transferQueue = new LinkedBlockingQueue<TransferData>(); int buffer_size = Utils.getInt(conf.get(Config.TOPOLOGY_TRANSFER_BUFFER_SIZE)); WaitStrategy waitStrategy = (WaitStrategy) Utils.newInstance((String) conf.get(Config.TOPOLOGY_DISRUPTOR_WAIT_STRATEGY)); this.transferQueue = DisruptorQueue.mkInstance("TotalTransfer", ProducerType.MULTI, buffer_size, waitStrategy); this.transferQueue.consumerStarted(); this.sendingQueue = DisruptorQueue.mkInstance("TotalSending", ProducerType.MULTI, buffer_size, waitStrategy); this.sendingQueue.consumerStarted(); this.nodeportSocket = new ConcurrentHashMap<WorkerSlot, IConnection>(); this.taskNodeport = new ConcurrentHashMap<Integer, WorkerSlot>(); this.workerToResource = new ConcurrentSkipListSet<ResourceWorkerSlot>(); this.innerTaskTransfer = new ConcurrentHashMap<Integer, DisruptorQueue>(); this.deserializeQueues = new ConcurrentHashMap<Integer, DisruptorQueue>(); Assignment assignment = zkCluster.assignment_info(topologyId, null); if (assignment == null) { String errMsg = "Failed to get Assignment of " + topologyId; LOG.error(errMsg); throw new RuntimeException(errMsg); } workerToResource.addAll(assignment.getWorkers()); // get current worker's task list this.taskids = assignment.getCurrentWorkerTasks(supervisorId, port); if (taskids.size() == 0) { throw new RuntimeException("No tasks running current workers"); } LOG.info("Current worker taskList:" + taskids); // deserialize topology code from local dir rawTopology = StormConfig.read_supervisor_topology_code(conf, topology_id); sysTopology = Common.system_topology(stormConf, rawTopology); generateMaps(); contextMaker = new ContextMaker(this); metricReporter = new MetricReporter(this); outTaskStatus = new HashMap<Integer, Boolean>(); threadPool = Executors.newScheduledThreadPool(THREAD_POOL_NUM); TimerTrigger.setScheduledExecutorService(threadPool); LOG.info("Successfully create WorkerData"); }
@Override public SupervisorWorkers getSupervisorWorkers(String host) throws NotAliveException, TException { try { StormClusterState stormClusterState = data.getStormClusterState(); String supervisorId = null; SupervisorInfo supervisorInfo = null; String ip = NetWorkUtils.host2Ip(host); String hostName = NetWorkUtils.ip2Host(host); // all supervisors Map<String, SupervisorInfo> supervisorInfos = Cluster.allSupervisorInfo(stormClusterState, null); for (Entry<String, SupervisorInfo> entry : supervisorInfos.entrySet()) { SupervisorInfo info = entry.getValue(); if (info.getHostName().equals(hostName) || info.getHostName().equals(ip)) { supervisorId = entry.getKey(); supervisorInfo = info; break; } } if (supervisorId == null) { throw new TException("No supervisor of " + host); } Map<String, Assignment> assignments = new HashMap<String, Assignment>(); // get all active topology's StormBase Map<String, StormBase> bases = Cluster.topology_bases(stormClusterState); for (Entry<String, StormBase> entry : bases.entrySet()) { String topologyId = entry.getKey(); StormBase base = entry.getValue(); Assignment assignment = stormClusterState.assignment_info(topologyId, null); if (assignment == null) { LOG.error("Failed to get assignment of " + topologyId); continue; } assignments.put(topologyId, assignment); } Map<Integer, WorkerSummary> portWorkerSummarys = new TreeMap<Integer, WorkerSummary>(); for (Entry<String, Assignment> entry : assignments.entrySet()) { String topologyId = entry.getKey(); Assignment assignment = entry.getValue(); Map<Integer, String> taskToComponent = Cluster.topology_task_info(stormClusterState, topologyId); Map<Integer, ResourceAssignment> taskToResource = assignment.getTaskToResource(); for (Entry<Integer, ResourceAssignment> resourceEntry : taskToResource.entrySet()) { Integer taskId = resourceEntry.getKey(); ResourceAssignment resourceAssignment = resourceEntry.getValue(); if (supervisorId.equals(resourceAssignment.getSupervisorId()) == false) { continue; } supervisorInfo.allocResource(resourceAssignment); Integer port = resourceAssignment.getPort(); WorkerSummary workerSummary = portWorkerSummarys.get(port); if (workerSummary == null) { workerSummary = new WorkerSummary(); workerSummary.set_port(port); workerSummary.set_topology(topologyId); workerSummary.set_tasks(new ArrayList<TaskSummary>()); portWorkerSummarys.put(port, workerSummary); } String componentName = taskToComponent.get(taskId); int uptime = TimeUtils.time_delta(assignment.getTaskStartTimeSecs().get(taskId)); List<TaskSummary> tasks = workerSummary.get_tasks(); TaskSummary taskSummary = NimbusUtils.mkSimpleTaskSummary( resourceAssignment, taskId, componentName, host, uptime); tasks.add(taskSummary); } } List<WorkerSummary> wokersList = new ArrayList<WorkerSummary>(); wokersList.addAll(portWorkerSummarys.values()); SupervisorSummary supervisorSummary = NimbusUtils.mkSupervisorSummary(supervisorInfo, supervisorId); return new SupervisorWorkers(supervisorSummary, wokersList); } catch (TException e) { LOG.info("Failed to get ClusterSummary ", e); throw e; } catch (Exception e) { LOG.info("Failed to get ClusterSummary ", e); throw new TException(e); } }
/** * get cluster's summary, it will contain SupervisorSummary and TopologySummary * * @return ClusterSummary */ @Override public ClusterSummary getClusterInfo() throws TException { try { StormClusterState stormClusterState = data.getStormClusterState(); Map<String, Assignment> assignments = new HashMap<String, Assignment>(); // get nimbus running time int uptime = data.uptime(); // get TopologySummary List<TopologySummary> topologySummaries = new ArrayList<TopologySummary>(); // get all active topology's StormBase Map<String, StormBase> bases = Cluster.topology_bases(stormClusterState); for (Entry<String, StormBase> entry : bases.entrySet()) { String topologyId = entry.getKey(); StormBase base = entry.getValue(); Assignment assignment = stormClusterState.assignment_info(topologyId, null); if (assignment == null) { LOG.error("Failed to get assignment of " + topologyId); continue; } assignments.put(topologyId, assignment); String group = "default"; if (data.isGroupMode()) group = base.getGroup(); if (group == null) group = "default"; TopologySummary topology = NimbusUtils.mkTopologySummary( assignment, topologyId, base.getStormName(), base.getStatusString(), TimeUtils.time_delta(base.getLanchTimeSecs()), group); topologySummaries.add(topology); } // all supervisors Map<String, SupervisorInfo> supervisorInfos = Cluster.allSupervisorInfo(stormClusterState, null); // generate SupervisorSummaries List<SupervisorSummary> supervisorSummaries = NimbusUtils.mkSupervisorSummaries(supervisorInfos, assignments); return new ClusterSummary( supervisorSummaries, uptime, topologySummaries, data.getGroupToTopology(), data.getGroupToResource(), data.getGroupToUsedResource(), data.isGroupMode()); } catch (TException e) { LOG.info("Failed to get ClusterSummary ", e); throw e; } catch (Exception e) { LOG.info("Failed to get ClusterSummary ", e); throw new TException(e); } }
protected TopologyAssignContext prepareTopologyAssign(TopologyAssignEvent event) throws Exception { TopologyAssignContext ret = new TopologyAssignContext(); String topologyId = event.getTopologyId(); /** 读取本地目录下的stormconf.ser和stormcode.ser */ Map<Object, Object> nimbusConf = nimbusData.getConf(); Map<Object, Object> topologyConf = StormConfig.read_nimbus_topology_conf(nimbusConf, topologyId); StormTopology rawTopology = StormConfig.read_nimbus_topology_code(nimbusConf, topologyId); ret.setRawTopology(rawTopology); Map stormConf = new HashMap(); stormConf.putAll(nimbusConf); stormConf.putAll(topologyConf); ret.setStormConf(stormConf); StormClusterState stormClusterState = nimbusData.getStormClusterState(); // get all running supervisor, don't need callback to watch supervisor /** 获取所有的运行的supervisor,以及supervisorInfo */ Map<String, SupervisorInfo> supInfos = Cluster.allSupervisorInfo(stormClusterState, null); if (supInfos.size() == 0) { throw new FailedAssignTopologyException( "Failed to make assignment " + topologyId + ", due to no alive supervisor"); } /** 获取topologyId下的所有tasks */ Map<Integer, String> taskToComponent = Cluster.topology_task_info(stormClusterState, topologyId); ret.setTaskToComponent(taskToComponent); // get taskids /ZK/tasks/topologyId Set<Integer> allTaskIds = taskToComponent.keySet(); if (allTaskIds == null || allTaskIds.size() == 0) { String errMsg = "Failed to get all task ID list from /ZK-dir/tasks/" + topologyId; LOG.warn(errMsg); throw new IOException(errMsg); } ret.setAllTaskIds(allTaskIds); Set<Integer> aliveTasks = new HashSet<Integer>(); // unstoppedTasks are tasks which are alive on no supervisor's(dead) // machine /** 未完成的任务,supervisor已死,任务没完成 */ Set<Integer> unstoppedTasks = new HashSet<Integer>(); Set<Integer> deadTasks = new HashSet<Integer>(); Set<ResourceWorkerSlot> unstoppedWorkers = new HashSet<ResourceWorkerSlot>(); Assignment existingAssignment = stormClusterState.assignment_info(topologyId, null); if (existingAssignment != null) { aliveTasks = getAliveTasks(topologyId, allTaskIds); unstoppedTasks = getUnstoppedSlots(aliveTasks, supInfos, existingAssignment); deadTasks.addAll(allTaskIds); deadTasks.removeAll(aliveTasks); } ret.setDeadTaskIds(deadTasks); ret.setUnstoppedTaskIds(unstoppedTasks); // Step 2: get all slots resource, free slots/ alive slots/ unstopped // slots getFreeSlots(supInfos, stormClusterState); ret.setCluster(supInfos); if (existingAssignment == null) { ret.setAssignType(TopologyAssignContext.ASSIGN_TYPE_NEW); try { AssignmentBak lastAssignment = stormClusterState.assignment_bak(event.getTopologyName()); if (lastAssignment != null) { ret.setOldAssignment(lastAssignment.getAssignment()); } } catch (Exception e) { LOG.warn("Fail to get old assignment", e); } } else { ret.setOldAssignment(existingAssignment); if (event.isScratch()) { ret.setAssignType(TopologyAssignContext.ASSIGN_TYPE_REBALANCE); unstoppedWorkers = getUnstoppedWorkers(unstoppedTasks, existingAssignment); ret.setUnstoppedWorkers(unstoppedWorkers); } else { ret.setAssignType(TopologyAssignContext.ASSIGN_TYPE_MONITOR); unstoppedWorkers = getUnstoppedWorkers(aliveTasks, existingAssignment); ret.setUnstoppedWorkers(unstoppedWorkers); } } return ret; }
/** * create and start one supervisor * * @param conf : configurationdefault.yaml storm.yaml * @param sharedContext : null (right now) * @return SupervisorManger: which is used to shutdown all workers and supervisor */ @SuppressWarnings("rawtypes") public SupervisorManger mkSupervisor(Map conf, IContext sharedContext) throws Exception { LOG.info("Starting Supervisor with conf " + conf); active = new AtomicBoolean(true); /** Step 1: cleanup all files in /storm-local-dir/supervisor/tmp */ String path = StormConfig.supervisorTmpDir(conf); FileUtils.cleanDirectory(new File(path)); /* * Step 2: create ZK operation instance StromClusterState */ StormClusterState stormClusterState = Cluster.mk_storm_cluster_state(conf); /* * Step 3, create LocalStat LocalStat is one KV database 4.1 create * LocalState instance 4.2 get supervisorId, if no supervisorId, create * one */ LocalState localState = StormConfig.supervisorState(conf); String supervisorId = (String) localState.get(Common.LS_ID); if (supervisorId == null) { supervisorId = UUID.randomUUID().toString(); localState.put(Common.LS_ID, supervisorId); } Vector<SmartThread> threads = new Vector<SmartThread>(); // Step 5 create HeartBeat // every supervisor.heartbeat.frequency.secs, write SupervisorInfo to ZK String myHostName = null; myHostName = ConfigExtension.getSupervisorHost(conf); if (myHostName == null) { myHostName = NetWorkUtils.hostname(); } Heartbeat hb = new Heartbeat(conf, stormClusterState, supervisorId, myHostName, active); hb.update(); AsyncLoopThread heartbeat = new AsyncLoopThread(hb, false, null, Thread.MIN_PRIORITY, true); threads.add(heartbeat); // Step 6 create and start sync Supervisor thread // every supervisor.monitor.frequency.secs second run SyncSupervisor EventManager processEventManager = new EventManagerImp(false); ConcurrentHashMap<String, String> workerThreadPids = new ConcurrentHashMap<String, String>(); SyncProcessEvent syncProcessEvent = new SyncProcessEvent(supervisorId, conf, localState, workerThreadPids, sharedContext); EventManager syncSupEventManager = new EventManagerImp(false); SyncSupervisorEvent syncSupervisorEvent = new SyncSupervisorEvent( supervisorId, conf, processEventManager, syncSupEventManager, stormClusterState, localState, syncProcessEvent); int syncFrequence = JStormUtils.parseInt(conf.get(Config.SUPERVISOR_MONITOR_FREQUENCY_SECS)); EventManagerPusher syncSupervisorPusher = new EventManagerPusher(syncSupEventManager, syncSupervisorEvent, active, syncFrequence); AsyncLoopThread syncSupervisorThread = new AsyncLoopThread(syncSupervisorPusher); threads.add(syncSupervisorThread); // Step 7 start sync process thread // every supervisor.monitor.frequency.secs run SyncProcesses // skip thread to do syncProcess, due to nimbus will check whether // worker is dead or not, if dead, it will reassign a new worker // // int syncProcessFrequence = syncFrequence/2; // EventManagerPusher syncProcessPusher = new EventManagerPusher( // processEventManager, syncProcessEvent, active, // syncProcessFrequence); // AsyncLoopThread syncProcessThread = new // AsyncLoopThread(syncProcessPusher); // threads.add(syncProcessThread); // Step 7 start httpserver Httpserver httpserver = new Httpserver(conf); httpserver.start(); LOG.info("Starting supervisor with id " + supervisorId + " at host " + myHostName); // SupervisorManger which can shutdown all supervisor and workers return new SupervisorManger( conf, supervisorId, active, threads, syncSupEventManager, processEventManager, httpserver, stormClusterState, workerThreadPids); }