/** * Copy jar to /local-dir/nimbus/topologyId/stormjar.jar * * @param conf * @param tmpJarLocation * @param stormroot * @throws IOException */ private void setupJar(Map<Object, Object> conf, String tmpJarLocation, String stormroot) throws IOException { if (!StormConfig.local_mode(conf)) { File srcFile = new File(tmpJarLocation); if (!srcFile.exists()) { throw new IllegalArgumentException( tmpJarLocation + " to copy to " + stormroot + " does not exist!"); } String path = StormConfig.stormjar_path(stormroot); File destFile = new File(path); FileUtils.copyFile(srcFile, destFile); } }
/** * make assignments for a topology The nimbus core function, this function has been totally * rewrite * * @param nimbusData NimbusData * @param topologyId String * @param isScratch Boolean: isScratch is false unless rebalancing the topology * @throws Exception */ public Assignment mkAssignment(TopologyAssignEvent event) throws Exception { String topologyId = event.getTopologyId(); LOG.info("Determining assignment for " + topologyId); TopologyAssignContext context = prepareTopologyAssign(event); Set<ResourceWorkerSlot> assignments = null; if (!StormConfig.local_mode(nimbusData.getConf())) { IToplogyScheduler scheduler = schedulers.get(DEFAULT_SCHEDULER_NAME); assignments = scheduler.assignTasks(context); } else { assignments = mkLocalAssignment(context); } Assignment assignment = null; Map<String, String> nodeHost = getTopologyNodeHost(context.getCluster(), context.getOldAssignment(), assignments); Map<Integer, Integer> startTimes = getTaskStartTimes(context, nimbusData, topologyId, context.getOldAssignment(), assignments); String codeDir = StormConfig.masterStormdistRoot(nimbusData.getConf(), topologyId); assignment = new Assignment(codeDir, assignments, nodeHost, startTimes); StormClusterState stormClusterState = nimbusData.getStormClusterState(); stormClusterState.set_assignment(topologyId, assignment); // update task heartbeat's start time NimbusUtils.updateTaskHbStartTime(nimbusData, assignment, topologyId); // Update metrics information in ZK when rebalance or reassignment // Only update metrics monitor status when creating topology if (context.getAssignType() == TopologyAssignContext.ASSIGN_TYPE_REBALANCE || context.getAssignType() == TopologyAssignContext.ASSIGN_TYPE_MONITOR) NimbusUtils.updateMetricsInfo(nimbusData, topologyId, assignment); else metricsMonitor(event); LOG.info("Successfully make assignment for topology id " + topologyId + ": " + assignment); return assignment; }
/** * cleanup the topologies which are not in ZK /topology, but in other place * * @param nimbusData * @param active_topologys * @throws Exception */ public void cleanupDisappearedTopology() throws Exception { StormClusterState clusterState = nimbusData.getStormClusterState(); List<String> active_topologys = clusterState.active_storms(); if (active_topologys == null) { return; } Set<String> cleanupIds = get_cleanup_ids(clusterState, active_topologys); for (String topologyId : cleanupIds) { LOG.info("Cleaning up " + topologyId); clusterState.try_remove_storm(topologyId); // nimbusData.getTaskHeartbeatsCache().remove(topologyId); // get /nimbus/stormdist/topologyId String master_stormdist_root = StormConfig.masterStormdistRoot(nimbusData.getConf(), topologyId); try { // delete topologyId local dir PathUtils.rmr(master_stormdist_root); } catch (IOException e) { LOG.warn("Failed to delete " + master_stormdist_root + ",", e); } } }
/** * get StormTopology throw deserialize local files * * @param id String: topology id * @return StormTopology */ @Override public StormTopology getTopology(String id) throws NotAliveException, TException { StormTopology topology = null; try { StormTopology stormtopology = StormConfig.read_nimbus_topology_code(conf, id); if (stormtopology == null) { throw new TException("topology:" + id + "is null"); } Map<Object, Object> topologyConf = (Map<Object, Object>) StormConfig.read_nimbus_topology_conf(conf, id); topology = Common.system_topology(topologyConf, stormtopology); } catch (Exception e) { LOG.error("Failed to get topology " + id + ",", e); throw new TException("Failed to get system_topology"); } return topology; }
/** * generate a taskid(Integer) for every task * * @param conf * @param topologyid * @return Map<Integer, String>: from taskid to componentid * @throws IOException * @throws InvalidTopologyException */ public Map<Integer, String> mkTaskComponentAssignments( Map<Object, Object> conf, String topologyid) throws IOException, InvalidTopologyException { // @@@ here exist a little problem, // we can directly pass stormConf from Submit method Map<Object, Object> stormConf = StormConfig.read_nimbus_topology_conf(conf, topologyid); StormTopology stopology = StormConfig.read_nimbus_topology_code(conf, topologyid); // use TreeMap to make task as sequence Map<Integer, String> rtn = new TreeMap<Integer, String>(); StormTopology topology = Common.system_topology(stormConf, stopology); Integer count = 0; count = mkTaskMaker(stormConf, topology.get_bolts(), rtn, count); count = mkTaskMaker(stormConf, topology.get_spouts(), rtn, count); count = mkTaskMaker(stormConf, topology.get_state_spouts(), rtn, count); return rtn; }
/** * get topology configuration * * @param id String: topology id * @return String */ @Override public String getTopologyConf(String id) throws NotAliveException, TException { String rtn; try { Map<Object, Object> topologyConf = StormConfig.read_nimbus_topology_conf(conf, id); rtn = JStormUtils.to_json(topologyConf); } catch (IOException e) { // TODO Auto-generated catch block LOG.info("Failed to get configuration of " + id, e); throw new TException(e); } return rtn; }
/** * create local topology files /local-dir/nimbus/topologyId/stormjar.jar * /local-dir/nimbus/topologyId/stormcode.ser /local-dir/nimbus/topologyId/stormconf.ser * * @param conf * @param topologyId * @param tmpJarLocation * @param stormConf * @param topology * @throws IOException */ private void setupStormCode( Map<Object, Object> conf, String topologyId, String tmpJarLocation, Map<Object, Object> stormConf, StormTopology topology) throws IOException { // local-dir/nimbus/stormdist/topologyId String stormroot = StormConfig.masterStormdistRoot(conf, topologyId); FileUtils.forceMkdir(new File(stormroot)); FileUtils.cleanDirectory(new File(stormroot)); // copy jar to /local-dir/nimbus/topologyId/stormjar.jar setupJar(conf, tmpJarLocation, stormroot); // serialize to file /local-dir/nimbus/topologyId/stormcode.ser FileUtils.writeByteArrayToFile( new File(StormConfig.stormcode_path(stormroot)), Utils.serialize(topology)); // serialize to file /local-dir/nimbus/topologyId/stormconf.ser FileUtils.writeByteArrayToFile( new File(StormConfig.sotrmconf_path(stormroot)), Utils.serialize(stormConf)); }
/** * prepare to uploading topology jar, return the file location * * @throws */ @Override public String beginFileUpload() throws TException { String fileLoc = null; try { fileLoc = StormConfig.masterInbox(conf) + "/stormjar-" + UUID.randomUUID() + ".jar"; data.getUploaders().put(fileLoc, Channels.newChannel(new FileOutputStream(fileLoc))); LOG.info("Uploading file from client to " + fileLoc); } catch (FileNotFoundException e) { LOG.error(" file not found " + fileLoc); throw new TException(e); } catch (IOException e) { LOG.error(" IOException " + fileLoc, e); throw new TException(e); } return fileLoc; }
/** * get topology ids which need to be cleanup * * @param clusterState * @return * @throws Exception */ private Set<String> get_cleanup_ids(StormClusterState clusterState, List<String> active_topologys) throws Exception { List<String> task_ids = clusterState.task_storms(); List<String> heartbeat_ids = clusterState.heartbeat_storms(); List<String> error_ids = clusterState.task_error_storms(); List<String> assignment_ids = clusterState.assignments(null); List<String> monitor_ids = clusterState.monitors(); String master_stormdist_root = StormConfig.masterStormdistRoot(nimbusData.getConf()); // listdir /local-dir/nimbus/stormdist List<String> code_ids = PathUtils.read_dir_contents(master_stormdist_root); // Set<String> assigned_ids = // JStormUtils.listToSet(clusterState.active_storms()); Set<String> to_cleanup_ids = new HashSet<String>(); if (task_ids != null) { to_cleanup_ids.addAll(task_ids); } if (heartbeat_ids != null) { to_cleanup_ids.addAll(heartbeat_ids); } if (error_ids != null) { to_cleanup_ids.addAll(error_ids); } if (assignment_ids != null) { to_cleanup_ids.addAll(assignment_ids); } if (monitor_ids != null) { to_cleanup_ids.addAll(monitor_ids); } if (code_ids != null) { to_cleanup_ids.addAll(code_ids); } if (active_topologys != null) { to_cleanup_ids.removeAll(active_topologys); } return to_cleanup_ids; }
/** start supervisor */ public void run() { SupervisorManger supervisorManager = null; try { Map<Object, Object> conf = Utils.readStormConfig(); StormConfig.validate_distributed_mode(conf); supervisorManager = mkSupervisor(conf, null); } catch (Exception e) { LOG.error("Failed to start supervisor\n", e); System.exit(1); } while (supervisorManager.isFinishShutdown() == false) { try { Thread.sleep(1000); } catch (InterruptedException e) { } } }
@Override public <T> Object execute(T... args) { boolean isSetTaskInfo = false; try { Boolean reassign = (Boolean) args[1]; Map<Object, Object> conf = (Map<Object, Object>) args[2]; // args[0]: // delay, // args[1]: // reassign_flag, // args[2]: // conf if (conf != null) { boolean isConfUpdate = false; Map stormConf = data.getConf(); // Update topology code Map topoConf = StormConfig.read_nimbus_topology_conf(stormConf, topologyid); StormTopology rawOldTopology = StormConfig.read_nimbus_topology_code(stormConf, topologyid); StormTopology rawNewTopology = NimbusUtils.normalizeTopology(conf, rawOldTopology, true); StormTopology sysOldTopology = rawOldTopology.deepCopy(); StormTopology sysNewTopology = rawNewTopology.deepCopy(); if (conf.get(Config.TOPOLOGY_ACKER_EXECUTORS) != null) { Common.add_acker(topoConf, sysOldTopology); Common.add_acker(conf, sysNewTopology); int ackerNum = JStormUtils.parseInt(conf.get(Config.TOPOLOGY_ACKER_EXECUTORS)); int oldAckerNum = JStormUtils.parseInt(topoConf.get(Config.TOPOLOGY_ACKER_EXECUTORS)); LOG.info("Update acker from oldAckerNum=" + oldAckerNum + " to ackerNum=" + ackerNum); topoConf.put(Config.TOPOLOGY_ACKER_EXECUTORS, ackerNum); isConfUpdate = true; } // If scale-out, setup task info for new added tasks setTaskInfo(sysOldTopology, sysNewTopology); isSetTaskInfo = true; // If everything is OK, write topology code into disk StormConfig.write_nimbus_topology_code( stormConf, topologyid, Utils.serialize(rawNewTopology)); // Update topology conf if worker num has been updated Set<Object> keys = conf.keySet(); Integer workerNum = JStormUtils.parseInt(conf.get(Config.TOPOLOGY_WORKERS)); if (workerNum != null) { Integer oldWorkerNum = JStormUtils.parseInt(topoConf.get(Config.TOPOLOGY_WORKERS)); topoConf.put(Config.TOPOLOGY_WORKERS, workerNum); isConfUpdate = true; LOG.info("Update worker num from " + oldWorkerNum + " to " + workerNum); } if (keys.contains(Config.ISOLATION_SCHEDULER_MACHINES)) { topoConf.put( Config.ISOLATION_SCHEDULER_MACHINES, conf.get(Config.ISOLATION_SCHEDULER_MACHINES)); } if (isConfUpdate) { StormConfig.write_nimbus_topology_conf(stormConf, topologyid, topoConf); } } TopologyAssignEvent event = new TopologyAssignEvent(); event.setTopologyId(topologyid); event.setScratch(true); event.setOldStatus(oldStatus); event.setReassign(reassign); if (conf != null) event.setScaleTopology(true); TopologyAssign.push(event); event.waitFinish(); } catch (Exception e) { LOG.error("do-rebalance error!", e); // Rollback the changes on ZK if (isSetTaskInfo) { try { StormClusterState clusterState = data.getStormClusterState(); clusterState.remove_task(topologyid, newTasks); } catch (Exception e1) { LOG.error("Failed to rollback the changes on ZK for task-" + newTasks, e); } } } DelayStatusTransitionCallback delayCallback = new DelayStatusTransitionCallback( data, topologyid, oldStatus, StatusType.rebalancing, StatusType.done_rebalance); return delayCallback.execute(); }
@SuppressWarnings({"rawtypes", "unchecked"}) public WorkerData( Map conf, IContext context, String topology_id, String supervisor_id, int port, String worker_id, String jar_path) throws Exception { this.conf = conf; this.context = context; this.topologyId = topology_id; this.supervisorId = supervisor_id; this.port = port; this.workerId = worker_id; this.active = new AtomicBoolean(true); this.topologyStatus = StatusType.active; if (StormConfig.cluster_mode(conf).equals("distributed")) { String pidDir = StormConfig.worker_pids_root(conf, worker_id); JStormServerUtils.createPid(pidDir); } // create zk interface this.zkClusterstate = ZkTool.mk_distributed_cluster_state(conf); this.zkCluster = Cluster.mk_storm_cluster_state(zkClusterstate); Map rawConf = StormConfig.read_supervisor_topology_conf(conf, topology_id); this.stormConf = new HashMap<Object, Object>(); this.stormConf.putAll(conf); this.stormConf.putAll(rawConf); LOG.info("Worker Configuration " + stormConf); try { boolean enableClassloader = ConfigExtension.isEnableTopologyClassLoader(stormConf); boolean enableDebugClassloader = ConfigExtension.isEnableClassloaderDebug(stormConf); if (jar_path == null && enableClassloader == true) { LOG.error("enable classloader, but not app jar"); throw new InvalidParameterException(); } URL[] urlArray = new URL[0]; if (jar_path != null) { String[] paths = jar_path.split(":"); Set<URL> urls = new HashSet<URL>(); for (String path : paths) { if (StringUtils.isBlank(path)) continue; URL url = new URL("File:" + path); urls.add(url); } urlArray = urls.toArray(new URL[0]); } WorkerClassLoader.mkInstance( urlArray, ClassLoader.getSystemClassLoader(), ClassLoader.getSystemClassLoader().getParent(), enableClassloader, enableDebugClassloader); } catch (Exception e) { // TODO Auto-generated catch block LOG.error("init jarClassLoader error!", e); throw new InvalidParameterException(); } if (this.context == null) { this.context = TransportFactory.makeContext(stormConf); } boolean disruptorUseSleep = ConfigExtension.isDisruptorUseSleep(stormConf); DisruptorQueue.setUseSleep(disruptorUseSleep); boolean isLimited = ConfigExtension.getTopologyBufferSizeLimited(stormConf); DisruptorQueue.setLimited(isLimited); LOG.info("Disruptor use sleep:" + disruptorUseSleep + ", limited size:" + isLimited); // this.transferQueue = new LinkedBlockingQueue<TransferData>(); int buffer_size = Utils.getInt(conf.get(Config.TOPOLOGY_TRANSFER_BUFFER_SIZE)); WaitStrategy waitStrategy = (WaitStrategy) Utils.newInstance((String) conf.get(Config.TOPOLOGY_DISRUPTOR_WAIT_STRATEGY)); this.transferQueue = DisruptorQueue.mkInstance("TotalTransfer", ProducerType.MULTI, buffer_size, waitStrategy); this.transferQueue.consumerStarted(); this.sendingQueue = DisruptorQueue.mkInstance("TotalSending", ProducerType.MULTI, buffer_size, waitStrategy); this.sendingQueue.consumerStarted(); this.nodeportSocket = new ConcurrentHashMap<WorkerSlot, IConnection>(); this.taskNodeport = new ConcurrentHashMap<Integer, WorkerSlot>(); this.workerToResource = new ConcurrentSkipListSet<ResourceWorkerSlot>(); this.innerTaskTransfer = new ConcurrentHashMap<Integer, DisruptorQueue>(); this.deserializeQueues = new ConcurrentHashMap<Integer, DisruptorQueue>(); Assignment assignment = zkCluster.assignment_info(topologyId, null); if (assignment == null) { String errMsg = "Failed to get Assignment of " + topologyId; LOG.error(errMsg); throw new RuntimeException(errMsg); } workerToResource.addAll(assignment.getWorkers()); // get current worker's task list this.taskids = assignment.getCurrentWorkerTasks(supervisorId, port); if (taskids.size() == 0) { throw new RuntimeException("No tasks running current workers"); } LOG.info("Current worker taskList:" + taskids); // deserialize topology code from local dir rawTopology = StormConfig.read_supervisor_topology_code(conf, topology_id); sysTopology = Common.system_topology(stormConf, rawTopology); generateMaps(); contextMaker = new ContextMaker(this); metricReporter = new MetricReporter(this); outTaskStatus = new HashMap<Integer, Boolean>(); threadPool = Executors.newScheduledThreadPool(THREAD_POOL_NUM); TimerTrigger.setScheduledExecutorService(threadPool); LOG.info("Successfully create WorkerData"); }
protected TopologyAssignContext prepareTopologyAssign(TopologyAssignEvent event) throws Exception { TopologyAssignContext ret = new TopologyAssignContext(); String topologyId = event.getTopologyId(); /** 读取本地目录下的stormconf.ser和stormcode.ser */ Map<Object, Object> nimbusConf = nimbusData.getConf(); Map<Object, Object> topologyConf = StormConfig.read_nimbus_topology_conf(nimbusConf, topologyId); StormTopology rawTopology = StormConfig.read_nimbus_topology_code(nimbusConf, topologyId); ret.setRawTopology(rawTopology); Map stormConf = new HashMap(); stormConf.putAll(nimbusConf); stormConf.putAll(topologyConf); ret.setStormConf(stormConf); StormClusterState stormClusterState = nimbusData.getStormClusterState(); // get all running supervisor, don't need callback to watch supervisor /** 获取所有的运行的supervisor,以及supervisorInfo */ Map<String, SupervisorInfo> supInfos = Cluster.allSupervisorInfo(stormClusterState, null); if (supInfos.size() == 0) { throw new FailedAssignTopologyException( "Failed to make assignment " + topologyId + ", due to no alive supervisor"); } /** 获取topologyId下的所有tasks */ Map<Integer, String> taskToComponent = Cluster.topology_task_info(stormClusterState, topologyId); ret.setTaskToComponent(taskToComponent); // get taskids /ZK/tasks/topologyId Set<Integer> allTaskIds = taskToComponent.keySet(); if (allTaskIds == null || allTaskIds.size() == 0) { String errMsg = "Failed to get all task ID list from /ZK-dir/tasks/" + topologyId; LOG.warn(errMsg); throw new IOException(errMsg); } ret.setAllTaskIds(allTaskIds); Set<Integer> aliveTasks = new HashSet<Integer>(); // unstoppedTasks are tasks which are alive on no supervisor's(dead) // machine /** 未完成的任务,supervisor已死,任务没完成 */ Set<Integer> unstoppedTasks = new HashSet<Integer>(); Set<Integer> deadTasks = new HashSet<Integer>(); Set<ResourceWorkerSlot> unstoppedWorkers = new HashSet<ResourceWorkerSlot>(); Assignment existingAssignment = stormClusterState.assignment_info(topologyId, null); if (existingAssignment != null) { aliveTasks = getAliveTasks(topologyId, allTaskIds); unstoppedTasks = getUnstoppedSlots(aliveTasks, supInfos, existingAssignment); deadTasks.addAll(allTaskIds); deadTasks.removeAll(aliveTasks); } ret.setDeadTaskIds(deadTasks); ret.setUnstoppedTaskIds(unstoppedTasks); // Step 2: get all slots resource, free slots/ alive slots/ unstopped // slots getFreeSlots(supInfos, stormClusterState); ret.setCluster(supInfos); if (existingAssignment == null) { ret.setAssignType(TopologyAssignContext.ASSIGN_TYPE_NEW); try { AssignmentBak lastAssignment = stormClusterState.assignment_bak(event.getTopologyName()); if (lastAssignment != null) { ret.setOldAssignment(lastAssignment.getAssignment()); } } catch (Exception e) { LOG.warn("Fail to get old assignment", e); } } else { ret.setOldAssignment(existingAssignment); if (event.isScratch()) { ret.setAssignType(TopologyAssignContext.ASSIGN_TYPE_REBALANCE); unstoppedWorkers = getUnstoppedWorkers(unstoppedTasks, existingAssignment); ret.setUnstoppedWorkers(unstoppedWorkers); } else { ret.setAssignType(TopologyAssignContext.ASSIGN_TYPE_MONITOR); unstoppedWorkers = getUnstoppedWorkers(aliveTasks, existingAssignment); ret.setUnstoppedWorkers(unstoppedWorkers); } } return ret; }
/** * create and start one supervisor * * @param conf : configurationdefault.yaml storm.yaml * @param sharedContext : null (right now) * @return SupervisorManger: which is used to shutdown all workers and supervisor */ @SuppressWarnings("rawtypes") public SupervisorManger mkSupervisor(Map conf, IContext sharedContext) throws Exception { LOG.info("Starting Supervisor with conf " + conf); active = new AtomicBoolean(true); /** Step 1: cleanup all files in /storm-local-dir/supervisor/tmp */ String path = StormConfig.supervisorTmpDir(conf); FileUtils.cleanDirectory(new File(path)); /* * Step 2: create ZK operation instance StromClusterState */ StormClusterState stormClusterState = Cluster.mk_storm_cluster_state(conf); /* * Step 3, create LocalStat LocalStat is one KV database 4.1 create * LocalState instance 4.2 get supervisorId, if no supervisorId, create * one */ LocalState localState = StormConfig.supervisorState(conf); String supervisorId = (String) localState.get(Common.LS_ID); if (supervisorId == null) { supervisorId = UUID.randomUUID().toString(); localState.put(Common.LS_ID, supervisorId); } Vector<SmartThread> threads = new Vector<SmartThread>(); // Step 5 create HeartBeat // every supervisor.heartbeat.frequency.secs, write SupervisorInfo to ZK String myHostName = null; myHostName = ConfigExtension.getSupervisorHost(conf); if (myHostName == null) { myHostName = NetWorkUtils.hostname(); } Heartbeat hb = new Heartbeat(conf, stormClusterState, supervisorId, myHostName, active); hb.update(); AsyncLoopThread heartbeat = new AsyncLoopThread(hb, false, null, Thread.MIN_PRIORITY, true); threads.add(heartbeat); // Step 6 create and start sync Supervisor thread // every supervisor.monitor.frequency.secs second run SyncSupervisor EventManager processEventManager = new EventManagerImp(false); ConcurrentHashMap<String, String> workerThreadPids = new ConcurrentHashMap<String, String>(); SyncProcessEvent syncProcessEvent = new SyncProcessEvent(supervisorId, conf, localState, workerThreadPids, sharedContext); EventManager syncSupEventManager = new EventManagerImp(false); SyncSupervisorEvent syncSupervisorEvent = new SyncSupervisorEvent( supervisorId, conf, processEventManager, syncSupEventManager, stormClusterState, localState, syncProcessEvent); int syncFrequence = JStormUtils.parseInt(conf.get(Config.SUPERVISOR_MONITOR_FREQUENCY_SECS)); EventManagerPusher syncSupervisorPusher = new EventManagerPusher(syncSupEventManager, syncSupervisorEvent, active, syncFrequence); AsyncLoopThread syncSupervisorThread = new AsyncLoopThread(syncSupervisorPusher); threads.add(syncSupervisorThread); // Step 7 start sync process thread // every supervisor.monitor.frequency.secs run SyncProcesses // skip thread to do syncProcess, due to nimbus will check whether // worker is dead or not, if dead, it will reassign a new worker // // int syncProcessFrequence = syncFrequence/2; // EventManagerPusher syncProcessPusher = new EventManagerPusher( // processEventManager, syncProcessEvent, active, // syncProcessFrequence); // AsyncLoopThread syncProcessThread = new // AsyncLoopThread(syncProcessPusher); // threads.add(syncProcessThread); // Step 7 start httpserver Httpserver httpserver = new Httpserver(conf); httpserver.start(); LOG.info("Starting supervisor with id " + supervisorId + " at host " + myHostName); // SupervisorManger which can shutdown all supervisor and workers return new SupervisorManger( conf, supervisorId, active, threads, syncSupEventManager, processEventManager, httpserver, stormClusterState, workerThreadPids); }