public DefaultTopologyAssignContext(TopologyAssignContext context) { super(context); try { sysTopology = Common.system_topology(stormConf, rawTopology); } catch (Exception e) { throw new FailedAssignTopologyException("Failed to generate system topology"); } sidToHostname = generateSidToHost(); hostToSid = JStormUtils.reverse_map(sidToHostname); if (oldAssignment != null && oldAssignment.getWorkers() != null) { oldWorkers = oldAssignment.getWorkers(); } else { oldWorkers = new HashSet<ResourceWorkerSlot>(); } refineDeadTasks(); componentTasks = JStormUtils.reverse_map(context.getTaskToComponent()); for (Entry<String, List<Integer>> entry : componentTasks.entrySet()) { List<Integer> componentTaskList = entry.getValue(); Collections.sort(componentTaskList); } totalWorkerNum = computeWorkerNum(); unstoppedWorkerNum = computeUnstoppedAssignments(); }
private static Set<ResourceWorkerSlot> mkLocalAssignment(TopologyAssignContext context) { Set<ResourceWorkerSlot> result = new HashSet<ResourceWorkerSlot>(); Map<String, SupervisorInfo> cluster = context.getCluster(); if (cluster.size() != 1) throw new RuntimeException(); SupervisorInfo localSupervisor = null; String supervisorId = null; for (Entry<String, SupervisorInfo> entry : cluster.entrySet()) { supervisorId = entry.getKey(); localSupervisor = entry.getValue(); } int port = localSupervisor.getWorkerPorts().iterator().next(); ResourceWorkerSlot worker = new ResourceWorkerSlot(supervisorId, port); worker.setTasks(new HashSet<Integer>(context.getAllTaskIds())); worker.setHostname(localSupervisor.getHostName()); result.add(worker); return result; }
/** * @@@ Here maybe exist one problem, some dead slots have been free * * @param context */ protected void freeUsed(TopologyAssignContext context) { Set<Integer> canFree = new HashSet<Integer>(); canFree.addAll(context.getAllTaskIds()); canFree.removeAll(context.getUnstoppedTaskIds()); Map<String, SupervisorInfo> cluster = context.getCluster(); Map<Integer, ResourceAssignment> oldAssigns = context.getOldAssignment().getTaskToResource(); for (Integer task : canFree) { ResourceAssignment oldAssign = oldAssigns.get(task); if (oldAssign == null) { LOG.warn("When free rebalance resource, no ResourceAssignment of task " + task); continue; } SupervisorInfo supervisorInfo = cluster.get(oldAssign.getSupervisorId()); if (supervisorInfo == null) { continue; } supervisorInfo.getCpuPool().free(oldAssign.getCpuSlotNum(), context); supervisorInfo.getMemPool().free(oldAssign.getMemSlotNum(), context); supervisorInfo.getDiskPool().free(oldAssign.getDiskSlot(), context); supervisorInfo.getNetPool().free(oldAssign.getPort(), context); } }
/** * @param existingAssignment * @param taskWorkerSlot * @return * @throws Exception */ public static Map<Integer, Integer> getTaskStartTimes( TopologyAssignContext context, NimbusData nimbusData, String topologyId, Assignment existingAssignment, Set<ResourceWorkerSlot> workers) throws Exception { Map<Integer, Integer> startTimes = new TreeMap<Integer, Integer>(); if (context.getAssignType() == TopologyAssignContext.ASSIGN_TYPE_NEW) { int nowSecs = TimeUtils.current_time_secs(); for (ResourceWorkerSlot worker : workers) { for (Integer changedTaskId : worker.getTasks()) { startTimes.put(changedTaskId, nowSecs); } } return startTimes; } Set<ResourceWorkerSlot> oldWorkers = new HashSet<ResourceWorkerSlot>(); if (existingAssignment != null) { Map<Integer, Integer> taskStartTimeSecs = existingAssignment.getTaskStartTimeSecs(); if (taskStartTimeSecs != null) { startTimes.putAll(taskStartTimeSecs); } if (existingAssignment.getWorkers() != null) { oldWorkers = existingAssignment.getWorkers(); } } StormClusterState zkClusterState = nimbusData.getStormClusterState(); Set<Integer> changeTaskIds = getChangeTaskIds(oldWorkers, workers); int nowSecs = TimeUtils.current_time_secs(); for (Integer changedTaskId : changeTaskIds) { startTimes.put(changedTaskId, nowSecs); zkClusterState.remove_task_heartbeat(topologyId, changedTaskId); } LOG.info("Task assignment has been changed " + changeTaskIds); return startTimes; }
/** * make assignments for a topology The nimbus core function, this function has been totally * rewrite * * @param nimbusData NimbusData * @param topologyId String * @param isScratch Boolean: isScratch is false unless rebalancing the topology * @throws Exception */ public Assignment mkAssignment(TopologyAssignEvent event) throws Exception { String topologyId = event.getTopologyId(); LOG.info("Determining assignment for " + topologyId); TopologyAssignContext context = prepareTopologyAssign(event); Set<ResourceWorkerSlot> assignments = null; if (!StormConfig.local_mode(nimbusData.getConf())) { IToplogyScheduler scheduler = schedulers.get(DEFAULT_SCHEDULER_NAME); assignments = scheduler.assignTasks(context); } else { assignments = mkLocalAssignment(context); } Assignment assignment = null; Map<String, String> nodeHost = getTopologyNodeHost(context.getCluster(), context.getOldAssignment(), assignments); Map<Integer, Integer> startTimes = getTaskStartTimes(context, nimbusData, topologyId, context.getOldAssignment(), assignments); String codeDir = StormConfig.masterStormdistRoot(nimbusData.getConf(), topologyId); assignment = new Assignment(codeDir, assignments, nodeHost, startTimes); StormClusterState stormClusterState = nimbusData.getStormClusterState(); stormClusterState.set_assignment(topologyId, assignment); // update task heartbeat's start time NimbusUtils.updateTaskHbStartTime(nimbusData, assignment, topologyId); // Update metrics information in ZK when rebalance or reassignment // Only update metrics monitor status when creating topology if (context.getAssignType() == TopologyAssignContext.ASSIGN_TYPE_REBALANCE || context.getAssignType() == TopologyAssignContext.ASSIGN_TYPE_MONITOR) NimbusUtils.updateMetricsInfo(nimbusData, topologyId, assignment); else metricsMonitor(event); LOG.info("Successfully make assignment for topology id " + topologyId + ": " + assignment); return assignment; }
protected TopologyAssignContext prepareTopologyAssign(TopologyAssignEvent event) throws Exception { TopologyAssignContext ret = new TopologyAssignContext(); String topologyId = event.getTopologyId(); /** 读取本地目录下的stormconf.ser和stormcode.ser */ Map<Object, Object> nimbusConf = nimbusData.getConf(); Map<Object, Object> topologyConf = StormConfig.read_nimbus_topology_conf(nimbusConf, topologyId); StormTopology rawTopology = StormConfig.read_nimbus_topology_code(nimbusConf, topologyId); ret.setRawTopology(rawTopology); Map stormConf = new HashMap(); stormConf.putAll(nimbusConf); stormConf.putAll(topologyConf); ret.setStormConf(stormConf); StormClusterState stormClusterState = nimbusData.getStormClusterState(); // get all running supervisor, don't need callback to watch supervisor /** 获取所有的运行的supervisor,以及supervisorInfo */ Map<String, SupervisorInfo> supInfos = Cluster.allSupervisorInfo(stormClusterState, null); if (supInfos.size() == 0) { throw new FailedAssignTopologyException( "Failed to make assignment " + topologyId + ", due to no alive supervisor"); } /** 获取topologyId下的所有tasks */ Map<Integer, String> taskToComponent = Cluster.topology_task_info(stormClusterState, topologyId); ret.setTaskToComponent(taskToComponent); // get taskids /ZK/tasks/topologyId Set<Integer> allTaskIds = taskToComponent.keySet(); if (allTaskIds == null || allTaskIds.size() == 0) { String errMsg = "Failed to get all task ID list from /ZK-dir/tasks/" + topologyId; LOG.warn(errMsg); throw new IOException(errMsg); } ret.setAllTaskIds(allTaskIds); Set<Integer> aliveTasks = new HashSet<Integer>(); // unstoppedTasks are tasks which are alive on no supervisor's(dead) // machine /** 未完成的任务,supervisor已死,任务没完成 */ Set<Integer> unstoppedTasks = new HashSet<Integer>(); Set<Integer> deadTasks = new HashSet<Integer>(); Set<ResourceWorkerSlot> unstoppedWorkers = new HashSet<ResourceWorkerSlot>(); Assignment existingAssignment = stormClusterState.assignment_info(topologyId, null); if (existingAssignment != null) { aliveTasks = getAliveTasks(topologyId, allTaskIds); unstoppedTasks = getUnstoppedSlots(aliveTasks, supInfos, existingAssignment); deadTasks.addAll(allTaskIds); deadTasks.removeAll(aliveTasks); } ret.setDeadTaskIds(deadTasks); ret.setUnstoppedTaskIds(unstoppedTasks); // Step 2: get all slots resource, free slots/ alive slots/ unstopped // slots getFreeSlots(supInfos, stormClusterState); ret.setCluster(supInfos); if (existingAssignment == null) { ret.setAssignType(TopologyAssignContext.ASSIGN_TYPE_NEW); try { AssignmentBak lastAssignment = stormClusterState.assignment_bak(event.getTopologyName()); if (lastAssignment != null) { ret.setOldAssignment(lastAssignment.getAssignment()); } } catch (Exception e) { LOG.warn("Fail to get old assignment", e); } } else { ret.setOldAssignment(existingAssignment); if (event.isScratch()) { ret.setAssignType(TopologyAssignContext.ASSIGN_TYPE_REBALANCE); unstoppedWorkers = getUnstoppedWorkers(unstoppedTasks, existingAssignment); ret.setUnstoppedWorkers(unstoppedWorkers); } else { ret.setAssignType(TopologyAssignContext.ASSIGN_TYPE_MONITOR); unstoppedWorkers = getUnstoppedWorkers(aliveTasks, existingAssignment); ret.setUnstoppedWorkers(unstoppedWorkers); } } return ret; }
@Override public Map<Integer, ResourceAssignment> assignTasks(TopologyAssignContext context) throws FailedAssignTopologyException { int assignType = context.getAssignType(); if (TopologyAssignContext.isAssignTypeValid(assignType) == false) { throw new FailedAssignTopologyException("Invalide Assign Type " + assignType); } DefaultTopologyAssignContext defaultContext = new DefaultTopologyAssignContext(context); if (assignType == TopologyAssignContext.ASSIGN_TYPE_REBALANCE) { freeUsed(defaultContext); } LOG.info("Dead tasks:" + defaultContext.getDeadTaskIds()); LOG.info("Unstopped tasks:" + defaultContext.getUnstoppedTaskIds()); Set<Integer> needAssignTasks = getNeedAssignTasks(defaultContext); Map<Integer, ResourceAssignment> keepAssigns = getKeepAssign(defaultContext, needAssignTasks); // please use tree map to make task sequence Map<Integer, ResourceAssignment> ret = new TreeMap<Integer, ResourceAssignment>(); ret.putAll(keepAssigns); ret.putAll(defaultContext.getUnstoppedAssignments()); Map<WorkerSlot, List<Integer>> keepAssignWorkers = Assignment.getWorkerTasks(keepAssigns); int allocWorkerNum = defaultContext.getTotalWorkerNum() - defaultContext.getUnstoppedWorkerNum() - keepAssignWorkers.size(); if (allocWorkerNum <= 0) { LOG.warn( "Don't need assign workers, all workers are fine " + defaultContext.toDetailString()); throw new FailedAssignTopologyException("Don't need assign worker, all workers are fine "); } Set<String> outputConfigComponents = new HashSet<String>(); Map<ComponentAssignType, Pair<Set<Integer>, IPreassignTask>> typeHandler = registerPreAssignHandler(defaultContext, needAssignTasks); Map<Integer, ResourceAssignment> newAssigns = new HashMap<Integer, ResourceAssignment>(); Set<String> usedSupervisorIds = new HashSet<String>(); List<Integer> lastFailed = new ArrayList<Integer>(); for (Entry<ComponentAssignType, Pair<Set<Integer>, IPreassignTask>> entry : typeHandler.entrySet()) { ComponentAssignType type = entry.getKey(); Set<Integer> tasks = entry.getValue().getFirst(); IPreassignTask handler = entry.getValue().getSecond(); tasks.addAll(lastFailed); lastFailed.clear(); List<Integer> sortedTasks = sortAssignTasks(defaultContext, tasks); StormTopology sysTopology = defaultContext.getSysTopology(); for (Integer task : sortedTasks) { Set<String> canUsedSupervisorIds = getCanUsedSupervisors(defaultContext, usedSupervisorIds, allocWorkerNum); String componentName = defaultContext.getTaskToComponent().get(task); ComponentCommon componentCommon = ThriftTopologyUtils.getComponentCommon(sysTopology, componentName); Map componentMap = (Map) JStormUtils.from_json(componentCommon.get_json_conf()); if (componentMap == null) { componentMap = Maps.newHashMap(); } if (outputConfigComponents.contains(componentName) == false) { LOG.info("Component map of " + componentName + "\n" + componentMap); outputConfigComponents.add(componentName); } ResourceAssignment preAssignment = handler.preAssign( task, defaultContext, componentMap, componentName, canUsedSupervisorIds, ret, newAssigns); if (preAssignment == null) { // pre assign fail lastFailed.add(task); } else { // sucess to do preAssign SupervisorInfo supervisorInfo = defaultContext.getCluster().get(preAssignment.getSupervisorId()); LOG.info("Task " + task + " had been assigned to " + supervisorInfo.getHostName()); newAssigns.put(task, preAssignment); ret.put(task, preAssignment); usedSupervisorIds.add(preAssignment.getSupervisorId()); } } } if (lastFailed.isEmpty() == false) { throw new FailedAssignTopologyException("Failed to assign tasks " + lastFailed); } // Here just hardcode IPostAssignTask postAssignHandler = new PostAssignTaskPort(); postAssignHandler.postAssign(defaultContext, newAssigns, allocWorkerNum); LOG.info("Keep Alive slots:" + keepAssigns); LOG.info("Unstopped slots:" + defaultContext.getUnstoppedAssignments()); LOG.info("New assign slots:" + newAssigns); return ret; }