/** * 根据停止的节点修复死锁 * * @param node */ public void fixedDeadNodeJob(Node node) { try { // 1. 判断这个节点的channel是否存在 ChannelWrapper channelWrapper = application .getChannelManager() .getChannel(node.getGroup(), node.getNodeType(), node.getIdentity()); if (channelWrapper == null || channelWrapper.getChannel() == null || channelWrapper.isClosed()) { // List<JobPo> jobPos = // application.getExecutingJobQueue().getJobs(node.getIdentity()); // if (CollectionUtils.isNotEmpty(jobPos)) { // for (JobPo jobPo : jobPos) { // fixedDeadJob(jobPo); // } // } List<TaskExecutingBean> executingTasks = InjectorHolder.getInstance(DTaskProvider.class) .loadTaskExecutingTasksBytaskTrackerIdentity(node.getIdentity()); if (CollectionUtils.isNotEmpty(executingTasks)) { for (TaskExecutingBean taskExectuing : executingTasks) { fixedDeadJob(taskExectuing); } } } } catch (Exception t) { LOGGER.error(t.getMessage(), t); } }
private void fix() throws RemotingSendException { // 查询出所有死掉的任务 (其实可以直接在数据库中fix的, 查询出来主要是为了日志打印) // 一般来说这个是没有多大的,我就不分页去查询了 List<JobPo> jobPos = new ArrayList<JobPo>(); ; // application.getExecutingJobQueue().getDeadJobs(SystemClock.now() - MAX_DEAD_CHECK_TIME); List<TaskExecutingBean> taskExecutingList = InjectorHolder.getInstance(DTaskProvider.class) .loadDeadTaskExecutingTasks(before(MAX_DEAD_CHECK_TIME)); if (CollectionUtils.isNotEmpty(taskExecutingList)) { for (TaskExecutingBean taskExecuting : taskExecutingList) { jobPos.add(JobDomainConverter.convertTaskExecutingToJobpo(taskExecuting)); } } if (jobPos != null && jobPos.size() > 0) { List<Node> nodes = application.getSubscribedNodeManager().getNodeList(NodeType.TASK_TRACKER); HashSet<String /*identity*/> identities = new HashSet<String>(); if (CollectionUtils.isNotEmpty(nodes)) { for (Node node : nodes) { identities.add(node.getIdentity()); } } Map<TaskTrackerNode /*执行的TaskTracker节点 identity*/, List<JobPo /*jobId*/>> timeoutMap = new HashMap<TaskTrackerNode, List<JobPo>>(); for (JobPo jobPo : jobPos) { if (!identities.contains(jobPo.getTaskTrackerIdentity())) { fixedDeadJob(jobPo); } else { // 如果节点存在,并且超时了, 那么去主动询问taskTracker 这个任务是否在执行中 if (SystemClock.now() - jobPo.getGmtModified() > MAX_TIME_OUT) { TaskTrackerNode taskTrackerNode = new TaskTrackerNode( jobPo.getTaskTrackerIdentity(), jobPo.getTaskTrackerNodeGroup()); List<JobPo> jobPosList = timeoutMap.get(taskTrackerNode); if (jobPosList == null) { jobPosList = new ArrayList<JobPo>(); timeoutMap.put(taskTrackerNode, jobPosList); } jobPosList.add(jobPo); } } } if (CollectionUtils.isNotEmpty(timeoutMap)) { RemotingServerDelegate remotingServer = application.getRemotingServer(); for (Map.Entry<TaskTrackerNode, List<JobPo>> entry : timeoutMap.entrySet()) { TaskTrackerNode taskTrackerNode = entry.getKey(); ChannelWrapper channelWrapper = application .getChannelManager() .getChannel( taskTrackerNode.getNodeGroup(), NodeType.TASK_TRACKER, taskTrackerNode.getIdentity()); if (channelWrapper != null && channelWrapper.getChannel() != null && channelWrapper.isOpen()) { JobAskRequest requestBody = application.getCommandBodyWrapper().wrapper(new JobAskRequest()); final List<JobPo> jobPoList = entry.getValue(); List<String> jobIds = new ArrayList<String>(jobPoList.size()); for (JobPo jobPo : jobPoList) { jobIds.add(jobPo.getJobId()); } requestBody.setJobIds(jobIds); RemotingCommand request = RemotingCommand.createRequestCommand( JobProtos.RequestCode.JOB_ASK.code(), requestBody); remotingServer.invokeAsync( channelWrapper.getChannel(), request, new InvokeCallback() { @Override public void operationComplete(ResponseFuture responseFuture) { RemotingCommand response = responseFuture.getResponseCommand(); if (response != null && RemotingProtos.ResponseCode.SUCCESS.code() == response.getCode()) { JobAskResponse responseBody = response.getBody(); List<String> deadJobIds = responseBody.getJobIds(); if (CollectionUtils.isNotEmpty(deadJobIds)) { try { Thread.sleep(1000L); // 睡了1秒再修复, 防止任务刚好执行完正在传输中. 1s可以让完成的正常完成 } catch (InterruptedException e) { e.printStackTrace(); } for (JobPo jobPo : jobPoList) { if (deadJobIds.contains(jobPo.getJobId())) { fixedDeadJob(jobPo); } } } } } }); } } } } }