Пример #1
0
  /**
   * 根据停止的节点修复死锁
   *
   * @param node
   */
  public void fixedDeadNodeJob(Node node) {
    try {
      // 1. 判断这个节点的channel是否存在
      ChannelWrapper channelWrapper =
          application
              .getChannelManager()
              .getChannel(node.getGroup(), node.getNodeType(), node.getIdentity());
      if (channelWrapper == null
          || channelWrapper.getChannel() == null
          || channelWrapper.isClosed()) {
        //                List<JobPo> jobPos =
        // application.getExecutingJobQueue().getJobs(node.getIdentity());
        //                if (CollectionUtils.isNotEmpty(jobPos)) {
        //                    for (JobPo jobPo : jobPos) {
        //                        fixedDeadJob(jobPo);
        //                    }
        //                }

        List<TaskExecutingBean> executingTasks =
            InjectorHolder.getInstance(DTaskProvider.class)
                .loadTaskExecutingTasksBytaskTrackerIdentity(node.getIdentity());
        if (CollectionUtils.isNotEmpty(executingTasks)) {
          for (TaskExecutingBean taskExectuing : executingTasks) {
            fixedDeadJob(taskExectuing);
          }
        }
      }
    } catch (Exception t) {
      LOGGER.error(t.getMessage(), t);
    }
  }
Пример #2
0
  private void fix() throws RemotingSendException {
    // 查询出所有死掉的任务 (其实可以直接在数据库中fix的, 查询出来主要是为了日志打印)
    // 一般来说这个是没有多大的,我就不分页去查询了
    List<JobPo> jobPos = new ArrayList<JobPo>();
    ; // application.getExecutingJobQueue().getDeadJobs(SystemClock.now() - MAX_DEAD_CHECK_TIME);
    List<TaskExecutingBean> taskExecutingList =
        InjectorHolder.getInstance(DTaskProvider.class)
            .loadDeadTaskExecutingTasks(before(MAX_DEAD_CHECK_TIME));
    if (CollectionUtils.isNotEmpty(taskExecutingList)) {
      for (TaskExecutingBean taskExecuting : taskExecutingList) {
        jobPos.add(JobDomainConverter.convertTaskExecutingToJobpo(taskExecuting));
      }
    }

    if (jobPos != null && jobPos.size() > 0) {
      List<Node> nodes = application.getSubscribedNodeManager().getNodeList(NodeType.TASK_TRACKER);
      HashSet<String /*identity*/> identities = new HashSet<String>();
      if (CollectionUtils.isNotEmpty(nodes)) {
        for (Node node : nodes) {
          identities.add(node.getIdentity());
        }
      }

      Map<TaskTrackerNode /*执行的TaskTracker节点 identity*/, List<JobPo /*jobId*/>> timeoutMap =
          new HashMap<TaskTrackerNode, List<JobPo>>();
      for (JobPo jobPo : jobPos) {
        if (!identities.contains(jobPo.getTaskTrackerIdentity())) {
          fixedDeadJob(jobPo);
        } else {
          // 如果节点存在,并且超时了, 那么去主动询问taskTracker 这个任务是否在执行中
          if (SystemClock.now() - jobPo.getGmtModified() > MAX_TIME_OUT) {
            TaskTrackerNode taskTrackerNode =
                new TaskTrackerNode(
                    jobPo.getTaskTrackerIdentity(), jobPo.getTaskTrackerNodeGroup());
            List<JobPo> jobPosList = timeoutMap.get(taskTrackerNode);
            if (jobPosList == null) {
              jobPosList = new ArrayList<JobPo>();
              timeoutMap.put(taskTrackerNode, jobPosList);
            }
            jobPosList.add(jobPo);
          }
        }
      }

      if (CollectionUtils.isNotEmpty(timeoutMap)) {
        RemotingServerDelegate remotingServer = application.getRemotingServer();
        for (Map.Entry<TaskTrackerNode, List<JobPo>> entry : timeoutMap.entrySet()) {
          TaskTrackerNode taskTrackerNode = entry.getKey();
          ChannelWrapper channelWrapper =
              application
                  .getChannelManager()
                  .getChannel(
                      taskTrackerNode.getNodeGroup(),
                      NodeType.TASK_TRACKER,
                      taskTrackerNode.getIdentity());
          if (channelWrapper != null
              && channelWrapper.getChannel() != null
              && channelWrapper.isOpen()) {
            JobAskRequest requestBody =
                application.getCommandBodyWrapper().wrapper(new JobAskRequest());

            final List<JobPo> jobPoList = entry.getValue();
            List<String> jobIds = new ArrayList<String>(jobPoList.size());
            for (JobPo jobPo : jobPoList) {
              jobIds.add(jobPo.getJobId());
            }
            requestBody.setJobIds(jobIds);
            RemotingCommand request =
                RemotingCommand.createRequestCommand(
                    JobProtos.RequestCode.JOB_ASK.code(), requestBody);
            remotingServer.invokeAsync(
                channelWrapper.getChannel(),
                request,
                new InvokeCallback() {
                  @Override
                  public void operationComplete(ResponseFuture responseFuture) {
                    RemotingCommand response = responseFuture.getResponseCommand();
                    if (response != null
                        && RemotingProtos.ResponseCode.SUCCESS.code() == response.getCode()) {
                      JobAskResponse responseBody = response.getBody();
                      List<String> deadJobIds = responseBody.getJobIds();
                      if (CollectionUtils.isNotEmpty(deadJobIds)) {
                        try {
                          Thread.sleep(1000L); // 睡了1秒再修复, 防止任务刚好执行完正在传输中. 1s可以让完成的正常完成
                        } catch (InterruptedException e) {
                          e.printStackTrace();
                        }
                        for (JobPo jobPo : jobPoList) {
                          if (deadJobIds.contains(jobPo.getJobId())) {
                            fixedDeadJob(jobPo);
                          }
                        }
                      }
                    }
                  }
                });
          }
        }
      }
    }
  }