예제 #1
0
파일: Task.java 프로젝트: qichenftw/opennms
 private void notifyCompleted() {
   try {
     m_monitor.completed(this);
   } catch (final Throwable t) {
     m_monitor.monitorException(t);
   }
 }
예제 #2
0
파일: Task.java 프로젝트: qichenftw/opennms
 private void notifySubmitted() {
   try {
     m_monitor.submitted(this);
   } catch (final Throwable t) {
     m_monitor.monitorException(t);
   }
 }
예제 #3
0
파일: Task.java 프로젝트: qichenftw/opennms
 private void notifyPrerequisteCompleted(final Task prereq) {
   try {
     m_monitor.prerequisiteCompleted(this, prereq);
   } catch (final Throwable t) {
     m_monitor.monitorException(t);
   }
 }
예제 #4
0
 @Before
 public void setUp() throws Exception {
   Field tasks = taskMonitor.getClass().getDeclaredField("tasks");
   tasks.setAccessible(true);
   tasks.set(taskMonitor, new ConcurrentHashMap<Integer, TaskMonitor.TaskCommunication>());
   this.tasks = (ConcurrentHashMap<Integer, TaskMonitor.TaskCommunication>) tasks.get(taskMonitor);
 }
예제 #5
0
  @Test
  public void testNormal() throws Exception {

    // register task
    long ttl = System.currentTimeMillis();

    Communication communication1 = new Communication();

    taskMonitor.registerTask(1, communication1);

    TaskMonitor.TaskCommunication taskCommunication1 = taskMonitor.getTaskCommunication(1);

    Assert.assertEquals(taskCommunication1.getLastAllReadRecords(), 0L);
    Assert.assertEquals(this.tasks.size(), 1);

    Assert.assertTrue(taskCommunication1.getLastUpdateComunicationTS() >= ttl);
    Assert.assertTrue(taskCommunication1.getTtl() >= ttl);

    // report 没有任何变化的communication

    long oldTS = taskCommunication1.getLastUpdateComunicationTS();
    long oldTTL = taskCommunication1.getTtl();
    Thread.sleep(1000);

    taskMonitor.report(1, communication1);

    TaskMonitor.TaskCommunication taskCommunication1_1 = taskMonitor.getTaskCommunication(1);

    Assert.assertEquals(taskCommunication1_1.getLastAllReadRecords(), 0L);
    Assert.assertEquals(taskCommunication1_1.getLastUpdateComunicationTS(), oldTS);
    Assert.assertTrue(taskCommunication1_1.getTtl() > oldTTL);

    // report 已经finish的communication
    Communication communication2 = new Communication();
    communication2.setState(State.KILLED);

    taskMonitor.registerTask(2, communication2);
    Assert.assertEquals(this.tasks.size(), 1);

    // report 另一个communication
    Communication communication3 = new Communication();
    taskMonitor.registerTask(3, communication3);

    Assert.assertEquals(this.tasks.size(), 2);
    System.out.println(this.tasks);

    // report communication

    ttl = System.currentTimeMillis();

    communication1.setLongCounter(CommunicationTool.READ_SUCCEED_RECORDS, 100);
    communication3.setLongCounter(CommunicationTool.READ_FAILED_RECORDS, 10);

    taskMonitor.report(1, communication1);
    taskMonitor.report(3, communication3);

    taskCommunication1 = taskMonitor.getTaskCommunication(1);

    Assert.assertEquals(taskCommunication1.getLastAllReadRecords(), 100L);
    Assert.assertEquals(this.tasks.size(), 2);

    Assert.assertTrue(taskCommunication1.getLastUpdateComunicationTS() >= ttl);
    Assert.assertTrue(taskCommunication1.getTtl() >= ttl);

    TaskMonitor.TaskCommunication taskCommunication3 = taskMonitor.getTaskCommunication(3);

    Assert.assertEquals(taskCommunication3.getLastAllReadRecords(), 10L);
    Assert.assertEquals(this.tasks.size(), 2);

    Assert.assertTrue(taskCommunication3.getLastUpdateComunicationTS() >= ttl);
    Assert.assertTrue(taskCommunication3.getTtl() >= ttl);

    // 继续report
    ttl = System.currentTimeMillis();

    communication1.setLongCounter(CommunicationTool.READ_SUCCEED_RECORDS, 1001);
    communication3.setLongCounter(CommunicationTool.READ_FAILED_RECORDS, 101);

    taskMonitor.report(1, communication1);
    taskMonitor.report(3, communication3);

    taskCommunication1 = taskMonitor.getTaskCommunication(1);

    Assert.assertEquals(taskCommunication1.getLastAllReadRecords(), 1001L);
    Assert.assertEquals(this.tasks.size(), 2);

    Assert.assertTrue(taskCommunication1.getLastUpdateComunicationTS() >= ttl);
    Assert.assertTrue(taskCommunication1.getTtl() >= ttl);

    taskCommunication3 = taskMonitor.getTaskCommunication(3);

    Assert.assertEquals(taskCommunication3.getLastAllReadRecords(), 101L);
    Assert.assertEquals(this.tasks.size(), 2);

    Assert.assertTrue(taskCommunication3.getLastUpdateComunicationTS() >= ttl);
    Assert.assertTrue(taskCommunication3.getTtl() >= ttl);

    // 设置EXPIRED_TIME
    Field EXPIRED_TIME = taskMonitor.getClass().getDeclaredField("EXPIRED_TIME");
    EXPIRED_TIME.setAccessible(true);
    EXPIRED_TIME.set(null, 1000);

    Thread.sleep(2000);

    // 超时没有变更
    taskMonitor.report(1, communication1);

    System.out.println(communication1.getCounter());
    System.out.println(communication1.getThrowable());
    System.out.println(communication1.getThrowableMessage());
    System.out.println(communication1.getState());

    Assert.assertTrue(communication1.getThrowableMessage().contains("任务hung住,Expired"));
    Assert.assertEquals(communication1.getState(), State.FAILED);

    // communicatio1 已经fail, communication3 在超时后进行变更,update正常
    ttl = System.currentTimeMillis();

    communication1.setLongCounter(CommunicationTool.READ_SUCCEED_RECORDS, 2001);
    communication3.setLongCounter(CommunicationTool.READ_FAILED_RECORDS, 201);

    taskMonitor.report(1, communication1);
    taskMonitor.report(3, communication3);

    taskCommunication1 = taskMonitor.getTaskCommunication(1);

    Assert.assertEquals(taskCommunication1.getLastAllReadRecords(), 1001L);
    Assert.assertEquals(this.tasks.size(), 2);

    Assert.assertTrue(communication1.getThrowableMessage().contains("任务hung住,Expired"));
    Assert.assertEquals(communication1.getState(), State.FAILED);

    taskCommunication3 = taskMonitor.getTaskCommunication(3);

    Assert.assertEquals(taskCommunication3.getLastAllReadRecords(), 201L);
    Assert.assertEquals(this.tasks.size(), 2);

    Assert.assertTrue(taskCommunication3.getLastUpdateComunicationTS() >= ttl);
    Assert.assertTrue(taskCommunication3.getTtl() >= ttl);

    // remove 1
    taskMonitor.removeTask(1);
    Assert.assertEquals(this.tasks.size(), 1);

    // remove 3
    taskMonitor.removeTask(3);
    Assert.assertEquals(this.tasks.size(), 0);

    // 没有register communication3 直接report
    ttl = System.currentTimeMillis();

    communication3.setLongCounter(CommunicationTool.READ_FAILED_RECORDS, 301);

    taskMonitor.report(3, communication3);

    taskCommunication3 = taskMonitor.getTaskCommunication(3);

    Assert.assertEquals(taskCommunication3.getLastAllReadRecords(), 301L);
    Assert.assertEquals(this.tasks.size(), 1);

    Assert.assertTrue(taskCommunication3.getLastUpdateComunicationTS() >= ttl);
    Assert.assertTrue(taskCommunication3.getTtl() >= ttl);
  }
 /**
  * The constructor contains the actual work to do (which is our solution calculation).
  *
  * @param solver the Solver object.
  * @param dimTryTopicClustering the number of topic clusterings to try.
  * @param dimTryPersonAssignment the number of topic/person assignments to try.
  * @param dimTryAlloc the maximum number of assignments to try.
  */
 public SolverTask(TaskMonitor monitor) {
   monitor.doTask();
   monitor.taskDone = true;
 }
예제 #7
0
  @Override
  public void start() {
    try {
      /** 状态check时间间隔,较短,可以把任务及时分发到对应channel中 */
      int sleepIntervalInMillSec =
          this.configuration.getInt(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_SLEEPINTERVAL, 100);
      /** 状态汇报时间间隔,稍长,避免大量汇报 */
      long reportIntervalInMillSec =
          this.configuration.getLong(
              CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_REPORTINTERVAL, 5000);

      // 获取channel数目
      int channelNumber =
          this.configuration.getInt(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_CHANNEL);

      int taskMaxRetryTimes =
          this.configuration.getInt(
              CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_MAXRETRYTIMES, 1);

      long taskRetryIntervalInMsec =
          this.configuration.getLong(
              CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_RETRYINTERVALINMSEC, 10000);

      long taskMaxWaitInMsec =
          this.configuration.getLong(
              CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_MAXWAITINMSEC, 60000);

      List<Configuration> taskConfigs =
          this.configuration.getListConfiguration(CoreConstant.DATAX_JOB_CONTENT);

      if (LOG.isDebugEnabled()) {
        LOG.debug(
            "taskGroup[{}]'s task configs[{}]", this.taskGroupId, JSON.toJSONString(taskConfigs));
      }

      int taskCountInThisTaskGroup = taskConfigs.size();
      LOG.info(
          String.format(
              "taskGroupId=[%d] start [%d] channels for [%d] tasks.",
              this.taskGroupId, channelNumber, taskCountInThisTaskGroup));

      this.containerCommunicator.registerCommunication(taskConfigs);

      Map<Integer, Configuration> taskConfigMap = buildTaskConfigMap(taskConfigs); // taskId与task配置
      List<Configuration> taskQueue = buildRemainTasks(taskConfigs); // 待运行task列表
      Map<Integer, TaskExecutor> taskFailedExecutorMap =
          new HashMap<Integer, TaskExecutor>(); // taskId与上次失败实例
      List<TaskExecutor> runTasks = new ArrayList<TaskExecutor>(channelNumber); // 正在运行task
      Map<Integer, Long> taskStartTimeMap = new HashMap<Integer, Long>(); // 任务开始时间

      long lastReportTimeStamp = 0;
      Communication lastTaskGroupContainerCommunication = new Communication();

      while (true) {
        // 1.判断task状态
        boolean failedOrKilled = false;
        Map<Integer, Communication> communicationMap = containerCommunicator.getCommunicationMap();
        for (Map.Entry<Integer, Communication> entry : communicationMap.entrySet()) {
          Integer taskId = entry.getKey();
          Communication taskCommunication = entry.getValue();
          if (!taskCommunication.isFinished()) {
            continue;
          }
          TaskExecutor taskExecutor = removeTask(runTasks, taskId);

          // 上面从runTasks里移除了,因此对应在monitor里移除
          taskMonitor.removeTask(taskId);

          // 失败,看task是否支持failover,重试次数未超过最大限制
          if (taskCommunication.getState() == State.FAILED) {
            taskFailedExecutorMap.put(taskId, taskExecutor);
            if (taskExecutor.supportFailOver()
                && taskExecutor.getAttemptCount() < taskMaxRetryTimes) {
              taskExecutor.shutdown(); // 关闭老的executor
              containerCommunicator.resetCommunication(taskId); // 将task的状态重置
              Configuration taskConfig = taskConfigMap.get(taskId);
              taskQueue.add(taskConfig); // 重新加入任务列表
            } else {
              failedOrKilled = true;
              break;
            }
          } else if (taskCommunication.getState() == State.KILLED) {
            failedOrKilled = true;
            break;
          } else if (taskCommunication.getState() == State.SUCCEEDED) {
            Long taskStartTime = taskStartTimeMap.get(taskId);
            if (taskStartTime != null) {
              Long usedTime = System.currentTimeMillis() - taskStartTime;
              LOG.info(
                  "taskGroup[{}] taskId[{}] is successed, used[{}]ms",
                  this.taskGroupId,
                  taskId,
                  usedTime);
              // usedTime*1000*1000 转换成PerfRecord记录的ns,这里主要是简单登记,进行最长任务的打印。因此增加特定静态方法
              PerfRecord.addPerfRecord(
                  taskGroupId,
                  taskId,
                  PerfRecord.PHASE.TASK_TOTAL,
                  taskStartTime,
                  usedTime * 1000L * 1000L);
              taskStartTimeMap.remove(taskId);
              taskConfigMap.remove(taskId);
            }
          }
        }

        // 2.发现该taskGroup下taskExecutor的总状态失败则汇报错误
        if (failedOrKilled) {
          lastTaskGroupContainerCommunication =
              reportTaskGroupCommunication(
                  lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);

          throw DataXException.asDataXException(
              FrameworkErrorCode.PLUGIN_RUNTIME_ERROR,
              lastTaskGroupContainerCommunication.getThrowable());
        }

        // 3.有任务未执行,且正在运行的任务数小于最大通道限制
        Iterator<Configuration> iterator = taskQueue.iterator();
        while (iterator.hasNext() && runTasks.size() < channelNumber) {
          Configuration taskConfig = iterator.next();
          Integer taskId = taskConfig.getInt(CoreConstant.TASK_ID);
          int attemptCount = 1;
          TaskExecutor lastExecutor = taskFailedExecutorMap.get(taskId);
          if (lastExecutor != null) {
            attemptCount = lastExecutor.getAttemptCount() + 1;
            long now = System.currentTimeMillis();
            long failedTime = lastExecutor.getTimeStamp();
            if (now - failedTime < taskRetryIntervalInMsec) { // 未到等待时间,继续留在队列
              continue;
            }
            if (!lastExecutor.isShutdown()) { // 上次失败的task仍未结束
              if (now - failedTime > taskMaxWaitInMsec) {
                markCommunicationFailed(taskId);
                reportTaskGroupCommunication(
                    lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
                throw DataXException.asDataXException(
                    CommonErrorCode.WAIT_TIME_EXCEED, "task failover等待超时");
              } else {
                lastExecutor.shutdown(); // 再次尝试关闭
                continue;
              }
            } else {
              LOG.info(
                  "taskGroup[{}] taskId[{}] attemptCount[{}] has already shutdown",
                  this.taskGroupId,
                  taskId,
                  lastExecutor.getAttemptCount());
            }
          }
          Configuration taskConfigForRun = taskMaxRetryTimes > 1 ? taskConfig.clone() : taskConfig;
          TaskExecutor taskExecutor = new TaskExecutor(taskConfigForRun, attemptCount);
          taskStartTimeMap.put(taskId, System.currentTimeMillis());
          taskExecutor.doStart();

          iterator.remove();
          runTasks.add(taskExecutor);

          // 上面,增加task到runTasks列表,因此在monitor里注册。
          taskMonitor.registerTask(taskId, this.containerCommunicator.getCommunication(taskId));

          taskFailedExecutorMap.remove(taskId);
          LOG.info(
              "taskGroup[{}] taskId[{}] attemptCount[{}] is started",
              this.taskGroupId,
              taskId,
              attemptCount);
        }

        // 4.任务列表为空,executor已结束, 搜集状态为success--->成功
        if (taskQueue.isEmpty()
            && isAllTaskDone(runTasks)
            && containerCommunicator.collectState() == State.SUCCEEDED) {
          // 成功的情况下,也需要汇报一次。否则在任务结束非常快的情况下,采集的信息将会不准确
          lastTaskGroupContainerCommunication =
              reportTaskGroupCommunication(
                  lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);

          LOG.info("taskGroup[{}] completed it's tasks.", this.taskGroupId);
          break;
        }

        // 5.如果当前时间已经超出汇报时间的interval,那么我们需要马上汇报
        long now = System.currentTimeMillis();
        if (now - lastReportTimeStamp > reportIntervalInMillSec) {
          lastTaskGroupContainerCommunication =
              reportTaskGroupCommunication(
                  lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);

          lastReportTimeStamp = now;

          // taskMonitor对于正在运行的task,每reportIntervalInMillSec进行检查
          for (TaskExecutor taskExecutor : runTasks) {
            taskMonitor.report(
                taskExecutor.getTaskId(),
                this.containerCommunicator.getCommunication(taskExecutor.getTaskId()));
          }
        }

        Thread.sleep(sleepIntervalInMillSec);
      }

      // 6.最后还要汇报一次
      reportTaskGroupCommunication(lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);

    } catch (Throwable e) {
      Communication nowTaskGroupContainerCommunication = this.containerCommunicator.collect();

      if (nowTaskGroupContainerCommunication.getThrowable() == null) {
        nowTaskGroupContainerCommunication.setThrowable(e);
      }
      nowTaskGroupContainerCommunication.setState(State.FAILED);
      this.containerCommunicator.report(nowTaskGroupContainerCommunication);

      throw DataXException.asDataXException(FrameworkErrorCode.RUNTIME_ERROR, e);
    } finally {
      if (!PerfTrace.getInstance().isJob()) {
        // 最后打印cpu的平均消耗,GC的统计
        VMInfo vmInfo = VMInfo.getVmInfo();
        if (vmInfo != null) {
          vmInfo.getDelta(false);
          LOG.info(vmInfo.totalString());
        }

        LOG.info(PerfTrace.getInstance().summarizeNoException());
      }
    }
  }
예제 #8
0
public class TaskGroupContainer extends AbstractContainer {
  private static final Logger LOG = LoggerFactory.getLogger(TaskGroupContainer.class);

  /** 当前taskGroup所属jobId */
  private long jobId;

  /** 当前taskGroupId */
  private int taskGroupId;

  /** 使用的channel类 */
  private String channelClazz;

  /** task收集器使用的类 */
  private String taskCollectorClass;

  private TaskMonitor taskMonitor = TaskMonitor.getInstance();

  public TaskGroupContainer(Configuration configuration) {
    super(configuration);

    initCommunicator(configuration);

    this.jobId = this.configuration.getLong(CoreConstant.DATAX_CORE_CONTAINER_JOB_ID);
    this.taskGroupId = this.configuration.getInt(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_ID);

    this.channelClazz =
        this.configuration.getString(CoreConstant.DATAX_CORE_TRANSPORT_CHANNEL_CLASS);
    this.taskCollectorClass =
        this.configuration.getString(CoreConstant.DATAX_CORE_STATISTICS_COLLECTOR_PLUGIN_TASKCLASS);
  }

  private void initCommunicator(Configuration configuration) {
    super.setContainerCommunicator(new StandaloneTGContainerCommunicator(configuration));
  }

  public long getJobId() {
    return jobId;
  }

  public int getTaskGroupId() {
    return taskGroupId;
  }

  @Override
  public void start() {
    try {
      /** 状态check时间间隔,较短,可以把任务及时分发到对应channel中 */
      int sleepIntervalInMillSec =
          this.configuration.getInt(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_SLEEPINTERVAL, 100);
      /** 状态汇报时间间隔,稍长,避免大量汇报 */
      long reportIntervalInMillSec =
          this.configuration.getLong(
              CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_REPORTINTERVAL, 5000);

      // 获取channel数目
      int channelNumber =
          this.configuration.getInt(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_CHANNEL);

      int taskMaxRetryTimes =
          this.configuration.getInt(
              CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_MAXRETRYTIMES, 1);

      long taskRetryIntervalInMsec =
          this.configuration.getLong(
              CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_RETRYINTERVALINMSEC, 10000);

      long taskMaxWaitInMsec =
          this.configuration.getLong(
              CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_MAXWAITINMSEC, 60000);

      List<Configuration> taskConfigs =
          this.configuration.getListConfiguration(CoreConstant.DATAX_JOB_CONTENT);

      if (LOG.isDebugEnabled()) {
        LOG.debug(
            "taskGroup[{}]'s task configs[{}]", this.taskGroupId, JSON.toJSONString(taskConfigs));
      }

      int taskCountInThisTaskGroup = taskConfigs.size();
      LOG.info(
          String.format(
              "taskGroupId=[%d] start [%d] channels for [%d] tasks.",
              this.taskGroupId, channelNumber, taskCountInThisTaskGroup));

      this.containerCommunicator.registerCommunication(taskConfigs);

      Map<Integer, Configuration> taskConfigMap = buildTaskConfigMap(taskConfigs); // taskId与task配置
      List<Configuration> taskQueue = buildRemainTasks(taskConfigs); // 待运行task列表
      Map<Integer, TaskExecutor> taskFailedExecutorMap =
          new HashMap<Integer, TaskExecutor>(); // taskId与上次失败实例
      List<TaskExecutor> runTasks = new ArrayList<TaskExecutor>(channelNumber); // 正在运行task
      Map<Integer, Long> taskStartTimeMap = new HashMap<Integer, Long>(); // 任务开始时间

      long lastReportTimeStamp = 0;
      Communication lastTaskGroupContainerCommunication = new Communication();

      while (true) {
        // 1.判断task状态
        boolean failedOrKilled = false;
        Map<Integer, Communication> communicationMap = containerCommunicator.getCommunicationMap();
        for (Map.Entry<Integer, Communication> entry : communicationMap.entrySet()) {
          Integer taskId = entry.getKey();
          Communication taskCommunication = entry.getValue();
          if (!taskCommunication.isFinished()) {
            continue;
          }
          TaskExecutor taskExecutor = removeTask(runTasks, taskId);

          // 上面从runTasks里移除了,因此对应在monitor里移除
          taskMonitor.removeTask(taskId);

          // 失败,看task是否支持failover,重试次数未超过最大限制
          if (taskCommunication.getState() == State.FAILED) {
            taskFailedExecutorMap.put(taskId, taskExecutor);
            if (taskExecutor.supportFailOver()
                && taskExecutor.getAttemptCount() < taskMaxRetryTimes) {
              taskExecutor.shutdown(); // 关闭老的executor
              containerCommunicator.resetCommunication(taskId); // 将task的状态重置
              Configuration taskConfig = taskConfigMap.get(taskId);
              taskQueue.add(taskConfig); // 重新加入任务列表
            } else {
              failedOrKilled = true;
              break;
            }
          } else if (taskCommunication.getState() == State.KILLED) {
            failedOrKilled = true;
            break;
          } else if (taskCommunication.getState() == State.SUCCEEDED) {
            Long taskStartTime = taskStartTimeMap.get(taskId);
            if (taskStartTime != null) {
              Long usedTime = System.currentTimeMillis() - taskStartTime;
              LOG.info(
                  "taskGroup[{}] taskId[{}] is successed, used[{}]ms",
                  this.taskGroupId,
                  taskId,
                  usedTime);
              // usedTime*1000*1000 转换成PerfRecord记录的ns,这里主要是简单登记,进行最长任务的打印。因此增加特定静态方法
              PerfRecord.addPerfRecord(
                  taskGroupId,
                  taskId,
                  PerfRecord.PHASE.TASK_TOTAL,
                  taskStartTime,
                  usedTime * 1000L * 1000L);
              taskStartTimeMap.remove(taskId);
              taskConfigMap.remove(taskId);
            }
          }
        }

        // 2.发现该taskGroup下taskExecutor的总状态失败则汇报错误
        if (failedOrKilled) {
          lastTaskGroupContainerCommunication =
              reportTaskGroupCommunication(
                  lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);

          throw DataXException.asDataXException(
              FrameworkErrorCode.PLUGIN_RUNTIME_ERROR,
              lastTaskGroupContainerCommunication.getThrowable());
        }

        // 3.有任务未执行,且正在运行的任务数小于最大通道限制
        Iterator<Configuration> iterator = taskQueue.iterator();
        while (iterator.hasNext() && runTasks.size() < channelNumber) {
          Configuration taskConfig = iterator.next();
          Integer taskId = taskConfig.getInt(CoreConstant.TASK_ID);
          int attemptCount = 1;
          TaskExecutor lastExecutor = taskFailedExecutorMap.get(taskId);
          if (lastExecutor != null) {
            attemptCount = lastExecutor.getAttemptCount() + 1;
            long now = System.currentTimeMillis();
            long failedTime = lastExecutor.getTimeStamp();
            if (now - failedTime < taskRetryIntervalInMsec) { // 未到等待时间,继续留在队列
              continue;
            }
            if (!lastExecutor.isShutdown()) { // 上次失败的task仍未结束
              if (now - failedTime > taskMaxWaitInMsec) {
                markCommunicationFailed(taskId);
                reportTaskGroupCommunication(
                    lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
                throw DataXException.asDataXException(
                    CommonErrorCode.WAIT_TIME_EXCEED, "task failover等待超时");
              } else {
                lastExecutor.shutdown(); // 再次尝试关闭
                continue;
              }
            } else {
              LOG.info(
                  "taskGroup[{}] taskId[{}] attemptCount[{}] has already shutdown",
                  this.taskGroupId,
                  taskId,
                  lastExecutor.getAttemptCount());
            }
          }
          Configuration taskConfigForRun = taskMaxRetryTimes > 1 ? taskConfig.clone() : taskConfig;
          TaskExecutor taskExecutor = new TaskExecutor(taskConfigForRun, attemptCount);
          taskStartTimeMap.put(taskId, System.currentTimeMillis());
          taskExecutor.doStart();

          iterator.remove();
          runTasks.add(taskExecutor);

          // 上面,增加task到runTasks列表,因此在monitor里注册。
          taskMonitor.registerTask(taskId, this.containerCommunicator.getCommunication(taskId));

          taskFailedExecutorMap.remove(taskId);
          LOG.info(
              "taskGroup[{}] taskId[{}] attemptCount[{}] is started",
              this.taskGroupId,
              taskId,
              attemptCount);
        }

        // 4.任务列表为空,executor已结束, 搜集状态为success--->成功
        if (taskQueue.isEmpty()
            && isAllTaskDone(runTasks)
            && containerCommunicator.collectState() == State.SUCCEEDED) {
          // 成功的情况下,也需要汇报一次。否则在任务结束非常快的情况下,采集的信息将会不准确
          lastTaskGroupContainerCommunication =
              reportTaskGroupCommunication(
                  lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);

          LOG.info("taskGroup[{}] completed it's tasks.", this.taskGroupId);
          break;
        }

        // 5.如果当前时间已经超出汇报时间的interval,那么我们需要马上汇报
        long now = System.currentTimeMillis();
        if (now - lastReportTimeStamp > reportIntervalInMillSec) {
          lastTaskGroupContainerCommunication =
              reportTaskGroupCommunication(
                  lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);

          lastReportTimeStamp = now;

          // taskMonitor对于正在运行的task,每reportIntervalInMillSec进行检查
          for (TaskExecutor taskExecutor : runTasks) {
            taskMonitor.report(
                taskExecutor.getTaskId(),
                this.containerCommunicator.getCommunication(taskExecutor.getTaskId()));
          }
        }

        Thread.sleep(sleepIntervalInMillSec);
      }

      // 6.最后还要汇报一次
      reportTaskGroupCommunication(lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);

    } catch (Throwable e) {
      Communication nowTaskGroupContainerCommunication = this.containerCommunicator.collect();

      if (nowTaskGroupContainerCommunication.getThrowable() == null) {
        nowTaskGroupContainerCommunication.setThrowable(e);
      }
      nowTaskGroupContainerCommunication.setState(State.FAILED);
      this.containerCommunicator.report(nowTaskGroupContainerCommunication);

      throw DataXException.asDataXException(FrameworkErrorCode.RUNTIME_ERROR, e);
    } finally {
      if (!PerfTrace.getInstance().isJob()) {
        // 最后打印cpu的平均消耗,GC的统计
        VMInfo vmInfo = VMInfo.getVmInfo();
        if (vmInfo != null) {
          vmInfo.getDelta(false);
          LOG.info(vmInfo.totalString());
        }

        LOG.info(PerfTrace.getInstance().summarizeNoException());
      }
    }
  }

  private Map<Integer, Configuration> buildTaskConfigMap(List<Configuration> configurations) {
    Map<Integer, Configuration> map = new HashMap<Integer, Configuration>();
    for (Configuration taskConfig : configurations) {
      int taskId = taskConfig.getInt(CoreConstant.TASK_ID);
      map.put(taskId, taskConfig);
    }
    return map;
  }

  private List<Configuration> buildRemainTasks(List<Configuration> configurations) {
    List<Configuration> remainTasks = new LinkedList<Configuration>();
    for (Configuration taskConfig : configurations) {
      remainTasks.add(taskConfig);
    }
    return remainTasks;
  }

  private TaskExecutor removeTask(List<TaskExecutor> taskList, int taskId) {
    Iterator<TaskExecutor> iterator = taskList.iterator();
    while (iterator.hasNext()) {
      TaskExecutor taskExecutor = iterator.next();
      if (taskExecutor.getTaskId() == taskId) {
        iterator.remove();
        return taskExecutor;
      }
    }
    return null;
  }

  private boolean isAllTaskDone(List<TaskExecutor> taskList) {
    for (TaskExecutor taskExecutor : taskList) {
      if (!taskExecutor.isTaskFinished()) {
        return false;
      }
    }
    return true;
  }

  private Communication reportTaskGroupCommunication(
      Communication lastTaskGroupContainerCommunication, int taskCount) {
    Communication nowTaskGroupContainerCommunication = this.containerCommunicator.collect();
    nowTaskGroupContainerCommunication.setTimestamp(System.currentTimeMillis());
    Communication reportCommunication =
        CommunicationTool.getReportCommunication(
            nowTaskGroupContainerCommunication, lastTaskGroupContainerCommunication, taskCount);
    this.containerCommunicator.report(reportCommunication);
    return reportCommunication;
  }

  private void markCommunicationFailed(Integer taskId) {
    Communication communication = containerCommunicator.getCommunication(taskId);
    communication.setState(State.FAILED);
  }

  /** TaskExecutor是一个完整task的执行器 其中包括1:1的reader和writer */
  class TaskExecutor {
    private Configuration taskConfig;

    private int taskId;

    private int attemptCount;

    private Channel channel;

    private Thread readerThread;

    private Thread writerThread;

    private ReaderRunner readerRunner;

    private WriterRunner writerRunner;

    /**
     * 该处的taskCommunication在多处用到: 1. channel 2. readerRunner和writerRunner 3.
     * reader和writer的taskPluginCollector
     */
    private Communication taskCommunication;

    public TaskExecutor(Configuration taskConf, int attemptCount) {
      // 获取该taskExecutor的配置
      this.taskConfig = taskConf;
      Validate.isTrue(
          null != this.taskConfig.getConfiguration(CoreConstant.JOB_READER)
              && null != this.taskConfig.getConfiguration(CoreConstant.JOB_WRITER),
          "[reader|writer]的插件参数不能为空!");

      // 得到taskId
      this.taskId = this.taskConfig.getInt(CoreConstant.TASK_ID);
      this.attemptCount = attemptCount;

      /** 由taskId得到该taskExecutor的Communication 要传给readerRunner和writerRunner,同时要传给channel作统计用 */
      this.taskCommunication = containerCommunicator.getCommunication(taskId);
      Validate.notNull(
          this.taskCommunication, String.format("taskId[%d]的Communication没有注册过", taskId));
      this.channel = ClassUtil.instantiate(channelClazz, Channel.class, configuration);
      this.channel.setCommunication(this.taskCommunication);

      /** 生成writerThread */
      writerRunner = (WriterRunner) generateRunner(PluginType.WRITER);
      this.writerThread =
          new Thread(
              writerRunner, String.format("%d-%d-%d-writer", jobId, taskGroupId, this.taskId));
      // 通过设置thread的contextClassLoader,即可实现同步和主程序不通的加载器
      this.writerThread.setContextClassLoader(
          LoadUtil.getJarLoader(
              PluginType.WRITER, this.taskConfig.getString(CoreConstant.JOB_WRITER_NAME)));

      /** 生成readerThread */
      readerRunner = (ReaderRunner) generateRunner(PluginType.READER);
      this.readerThread =
          new Thread(
              readerRunner, String.format("%d-%d-%d-reader", jobId, taskGroupId, this.taskId));
      /** 通过设置thread的contextClassLoader,即可实现同步和主程序不通的加载器 */
      this.readerThread.setContextClassLoader(
          LoadUtil.getJarLoader(
              PluginType.READER, this.taskConfig.getString(CoreConstant.JOB_READER_NAME)));
    }

    public void doStart() {
      this.writerThread.start();

      // reader没有起来,writer不可能结束
      if (!this.writerThread.isAlive() || this.taskCommunication.getState() == State.FAILED) {
        throw DataXException.asDataXException(
            FrameworkErrorCode.RUNTIME_ERROR, this.taskCommunication.getThrowable());
      }

      this.readerThread.start();

      // 这里reader可能很快结束
      if (!this.readerThread.isAlive() && this.taskCommunication.getState() == State.FAILED) {
        // 这里有可能出现Reader线上启动即挂情况 对于这类情况 需要立刻抛出异常
        throw DataXException.asDataXException(
            FrameworkErrorCode.RUNTIME_ERROR, this.taskCommunication.getThrowable());
      }
    }

    private AbstractRunner generateRunner(PluginType pluginType) {
      AbstractRunner newRunner = null;
      TaskPluginCollector pluginCollector;

      switch (pluginType) {
        case READER:
          newRunner =
              LoadUtil.loadPluginRunner(
                  pluginType, this.taskConfig.getString(CoreConstant.JOB_READER_NAME));
          newRunner.setJobConf(this.taskConfig.getConfiguration(CoreConstant.JOB_READER_PARAMETER));

          pluginCollector =
              ClassUtil.instantiate(
                  taskCollectorClass,
                  AbstractTaskPluginCollector.class,
                  configuration,
                  this.taskCommunication,
                  PluginType.READER);

          ((ReaderRunner) newRunner)
              .setRecordSender(new BufferedRecordExchanger(this.channel, pluginCollector));
          /** 设置taskPlugin的collector,用来处理脏数据和job/task通信 */
          newRunner.setTaskPluginCollector(pluginCollector);
          break;
        case WRITER:
          newRunner =
              LoadUtil.loadPluginRunner(
                  pluginType, this.taskConfig.getString(CoreConstant.JOB_WRITER_NAME));
          newRunner.setJobConf(this.taskConfig.getConfiguration(CoreConstant.JOB_WRITER_PARAMETER));

          pluginCollector =
              ClassUtil.instantiate(
                  taskCollectorClass,
                  AbstractTaskPluginCollector.class,
                  configuration,
                  this.taskCommunication,
                  PluginType.WRITER);
          ((WriterRunner) newRunner)
              .setRecordReceiver(new BufferedRecordExchanger(this.channel, pluginCollector));
          /** 设置taskPlugin的collector,用来处理脏数据和job/task通信 */
          newRunner.setTaskPluginCollector(pluginCollector);
          break;
        default:
          throw DataXException.asDataXException(
              FrameworkErrorCode.ARGUMENT_ERROR, "Cant generateRunner for:" + pluginType);
      }

      newRunner.setTaskGroupId(taskGroupId);
      newRunner.setTaskId(this.taskId);
      newRunner.setRunnerCommunication(this.taskCommunication);

      return newRunner;
    }

    // 检查任务是否结束
    private boolean isTaskFinished() {
      // 如果reader 或 writer没有完成工作,那么直接返回工作没有完成
      if (readerThread.isAlive() || writerThread.isAlive()) {
        return false;
      }

      if (taskCommunication == null || !taskCommunication.isFinished()) {
        return false;
      }

      return true;
    }

    private int getTaskId() {
      return taskId;
    }

    private long getTimeStamp() {
      return taskCommunication.getTimestamp();
    }

    private int getAttemptCount() {
      return attemptCount;
    }

    private boolean supportFailOver() {
      return writerRunner.supportFailOver();
    }

    private void shutdown() {
      writerRunner.shutdown();
      readerRunner.shutdown();
      if (writerThread.isAlive()) {
        writerThread.interrupt();
      }
      if (readerThread.isAlive()) {
        readerThread.interrupt();
      }
    }

    private boolean isShutdown() {
      return !readerThread.isAlive() && !writerThread.isAlive();
    }
  }
}