@Override
  public void start() {
    try {
      /** 状态check时间间隔,较短,可以把任务及时分发到对应channel中 */
      int sleepIntervalInMillSec =
          this.configuration.getInt(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_SLEEPINTERVAL, 100);
      /** 状态汇报时间间隔,稍长,避免大量汇报 */
      long reportIntervalInMillSec =
          this.configuration.getLong(
              CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_REPORTINTERVAL, 5000);

      // 获取channel数目
      int channelNumber =
          this.configuration.getInt(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_CHANNEL);

      int taskMaxRetryTimes =
          this.configuration.getInt(
              CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_MAXRETRYTIMES, 1);

      long taskRetryIntervalInMsec =
          this.configuration.getLong(
              CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_RETRYINTERVALINMSEC, 10000);

      long taskMaxWaitInMsec =
          this.configuration.getLong(
              CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_MAXWAITINMSEC, 60000);

      List<Configuration> taskConfigs =
          this.configuration.getListConfiguration(CoreConstant.DATAX_JOB_CONTENT);

      if (LOG.isDebugEnabled()) {
        LOG.debug(
            "taskGroup[{}]'s task configs[{}]", this.taskGroupId, JSON.toJSONString(taskConfigs));
      }

      int taskCountInThisTaskGroup = taskConfigs.size();
      LOG.info(
          String.format(
              "taskGroupId=[%d] start [%d] channels for [%d] tasks.",
              this.taskGroupId, channelNumber, taskCountInThisTaskGroup));

      this.containerCommunicator.registerCommunication(taskConfigs);

      Map<Integer, Configuration> taskConfigMap = buildTaskConfigMap(taskConfigs); // taskId与task配置
      List<Configuration> taskQueue = buildRemainTasks(taskConfigs); // 待运行task列表
      Map<Integer, TaskExecutor> taskFailedExecutorMap =
          new HashMap<Integer, TaskExecutor>(); // taskId与上次失败实例
      List<TaskExecutor> runTasks = new ArrayList<TaskExecutor>(channelNumber); // 正在运行task
      Map<Integer, Long> taskStartTimeMap = new HashMap<Integer, Long>(); // 任务开始时间

      long lastReportTimeStamp = 0;
      Communication lastTaskGroupContainerCommunication = new Communication();

      while (true) {
        // 1.判断task状态
        boolean failedOrKilled = false;
        Map<Integer, Communication> communicationMap = containerCommunicator.getCommunicationMap();
        for (Map.Entry<Integer, Communication> entry : communicationMap.entrySet()) {
          Integer taskId = entry.getKey();
          Communication taskCommunication = entry.getValue();
          if (!taskCommunication.isFinished()) {
            continue;
          }
          TaskExecutor taskExecutor = removeTask(runTasks, taskId);

          // 上面从runTasks里移除了,因此对应在monitor里移除
          taskMonitor.removeTask(taskId);

          // 失败,看task是否支持failover,重试次数未超过最大限制
          if (taskCommunication.getState() == State.FAILED) {
            taskFailedExecutorMap.put(taskId, taskExecutor);
            if (taskExecutor.supportFailOver()
                && taskExecutor.getAttemptCount() < taskMaxRetryTimes) {
              taskExecutor.shutdown(); // 关闭老的executor
              containerCommunicator.resetCommunication(taskId); // 将task的状态重置
              Configuration taskConfig = taskConfigMap.get(taskId);
              taskQueue.add(taskConfig); // 重新加入任务列表
            } else {
              failedOrKilled = true;
              break;
            }
          } else if (taskCommunication.getState() == State.KILLED) {
            failedOrKilled = true;
            break;
          } else if (taskCommunication.getState() == State.SUCCEEDED) {
            Long taskStartTime = taskStartTimeMap.get(taskId);
            if (taskStartTime != null) {
              Long usedTime = System.currentTimeMillis() - taskStartTime;
              LOG.info(
                  "taskGroup[{}] taskId[{}] is successed, used[{}]ms",
                  this.taskGroupId,
                  taskId,
                  usedTime);
              // usedTime*1000*1000 转换成PerfRecord记录的ns,这里主要是简单登记,进行最长任务的打印。因此增加特定静态方法
              PerfRecord.addPerfRecord(
                  taskGroupId,
                  taskId,
                  PerfRecord.PHASE.TASK_TOTAL,
                  taskStartTime,
                  usedTime * 1000L * 1000L);
              taskStartTimeMap.remove(taskId);
              taskConfigMap.remove(taskId);
            }
          }
        }

        // 2.发现该taskGroup下taskExecutor的总状态失败则汇报错误
        if (failedOrKilled) {
          lastTaskGroupContainerCommunication =
              reportTaskGroupCommunication(
                  lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);

          throw DataXException.asDataXException(
              FrameworkErrorCode.PLUGIN_RUNTIME_ERROR,
              lastTaskGroupContainerCommunication.getThrowable());
        }

        // 3.有任务未执行,且正在运行的任务数小于最大通道限制
        Iterator<Configuration> iterator = taskQueue.iterator();
        while (iterator.hasNext() && runTasks.size() < channelNumber) {
          Configuration taskConfig = iterator.next();
          Integer taskId = taskConfig.getInt(CoreConstant.TASK_ID);
          int attemptCount = 1;
          TaskExecutor lastExecutor = taskFailedExecutorMap.get(taskId);
          if (lastExecutor != null) {
            attemptCount = lastExecutor.getAttemptCount() + 1;
            long now = System.currentTimeMillis();
            long failedTime = lastExecutor.getTimeStamp();
            if (now - failedTime < taskRetryIntervalInMsec) { // 未到等待时间,继续留在队列
              continue;
            }
            if (!lastExecutor.isShutdown()) { // 上次失败的task仍未结束
              if (now - failedTime > taskMaxWaitInMsec) {
                markCommunicationFailed(taskId);
                reportTaskGroupCommunication(
                    lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
                throw DataXException.asDataXException(
                    CommonErrorCode.WAIT_TIME_EXCEED, "task failover等待超时");
              } else {
                lastExecutor.shutdown(); // 再次尝试关闭
                continue;
              }
            } else {
              LOG.info(
                  "taskGroup[{}] taskId[{}] attemptCount[{}] has already shutdown",
                  this.taskGroupId,
                  taskId,
                  lastExecutor.getAttemptCount());
            }
          }
          Configuration taskConfigForRun = taskMaxRetryTimes > 1 ? taskConfig.clone() : taskConfig;
          TaskExecutor taskExecutor = new TaskExecutor(taskConfigForRun, attemptCount);
          taskStartTimeMap.put(taskId, System.currentTimeMillis());
          taskExecutor.doStart();

          iterator.remove();
          runTasks.add(taskExecutor);

          // 上面,增加task到runTasks列表,因此在monitor里注册。
          taskMonitor.registerTask(taskId, this.containerCommunicator.getCommunication(taskId));

          taskFailedExecutorMap.remove(taskId);
          LOG.info(
              "taskGroup[{}] taskId[{}] attemptCount[{}] is started",
              this.taskGroupId,
              taskId,
              attemptCount);
        }

        // 4.任务列表为空,executor已结束, 搜集状态为success--->成功
        if (taskQueue.isEmpty()
            && isAllTaskDone(runTasks)
            && containerCommunicator.collectState() == State.SUCCEEDED) {
          // 成功的情况下,也需要汇报一次。否则在任务结束非常快的情况下,采集的信息将会不准确
          lastTaskGroupContainerCommunication =
              reportTaskGroupCommunication(
                  lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);

          LOG.info("taskGroup[{}] completed it's tasks.", this.taskGroupId);
          break;
        }

        // 5.如果当前时间已经超出汇报时间的interval,那么我们需要马上汇报
        long now = System.currentTimeMillis();
        if (now - lastReportTimeStamp > reportIntervalInMillSec) {
          lastTaskGroupContainerCommunication =
              reportTaskGroupCommunication(
                  lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);

          lastReportTimeStamp = now;

          // taskMonitor对于正在运行的task,每reportIntervalInMillSec进行检查
          for (TaskExecutor taskExecutor : runTasks) {
            taskMonitor.report(
                taskExecutor.getTaskId(),
                this.containerCommunicator.getCommunication(taskExecutor.getTaskId()));
          }
        }

        Thread.sleep(sleepIntervalInMillSec);
      }

      // 6.最后还要汇报一次
      reportTaskGroupCommunication(lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);

    } catch (Throwable e) {
      Communication nowTaskGroupContainerCommunication = this.containerCommunicator.collect();

      if (nowTaskGroupContainerCommunication.getThrowable() == null) {
        nowTaskGroupContainerCommunication.setThrowable(e);
      }
      nowTaskGroupContainerCommunication.setState(State.FAILED);
      this.containerCommunicator.report(nowTaskGroupContainerCommunication);

      throw DataXException.asDataXException(FrameworkErrorCode.RUNTIME_ERROR, e);
    } finally {
      if (!PerfTrace.getInstance().isJob()) {
        // 最后打印cpu的平均消耗,GC的统计
        VMInfo vmInfo = VMInfo.getVmInfo();
        if (vmInfo != null) {
          vmInfo.getDelta(false);
          LOG.info(vmInfo.totalString());
        }

        LOG.info(PerfTrace.getInstance().summarizeNoException());
      }
    }
  }
Exemple #2
0
  @Test
  public void testNormal() throws Exception {

    // register task
    long ttl = System.currentTimeMillis();

    Communication communication1 = new Communication();

    taskMonitor.registerTask(1, communication1);

    TaskMonitor.TaskCommunication taskCommunication1 = taskMonitor.getTaskCommunication(1);

    Assert.assertEquals(taskCommunication1.getLastAllReadRecords(), 0L);
    Assert.assertEquals(this.tasks.size(), 1);

    Assert.assertTrue(taskCommunication1.getLastUpdateComunicationTS() >= ttl);
    Assert.assertTrue(taskCommunication1.getTtl() >= ttl);

    // report 没有任何变化的communication

    long oldTS = taskCommunication1.getLastUpdateComunicationTS();
    long oldTTL = taskCommunication1.getTtl();
    Thread.sleep(1000);

    taskMonitor.report(1, communication1);

    TaskMonitor.TaskCommunication taskCommunication1_1 = taskMonitor.getTaskCommunication(1);

    Assert.assertEquals(taskCommunication1_1.getLastAllReadRecords(), 0L);
    Assert.assertEquals(taskCommunication1_1.getLastUpdateComunicationTS(), oldTS);
    Assert.assertTrue(taskCommunication1_1.getTtl() > oldTTL);

    // report 已经finish的communication
    Communication communication2 = new Communication();
    communication2.setState(State.KILLED);

    taskMonitor.registerTask(2, communication2);
    Assert.assertEquals(this.tasks.size(), 1);

    // report 另一个communication
    Communication communication3 = new Communication();
    taskMonitor.registerTask(3, communication3);

    Assert.assertEquals(this.tasks.size(), 2);
    System.out.println(this.tasks);

    // report communication

    ttl = System.currentTimeMillis();

    communication1.setLongCounter(CommunicationTool.READ_SUCCEED_RECORDS, 100);
    communication3.setLongCounter(CommunicationTool.READ_FAILED_RECORDS, 10);

    taskMonitor.report(1, communication1);
    taskMonitor.report(3, communication3);

    taskCommunication1 = taskMonitor.getTaskCommunication(1);

    Assert.assertEquals(taskCommunication1.getLastAllReadRecords(), 100L);
    Assert.assertEquals(this.tasks.size(), 2);

    Assert.assertTrue(taskCommunication1.getLastUpdateComunicationTS() >= ttl);
    Assert.assertTrue(taskCommunication1.getTtl() >= ttl);

    TaskMonitor.TaskCommunication taskCommunication3 = taskMonitor.getTaskCommunication(3);

    Assert.assertEquals(taskCommunication3.getLastAllReadRecords(), 10L);
    Assert.assertEquals(this.tasks.size(), 2);

    Assert.assertTrue(taskCommunication3.getLastUpdateComunicationTS() >= ttl);
    Assert.assertTrue(taskCommunication3.getTtl() >= ttl);

    // 继续report
    ttl = System.currentTimeMillis();

    communication1.setLongCounter(CommunicationTool.READ_SUCCEED_RECORDS, 1001);
    communication3.setLongCounter(CommunicationTool.READ_FAILED_RECORDS, 101);

    taskMonitor.report(1, communication1);
    taskMonitor.report(3, communication3);

    taskCommunication1 = taskMonitor.getTaskCommunication(1);

    Assert.assertEquals(taskCommunication1.getLastAllReadRecords(), 1001L);
    Assert.assertEquals(this.tasks.size(), 2);

    Assert.assertTrue(taskCommunication1.getLastUpdateComunicationTS() >= ttl);
    Assert.assertTrue(taskCommunication1.getTtl() >= ttl);

    taskCommunication3 = taskMonitor.getTaskCommunication(3);

    Assert.assertEquals(taskCommunication3.getLastAllReadRecords(), 101L);
    Assert.assertEquals(this.tasks.size(), 2);

    Assert.assertTrue(taskCommunication3.getLastUpdateComunicationTS() >= ttl);
    Assert.assertTrue(taskCommunication3.getTtl() >= ttl);

    // 设置EXPIRED_TIME
    Field EXPIRED_TIME = taskMonitor.getClass().getDeclaredField("EXPIRED_TIME");
    EXPIRED_TIME.setAccessible(true);
    EXPIRED_TIME.set(null, 1000);

    Thread.sleep(2000);

    // 超时没有变更
    taskMonitor.report(1, communication1);

    System.out.println(communication1.getCounter());
    System.out.println(communication1.getThrowable());
    System.out.println(communication1.getThrowableMessage());
    System.out.println(communication1.getState());

    Assert.assertTrue(communication1.getThrowableMessage().contains("任务hung住,Expired"));
    Assert.assertEquals(communication1.getState(), State.FAILED);

    // communicatio1 已经fail, communication3 在超时后进行变更,update正常
    ttl = System.currentTimeMillis();

    communication1.setLongCounter(CommunicationTool.READ_SUCCEED_RECORDS, 2001);
    communication3.setLongCounter(CommunicationTool.READ_FAILED_RECORDS, 201);

    taskMonitor.report(1, communication1);
    taskMonitor.report(3, communication3);

    taskCommunication1 = taskMonitor.getTaskCommunication(1);

    Assert.assertEquals(taskCommunication1.getLastAllReadRecords(), 1001L);
    Assert.assertEquals(this.tasks.size(), 2);

    Assert.assertTrue(communication1.getThrowableMessage().contains("任务hung住,Expired"));
    Assert.assertEquals(communication1.getState(), State.FAILED);

    taskCommunication3 = taskMonitor.getTaskCommunication(3);

    Assert.assertEquals(taskCommunication3.getLastAllReadRecords(), 201L);
    Assert.assertEquals(this.tasks.size(), 2);

    Assert.assertTrue(taskCommunication3.getLastUpdateComunicationTS() >= ttl);
    Assert.assertTrue(taskCommunication3.getTtl() >= ttl);

    // remove 1
    taskMonitor.removeTask(1);
    Assert.assertEquals(this.tasks.size(), 1);

    // remove 3
    taskMonitor.removeTask(3);
    Assert.assertEquals(this.tasks.size(), 0);

    // 没有register communication3 直接report
    ttl = System.currentTimeMillis();

    communication3.setLongCounter(CommunicationTool.READ_FAILED_RECORDS, 301);

    taskMonitor.report(3, communication3);

    taskCommunication3 = taskMonitor.getTaskCommunication(3);

    Assert.assertEquals(taskCommunication3.getLastAllReadRecords(), 301L);
    Assert.assertEquals(this.tasks.size(), 1);

    Assert.assertTrue(taskCommunication3.getLastUpdateComunicationTS() >= ttl);
    Assert.assertTrue(taskCommunication3.getTtl() >= ttl);
  }
 private void markCommunicationFailed(Integer taskId) {
   Communication communication = containerCommunicator.getCommunication(taskId);
   communication.setState(State.FAILED);
 }