@Override public void start() { try { /** 状态check时间间隔,较短,可以把任务及时分发到对应channel中 */ int sleepIntervalInMillSec = this.configuration.getInt(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_SLEEPINTERVAL, 100); /** 状态汇报时间间隔,稍长,避免大量汇报 */ long reportIntervalInMillSec = this.configuration.getLong( CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_REPORTINTERVAL, 5000); // 获取channel数目 int channelNumber = this.configuration.getInt(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_CHANNEL); int taskMaxRetryTimes = this.configuration.getInt( CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_MAXRETRYTIMES, 1); long taskRetryIntervalInMsec = this.configuration.getLong( CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_RETRYINTERVALINMSEC, 10000); long taskMaxWaitInMsec = this.configuration.getLong( CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_MAXWAITINMSEC, 60000); List<Configuration> taskConfigs = this.configuration.getListConfiguration(CoreConstant.DATAX_JOB_CONTENT); if (LOG.isDebugEnabled()) { LOG.debug( "taskGroup[{}]'s task configs[{}]", this.taskGroupId, JSON.toJSONString(taskConfigs)); } int taskCountInThisTaskGroup = taskConfigs.size(); LOG.info( String.format( "taskGroupId=[%d] start [%d] channels for [%d] tasks.", this.taskGroupId, channelNumber, taskCountInThisTaskGroup)); this.containerCommunicator.registerCommunication(taskConfigs); Map<Integer, Configuration> taskConfigMap = buildTaskConfigMap(taskConfigs); // taskId与task配置 List<Configuration> taskQueue = buildRemainTasks(taskConfigs); // 待运行task列表 Map<Integer, TaskExecutor> taskFailedExecutorMap = new HashMap<Integer, TaskExecutor>(); // taskId与上次失败实例 List<TaskExecutor> runTasks = new ArrayList<TaskExecutor>(channelNumber); // 正在运行task Map<Integer, Long> taskStartTimeMap = new HashMap<Integer, Long>(); // 任务开始时间 long lastReportTimeStamp = 0; Communication lastTaskGroupContainerCommunication = new Communication(); while (true) { // 1.判断task状态 boolean failedOrKilled = false; Map<Integer, Communication> communicationMap = containerCommunicator.getCommunicationMap(); for (Map.Entry<Integer, Communication> entry : communicationMap.entrySet()) { Integer taskId = entry.getKey(); Communication taskCommunication = entry.getValue(); if (!taskCommunication.isFinished()) { continue; } TaskExecutor taskExecutor = removeTask(runTasks, taskId); // 上面从runTasks里移除了,因此对应在monitor里移除 taskMonitor.removeTask(taskId); // 失败,看task是否支持failover,重试次数未超过最大限制 if (taskCommunication.getState() == State.FAILED) { taskFailedExecutorMap.put(taskId, taskExecutor); if (taskExecutor.supportFailOver() && taskExecutor.getAttemptCount() < taskMaxRetryTimes) { taskExecutor.shutdown(); // 关闭老的executor containerCommunicator.resetCommunication(taskId); // 将task的状态重置 Configuration taskConfig = taskConfigMap.get(taskId); taskQueue.add(taskConfig); // 重新加入任务列表 } else { failedOrKilled = true; break; } } else if (taskCommunication.getState() == State.KILLED) { failedOrKilled = true; break; } else if (taskCommunication.getState() == State.SUCCEEDED) { Long taskStartTime = taskStartTimeMap.get(taskId); if (taskStartTime != null) { Long usedTime = System.currentTimeMillis() - taskStartTime; LOG.info( "taskGroup[{}] taskId[{}] is successed, used[{}]ms", this.taskGroupId, taskId, usedTime); // usedTime*1000*1000 转换成PerfRecord记录的ns,这里主要是简单登记,进行最长任务的打印。因此增加特定静态方法 PerfRecord.addPerfRecord( taskGroupId, taskId, PerfRecord.PHASE.TASK_TOTAL, taskStartTime, usedTime * 1000L * 1000L); taskStartTimeMap.remove(taskId); taskConfigMap.remove(taskId); } } } // 2.发现该taskGroup下taskExecutor的总状态失败则汇报错误 if (failedOrKilled) { lastTaskGroupContainerCommunication = reportTaskGroupCommunication( lastTaskGroupContainerCommunication, taskCountInThisTaskGroup); throw DataXException.asDataXException( FrameworkErrorCode.PLUGIN_RUNTIME_ERROR, lastTaskGroupContainerCommunication.getThrowable()); } // 3.有任务未执行,且正在运行的任务数小于最大通道限制 Iterator<Configuration> iterator = taskQueue.iterator(); while (iterator.hasNext() && runTasks.size() < channelNumber) { Configuration taskConfig = iterator.next(); Integer taskId = taskConfig.getInt(CoreConstant.TASK_ID); int attemptCount = 1; TaskExecutor lastExecutor = taskFailedExecutorMap.get(taskId); if (lastExecutor != null) { attemptCount = lastExecutor.getAttemptCount() + 1; long now = System.currentTimeMillis(); long failedTime = lastExecutor.getTimeStamp(); if (now - failedTime < taskRetryIntervalInMsec) { // 未到等待时间,继续留在队列 continue; } if (!lastExecutor.isShutdown()) { // 上次失败的task仍未结束 if (now - failedTime > taskMaxWaitInMsec) { markCommunicationFailed(taskId); reportTaskGroupCommunication( lastTaskGroupContainerCommunication, taskCountInThisTaskGroup); throw DataXException.asDataXException( CommonErrorCode.WAIT_TIME_EXCEED, "task failover等待超时"); } else { lastExecutor.shutdown(); // 再次尝试关闭 continue; } } else { LOG.info( "taskGroup[{}] taskId[{}] attemptCount[{}] has already shutdown", this.taskGroupId, taskId, lastExecutor.getAttemptCount()); } } Configuration taskConfigForRun = taskMaxRetryTimes > 1 ? taskConfig.clone() : taskConfig; TaskExecutor taskExecutor = new TaskExecutor(taskConfigForRun, attemptCount); taskStartTimeMap.put(taskId, System.currentTimeMillis()); taskExecutor.doStart(); iterator.remove(); runTasks.add(taskExecutor); // 上面,增加task到runTasks列表,因此在monitor里注册。 taskMonitor.registerTask(taskId, this.containerCommunicator.getCommunication(taskId)); taskFailedExecutorMap.remove(taskId); LOG.info( "taskGroup[{}] taskId[{}] attemptCount[{}] is started", this.taskGroupId, taskId, attemptCount); } // 4.任务列表为空,executor已结束, 搜集状态为success--->成功 if (taskQueue.isEmpty() && isAllTaskDone(runTasks) && containerCommunicator.collectState() == State.SUCCEEDED) { // 成功的情况下,也需要汇报一次。否则在任务结束非常快的情况下,采集的信息将会不准确 lastTaskGroupContainerCommunication = reportTaskGroupCommunication( lastTaskGroupContainerCommunication, taskCountInThisTaskGroup); LOG.info("taskGroup[{}] completed it's tasks.", this.taskGroupId); break; } // 5.如果当前时间已经超出汇报时间的interval,那么我们需要马上汇报 long now = System.currentTimeMillis(); if (now - lastReportTimeStamp > reportIntervalInMillSec) { lastTaskGroupContainerCommunication = reportTaskGroupCommunication( lastTaskGroupContainerCommunication, taskCountInThisTaskGroup); lastReportTimeStamp = now; // taskMonitor对于正在运行的task,每reportIntervalInMillSec进行检查 for (TaskExecutor taskExecutor : runTasks) { taskMonitor.report( taskExecutor.getTaskId(), this.containerCommunicator.getCommunication(taskExecutor.getTaskId())); } } Thread.sleep(sleepIntervalInMillSec); } // 6.最后还要汇报一次 reportTaskGroupCommunication(lastTaskGroupContainerCommunication, taskCountInThisTaskGroup); } catch (Throwable e) { Communication nowTaskGroupContainerCommunication = this.containerCommunicator.collect(); if (nowTaskGroupContainerCommunication.getThrowable() == null) { nowTaskGroupContainerCommunication.setThrowable(e); } nowTaskGroupContainerCommunication.setState(State.FAILED); this.containerCommunicator.report(nowTaskGroupContainerCommunication); throw DataXException.asDataXException(FrameworkErrorCode.RUNTIME_ERROR, e); } finally { if (!PerfTrace.getInstance().isJob()) { // 最后打印cpu的平均消耗,GC的统计 VMInfo vmInfo = VMInfo.getVmInfo(); if (vmInfo != null) { vmInfo.getDelta(false); LOG.info(vmInfo.totalString()); } LOG.info(PerfTrace.getInstance().summarizeNoException()); } } }
@Test public void testNormal() throws Exception { // register task long ttl = System.currentTimeMillis(); Communication communication1 = new Communication(); taskMonitor.registerTask(1, communication1); TaskMonitor.TaskCommunication taskCommunication1 = taskMonitor.getTaskCommunication(1); Assert.assertEquals(taskCommunication1.getLastAllReadRecords(), 0L); Assert.assertEquals(this.tasks.size(), 1); Assert.assertTrue(taskCommunication1.getLastUpdateComunicationTS() >= ttl); Assert.assertTrue(taskCommunication1.getTtl() >= ttl); // report 没有任何变化的communication long oldTS = taskCommunication1.getLastUpdateComunicationTS(); long oldTTL = taskCommunication1.getTtl(); Thread.sleep(1000); taskMonitor.report(1, communication1); TaskMonitor.TaskCommunication taskCommunication1_1 = taskMonitor.getTaskCommunication(1); Assert.assertEquals(taskCommunication1_1.getLastAllReadRecords(), 0L); Assert.assertEquals(taskCommunication1_1.getLastUpdateComunicationTS(), oldTS); Assert.assertTrue(taskCommunication1_1.getTtl() > oldTTL); // report 已经finish的communication Communication communication2 = new Communication(); communication2.setState(State.KILLED); taskMonitor.registerTask(2, communication2); Assert.assertEquals(this.tasks.size(), 1); // report 另一个communication Communication communication3 = new Communication(); taskMonitor.registerTask(3, communication3); Assert.assertEquals(this.tasks.size(), 2); System.out.println(this.tasks); // report communication ttl = System.currentTimeMillis(); communication1.setLongCounter(CommunicationTool.READ_SUCCEED_RECORDS, 100); communication3.setLongCounter(CommunicationTool.READ_FAILED_RECORDS, 10); taskMonitor.report(1, communication1); taskMonitor.report(3, communication3); taskCommunication1 = taskMonitor.getTaskCommunication(1); Assert.assertEquals(taskCommunication1.getLastAllReadRecords(), 100L); Assert.assertEquals(this.tasks.size(), 2); Assert.assertTrue(taskCommunication1.getLastUpdateComunicationTS() >= ttl); Assert.assertTrue(taskCommunication1.getTtl() >= ttl); TaskMonitor.TaskCommunication taskCommunication3 = taskMonitor.getTaskCommunication(3); Assert.assertEquals(taskCommunication3.getLastAllReadRecords(), 10L); Assert.assertEquals(this.tasks.size(), 2); Assert.assertTrue(taskCommunication3.getLastUpdateComunicationTS() >= ttl); Assert.assertTrue(taskCommunication3.getTtl() >= ttl); // 继续report ttl = System.currentTimeMillis(); communication1.setLongCounter(CommunicationTool.READ_SUCCEED_RECORDS, 1001); communication3.setLongCounter(CommunicationTool.READ_FAILED_RECORDS, 101); taskMonitor.report(1, communication1); taskMonitor.report(3, communication3); taskCommunication1 = taskMonitor.getTaskCommunication(1); Assert.assertEquals(taskCommunication1.getLastAllReadRecords(), 1001L); Assert.assertEquals(this.tasks.size(), 2); Assert.assertTrue(taskCommunication1.getLastUpdateComunicationTS() >= ttl); Assert.assertTrue(taskCommunication1.getTtl() >= ttl); taskCommunication3 = taskMonitor.getTaskCommunication(3); Assert.assertEquals(taskCommunication3.getLastAllReadRecords(), 101L); Assert.assertEquals(this.tasks.size(), 2); Assert.assertTrue(taskCommunication3.getLastUpdateComunicationTS() >= ttl); Assert.assertTrue(taskCommunication3.getTtl() >= ttl); // 设置EXPIRED_TIME Field EXPIRED_TIME = taskMonitor.getClass().getDeclaredField("EXPIRED_TIME"); EXPIRED_TIME.setAccessible(true); EXPIRED_TIME.set(null, 1000); Thread.sleep(2000); // 超时没有变更 taskMonitor.report(1, communication1); System.out.println(communication1.getCounter()); System.out.println(communication1.getThrowable()); System.out.println(communication1.getThrowableMessage()); System.out.println(communication1.getState()); Assert.assertTrue(communication1.getThrowableMessage().contains("任务hung住,Expired")); Assert.assertEquals(communication1.getState(), State.FAILED); // communicatio1 已经fail, communication3 在超时后进行变更,update正常 ttl = System.currentTimeMillis(); communication1.setLongCounter(CommunicationTool.READ_SUCCEED_RECORDS, 2001); communication3.setLongCounter(CommunicationTool.READ_FAILED_RECORDS, 201); taskMonitor.report(1, communication1); taskMonitor.report(3, communication3); taskCommunication1 = taskMonitor.getTaskCommunication(1); Assert.assertEquals(taskCommunication1.getLastAllReadRecords(), 1001L); Assert.assertEquals(this.tasks.size(), 2); Assert.assertTrue(communication1.getThrowableMessage().contains("任务hung住,Expired")); Assert.assertEquals(communication1.getState(), State.FAILED); taskCommunication3 = taskMonitor.getTaskCommunication(3); Assert.assertEquals(taskCommunication3.getLastAllReadRecords(), 201L); Assert.assertEquals(this.tasks.size(), 2); Assert.assertTrue(taskCommunication3.getLastUpdateComunicationTS() >= ttl); Assert.assertTrue(taskCommunication3.getTtl() >= ttl); // remove 1 taskMonitor.removeTask(1); Assert.assertEquals(this.tasks.size(), 1); // remove 3 taskMonitor.removeTask(3); Assert.assertEquals(this.tasks.size(), 0); // 没有register communication3 直接report ttl = System.currentTimeMillis(); communication3.setLongCounter(CommunicationTool.READ_FAILED_RECORDS, 301); taskMonitor.report(3, communication3); taskCommunication3 = taskMonitor.getTaskCommunication(3); Assert.assertEquals(taskCommunication3.getLastAllReadRecords(), 301L); Assert.assertEquals(this.tasks.size(), 1); Assert.assertTrue(taskCommunication3.getLastUpdateComunicationTS() >= ttl); Assert.assertTrue(taskCommunication3.getTtl() >= ttl); }
private void markCommunicationFailed(Integer taskId) { Communication communication = containerCommunicator.getCommunication(taskId); communication.setState(State.FAILED); }