// 做合并和导出,重置任务的检查操作 // 所有任务一起来轮询,对Master来讲,有点资源浪费 // 可以通过以下几种方式改进: // 1、针对job的属性设置监听器,Listener模式 // 2、使用Observer模式 protected void mergeAndExportJobs() { Iterator<Map.Entry<String, Job>> iter = jobs.entrySet().iterator(); while (iter.hasNext()) { Job job = iter.next().getValue(); if (job.getRebuildTag() == 2) { job.rebuild(0, null, this); continue; } if (!job.getJobTimeOut().get()) { // 需要合并该job的task if (!job.isMerging().get() && job.needMerge()) { logger.warn( "job " + job.getJobName() + " complete tasks:" + job.getCompletedTaskCount().get() + ", merged tasks :" + job.getMergedTaskCount().get()); final Job j = job; final BlockingQueue<JobMergedResult> branchResultQueue = branchResultQueuePool.get(j.getJobName()); final BlockingQueue<JobTaskResult> jobTaskResultsQueue = jobTaskResultsQueuePool.get(j.getJobName()); if (j.isMerging().compareAndSet(false, true)) eventProcessThreadPool.execute( new Runnable() { public void run() { try { jobResultMerger.merge(j, branchResultQueue, jobTaskResultsQueue, true); } catch (Throwable e) { logger.error(e); } finally { j.isMerging().set(false); } } }); } } else { // 判断是否还有和主干合并的线程,如果没有可以设置完成标识 boolean gotIt = job.getTrunkLock().writeLock().tryLock(); if (gotIt) { try { if (!job.isMerged().get()) { List<Map<String, Map<String, Object>>> mergeResults = new ArrayList<Map<String, Map<String, Object>>>(); new MergeJobOperation( job, 0, mergeResults, config, branchResultQueuePool.get(job.getJobName())) .run(); job.isMerged().set(true); logger.warn("job is timeout, last merge trunk success!"); } } finally { job.getTrunkLock().writeLock().unlock(); } } } // 需要导出该job的数据 if (!job.isExporting().get() && job.needExport()) { final Job j = job; if (j.isExporting().compareAndSet(false, true)) eventProcessThreadPool.execute( new Runnable() { public void run() { try { // 虽然是多线程,但还是阻塞模式来做 jobExporter.exportReport(j, false); j.isExported().set(true); } catch (Throwable e) { logger.error(e); } finally { j.isExporting().set(false); } // 判断是否需要开始导出中间结果,放在外部不妨碍下一次的处理 exportOrCleanTrunk(j); } }); if (job.getRebuildTag() == -1) { job.rebuild(0, null, this); iter.remove(); } if (job.getRebuildTag() == 1) { job.rebuild(0, null, this); } } // 做一次任务处理时间判断,如果超时将设置job的超时状态位置 job.checkJobTimeOut(); // 任务是否需要被重置 if (job.needReset()) { if (logger.isWarnEnabled()) logger.warn("job " + job.getJobName() + " be reset now."); StringBuilder sb = new StringBuilder(ReportUtil.MASTER_LOG) .append(",") .append(System.currentTimeMillis()) .append(","); sb.append(job.getEpoch()) .append(",") .append(job.getJobName()) .append(",") .append(System.currentTimeMillis() - job.getStartTime()) .append(",") .append(job.getJobMergeTime().get()) .append(",") .append(job.getJobExportTime()) .append(",") .append(job.getTaskCount()) .append(",") .append(job.getCompletedTaskCount().get()) .append(",") .append(job.getMergedTaskCount().get()) .append(",") .append(job.getJobMergeBranchCount().get()); ReportUtil.clusterLog(sb.toString()); job.reset(this); if (logger.isInfoEnabled()) { sb = new StringBuilder("jobManager:{jobs:") .append(jobs.size()) .append(",jobTaskPool:" + jobTaskPool.size()); sb.append(",statusPool:") .append(statusPool.size()) .append(",undoTasks:") .append(undoTaskQueue.size()) .append("}"); logger.info(sb.toString()); } List<JobTask> tasks = job.getJobTasks(); for (JobTask task : tasks) { statusPool.put(task.getTaskId(), task.getStatus()); } } } }
// 分配任务和结果提交处理由于是单线程处理, // 因此本身不用做状态池并发控制,将消耗较多的发送操作交给ServerConnector多线程操作 @Override public void addTaskResultToQueue(SendResultsRequestEvent jobResponseEvent) { JobTaskResult jobTaskResult = jobResponseEvent.getJobTaskResult(); if (jobTaskResult.getTaskIds() != null && jobTaskResult.getTaskIds().size() > 0) { // 判断是否是过期的一些老任务数据,根据task和taskresult的createtime来判断 // 以后要扩展成为如果发现当前的epoch < 结果的epoch,表明这台可能是从属的master,负责reduce,但是速度跟不上了 if (jobTaskPool.get(jobTaskResult.getTaskIds().get(0)) == null) { logger.error("jobTask is null " + jobTaskResult.getTaskIds().get(0)); } if (jobTaskResult.getJobEpoch() != jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobEpoch()) { if (jobTaskResult.getJobEpoch() < jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobEpoch()) { logger.error( "old task result will be discard! job:" + jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName() + ",epoch:" + jobTaskResult.getJobEpoch() + ",slave:" + jobResponseEvent.getChannel()); masterNode.echoSendJobTaskResults( jobResponseEvent.getSequence(), "success", jobResponseEvent.getChannel()); return; } else { // 给一定的容忍时间,暂时定为5秒 jobs.get(jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName()) .blockToResetJob(15000); if (jobTaskResult.getJobEpoch() > jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobEpoch()) { logger.error( "otherMaster can't merge in time!job:" + jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName()); masterNode.echoSendJobTaskResults( jobResponseEvent.getSequence(), "success", jobResponseEvent.getChannel()); return; } } } if (logger.isWarnEnabled()) { StringBuilder ts = new StringBuilder("Receive slave analysis result, jobTaskIds : ") .append(jobTaskResult.toString()) .append(", ") .append(jobTaskResult.getTaskIds().size()); logger.warn(ts.toString()); } // 先放入队列,防止小概率多线程并发问题 jobTaskResultsQueuePool .get(jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName()) .offer(jobTaskResult); for (int i = 0; i < jobTaskResult.getTaskIds().size(); i++) { String taskId = jobTaskResult.getTaskIds().get(i); JobTask jobTask = jobTaskPool.get(taskId); if (jobTask == null) { logger.error( new StringBuilder("taskId :").append(taskId).append("not exist!").toString()); continue; } Job job = jobs.get(jobTask.getJobName()); if (job == null) { logger.error( new StringBuilder("job :") .append(jobTask.getJobName()) .append("not exist!") .toString()); continue; } if (statusPool.replace(taskId, JobTaskStatus.DOING, JobTaskStatus.DONE) || statusPool.replace(taskId, JobTaskStatus.UNDO, JobTaskStatus.DONE)) { logger.info("task " + jobTask.getJobName() + " of job " + job.getJobName() + " done"); jobTask.setStatus(JobTaskStatus.DONE); jobTask.setEndTime(System.currentTimeMillis()); jobTask.setLastMergedEpoch(job.getEpoch().get()); job.getCompletedTaskCount().incrementAndGet(); } // 对jobTask的执行结果打点 StringBuilder log = new StringBuilder(ReportUtil.SLAVE_LOG) .append(",") .append(System.currentTimeMillis()) .append(",") .append(job.getEpoch()) .append(","); log.append(jobTask.getJobName()) .append(",") .append(jobTask.getTaskId()) .append(",") .append(jobTask.getRecycleCounter().get()) .append(",") .append(jobTaskResult.getSlaveIp()) .append(",") .append(jobTaskResult.getEfficiency()) .append(","); JobTaskExecuteInfo executeInfo = jobTaskResult.getTaskExecuteInfos().get(jobTask.getTaskId()); if (executeInfo != null) log.append(executeInfo.getAnalysisConsume()) .append(",") .append(executeInfo.getJobDataSize()) .append(",") .append(executeInfo.getTotalLine()) .append(",") .append(executeInfo.getErrorLine()) .append(",") .append(executeInfo.getEmptyLine()); else logger.error( new StringBuilder() .append("taskId : ") .append(jobTask.getTaskId()) .append(" executeInfo is null!") .toString()); ReportUtil.clusterLog(log.toString()); } } // 是否需要用异步方式发送,减少对jobManager事件处理延时 if (config.isUseAsynModeToSendResponse()) { final String sequence = jobResponseEvent.getSequence(); final Object channel = jobResponseEvent.getChannel(); eventProcessThreadPool.execute( new Runnable() { public void run() { try { masterNode.echoSendJobTaskResults(sequence, "success", channel); } catch (Throwable e) { logger.error(e); } } }); } else masterNode.echoSendJobTaskResults( jobResponseEvent.getSequence(), "success", jobResponseEvent.getChannel()); }