@Override public synchronized void checkJobStatus() throws AnalysisException { // 通过外部事件激发重新载入配置 if (jobBuilder.isNeedRebuild()) { if (logger.isInfoEnabled()) { logger.info("check job status need to rebuild"); } jobs = jobBuilder.rebuild(jobs); if (jobs == null || (jobs != null && jobs.size() == 0)) throw new AnalysisException("jobs should not be empty!"); } checkTaskStatus(); mergeAndExportJobs(); // 任务全部完成并且没有新加任务的情况下,休息1s for (Job job : jobs.values()) { if (!job.isExported().get() || job.getRebuildTag() == 2) { return; } else { try { Thread.sleep(1000); } catch (InterruptedException e) { logger.error(e); } } } }
@Override public void clearJobData(String jobName) { Job job = jobs.get(jobName); if (job != null) { job.getJobResult().clear(); if (logger.isWarnEnabled()) logger.warn("clear job :" + job.getJobName() + " data."); } }
@Override public void init() throws AnalysisException { // 获得任务数量 jobBuilder.setConfig(config); jobExporter.setConfig(config); jobResultMerger.setConfig(config); jobBuilder.init(); jobExporter.init(); jobResultMerger.init(); jobs = jobBuilder.build(); for (Job job : jobs.values()) { job.reset(null); } if (jobs == null || (jobs != null && jobs.size() == 0)) throw new AnalysisException("jobs should not be empty!"); jobTaskPool = new ConcurrentHashMap<String, JobTask>(); undoTaskQueue = new LinkedBlockingDeque<JobTask>(); statusPool = new ConcurrentHashMap<String, JobTaskStatus>(); jobTaskResultsQueuePool = new HashMap<String, BlockingQueue<JobTaskResult>>(); branchResultQueuePool = new HashMap<String, BlockingQueue<JobMergedResult>>(); for (String jobName : jobs.keySet()) { jobTaskResultsQueuePool.put(jobName, new LinkedBlockingQueue<JobTaskResult>()); branchResultQueuePool.put(jobName, new LinkedBlockingQueue<JobMergedResult>()); } eventProcessThreadPool = new ThreadPoolExecutor( this.config.getMaxJobEventWorker(), this.config.getMaxJobEventWorker(), 0, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(), new NamedThreadFactory("jobManagerEventProcess_worker")); masterDataRecoverWorker = new MasterDataRecoverWorker( config.getMasterName(), config.getTempStoreDataDir(), jobs, this.config); masterDataRecoverWorker.start(); addJobsToPool(); if (logger.isInfoEnabled()) logger.info("jobManager init end, MaxJobEventWorker size : " + config.getMaxJobEventWorker()); }
// 重新增加任务到任务池中 protected void addJobsToPool() { for (Job job : jobs.values()) { List<JobTask> tasks = job.getJobTasks(); for (JobTask task : tasks) { jobTaskPool.put(task.getTaskId(), task); statusPool.put(task.getTaskId(), task.getStatus()); undoTaskQueue.offer(task); } if (jobTaskResultsQueuePool.get(job.getJobName()) == null) jobTaskResultsQueuePool.put(job.getJobName(), new LinkedBlockingQueue<JobTaskResult>()); if (branchResultQueuePool.get(job.getJobName()) == null) branchResultQueuePool.put(job.getJobName(), new LinkedBlockingQueue<JobMergedResult>()); } }
@Override public void releaseResource() { stopped = true; try { // 导出所有结果,暂时不导出中间data,后面看是否需要 // 添加中间结果导出,不导出中间结果,会有部分数据丢失 if (jobs != null) for (Job j : jobs.values()) { // 结果导出不重要,可以考虑去掉 while (!j.getTrunkExported().get()) Thread.sleep(3000); if (!j.isExported().get()) { jobExporter.exportReport(j, false); logger.info("releaseResouce now, export job : " + j.getJobName()); } } if (eventProcessThreadPool != null) eventProcessThreadPool.shutdown(); if (masterDataRecoverWorker != null) masterDataRecoverWorker.stopWorker(); } catch (Throwable e) { logger.error("error when stop the node", e); } finally { if (jobs != null) jobs.clear(); if (jobTaskPool != null) jobTaskPool.clear(); if (undoTaskQueue != null) undoTaskQueue.clear(); if (statusPool != null) statusPool.clear(); if (jobTaskResultsQueuePool != null) jobTaskResultsQueuePool.clear(); if (branchResultQueuePool != null) branchResultQueuePool.clear(); if (jobBuilder != null) jobBuilder.releaseResource(); if (jobExporter != null) jobExporter.releaseResource(); if (jobResultMerger != null) jobResultMerger.releaseResource(); logger.info("jobManager releaseResource end"); } }
/** * 在导出数据以后,判断是否需要清空主干,是否需要导出主干 * * @param job */ protected void exportOrCleanTrunk(Job job) { boolean needToSetJobResultNull = false; // 判断是否到了报表的有效时间段,支持小时,日,月三种方式 if (job.getJobConfig().getReportPeriodDefine().equals(AnalysisConstants.REPORT_PERIOD_DAY)) { Calendar calendar = Calendar.getInstance(); int now = calendar.get(Calendar.DAY_OF_MONTH); if (job.getReportPeriodFlag() != -1 && now != job.getReportPeriodFlag()) needToSetJobResultNull = true; job.setReportPeriodFlag(now); } else { if (job.getJobConfig().getReportPeriodDefine().equals(AnalysisConstants.REPORT_PERIOD_HOUR)) { Calendar calendar = Calendar.getInstance(); int now = calendar.get(Calendar.HOUR_OF_DAY); if (job.getReportPeriodFlag() != -1 && now != job.getReportPeriodFlag()) needToSetJobResultNull = true; job.setReportPeriodFlag(now); } else { if (job.getJobConfig() .getReportPeriodDefine() .equals(AnalysisConstants.REPORT_PERIOD_MONTH)) { Calendar calendar = Calendar.getInstance(); int now = calendar.get(Calendar.MONTH); if (job.getReportPeriodFlag() != -1 && now != job.getReportPeriodFlag()) needToSetJobResultNull = true; job.setReportPeriodFlag(now); } } } if (needToSetJobResultNull) { job.setJobResult(null); job.getEpoch().set(0); // 删除临时文件,防止重复载入使得清空不生效 if (config.getSaveTmpResultToFile()) { JobDataOperation jobDataOperation = new JobDataOperation(job, AnalysisConstants.JOBMANAGER_EVENT_DEL_DATAFILE, this.config); jobDataOperation.run(); } if (logger.isWarnEnabled()) logger.warn("job " + job.getJobName() + " report data be reset.it's a new start. "); } // 清除主干数据,到时候自然会载入 if (config.getSaveTmpResultToFile() && (job.getJobConfig().getSaveTmpResultToFile() == null || job.getJobConfig().getSaveTmpResultToFile())) { logger.warn("@disk2Mem mode: start " + job.getJobName() + " store trunk to disk now ."); JobDataOperation jobDataOperation = new JobDataOperation( job, AnalysisConstants.JOBMANAGER_EVENT_SETNULL_EXPORTDATA, this.config); jobDataOperation.run(); } else { if (job.getLastExportTime() == 0 || System.currentTimeMillis() - job.getLastExportTime() >= config.getExportInterval() || stopped) { logger.warn("export job: " + job.getJobName() + " trunk to disk."); JobDataOperation jobDataOperation = new JobDataOperation(job, AnalysisConstants.JOBMANAGER_EVENT_EXPORTDATA, this.config); jobDataOperation.run(); } } }
// 做合并和导出,重置任务的检查操作 // 所有任务一起来轮询,对Master来讲,有点资源浪费 // 可以通过以下几种方式改进: // 1、针对job的属性设置监听器,Listener模式 // 2、使用Observer模式 protected void mergeAndExportJobs() { Iterator<Map.Entry<String, Job>> iter = jobs.entrySet().iterator(); while (iter.hasNext()) { Job job = iter.next().getValue(); if (job.getRebuildTag() == 2) { job.rebuild(0, null, this); continue; } if (!job.getJobTimeOut().get()) { // 需要合并该job的task if (!job.isMerging().get() && job.needMerge()) { logger.warn( "job " + job.getJobName() + " complete tasks:" + job.getCompletedTaskCount().get() + ", merged tasks :" + job.getMergedTaskCount().get()); final Job j = job; final BlockingQueue<JobMergedResult> branchResultQueue = branchResultQueuePool.get(j.getJobName()); final BlockingQueue<JobTaskResult> jobTaskResultsQueue = jobTaskResultsQueuePool.get(j.getJobName()); if (j.isMerging().compareAndSet(false, true)) eventProcessThreadPool.execute( new Runnable() { public void run() { try { jobResultMerger.merge(j, branchResultQueue, jobTaskResultsQueue, true); } catch (Throwable e) { logger.error(e); } finally { j.isMerging().set(false); } } }); } } else { // 判断是否还有和主干合并的线程,如果没有可以设置完成标识 boolean gotIt = job.getTrunkLock().writeLock().tryLock(); if (gotIt) { try { if (!job.isMerged().get()) { List<Map<String, Map<String, Object>>> mergeResults = new ArrayList<Map<String, Map<String, Object>>>(); new MergeJobOperation( job, 0, mergeResults, config, branchResultQueuePool.get(job.getJobName())) .run(); job.isMerged().set(true); logger.warn("job is timeout, last merge trunk success!"); } } finally { job.getTrunkLock().writeLock().unlock(); } } } // 需要导出该job的数据 if (!job.isExporting().get() && job.needExport()) { final Job j = job; if (j.isExporting().compareAndSet(false, true)) eventProcessThreadPool.execute( new Runnable() { public void run() { try { // 虽然是多线程,但还是阻塞模式来做 jobExporter.exportReport(j, false); j.isExported().set(true); } catch (Throwable e) { logger.error(e); } finally { j.isExporting().set(false); } // 判断是否需要开始导出中间结果,放在外部不妨碍下一次的处理 exportOrCleanTrunk(j); } }); if (job.getRebuildTag() == -1) { job.rebuild(0, null, this); iter.remove(); } if (job.getRebuildTag() == 1) { job.rebuild(0, null, this); } } // 做一次任务处理时间判断,如果超时将设置job的超时状态位置 job.checkJobTimeOut(); // 任务是否需要被重置 if (job.needReset()) { if (logger.isWarnEnabled()) logger.warn("job " + job.getJobName() + " be reset now."); StringBuilder sb = new StringBuilder(ReportUtil.MASTER_LOG) .append(",") .append(System.currentTimeMillis()) .append(","); sb.append(job.getEpoch()) .append(",") .append(job.getJobName()) .append(",") .append(System.currentTimeMillis() - job.getStartTime()) .append(",") .append(job.getJobMergeTime().get()) .append(",") .append(job.getJobExportTime()) .append(",") .append(job.getTaskCount()) .append(",") .append(job.getCompletedTaskCount().get()) .append(",") .append(job.getMergedTaskCount().get()) .append(",") .append(job.getJobMergeBranchCount().get()); ReportUtil.clusterLog(sb.toString()); job.reset(this); if (logger.isInfoEnabled()) { sb = new StringBuilder("jobManager:{jobs:") .append(jobs.size()) .append(",jobTaskPool:" + jobTaskPool.size()); sb.append(",statusPool:") .append(statusPool.size()) .append(",undoTasks:") .append(undoTaskQueue.size()) .append("}"); logger.info(sb.toString()); } List<JobTask> tasks = job.getJobTasks(); for (JobTask task : tasks) { statusPool.put(task.getTaskId(), task.getStatus()); } } } }
// 分配任务和结果提交处理由于是单线程处理, // 因此本身不用做状态池并发控制,将消耗较多的发送操作交给ServerConnector多线程操作 @Override public void addTaskResultToQueue(SendResultsRequestEvent jobResponseEvent) { JobTaskResult jobTaskResult = jobResponseEvent.getJobTaskResult(); if (jobTaskResult.getTaskIds() != null && jobTaskResult.getTaskIds().size() > 0) { // 判断是否是过期的一些老任务数据,根据task和taskresult的createtime来判断 // 以后要扩展成为如果发现当前的epoch < 结果的epoch,表明这台可能是从属的master,负责reduce,但是速度跟不上了 if (jobTaskPool.get(jobTaskResult.getTaskIds().get(0)) == null) { logger.error("jobTask is null " + jobTaskResult.getTaskIds().get(0)); } if (jobTaskResult.getJobEpoch() != jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobEpoch()) { if (jobTaskResult.getJobEpoch() < jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobEpoch()) { logger.error( "old task result will be discard! job:" + jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName() + ",epoch:" + jobTaskResult.getJobEpoch() + ",slave:" + jobResponseEvent.getChannel()); masterNode.echoSendJobTaskResults( jobResponseEvent.getSequence(), "success", jobResponseEvent.getChannel()); return; } else { // 给一定的容忍时间,暂时定为5秒 jobs.get(jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName()) .blockToResetJob(15000); if (jobTaskResult.getJobEpoch() > jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobEpoch()) { logger.error( "otherMaster can't merge in time!job:" + jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName()); masterNode.echoSendJobTaskResults( jobResponseEvent.getSequence(), "success", jobResponseEvent.getChannel()); return; } } } if (logger.isWarnEnabled()) { StringBuilder ts = new StringBuilder("Receive slave analysis result, jobTaskIds : ") .append(jobTaskResult.toString()) .append(", ") .append(jobTaskResult.getTaskIds().size()); logger.warn(ts.toString()); } // 先放入队列,防止小概率多线程并发问题 jobTaskResultsQueuePool .get(jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName()) .offer(jobTaskResult); for (int i = 0; i < jobTaskResult.getTaskIds().size(); i++) { String taskId = jobTaskResult.getTaskIds().get(i); JobTask jobTask = jobTaskPool.get(taskId); if (jobTask == null) { logger.error( new StringBuilder("taskId :").append(taskId).append("not exist!").toString()); continue; } Job job = jobs.get(jobTask.getJobName()); if (job == null) { logger.error( new StringBuilder("job :") .append(jobTask.getJobName()) .append("not exist!") .toString()); continue; } if (statusPool.replace(taskId, JobTaskStatus.DOING, JobTaskStatus.DONE) || statusPool.replace(taskId, JobTaskStatus.UNDO, JobTaskStatus.DONE)) { logger.info("task " + jobTask.getJobName() + " of job " + job.getJobName() + " done"); jobTask.setStatus(JobTaskStatus.DONE); jobTask.setEndTime(System.currentTimeMillis()); jobTask.setLastMergedEpoch(job.getEpoch().get()); job.getCompletedTaskCount().incrementAndGet(); } // 对jobTask的执行结果打点 StringBuilder log = new StringBuilder(ReportUtil.SLAVE_LOG) .append(",") .append(System.currentTimeMillis()) .append(",") .append(job.getEpoch()) .append(","); log.append(jobTask.getJobName()) .append(",") .append(jobTask.getTaskId()) .append(",") .append(jobTask.getRecycleCounter().get()) .append(",") .append(jobTaskResult.getSlaveIp()) .append(",") .append(jobTaskResult.getEfficiency()) .append(","); JobTaskExecuteInfo executeInfo = jobTaskResult.getTaskExecuteInfos().get(jobTask.getTaskId()); if (executeInfo != null) log.append(executeInfo.getAnalysisConsume()) .append(",") .append(executeInfo.getJobDataSize()) .append(",") .append(executeInfo.getTotalLine()) .append(",") .append(executeInfo.getErrorLine()) .append(",") .append(executeInfo.getEmptyLine()); else logger.error( new StringBuilder() .append("taskId : ") .append(jobTask.getTaskId()) .append(" executeInfo is null!") .toString()); ReportUtil.clusterLog(log.toString()); } } // 是否需要用异步方式发送,减少对jobManager事件处理延时 if (config.isUseAsynModeToSendResponse()) { final String sequence = jobResponseEvent.getSequence(); final Object channel = jobResponseEvent.getChannel(); eventProcessThreadPool.execute( new Runnable() { public void run() { try { masterNode.echoSendJobTaskResults(sequence, "success", channel); } catch (Throwable e) { logger.error(e); } } }); } else masterNode.echoSendJobTaskResults( jobResponseEvent.getSequence(), "success", jobResponseEvent.getChannel()); }
// 分配任务和结果提交处理由于是单线程处理, // 因此本身不用做状态池并发控制,将消耗较多的发送操作交给ServerConnector多线程操作 @Override public void getUnDoJobTasks(GetTaskRequestEvent requestEvent) { String jobName = requestEvent.getJobName(); int jobCount = requestEvent.getRequestJobCount(); final List<JobTask> jobTasks = new ArrayList<JobTask>(); // 如果关闭,则直接返回一个空的JobTask的list给slave if (this.stopped) { masterNode.echoGetJobTasks(requestEvent.getSequence(), jobTasks, requestEvent.getChannel()); return; } // 指定job if (jobName != null && jobs.containsKey(jobName)) { Job job = jobs.get(jobName); List<JobTask> tasks = job.getJobTasks(); for (JobTask jobTask : tasks) { if (jobTask.getStatus().equals(JobTaskStatus.UNDO)) { if (statusPool.replace(jobTask.getTaskId(), JobTaskStatus.UNDO, JobTaskStatus.DOING)) { this.allocateTask(jobTask); jobTasks.add(jobTask); if (jobTasks.size() == jobCount) break; } } } } else { Iterator<JobTask> taskIter = undoTaskQueue.iterator(); while (taskIter.hasNext()) { // String taskId = taskIds.next(); // JobTask jobTask = jobTaskPool.get(taskId); JobTask jobTask = taskIter.next(); if (!jobTaskPool.keySet().contains(jobTask.getTaskId()) || jobs.get(jobTask.getJobName()).getEpoch().get() > jobTask.getJobEpoch() || jobs.get(jobTask.getJobName()).getJobTimeOut().get()) { taskIter.remove(); continue; } if (statusPool.get(jobTask.getTaskId()).equals(JobTaskStatus.UNDO)) { if (statusPool.replace(jobTask.getTaskId(), JobTaskStatus.UNDO, JobTaskStatus.DOING)) { this.allocateTask(jobTask); jobTasks.add(jobTask); taskIter.remove(); if (jobTasks.size() >= jobCount) break; } } else taskIter.remove(); } } // 是否需要用异步方式发送,减少对jobManager事件处理延时 if (config.isUseAsynModeToSendResponse()) { final String sequence = requestEvent.getSequence(); final Object channel = requestEvent.getChannel(); // 由于该操作比较慢,开线程执行,保证速度 eventProcessThreadPool.execute( new Runnable() { public void run() { try { masterNode.echoGetJobTasks(sequence, jobTasks, channel); } catch (Throwable e) { logger.error(e); } } }); } else masterNode.echoGetJobTasks(requestEvent.getSequence(), jobTasks, requestEvent.getChannel()); }