Пример #1
0
  @Override
  public synchronized void checkJobStatus() throws AnalysisException {

    // 通过外部事件激发重新载入配置
    if (jobBuilder.isNeedRebuild()) {
      if (logger.isInfoEnabled()) {
        logger.info("check job status need to rebuild");
      }
      jobs = jobBuilder.rebuild(jobs);

      if (jobs == null || (jobs != null && jobs.size() == 0))
        throw new AnalysisException("jobs should not be empty!");
    }

    checkTaskStatus();

    mergeAndExportJobs();

    // 任务全部完成并且没有新加任务的情况下,休息1s
    for (Job job : jobs.values()) {
      if (!job.isExported().get() || job.getRebuildTag() == 2) {
        return;
      } else {
        try {
          Thread.sleep(1000);
        } catch (InterruptedException e) {
          logger.error(e);
        }
      }
    }
  }
Пример #2
0
  @Override
  public void clearJobData(String jobName) {

    Job job = jobs.get(jobName);

    if (job != null) {
      job.getJobResult().clear();

      if (logger.isWarnEnabled()) logger.warn("clear job :" + job.getJobName() + " data.");
    }
  }
Пример #3
0
  @Override
  public void init() throws AnalysisException {
    // 获得任务数量
    jobBuilder.setConfig(config);
    jobExporter.setConfig(config);
    jobResultMerger.setConfig(config);

    jobBuilder.init();
    jobExporter.init();
    jobResultMerger.init();

    jobs = jobBuilder.build();
    for (Job job : jobs.values()) {
      job.reset(null);
    }

    if (jobs == null || (jobs != null && jobs.size() == 0))
      throw new AnalysisException("jobs should not be empty!");

    jobTaskPool = new ConcurrentHashMap<String, JobTask>();
    undoTaskQueue = new LinkedBlockingDeque<JobTask>();
    statusPool = new ConcurrentHashMap<String, JobTaskStatus>();
    jobTaskResultsQueuePool = new HashMap<String, BlockingQueue<JobTaskResult>>();
    branchResultQueuePool = new HashMap<String, BlockingQueue<JobMergedResult>>();

    for (String jobName : jobs.keySet()) {
      jobTaskResultsQueuePool.put(jobName, new LinkedBlockingQueue<JobTaskResult>());
      branchResultQueuePool.put(jobName, new LinkedBlockingQueue<JobMergedResult>());
    }

    eventProcessThreadPool =
        new ThreadPoolExecutor(
            this.config.getMaxJobEventWorker(),
            this.config.getMaxJobEventWorker(),
            0,
            TimeUnit.SECONDS,
            new LinkedBlockingQueue<Runnable>(),
            new NamedThreadFactory("jobManagerEventProcess_worker"));

    masterDataRecoverWorker =
        new MasterDataRecoverWorker(
            config.getMasterName(), config.getTempStoreDataDir(), jobs, this.config);
    masterDataRecoverWorker.start();

    addJobsToPool();

    if (logger.isInfoEnabled())
      logger.info("jobManager init end, MaxJobEventWorker size : " + config.getMaxJobEventWorker());
  }
Пример #4
0
  // 重新增加任务到任务池中
  protected void addJobsToPool() {
    for (Job job : jobs.values()) {
      List<JobTask> tasks = job.getJobTasks();

      for (JobTask task : tasks) {
        jobTaskPool.put(task.getTaskId(), task);
        statusPool.put(task.getTaskId(), task.getStatus());
        undoTaskQueue.offer(task);
      }

      if (jobTaskResultsQueuePool.get(job.getJobName()) == null)
        jobTaskResultsQueuePool.put(job.getJobName(), new LinkedBlockingQueue<JobTaskResult>());
      if (branchResultQueuePool.get(job.getJobName()) == null)
        branchResultQueuePool.put(job.getJobName(), new LinkedBlockingQueue<JobMergedResult>());
    }
  }
Пример #5
0
  @Override
  public void releaseResource() {
    stopped = true;

    try {
      // 导出所有结果,暂时不导出中间data,后面看是否需要
      // 添加中间结果导出,不导出中间结果,会有部分数据丢失
      if (jobs != null)
        for (Job j : jobs.values()) {
          // 结果导出不重要,可以考虑去掉
          while (!j.getTrunkExported().get()) Thread.sleep(3000);
          if (!j.isExported().get()) {
            jobExporter.exportReport(j, false);
            logger.info("releaseResouce now, export job : " + j.getJobName());
          }
        }
      if (eventProcessThreadPool != null) eventProcessThreadPool.shutdown();

      if (masterDataRecoverWorker != null) masterDataRecoverWorker.stopWorker();
    } catch (Throwable e) {
      logger.error("error when stop the node", e);
    } finally {
      if (jobs != null) jobs.clear();

      if (jobTaskPool != null) jobTaskPool.clear();
      if (undoTaskQueue != null) undoTaskQueue.clear();

      if (statusPool != null) statusPool.clear();

      if (jobTaskResultsQueuePool != null) jobTaskResultsQueuePool.clear();

      if (branchResultQueuePool != null) branchResultQueuePool.clear();

      if (jobBuilder != null) jobBuilder.releaseResource();

      if (jobExporter != null) jobExporter.releaseResource();

      if (jobResultMerger != null) jobResultMerger.releaseResource();

      logger.info("jobManager releaseResource end");
    }
  }
Пример #6
0
  /**
   * 在导出数据以后,判断是否需要清空主干,是否需要导出主干
   *
   * @param job
   */
  protected void exportOrCleanTrunk(Job job) {
    boolean needToSetJobResultNull = false;

    // 判断是否到了报表的有效时间段,支持小时,日,月三种方式
    if (job.getJobConfig().getReportPeriodDefine().equals(AnalysisConstants.REPORT_PERIOD_DAY)) {
      Calendar calendar = Calendar.getInstance();
      int now = calendar.get(Calendar.DAY_OF_MONTH);

      if (job.getReportPeriodFlag() != -1 && now != job.getReportPeriodFlag())
        needToSetJobResultNull = true;

      job.setReportPeriodFlag(now);
    } else {
      if (job.getJobConfig().getReportPeriodDefine().equals(AnalysisConstants.REPORT_PERIOD_HOUR)) {
        Calendar calendar = Calendar.getInstance();
        int now = calendar.get(Calendar.HOUR_OF_DAY);

        if (job.getReportPeriodFlag() != -1 && now != job.getReportPeriodFlag())
          needToSetJobResultNull = true;

        job.setReportPeriodFlag(now);
      } else {
        if (job.getJobConfig()
            .getReportPeriodDefine()
            .equals(AnalysisConstants.REPORT_PERIOD_MONTH)) {
          Calendar calendar = Calendar.getInstance();
          int now = calendar.get(Calendar.MONTH);

          if (job.getReportPeriodFlag() != -1 && now != job.getReportPeriodFlag())
            needToSetJobResultNull = true;

          job.setReportPeriodFlag(now);
        }
      }
    }

    if (needToSetJobResultNull) {
      job.setJobResult(null);
      job.getEpoch().set(0);

      // 删除临时文件,防止重复载入使得清空不生效
      if (config.getSaveTmpResultToFile()) {
        JobDataOperation jobDataOperation =
            new JobDataOperation(job, AnalysisConstants.JOBMANAGER_EVENT_DEL_DATAFILE, this.config);
        jobDataOperation.run();
      }

      if (logger.isWarnEnabled())
        logger.warn("job " + job.getJobName() + " report data be reset.it's a new start. ");
    }

    // 清除主干数据,到时候自然会载入
    if (config.getSaveTmpResultToFile()
        && (job.getJobConfig().getSaveTmpResultToFile() == null
            || job.getJobConfig().getSaveTmpResultToFile())) {
      logger.warn("@disk2Mem mode: start " + job.getJobName() + " store trunk to disk now .");

      JobDataOperation jobDataOperation =
          new JobDataOperation(
              job, AnalysisConstants.JOBMANAGER_EVENT_SETNULL_EXPORTDATA, this.config);
      jobDataOperation.run();

    } else {
      if (job.getLastExportTime() == 0
          || System.currentTimeMillis() - job.getLastExportTime() >= config.getExportInterval()
          || stopped) {
        logger.warn("export job: " + job.getJobName() + " trunk to disk.");

        JobDataOperation jobDataOperation =
            new JobDataOperation(job, AnalysisConstants.JOBMANAGER_EVENT_EXPORTDATA, this.config);
        jobDataOperation.run();
      }
    }
  }
Пример #7
0
  // 做合并和导出,重置任务的检查操作
  // 所有任务一起来轮询,对Master来讲,有点资源浪费
  // 可以通过以下几种方式改进:
  // 1、针对job的属性设置监听器,Listener模式
  // 2、使用Observer模式
  protected void mergeAndExportJobs() {
    Iterator<Map.Entry<String, Job>> iter = jobs.entrySet().iterator();
    while (iter.hasNext()) {
      Job job = iter.next().getValue();
      if (job.getRebuildTag() == 2) {
        job.rebuild(0, null, this);
        continue;
      }
      if (!job.getJobTimeOut().get()) {
        // 需要合并该job的task
        if (!job.isMerging().get() && job.needMerge()) {
          logger.warn(
              "job "
                  + job.getJobName()
                  + " complete tasks:"
                  + job.getCompletedTaskCount().get()
                  + ", merged tasks :"
                  + job.getMergedTaskCount().get());
          final Job j = job;
          final BlockingQueue<JobMergedResult> branchResultQueue =
              branchResultQueuePool.get(j.getJobName());
          final BlockingQueue<JobTaskResult> jobTaskResultsQueue =
              jobTaskResultsQueuePool.get(j.getJobName());

          if (j.isMerging().compareAndSet(false, true))
            eventProcessThreadPool.execute(
                new Runnable() {
                  public void run() {
                    try {
                      jobResultMerger.merge(j, branchResultQueue, jobTaskResultsQueue, true);
                    } catch (Throwable e) {
                      logger.error(e);
                    } finally {
                      j.isMerging().set(false);
                    }
                  }
                });
        }
      } else {
        // 判断是否还有和主干合并的线程,如果没有可以设置完成标识
        boolean gotIt = job.getTrunkLock().writeLock().tryLock();

        if (gotIt) {
          try {
            if (!job.isMerged().get()) {
              List<Map<String, Map<String, Object>>> mergeResults =
                  new ArrayList<Map<String, Map<String, Object>>>();
              new MergeJobOperation(
                      job, 0, mergeResults, config, branchResultQueuePool.get(job.getJobName()))
                  .run();

              job.isMerged().set(true);
              logger.warn("job is timeout, last merge trunk success!");
            }
          } finally {
            job.getTrunkLock().writeLock().unlock();
          }
        }
      }

      // 需要导出该job的数据
      if (!job.isExporting().get() && job.needExport()) {
        final Job j = job;

        if (j.isExporting().compareAndSet(false, true))
          eventProcessThreadPool.execute(
              new Runnable() {
                public void run() {
                  try {
                    // 虽然是多线程,但还是阻塞模式来做
                    jobExporter.exportReport(j, false);
                    j.isExported().set(true);
                  } catch (Throwable e) {
                    logger.error(e);
                  } finally {
                    j.isExporting().set(false);
                  }

                  // 判断是否需要开始导出中间结果,放在外部不妨碍下一次的处理
                  exportOrCleanTrunk(j);
                }
              });
        if (job.getRebuildTag() == -1) {
          job.rebuild(0, null, this);
          iter.remove();
        }
        if (job.getRebuildTag() == 1) {
          job.rebuild(0, null, this);
        }
      }

      // 做一次任务处理时间判断,如果超时将设置job的超时状态位置
      job.checkJobTimeOut();

      // 任务是否需要被重置
      if (job.needReset()) {
        if (logger.isWarnEnabled()) logger.warn("job " + job.getJobName() + " be reset now.");

        StringBuilder sb =
            new StringBuilder(ReportUtil.MASTER_LOG)
                .append(",")
                .append(System.currentTimeMillis())
                .append(",");
        sb.append(job.getEpoch())
            .append(",")
            .append(job.getJobName())
            .append(",")
            .append(System.currentTimeMillis() - job.getStartTime())
            .append(",")
            .append(job.getJobMergeTime().get())
            .append(",")
            .append(job.getJobExportTime())
            .append(",")
            .append(job.getTaskCount())
            .append(",")
            .append(job.getCompletedTaskCount().get())
            .append(",")
            .append(job.getMergedTaskCount().get())
            .append(",")
            .append(job.getJobMergeBranchCount().get());
        ReportUtil.clusterLog(sb.toString());

        job.reset(this);

        if (logger.isInfoEnabled()) {
          sb =
              new StringBuilder("jobManager:{jobs:")
                  .append(jobs.size())
                  .append(",jobTaskPool:" + jobTaskPool.size());
          sb.append(",statusPool:")
              .append(statusPool.size())
              .append(",undoTasks:")
              .append(undoTaskQueue.size())
              .append("}");
          logger.info(sb.toString());
        }

        List<JobTask> tasks = job.getJobTasks();

        for (JobTask task : tasks) {
          statusPool.put(task.getTaskId(), task.getStatus());
        }
      }
    }
  }
Пример #8
0
  // 分配任务和结果提交处理由于是单线程处理,
  // 因此本身不用做状态池并发控制,将消耗较多的发送操作交给ServerConnector多线程操作
  @Override
  public void addTaskResultToQueue(SendResultsRequestEvent jobResponseEvent) {

    JobTaskResult jobTaskResult = jobResponseEvent.getJobTaskResult();

    if (jobTaskResult.getTaskIds() != null && jobTaskResult.getTaskIds().size() > 0) {
      // 判断是否是过期的一些老任务数据,根据task和taskresult的createtime来判断
      // 以后要扩展成为如果发现当前的epoch < 结果的epoch,表明这台可能是从属的master,负责reduce,但是速度跟不上了
      if (jobTaskPool.get(jobTaskResult.getTaskIds().get(0)) == null) {
        logger.error("jobTask is null " + jobTaskResult.getTaskIds().get(0));
      }
      if (jobTaskResult.getJobEpoch()
          != jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobEpoch()) {

        if (jobTaskResult.getJobEpoch()
            < jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobEpoch()) {
          logger.error(
              "old task result will be discard! job:"
                  + jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName()
                  + ",epoch:"
                  + jobTaskResult.getJobEpoch()
                  + ",slave:"
                  + jobResponseEvent.getChannel());
          masterNode.echoSendJobTaskResults(
              jobResponseEvent.getSequence(), "success", jobResponseEvent.getChannel());
          return;
        } else {
          // 给一定的容忍时间,暂时定为5秒
          jobs.get(jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName())
              .blockToResetJob(15000);

          if (jobTaskResult.getJobEpoch()
              > jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobEpoch()) {
            logger.error(
                "otherMaster can't merge in time!job:"
                    + jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName());
            masterNode.echoSendJobTaskResults(
                jobResponseEvent.getSequence(), "success", jobResponseEvent.getChannel());
            return;
          }
        }
      }

      if (logger.isWarnEnabled()) {
        StringBuilder ts =
            new StringBuilder("Receive slave analysis result, jobTaskIds : ")
                .append(jobTaskResult.toString())
                .append(", ")
                .append(jobTaskResult.getTaskIds().size());
        logger.warn(ts.toString());
      }

      // 先放入队列,防止小概率多线程并发问题
      jobTaskResultsQueuePool
          .get(jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName())
          .offer(jobTaskResult);

      for (int i = 0; i < jobTaskResult.getTaskIds().size(); i++) {
        String taskId = jobTaskResult.getTaskIds().get(i);
        JobTask jobTask = jobTaskPool.get(taskId);

        if (jobTask == null) {
          logger.error(
              new StringBuilder("taskId :").append(taskId).append("not exist!").toString());
          continue;
        }

        Job job = jobs.get(jobTask.getJobName());
        if (job == null) {
          logger.error(
              new StringBuilder("job :")
                  .append(jobTask.getJobName())
                  .append("not exist!")
                  .toString());
          continue;
        }

        if (statusPool.replace(taskId, JobTaskStatus.DOING, JobTaskStatus.DONE)
            || statusPool.replace(taskId, JobTaskStatus.UNDO, JobTaskStatus.DONE)) {
          logger.info("task " + jobTask.getJobName() + " of job " + job.getJobName() + " done");
          jobTask.setStatus(JobTaskStatus.DONE);
          jobTask.setEndTime(System.currentTimeMillis());
          jobTask.setLastMergedEpoch(job.getEpoch().get());
          job.getCompletedTaskCount().incrementAndGet();
        }

        // 对jobTask的执行结果打点
        StringBuilder log =
            new StringBuilder(ReportUtil.SLAVE_LOG)
                .append(",")
                .append(System.currentTimeMillis())
                .append(",")
                .append(job.getEpoch())
                .append(",");
        log.append(jobTask.getJobName())
            .append(",")
            .append(jobTask.getTaskId())
            .append(",")
            .append(jobTask.getRecycleCounter().get())
            .append(",")
            .append(jobTaskResult.getSlaveIp())
            .append(",")
            .append(jobTaskResult.getEfficiency())
            .append(",");

        JobTaskExecuteInfo executeInfo =
            jobTaskResult.getTaskExecuteInfos().get(jobTask.getTaskId());

        if (executeInfo != null)
          log.append(executeInfo.getAnalysisConsume())
              .append(",")
              .append(executeInfo.getJobDataSize())
              .append(",")
              .append(executeInfo.getTotalLine())
              .append(",")
              .append(executeInfo.getErrorLine())
              .append(",")
              .append(executeInfo.getEmptyLine());
        else
          logger.error(
              new StringBuilder()
                  .append("taskId : ")
                  .append(jobTask.getTaskId())
                  .append(" executeInfo is null!")
                  .toString());

        ReportUtil.clusterLog(log.toString());
      }
    }

    // 是否需要用异步方式发送,减少对jobManager事件处理延时
    if (config.isUseAsynModeToSendResponse()) {
      final String sequence = jobResponseEvent.getSequence();
      final Object channel = jobResponseEvent.getChannel();

      eventProcessThreadPool.execute(
          new Runnable() {
            public void run() {
              try {
                masterNode.echoSendJobTaskResults(sequence, "success", channel);
              } catch (Throwable e) {
                logger.error(e);
              }
            }
          });
    } else
      masterNode.echoSendJobTaskResults(
          jobResponseEvent.getSequence(), "success", jobResponseEvent.getChannel());
  }
Пример #9
0
  // 分配任务和结果提交处理由于是单线程处理,
  // 因此本身不用做状态池并发控制,将消耗较多的发送操作交给ServerConnector多线程操作
  @Override
  public void getUnDoJobTasks(GetTaskRequestEvent requestEvent) {

    String jobName = requestEvent.getJobName();
    int jobCount = requestEvent.getRequestJobCount();
    final List<JobTask> jobTasks = new ArrayList<JobTask>();

    // 如果关闭,则直接返回一个空的JobTask的list给slave
    if (this.stopped) {
      masterNode.echoGetJobTasks(requestEvent.getSequence(), jobTasks, requestEvent.getChannel());
      return;
    }
    // 指定job
    if (jobName != null && jobs.containsKey(jobName)) {
      Job job = jobs.get(jobName);

      List<JobTask> tasks = job.getJobTasks();

      for (JobTask jobTask : tasks) {
        if (jobTask.getStatus().equals(JobTaskStatus.UNDO)) {
          if (statusPool.replace(jobTask.getTaskId(), JobTaskStatus.UNDO, JobTaskStatus.DOING)) {
            this.allocateTask(jobTask);
            jobTasks.add(jobTask);

            if (jobTasks.size() == jobCount) break;
          }
        }
      }
    } else {
      Iterator<JobTask> taskIter = undoTaskQueue.iterator();

      while (taskIter.hasNext()) {
        //                String taskId = taskIds.next();
        //                JobTask jobTask = jobTaskPool.get(taskId);
        JobTask jobTask = taskIter.next();
        if (!jobTaskPool.keySet().contains(jobTask.getTaskId())
            || jobs.get(jobTask.getJobName()).getEpoch().get() > jobTask.getJobEpoch()
            || jobs.get(jobTask.getJobName()).getJobTimeOut().get()) {
          taskIter.remove();
          continue;
        }

        if (statusPool.get(jobTask.getTaskId()).equals(JobTaskStatus.UNDO)) {
          if (statusPool.replace(jobTask.getTaskId(), JobTaskStatus.UNDO, JobTaskStatus.DOING)) {
            this.allocateTask(jobTask);
            jobTasks.add(jobTask);
            taskIter.remove();

            if (jobTasks.size() >= jobCount) break;
          }
        } else taskIter.remove();
      }
    }

    // 是否需要用异步方式发送,减少对jobManager事件处理延时
    if (config.isUseAsynModeToSendResponse()) {
      final String sequence = requestEvent.getSequence();
      final Object channel = requestEvent.getChannel();

      // 由于该操作比较慢,开线程执行,保证速度
      eventProcessThreadPool.execute(
          new Runnable() {
            public void run() {
              try {
                masterNode.echoGetJobTasks(sequence, jobTasks, channel);
              } catch (Throwable e) {
                logger.error(e);
              }
            }
          });
    } else
      masterNode.echoGetJobTasks(requestEvent.getSequence(), jobTasks, requestEvent.getChannel());
  }