/** Update {@link lastStatus} so that it can be viewed from outside */ private void updateStatus() { int highPriorityFiles = 0; int lowPriorityFiles = 0; List<JobStatus> jobs = new ArrayList<JobStatus>(); List<String> highPriorityFileNames = new ArrayList<String>(); for (Map.Entry<String, CorruptFileInfo> e : fileIndex.entrySet()) { String fileName = e.getKey(); CorruptFileInfo fileInfo = e.getValue(); if (fileInfo.getHighestPriority() > 0) { highPriorityFileNames.add(fileName); highPriorityFiles += 1; } else { lowPriorityFiles += 1; } } for (Job job : jobIndex.keySet()) { String url = job.getTrackingURL(); String name = job.getJobName(); JobID jobId = job.getID(); jobs.add(new BlockFixer.JobStatus(jobId, name, url)); } lastStatus = new BlockFixer.Status(highPriorityFiles, lowPriorityFiles, jobs, highPriorityFileNames); RaidNodeMetrics.getInstance().corruptFilesHighPri.set(highPriorityFiles); RaidNodeMetrics.getInstance().corruptFilesLowPri.set(lowPriorityFiles); LOG.info("Update status done." + lastStatus.toString()); }
/** Handle a successful job. */ private void succeedJob(Job job, long filesSucceeded, long filesFailed) throws IOException { String jobName = job.getJobName(); LOG.info("Job " + job.getID() + "(" + jobName + ") finished (succeeded)"); if (filesFailed == 0) { // no files have failed for (CorruptFileInfo fileInfo : jobIndex.get(job)) { boolean failed = false; fileInfo.finishJob(jobName, failed); } } else { // we have to look at the output to check which files have failed Set<String> failedFiles = getFailedFiles(job); for (CorruptFileInfo fileInfo : jobIndex.get(job)) { if (failedFiles.contains(fileInfo.getFile().toString())) { boolean failed = true; fileInfo.finishJob(jobName, failed); } else { // call succeed for files that have succeeded or for which no action // was taken boolean failed = false; fileInfo.finishJob(jobName, failed); } } } // report succeeded files to metrics incrFilesFixed(filesSucceeded); incrFileFixFailures(filesFailed); numJobsRunning--; }
/** Handle a failed job. */ private void failJob(Job job) throws IOException { // assume no files have been fixed LOG.error("Job " + job.getID() + "(" + job.getJobName() + ") finished (failed)"); // We do not change metrics here since we do not know for sure if file // fixing failed. for (CorruptFileInfo fileInfo : jobIndex.get(job)) { boolean failed = true; fileInfo.finishJob(job.getJobName(), failed); } numJobsRunning--; }
/** * checks if jobs have completed and updates job and file index returns a list of failed files for * restarting */ void checkJobs() throws IOException { Iterator<Job> jobIter = jobIndex.keySet().iterator(); while (jobIter.hasNext()) { Job job = jobIter.next(); try { if (job.isComplete()) { long slotSeconds = job.getCounters().findCounter(JobInProgress.Counter.SLOTS_MILLIS_MAPS).getValue() / 1000; RaidNodeMetrics.getInstance().blockFixSlotSeconds.inc(slotSeconds); long filesSucceeded = job.getCounters().findCounter(Counter.FILES_SUCCEEDED) != null ? job.getCounters().findCounter(Counter.FILES_SUCCEEDED).getValue() : 0; long filesFailed = job.getCounters().findCounter(Counter.FILES_FAILED) != null ? job.getCounters().findCounter(Counter.FILES_FAILED).getValue() : 0; long filesNoAction = job.getCounters().findCounter(Counter.FILES_NOACTION) != null ? job.getCounters().findCounter(Counter.FILES_NOACTION).getValue() : 0; int files = jobIndex.get(job).size(); if (job.isSuccessful() && (filesSucceeded + filesFailed + filesNoAction == ((long) files))) { // job has processed all files succeedJob(job, filesSucceeded, filesFailed); } else { failJob(job); } jobIter.remove(); } else { LOG.info("Job " + job.getID() + "(" + job.getJobName() + " still running"); } } catch (Exception e) { LOG.error(StringUtils.stringifyException(e)); failJob(job); try { job.killJob(); } catch (Exception ee) { LOG.error(StringUtils.stringifyException(ee)); } jobIter.remove(); } } purgeFileIndex(); }
// Can be overridded by tests. void submitJob(Job job, List<String> filesInJob, int priority) throws IOException, InterruptedException, ClassNotFoundException { job.submit(); LOG.info("Job " + job.getID() + "(" + job.getJobName() + ") started"); jobIndex.put(job, null); }