/** Update {@link lastStatus} so that it can be viewed from outside */ private void updateStatus() { int highPriorityFiles = 0; int lowPriorityFiles = 0; List<JobStatus> jobs = new ArrayList<JobStatus>(); List<String> highPriorityFileNames = new ArrayList<String>(); for (Map.Entry<String, CorruptFileInfo> e : fileIndex.entrySet()) { String fileName = e.getKey(); CorruptFileInfo fileInfo = e.getValue(); if (fileInfo.getHighestPriority() > 0) { highPriorityFileNames.add(fileName); highPriorityFiles += 1; } else { lowPriorityFiles += 1; } } for (Job job : jobIndex.keySet()) { String url = job.getTrackingURL(); String name = job.getJobName(); JobID jobId = job.getID(); jobs.add(new BlockFixer.JobStatus(jobId, name, url)); } lastStatus = new BlockFixer.Status(highPriorityFiles, lowPriorityFiles, jobs, highPriorityFileNames); RaidNodeMetrics.getInstance().corruptFilesHighPri.set(highPriorityFiles); RaidNodeMetrics.getInstance().corruptFilesLowPri.set(lowPriorityFiles); LOG.info("Update status done." + lastStatus.toString()); }
/** checks for corrupt blocks and fixes them (if any) */ void checkAndFixBlocks() throws IOException, InterruptedException, ClassNotFoundException { checkJobs(); if (jobIndex.size() >= maxPendingJobs) { LOG.info("Waiting for " + jobIndex.size() + " pending jobs"); return; } Map<String, Integer> corruptFiles = getCorruptFiles(); FileSystem fs = new Path("/").getFileSystem(getConf()); Map<String, Integer> corruptFilePriority = computePriorities(fs, corruptFiles); int totalCorruptFiles = corruptFilePriority.size() + fileIndex.size(); RaidNodeMetrics.getInstance().numFilesToFix.set(totalCorruptFiles); String startTimeStr = dateFormat.format(new Date()); LOG.info("Found " + corruptFilePriority.size() + " corrupt files"); if (corruptFilePriority.size() > 0) { for (int pri = 1; pri >= 0; pri--) { String jobName = "blockfixer." + jobCounter + ".pri" + pri + "." + startTimeStr; jobCounter++; startJob(jobName, corruptFilePriority, pri); } } }
/** * Checks if the map-reduce job has completed. * * @return true if the job completed, false otherwise. * @throws IOException */ public boolean checkComplete() throws IOException { JobID jobID = runningJob.getID(); if (runningJob.isComplete()) { // delete job directory final String jobdir = jobconf.get(JOB_DIR_LABEL); if (jobdir != null) { final Path jobpath = new Path(jobdir); jobpath.getFileSystem(jobconf).delete(jobpath, true); } if (runningJob.isSuccessful()) { LOG.info("Job Complete(Succeeded): " + jobID); } else { LOG.info("Job Complete(Failed): " + jobID); } raidPolicyPathPairList.clear(); Counters ctrs = runningJob.getCounters(); if (ctrs != null) { RaidNodeMetrics metrics = RaidNodeMetrics.getInstance(RaidNodeMetrics.DEFAULT_NAMESPACE_ID); if (ctrs.findCounter(Counter.FILES_FAILED) != null) { long filesFailed = ctrs.findCounter(Counter.FILES_FAILED).getValue(); metrics.raidFailures.inc(filesFailed); } long slotSeconds = ctrs.findCounter(JobInProgress.Counter.SLOTS_MILLIS_MAPS).getValue() / 1000; metrics.raidSlotSeconds.inc(slotSeconds); } return true; } else { String report = (" job " + jobID + " map " + StringUtils.formatPercent(runningJob.mapProgress(), 0) + " reduce " + StringUtils.formatPercent(runningJob.reduceProgress(), 0)); if (!report.equals(lastReport)) { LOG.info(report); lastReport = report; } TaskCompletionEvent[] events = runningJob.getTaskCompletionEvents(jobEventCounter); jobEventCounter += events.length; for (TaskCompletionEvent event : events) { if (event.getTaskStatus() == TaskCompletionEvent.Status.FAILED) { LOG.info(" Job " + jobID + " " + event.toString()); } } return false; } }
/** * checks if jobs have completed and updates job and file index returns a list of failed files for * restarting */ void checkJobs() throws IOException { Iterator<Job> jobIter = jobIndex.keySet().iterator(); while (jobIter.hasNext()) { Job job = jobIter.next(); try { if (job.isComplete()) { long slotSeconds = job.getCounters().findCounter(JobInProgress.Counter.SLOTS_MILLIS_MAPS).getValue() / 1000; RaidNodeMetrics.getInstance().blockFixSlotSeconds.inc(slotSeconds); long filesSucceeded = job.getCounters().findCounter(Counter.FILES_SUCCEEDED) != null ? job.getCounters().findCounter(Counter.FILES_SUCCEEDED).getValue() : 0; long filesFailed = job.getCounters().findCounter(Counter.FILES_FAILED) != null ? job.getCounters().findCounter(Counter.FILES_FAILED).getValue() : 0; long filesNoAction = job.getCounters().findCounter(Counter.FILES_NOACTION) != null ? job.getCounters().findCounter(Counter.FILES_NOACTION).getValue() : 0; int files = jobIndex.get(job).size(); if (job.isSuccessful() && (filesSucceeded + filesFailed + filesNoAction == ((long) files))) { // job has processed all files succeedJob(job, filesSucceeded, filesFailed); } else { failJob(job); } jobIter.remove(); } else { LOG.info("Job " + job.getID() + "(" + job.getJobName() + " still running"); } } catch (Exception e) { LOG.error(StringUtils.stringifyException(e)); failJob(job); try { job.killJob(); } catch (Exception ee) { LOG.error(StringUtils.stringifyException(ee)); } jobIter.remove(); } } purgeFileIndex(); }
void doFix() throws InterruptedException, IOException { while (running) { // Sleep before proceeding to fix files. Thread.sleep(blockCheckInterval); List<String> corruptFiles = getCorruptFiles(); FileSystem parityFs = new Path("/").getFileSystem(getConf()); filterUnreconstructableSourceFiles(parityFs, corruptFiles.iterator()); RaidNodeMetrics.getInstance(RaidNodeMetrics.DEFAULT_NAMESPACE_ID) .numFilesToFix .set(corruptFiles.size()); if (corruptFiles.isEmpty()) { // If there are no corrupt files, retry after some time. continue; } LOG.info("Found " + corruptFiles.size() + " corrupt files."); helper.sortLostFiles(corruptFiles); for (String srcPath : corruptFiles) { if (!running) break; try { boolean fixed = helper.reconstructFile(new Path(srcPath), RaidUtils.NULL_PROGRESSABLE); if (fixed) { incrFilesFixed(); } } catch (IOException ie) { incrFileFixFailures(); LOG.error( "Hit error while processing " + srcPath + ": " + StringUtils.stringifyException(ie)); // Do nothing, move on to the next file. } } } }