/** Update {@link lastStatus} so that it can be viewed from outside */
 private void updateStatus() {
   int highPriorityFiles = 0;
   int lowPriorityFiles = 0;
   List<JobStatus> jobs = new ArrayList<JobStatus>();
   List<String> highPriorityFileNames = new ArrayList<String>();
   for (Map.Entry<String, CorruptFileInfo> e : fileIndex.entrySet()) {
     String fileName = e.getKey();
     CorruptFileInfo fileInfo = e.getValue();
     if (fileInfo.getHighestPriority() > 0) {
       highPriorityFileNames.add(fileName);
       highPriorityFiles += 1;
     } else {
       lowPriorityFiles += 1;
     }
   }
   for (Job job : jobIndex.keySet()) {
     String url = job.getTrackingURL();
     String name = job.getJobName();
     JobID jobId = job.getID();
     jobs.add(new BlockFixer.JobStatus(jobId, name, url));
   }
   lastStatus =
       new BlockFixer.Status(highPriorityFiles, lowPriorityFiles, jobs, highPriorityFileNames);
   RaidNodeMetrics.getInstance().corruptFilesHighPri.set(highPriorityFiles);
   RaidNodeMetrics.getInstance().corruptFilesLowPri.set(lowPriorityFiles);
   LOG.info("Update status done." + lastStatus.toString());
 }
  /** checks for corrupt blocks and fixes them (if any) */
  void checkAndFixBlocks() throws IOException, InterruptedException, ClassNotFoundException {
    checkJobs();

    if (jobIndex.size() >= maxPendingJobs) {
      LOG.info("Waiting for " + jobIndex.size() + " pending jobs");
      return;
    }

    Map<String, Integer> corruptFiles = getCorruptFiles();
    FileSystem fs = new Path("/").getFileSystem(getConf());
    Map<String, Integer> corruptFilePriority = computePriorities(fs, corruptFiles);

    int totalCorruptFiles = corruptFilePriority.size() + fileIndex.size();
    RaidNodeMetrics.getInstance().numFilesToFix.set(totalCorruptFiles);

    String startTimeStr = dateFormat.format(new Date());

    LOG.info("Found " + corruptFilePriority.size() + " corrupt files");

    if (corruptFilePriority.size() > 0) {
      for (int pri = 1; pri >= 0; pri--) {
        String jobName = "blockfixer." + jobCounter + ".pri" + pri + "." + startTimeStr;
        jobCounter++;
        startJob(jobName, corruptFilePriority, pri);
      }
    }
  }
Example #3
0
 /**
  * Checks if the map-reduce job has completed.
  *
  * @return true if the job completed, false otherwise.
  * @throws IOException
  */
 public boolean checkComplete() throws IOException {
   JobID jobID = runningJob.getID();
   if (runningJob.isComplete()) {
     // delete job directory
     final String jobdir = jobconf.get(JOB_DIR_LABEL);
     if (jobdir != null) {
       final Path jobpath = new Path(jobdir);
       jobpath.getFileSystem(jobconf).delete(jobpath, true);
     }
     if (runningJob.isSuccessful()) {
       LOG.info("Job Complete(Succeeded): " + jobID);
     } else {
       LOG.info("Job Complete(Failed): " + jobID);
     }
     raidPolicyPathPairList.clear();
     Counters ctrs = runningJob.getCounters();
     if (ctrs != null) {
       RaidNodeMetrics metrics = RaidNodeMetrics.getInstance(RaidNodeMetrics.DEFAULT_NAMESPACE_ID);
       if (ctrs.findCounter(Counter.FILES_FAILED) != null) {
         long filesFailed = ctrs.findCounter(Counter.FILES_FAILED).getValue();
         metrics.raidFailures.inc(filesFailed);
       }
       long slotSeconds =
           ctrs.findCounter(JobInProgress.Counter.SLOTS_MILLIS_MAPS).getValue() / 1000;
       metrics.raidSlotSeconds.inc(slotSeconds);
     }
     return true;
   } else {
     String report =
         (" job "
             + jobID
             + " map "
             + StringUtils.formatPercent(runningJob.mapProgress(), 0)
             + " reduce "
             + StringUtils.formatPercent(runningJob.reduceProgress(), 0));
     if (!report.equals(lastReport)) {
       LOG.info(report);
       lastReport = report;
     }
     TaskCompletionEvent[] events = runningJob.getTaskCompletionEvents(jobEventCounter);
     jobEventCounter += events.length;
     for (TaskCompletionEvent event : events) {
       if (event.getTaskStatus() == TaskCompletionEvent.Status.FAILED) {
         LOG.info(" Job " + jobID + " " + event.toString());
       }
     }
     return false;
   }
 }
  /**
   * checks if jobs have completed and updates job and file index returns a list of failed files for
   * restarting
   */
  void checkJobs() throws IOException {
    Iterator<Job> jobIter = jobIndex.keySet().iterator();
    while (jobIter.hasNext()) {
      Job job = jobIter.next();

      try {
        if (job.isComplete()) {
          long slotSeconds =
              job.getCounters().findCounter(JobInProgress.Counter.SLOTS_MILLIS_MAPS).getValue()
                  / 1000;
          RaidNodeMetrics.getInstance().blockFixSlotSeconds.inc(slotSeconds);
          long filesSucceeded =
              job.getCounters().findCounter(Counter.FILES_SUCCEEDED) != null
                  ? job.getCounters().findCounter(Counter.FILES_SUCCEEDED).getValue()
                  : 0;
          long filesFailed =
              job.getCounters().findCounter(Counter.FILES_FAILED) != null
                  ? job.getCounters().findCounter(Counter.FILES_FAILED).getValue()
                  : 0;
          long filesNoAction =
              job.getCounters().findCounter(Counter.FILES_NOACTION) != null
                  ? job.getCounters().findCounter(Counter.FILES_NOACTION).getValue()
                  : 0;
          int files = jobIndex.get(job).size();
          if (job.isSuccessful()
              && (filesSucceeded + filesFailed + filesNoAction == ((long) files))) {
            // job has processed all files
            succeedJob(job, filesSucceeded, filesFailed);
          } else {
            failJob(job);
          }
          jobIter.remove();
        } else {
          LOG.info("Job " + job.getID() + "(" + job.getJobName() + " still running");
        }
      } catch (Exception e) {
        LOG.error(StringUtils.stringifyException(e));
        failJob(job);
        try {
          job.killJob();
        } catch (Exception ee) {
          LOG.error(StringUtils.stringifyException(ee));
        }
        jobIter.remove();
      }
    }
    purgeFileIndex();
  }
  void doFix() throws InterruptedException, IOException {
    while (running) {
      // Sleep before proceeding to fix files.
      Thread.sleep(blockCheckInterval);

      List<String> corruptFiles = getCorruptFiles();
      FileSystem parityFs = new Path("/").getFileSystem(getConf());
      filterUnreconstructableSourceFiles(parityFs, corruptFiles.iterator());
      RaidNodeMetrics.getInstance(RaidNodeMetrics.DEFAULT_NAMESPACE_ID)
          .numFilesToFix
          .set(corruptFiles.size());

      if (corruptFiles.isEmpty()) {
        // If there are no corrupt files, retry after some time.
        continue;
      }
      LOG.info("Found " + corruptFiles.size() + " corrupt files.");

      helper.sortLostFiles(corruptFiles);

      for (String srcPath : corruptFiles) {
        if (!running) break;
        try {
          boolean fixed = helper.reconstructFile(new Path(srcPath), RaidUtils.NULL_PROGRESSABLE);
          if (fixed) {
            incrFilesFixed();
          }
        } catch (IOException ie) {
          incrFileFixFailures();
          LOG.error(
              "Hit error while processing " + srcPath + ": " + StringUtils.stringifyException(ie));
          // Do nothing, move on to the next file.
        }
      }
    }
  }