Пример #1
0
  private String showJobFailDebugInfo() throws IOException {
    console.printError("Error during job, obtaining debugging information...");
    if (!conf.get("mapred.job.tracker", "local").equals("local")) {
      // Show Tracking URL for remotely running jobs.
      console.printError("Job Tracking URL: " + rj.getTrackingURL());
    }
    // Loop to get all task completion events because getTaskCompletionEvents
    // only returns a subset per call
    TaskInfoGrabber tlg = new TaskInfoGrabber();
    Thread t = new Thread(tlg);
    try {
      t.start();
      t.join(HiveConf.getIntVar(conf, HiveConf.ConfVars.TASKLOG_DEBUG_TIMEOUT));
    } catch (InterruptedException e) {
      console.printError(
          "Timed out trying to finish grabbing task log URLs, " + "some task info may be missing");
    }

    // Remove failures for tasks that succeeded
    for (String task : successes) {
      failures.remove(task);
    }

    if (failures.keySet().size() == 0) {
      return null;
    }
    // Find the highest failure count
    computeMaxFailures();

    // Display Error Message for tasks with the highest failure count
    String jtUrl = null;
    try {
      jtUrl = JobTrackerURLResolver.getURL(conf);
    } catch (Exception e) {
      console.printError("Unable to retrieve URL for Hadoop Task logs. " + e.getMessage());
    }

    String msg = null;
    for (String task : failures.keySet()) {
      if (failures.get(task).intValue() == maxFailures) {
        TaskInfo ti = taskIdToInfo.get(task);
        String jobId = ti.getJobId();
        String taskUrl =
            (jtUrl == null)
                ? null
                : jtUrl + "/taskdetails.jsp?jobid=" + jobId + "&tipid=" + task.toString();

        TaskLogProcessor tlp = new TaskLogProcessor(conf);
        for (String logUrl : ti.getLogUrls()) {
          tlp.addTaskAttemptLogUrl(logUrl);
        }

        if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.JOB_DEBUG_CAPTURE_STACKTRACES)
            && stackTraces != null) {
          if (!stackTraces.containsKey(jobId)) {
            stackTraces.put(jobId, new ArrayList<List<String>>());
          }
          stackTraces.get(jobId).addAll(tlp.getStackTraces());
        }

        if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.SHOW_JOB_FAIL_DEBUG_INFO)) {
          List<ErrorAndSolution> errors = tlp.getErrors();

          StringBuilder sb = new StringBuilder();
          // We use a StringBuilder and then call printError only once as
          // printError will write to both stderr and the error log file. In
          // situations where both the stderr and the log file output is
          // simultaneously output to a single stream, this will look cleaner.
          sb.append("\n");
          sb.append("Task with the most failures(" + maxFailures + "): \n");
          sb.append("-----\n");
          sb.append("Task ID:\n  " + task + "\n\n");
          if (taskUrl != null) {
            sb.append("URL:\n  " + taskUrl + "\n");
          }

          for (ErrorAndSolution e : errors) {
            sb.append("\n");
            sb.append("Possible error:\n  " + e.getError() + "\n\n");
            sb.append("Solution:\n  " + e.getSolution() + "\n");
          }
          sb.append("-----\n");

          sb.append("Diagnostic Messages for this Task:\n");
          String[] diagMesgs = ti.getDiagnosticMesgs();
          for (String mesg : diagMesgs) {
            sb.append(mesg + "\n");
          }
          msg = sb.toString();
          console.printError(msg);
        }

        // Only print out one task because that's good enough for debugging.
        break;
      }
    }
    return msg;
  }
Пример #2
0
    private void getTaskInfos() throws IOException, MalformedURLException {
      int startIndex = 0;
      while (true) {
        TaskCompletionEvent[] taskCompletions = rj.getTaskCompletionEvents(startIndex);

        if (taskCompletions == null || taskCompletions.length == 0) {
          break;
        }

        boolean more = true;
        boolean firstError = true;
        for (TaskCompletionEvent t : taskCompletions) {
          // For each task completion event, get the associated task id, job id
          // and the logs
          String taskId = t.getTaskAttemptId().getTaskID().toString();
          String jobId = t.getTaskAttemptId().getJobID().toString();
          if (firstError) {
            console.printError("Examining task ID: " + taskId + " (and more) from job " + jobId);
            firstError = false;
          }

          TaskInfo ti = taskIdToInfo.get(taskId);
          if (ti == null) {
            ti = new TaskInfo(jobId);
            taskIdToInfo.put(taskId, ti);
          }
          // These tasks should have come from the same job.
          assert (ti.getJobId() != null && ti.getJobId().equals(jobId));
          String taskAttemptLogUrl =
              ShimLoader.getHadoopShims()
                  .getTaskAttemptLogUrl(conf, t.getTaskTrackerHttp(), t.getTaskId());
          if (taskAttemptLogUrl != null) {
            ti.getLogUrls().add(taskAttemptLogUrl);
          }

          // If a task failed, fetch its error code (if available).
          // Also keep track of the total number of failures for that
          // task (typically, a task gets re-run up to 4 times if it fails.
          if (t.getTaskStatus() != TaskCompletionEvent.Status.SUCCEEDED) {
            String[] diags = rj.getTaskDiagnostics(t.getTaskAttemptId());
            ti.setDiagnosticMesgs(diags);
            if (ti.getErrorCode() == 0) {
              ti.setErrorCode(extractErrorCode(diags));
            }

            Integer failAttempts = failures.get(taskId);
            if (failAttempts == null) {
              failAttempts = Integer.valueOf(0);
            }
            failAttempts = Integer.valueOf(failAttempts.intValue() + 1);
            failures.put(taskId, failAttempts);
          } else {
            successes.add(taskId);
          }
        }
        if (!more) {
          break;
        }
        startIndex += taskCompletions.length;
      }
    }