private String showJobFailDebugInfo() throws IOException { console.printError("Error during job, obtaining debugging information..."); if (!conf.get("mapred.job.tracker", "local").equals("local")) { // Show Tracking URL for remotely running jobs. console.printError("Job Tracking URL: " + rj.getTrackingURL()); } // Loop to get all task completion events because getTaskCompletionEvents // only returns a subset per call TaskInfoGrabber tlg = new TaskInfoGrabber(); Thread t = new Thread(tlg); try { t.start(); t.join(HiveConf.getIntVar(conf, HiveConf.ConfVars.TASKLOG_DEBUG_TIMEOUT)); } catch (InterruptedException e) { console.printError( "Timed out trying to finish grabbing task log URLs, " + "some task info may be missing"); } // Remove failures for tasks that succeeded for (String task : successes) { failures.remove(task); } if (failures.keySet().size() == 0) { return null; } // Find the highest failure count computeMaxFailures(); // Display Error Message for tasks with the highest failure count String jtUrl = null; try { jtUrl = JobTrackerURLResolver.getURL(conf); } catch (Exception e) { console.printError("Unable to retrieve URL for Hadoop Task logs. " + e.getMessage()); } String msg = null; for (String task : failures.keySet()) { if (failures.get(task).intValue() == maxFailures) { TaskInfo ti = taskIdToInfo.get(task); String jobId = ti.getJobId(); String taskUrl = (jtUrl == null) ? null : jtUrl + "/taskdetails.jsp?jobid=" + jobId + "&tipid=" + task.toString(); TaskLogProcessor tlp = new TaskLogProcessor(conf); for (String logUrl : ti.getLogUrls()) { tlp.addTaskAttemptLogUrl(logUrl); } if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.JOB_DEBUG_CAPTURE_STACKTRACES) && stackTraces != null) { if (!stackTraces.containsKey(jobId)) { stackTraces.put(jobId, new ArrayList<List<String>>()); } stackTraces.get(jobId).addAll(tlp.getStackTraces()); } if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.SHOW_JOB_FAIL_DEBUG_INFO)) { List<ErrorAndSolution> errors = tlp.getErrors(); StringBuilder sb = new StringBuilder(); // We use a StringBuilder and then call printError only once as // printError will write to both stderr and the error log file. In // situations where both the stderr and the log file output is // simultaneously output to a single stream, this will look cleaner. sb.append("\n"); sb.append("Task with the most failures(" + maxFailures + "): \n"); sb.append("-----\n"); sb.append("Task ID:\n " + task + "\n\n"); if (taskUrl != null) { sb.append("URL:\n " + taskUrl + "\n"); } for (ErrorAndSolution e : errors) { sb.append("\n"); sb.append("Possible error:\n " + e.getError() + "\n\n"); sb.append("Solution:\n " + e.getSolution() + "\n"); } sb.append("-----\n"); sb.append("Diagnostic Messages for this Task:\n"); String[] diagMesgs = ti.getDiagnosticMesgs(); for (String mesg : diagMesgs) { sb.append(mesg + "\n"); } msg = sb.toString(); console.printError(msg); } // Only print out one task because that's good enough for debugging. break; } } return msg; }
private void getTaskInfos() throws IOException, MalformedURLException { int startIndex = 0; while (true) { TaskCompletionEvent[] taskCompletions = rj.getTaskCompletionEvents(startIndex); if (taskCompletions == null || taskCompletions.length == 0) { break; } boolean more = true; boolean firstError = true; for (TaskCompletionEvent t : taskCompletions) { // For each task completion event, get the associated task id, job id // and the logs String taskId = t.getTaskAttemptId().getTaskID().toString(); String jobId = t.getTaskAttemptId().getJobID().toString(); if (firstError) { console.printError("Examining task ID: " + taskId + " (and more) from job " + jobId); firstError = false; } TaskInfo ti = taskIdToInfo.get(taskId); if (ti == null) { ti = new TaskInfo(jobId); taskIdToInfo.put(taskId, ti); } // These tasks should have come from the same job. assert (ti.getJobId() != null && ti.getJobId().equals(jobId)); String taskAttemptLogUrl = ShimLoader.getHadoopShims() .getTaskAttemptLogUrl(conf, t.getTaskTrackerHttp(), t.getTaskId()); if (taskAttemptLogUrl != null) { ti.getLogUrls().add(taskAttemptLogUrl); } // If a task failed, fetch its error code (if available). // Also keep track of the total number of failures for that // task (typically, a task gets re-run up to 4 times if it fails. if (t.getTaskStatus() != TaskCompletionEvent.Status.SUCCEEDED) { String[] diags = rj.getTaskDiagnostics(t.getTaskAttemptId()); ti.setDiagnosticMesgs(diags); if (ti.getErrorCode() == 0) { ti.setErrorCode(extractErrorCode(diags)); } Integer failAttempts = failures.get(taskId); if (failAttempts == null) { failAttempts = Integer.valueOf(0); } failAttempts = Integer.valueOf(failAttempts.intValue() + 1); failures.put(taskId, failAttempts); } else { successes.add(taskId); } } if (!more) { break; } startIndex += taskCompletions.length; } }