Exemplo n.º 1
0
  /** Return a Task that can be sent to a TaskTracker for execution. */
  public Task getTaskToRun(String taskTracker) {

    // Create the 'taskid'; do not count the 'killed' tasks against the job!
    TaskAttemptID taskid = null;
    if (nextTaskId < (MAX_TASK_EXECS + maxTaskAttempts + numKilledTasks)) {
      // Make sure that the attempts are unqiue across restarts
      int attemptId = job.getNumRestarts() * NUM_ATTEMPTS_PER_RESTART + nextTaskId;
      taskid = new TaskAttemptID(id, attemptId);
      ++nextTaskId;
    } else {
      LOG.warn(
          "Exceeded limit of "
              + (MAX_TASK_EXECS + maxTaskAttempts)
              + " (plus "
              + numKilledTasks
              + " killed)"
              + " attempts for the tip '"
              + getTIPId()
              + "'");
      return null;
    }
    // keep track of the last time we started an attempt at this TIP
    // used to calculate the progress rate of this TIP
    setDispatchTime(taskid, JobTracker.getClock().getTime());
    if (0 == execStartTime) {
      // assume task starts running now
      execStartTime = JobTracker.getClock().getTime();
    }
    return addRunningTask(taskid, taskTracker);
  }
Exemplo n.º 2
0
  public void testJobRetire() throws Exception {
    MiniMRCluster mr = null;
    try {
      JobConf conf = new JobConf();
      mr = startCluster(conf, 1);

      JobConf jobConf = mr.createJobConf();
      JobTracker jobtracker = mr.getJobTrackerRunner().getJobTracker();

      Path inDir = new Path(testDir, "input1");
      Path outDir = new Path(testDir, "output1");

      JobID id1 = validateJobRetire(jobConf, inDir, outDir, jobtracker);

      outDir = new Path(testDir, "output2");
      JobID id2 = validateJobRetire(jobConf, inDir, outDir, jobtracker);

      assertNull("Job not removed from cache", jobtracker.getJobStatus(id1));

      assertEquals("Total job in cache not correct", 1, jobtracker.getAllJobs().length);
    } finally {
      if (mr != null) {
        mr.shutdown();
      }
      FileUtil.fullyDelete(new File(testDir.toString()));
    }
  }
Exemplo n.º 3
0
 // wait till the job retires
 private void waitTillRetire(JobID id, JobTracker jobtracker) {
   // wait for job to get retired
   JobInProgress job = jobtracker.getJob(id);
   for (int i = 0; i < 10 && job != null; i++) {
     UtilsForTests.waitFor(1000);
     job = jobtracker.getJob(id);
   }
   assertNull("Job did not retire", job);
 }
  private void printFailures(
      JspWriter out, JobTracker tracker, JobID jobId, String kind, String cause)
      throws IOException {
    JobInProgress job = (JobInProgress) tracker.getJob(jobId);
    if (job == null) {
      out.print("<b>Job " + jobId + " not found.</b><br>\n");
      return;
    }

    boolean includeMap = false;
    boolean includeReduce = false;
    if (kind == null) {
      includeMap = true;
      includeReduce = true;
    } else if ("map".equals(kind)) {
      includeMap = true;
    } else if ("reduce".equals(kind)) {
      includeReduce = true;
    } else if ("all".equals(kind)) {
      includeMap = true;
      includeReduce = true;
    } else {
      out.print("<b>Kind " + kind + " not supported.</b><br>\n");
      return;
    }

    TaskStatus.State state = null;
    try {
      if (cause != null) {
        state = TaskStatus.State.valueOf(cause.toUpperCase());
        if (state != TaskStatus.State.FAILED && state != TaskStatus.State.KILLED) {
          out.print("<b>Cause '" + cause + "' is not an 'unsuccessful' state.</b><br>\n");
          return;
        }
      }
    } catch (IllegalArgumentException e) {
      out.print("<b>Cause '" + cause + "' not supported.</b><br>\n");
      return;
    }

    out.print("<table border=2 cellpadding=\"5\" cellspacing=\"2\">");
    out.print(
        "<tr><th>Attempt</th><th>Task</th><th>Machine</th><th>State</th>"
            + "<th>Error</th><th>Logs</th></tr>\n");
    if (includeMap) {
      TaskInProgress[] tips = job.getTasks(TaskType.MAP);
      for (int i = 0; i < tips.length; ++i) {
        printFailedAttempts(out, tracker, jobId, tips[i], state);
      }
    }
    if (includeReduce) {
      TaskInProgress[] tips = job.getTasks(TaskType.REDUCE);
      for (int i = 0; i < tips.length; ++i) {
        printFailedAttempts(out, tracker, jobId, tips[i], state);
      }
    }
    out.print("</table>\n");
  }
Exemplo n.º 5
0
 /** The TIP's been ordered kill()ed. */
 public void kill() {
   if (isComplete() || failed) {
     return;
   }
   this.failed = true;
   killed = true;
   this.execFinishTime = JobTracker.getClock().getTime();
   recomputeProgress();
 }
Exemplo n.º 6
0
 /** Shutdown the job tracker and wait for it to finish. */
 public void shutdown() {
   try {
     if (tracker != null) {
       tracker.stopTracker();
     }
   } catch (Throwable e) {
     LOG.error("Problem shutting down job tracker", e);
   }
   isActive = false;
 }
Exemplo n.º 7
0
  // create a new job and add it to the jobtracker
  private JobInProgress createAndAddJob(JobTracker jobtracker, JobConf conf) {
    // submit a job in a fake manner
    // get the new job-id
    JobID id = new JobID(jobtracker.getTrackerIdentifier(), jobtracker.jobs.size() + 1);
    // create a JobInProgress for this fake job
    JobInProgress jip = new JobInProgress(id, conf, jobtracker);

    // insert this fake completed job in the jobtracker
    jobtracker.jobs.put(id, jip);

    return jip;
  }
Exemplo n.º 8
0
  private JobID validateJobRetire(JobConf jobConf, Path inDir, Path outDir, JobTracker jobtracker)
      throws IOException {

    RunningJob rj = UtilsForTests.runJob(jobConf, inDir, outDir, 0, 0);
    rj.waitForCompletion();
    assertTrue(rj.isSuccessful());
    JobID id = rj.getID();

    // wait for job to get retired
    waitTillRetire(id, jobtracker);
    RetireJobInfo retired = jobtracker.retireJobs.get(id);
    assertTrue(
        "History url not set",
        retired.getHistoryFile() != null && retired.getHistoryFile().length() > 0);
    assertNotNull("Job is not in cache", jobtracker.getJobStatus(id));

    // get the job conf filename
    String name = jobtracker.getLocalJobFilePath(id);
    File file = new File(name);

    assertFalse("JobConf file not deleted", file.exists());
    // test redirection
    URL jobUrl = new URL(rj.getTrackingURL());
    HttpURLConnection conn = (HttpURLConnection) jobUrl.openConnection();
    conn.setInstanceFollowRedirects(false);
    conn.connect();
    assertEquals(HttpURLConnection.HTTP_MOVED_TEMP, conn.getResponseCode());
    conn.disconnect();

    URL redirectedUrl = new URL(conn.getHeaderField("Location"));
    conn = (HttpURLConnection) redirectedUrl.openConnection();
    conn.connect();
    assertEquals(HttpURLConnection.HTTP_OK, conn.getResponseCode());
    conn.disconnect();

    return id;
  }
Exemplo n.º 9
0
 public static void startTracker(Configuration conf) throws IOException {
   if (tracker != null) throw new IOException("JobTracker already running.");
   while (true) {
     try {
       tracker = new JobTracker(conf);
       break;
     } catch (IOException e) {
       LOG.log(Level.WARNING, "Starting tracker", e);
     }
     try {
       Thread.sleep(1000);
     } catch (InterruptedException e) {
     }
   }
   tracker.offerService();
 }
Exemplo n.º 10
0
  /** Indicate that one of the taskids in this TaskInProgress has successfully completed! */
  public void completed(TaskAttemptID taskid) {
    //
    // Record that this taskid is complete
    //
    completedTask(taskid, TaskStatus.State.SUCCEEDED);

    // Note the successful taskid
    setSuccessfulTaskid(taskid);

    //
    // Now that the TIP is complete, the other speculative
    // subtasks will be closed when the owning tasktracker
    // reports in and calls shouldClose() on this object.
    //

    this.completes++;
    this.execFinishTime = JobTracker.getClock().getTime();
    recomputeProgress();
  }
Exemplo n.º 11
0
  // Mock a job run such that the jobtracker is in a state similar to that
  // resulting from an actual job run.
  // Steps :
  //   - generate a new job-id
  //   - create and add a JobInProgress object using the fake job-id
  //   - create and add a fake tip of the passed type 't' under the fake job
  //     Note that t can be a MAP or a REDUCE or a JOB_SETUP or a JOB_CLEANUP.
  //   - create and add a fake attempt under the fake tip
  //   - remove the job from the jobtracker
  //   - check if the fake attempt is removed from the jobtracker
  private void testRemoveJobTasks(JobTracker jobtracker, JobConf conf, TaskType type) {
    // create and submit a job
    JobInProgress jip = createAndAddJob(jobtracker, conf);
    // create and add a tip
    TaskInProgress tip = createAndAddTIP(jobtracker, jip, type);
    // create and add an attempt
    TaskAttemptID taskid = createAndAddAttempt(tip, 0);

    // this fake attempt should not have any status
    assertNull(tip.getTaskStatus(taskid));

    // remove the job tasks for this fake job from the jobtracker
    jobtracker.removeJobTasks(jip);

    // check the taskidToTIPMap
    for (TaskAttemptID tid : jobtracker.taskidToTIPMap.keySet()) {
      LOG.info("TaskidToTIP : " + tid);
    }

    // check if the fake attempt is removed from the jobtracker
    assertEquals("'taskid' to TIP mapping still exists", 0, jobtracker.taskidToTIPMap.size());
  }
Exemplo n.º 12
0
  /** Initialization common to Map and Reduce */
  void init(JobID jobId) {
    this.startTime = JobTracker.getClock().getTime();
    this.id = new TaskID(jobId, isMapTask(), partition);
    this.skipping = startSkipping();
    long speculativeDuration;
    if (isMapTask()) {
      this.speculativeLag = conf.getMapSpeculativeLag();
      speculativeDuration = conf.getMapSpeculativeDuration();
    } else {
      this.speculativeLag = conf.getReduceSpeculativeLag();
      speculativeDuration = conf.getReduceSpeculativeDuration();
    }

    // speculate only if 1/(1000 * progress_rate) > speculativeDuration
    // ie. :
    // speculate only if progress_rate < 1/(1000 * speculativeDuration)

    if (speculativeDuration > 0) {
      this.maxProgressRateForSpeculation = 1.0 / (1000.0 * speculativeDuration);
    } else {
      // disable this check for durations <= 0
      this.maxProgressRateForSpeculation = -1.0;
    }
  }
Exemplo n.º 13
0
 /** Create the job tracker and run it. */
 public void run() {
   try {
     jc = (jc == null) ? createJobConf() : createJobConf(jc);
     File f = new File("build/test/mapred/local").getAbsoluteFile();
     jc.set("mapred.local.dir", f.getAbsolutePath());
     jc.setClass(
         "topology.node.switch.mapping.impl", StaticMapping.class, DNSToSwitchMapping.class);
     final String id = new SimpleDateFormat("yyyyMMddHHmmssSSS").format(new Date());
     if (ugi == null) {
       ugi = UserGroupInformation.getCurrentUser();
     }
     tracker =
         ugi.doAs(
             new PrivilegedExceptionAction<JobTracker>() {
               public JobTracker run() throws InterruptedException, IOException {
                 return JobTracker.startTracker(jc, id);
               }
             });
     tracker.offerService();
   } catch (Throwable e) {
     LOG.error("Job tracker crashed", e);
     isActive = false;
   }
 }
Exemplo n.º 14
0
  /** Check refreshNodes for decommissioning blacklisted nodes. */
  public void testBlacklistedNodeDecommissioning() throws Exception {
    LOG.info("Testing blacklisted node decommissioning");
    MiniMRCluster mr = null;
    JobTracker jt = null;

    try {
      // start mini mr
      JobConf jtConf = new JobConf();
      jtConf.set("mapred.max.tracker.blacklists", "1");
      mr = new MiniMRCluster(0, 0, 2, "file:///", 1, null, null, null, jtConf);
      jt = mr.getJobTrackerRunner().getJobTracker();

      assertEquals("Trackers not up", 2, jt.taskTrackers().size());
      // validate the total tracker count
      assertEquals(
          "Active tracker count mismatch", 2, jt.getClusterStatus(false).getTaskTrackers());
      // validate blacklisted count
      assertEquals(
          "Blacklisted tracker count mismatch",
          0,
          jt.getClusterStatus(false).getBlacklistedTrackers());

      // run a failing job to blacklist the tracker
      JobConf jConf = mr.createJobConf();
      jConf.set("mapred.max.tracker.failures", "1");
      jConf.setJobName("test-job-fail-once");
      jConf.setMapperClass(FailOnceMapper.class);
      jConf.setReducerClass(IdentityReducer.class);
      jConf.setNumMapTasks(1);
      jConf.setNumReduceTasks(0);

      RunningJob job =
          UtilsForTests.runJob(jConf, new Path(TEST_DIR, "in"), new Path(TEST_DIR, "out"));
      job.waitForCompletion();

      // validate the total tracker count
      assertEquals(
          "Active tracker count mismatch", 1, jt.getClusterStatus(false).getTaskTrackers());
      // validate blacklisted count
      assertEquals(
          "Blacklisted tracker count mismatch",
          1,
          jt.getClusterStatus(false).getBlacklistedTrackers());

      // find the blacklisted tracker
      String trackerName = null;
      for (TaskTrackerStatus status : jt.taskTrackers()) {
        if (jt.isBlacklisted(status.getTrackerName())) {
          trackerName = status.getTrackerName();
          break;
        }
      }
      // get the hostname
      String hostToDecommission = JobInProgress.convertTrackerNameToHostName(trackerName);
      LOG.info("Decommissioning tracker " + hostToDecommission);

      // decommission the node
      HashSet<String> decom = new HashSet<String>(1);
      decom.add(hostToDecommission);
      jt.decommissionNodes(decom);

      // validate
      // check the cluster status and tracker size
      assertEquals(
          "Tracker is not lost upon host decommissioning",
          1,
          jt.getClusterStatus(false).getTaskTrackers());
      assertEquals(
          "Blacklisted tracker count incorrect in cluster status " + "after decommissioning",
          0,
          jt.getClusterStatus(false).getBlacklistedTrackers());
      assertEquals("Tracker is not lost upon host decommissioning", 1, jt.taskTrackers().size());
    } finally {
      if (mr != null) {
        mr.shutdown();
        mr = null;
        jt = null;
        FileUtil.fullyDelete(new File(TEST_DIR.toString()));
      }
    }
  }
  public void _jspService(HttpServletRequest request, HttpServletResponse response)
      throws java.io.IOException, ServletException {

    PageContext pageContext = null;
    HttpSession session = null;
    ServletContext application = null;
    ServletConfig config = null;
    JspWriter out = null;
    Object page = this;
    JspWriter _jspx_out = null;
    PageContext _jspx_page_context = null;

    try {
      response.setContentType("text/html; charset=UTF-8");
      pageContext = _jspxFactory.getPageContext(this, request, response, null, true, 8192, true);
      _jspx_page_context = pageContext;
      application = pageContext.getServletContext();
      config = pageContext.getServletConfig();
      session = pageContext.getSession();
      out = pageContext.getOut();
      _jspx_out = out;
      _jspx_resourceInjector =
          (org.apache.jasper.runtime.ResourceInjector)
              application.getAttribute("com.sun.appserv.jsp.resource.injector");

      out.write('\n');
      out.write('\n');

      JobTracker tracker = (JobTracker) application.getAttribute("job.tracker");
      String trackerName = StringUtils.simpleHostname(tracker.getJobTrackerMachine());

      out.write('\n');
      out.write('\n');
      out.write('\n');

      String jobId = request.getParameter("jobid");
      if (jobId == null) {
        out.println("<h2>Missing 'jobid'!</h2>");
        return;
      }
      JobID jobIdObj = JobID.forName(jobId);
      String kind = request.getParameter("kind");
      String cause = request.getParameter("cause");

      out.write("\n\n<html>\n<title>Hadoop ");
      out.print(jobId);
      out.write(" failures on ");
      out.print(trackerName);
      out.write("</title>\n<body>\n<h1>Hadoop <a href=\"jobdetails.jsp?jobid=");
      out.print(jobId);
      out.write('"');
      out.write('>');
      out.print(jobId);
      out.write("</a>\nfailures on <a href=\"jobtracker.jsp\">");
      out.print(trackerName);
      out.write("</a></h1>\n\n");

      printFailures(out, tracker, jobIdObj, kind, cause);

      out.write("\n\n<hr>\n<a href=\"jobtracker.jsp\">Go back to JobTracker</a><br>\n");

      out.println(ServletUtil.htmlFooter());

      out.write('\n');
    } catch (Throwable t) {
      if (!(t instanceof SkipPageException)) {
        out = _jspx_out;
        if (out != null && out.getBufferSize() != 0) out.clearBuffer();
        if (_jspx_page_context != null) _jspx_page_context.handlePageException(t);
      }
    } finally {
      _jspxFactory.releasePageContext(_jspx_page_context);
    }
  }
Exemplo n.º 16
0
  /** Indicate that one of the taskids in this TaskInProgress has failed. */
  public void incompleteSubTask(TaskAttemptID taskid, JobStatus jobStatus) {
    //
    // Note the failure and its location
    //
    TaskStatus status = taskStatuses.get(taskid);
    String trackerName;
    String trackerHostName = null;
    TaskStatus.State taskState = TaskStatus.State.FAILED;
    if (status != null) {
      trackerName = status.getTaskTracker();
      trackerHostName = JobInProgressTraits.convertTrackerNameToHostName(trackerName);
      // Check if the user manually KILLED/FAILED this task-attempt...
      Boolean shouldFail = tasksToKill.remove(taskid);
      if (shouldFail != null) {
        if (status.getRunState() == TaskStatus.State.FAILED
            || status.getRunState() == TaskStatus.State.KILLED) {
          taskState = (shouldFail) ? TaskStatus.State.FAILED : TaskStatus.State.KILLED;
        } else {
          taskState =
              (shouldFail) ? TaskStatus.State.FAILED_UNCLEAN : TaskStatus.State.KILLED_UNCLEAN;
        }
        status.setRunState(taskState);
        addDiagnosticInfo(taskid, "Task has been " + taskState + " by the user");
      }

      taskState = status.getRunState();
      if (taskState != TaskStatus.State.FAILED
          && taskState != TaskStatus.State.KILLED
          && taskState != TaskStatus.State.FAILED_UNCLEAN
          && taskState != TaskStatus.State.KILLED_UNCLEAN) {
        LOG.info(
            "Task '"
                + taskid
                + "' running on '"
                + trackerName
                + "' in state: '"
                + taskState
                + "' being failed!");
        status.setRunState(TaskStatus.State.FAILED);
        taskState = TaskStatus.State.FAILED;
      }

      // tasktracker went down and failed time was not reported.
      if (0 == status.getFinishTime()) {
        status.setFinishTime(JobTracker.getClock().getTime());
      }
    }

    this.activeTasks.remove(taskid);

    // Since we do not fail completed reduces (whose outputs go to hdfs), we
    // should note this failure only for completed maps, only if this taskid;
    // completed this map. however if the job is done, there is no need to
    // manipulate completed maps
    if (this.isMapTask()
        && !jobSetup
        && !jobCleanup
        && isComplete(taskid)
        && jobStatus.getRunState() != JobStatus.SUCCEEDED) {
      this.completes--;

      // Reset the successfulTaskId since we don't have a SUCCESSFUL task now
      resetSuccessfulTaskid();
    }

    // Note that there can be failures of tasks that are hosted on a machine
    // that has not yet registered with restarted jobtracker
    // recalculate the counts only if its a genuine failure
    if (tasks.contains(taskid)) {
      if (taskState == TaskStatus.State.FAILED) {
        numTaskFailures++;
        machinesWhereFailed.add(trackerHostName);
        if (maxSkipRecords > 0) {
          // skipping feature enabled
          LOG.debug("TaskInProgress adding" + status.getNextRecordRange());
          failedRanges.add(status.getNextRecordRange());
          skipping = startSkipping();
        }

      } else if (taskState == TaskStatus.State.KILLED) {
        numKilledTasks++;
      }
    }

    if (numTaskFailures >= maxTaskAttempts) {
      LOG.info("TaskInProgress " + getTIPId() + " has failed " + numTaskFailures + " times.");
      kill();
    }
  }
Exemplo n.º 17
0
  /**
   * A status message from a client has arrived. It updates the status of a single
   * component-thread-task, which might result in an overall TaskInProgress status update.
   *
   * @return has the task changed its state noticeably?
   */
  synchronized boolean updateStatus(TaskStatus status) {
    TaskAttemptID taskid = status.getTaskID();
    String taskTracker = status.getTaskTracker();
    String diagInfo = status.getDiagnosticInfo();
    TaskStatus oldStatus = taskStatuses.get(taskid);
    boolean changed = true;
    if (diagInfo != null && diagInfo.length() > 0) {
      long runTime = status.getRunTime();
      LOG.info(
          "Error from "
              + taskid
              + " on "
              + taskTracker
              + " runTime(msec) "
              + runTime
              + ": "
              + diagInfo);
      addDiagnosticInfo(taskid, diagInfo);
    }

    if (skipping) {
      failedRanges.updateState(status);
    }

    if (oldStatus != null) {
      TaskStatus.State oldState = oldStatus.getRunState();
      TaskStatus.State newState = status.getRunState();

      // We should never recieve a duplicate success/failure/killed
      // status update for the same taskid! This is a safety check,
      // and is addressed better at the TaskTracker to ensure this.
      // @see {@link TaskTracker.transmitHeartbeat()}
      if ((newState != TaskStatus.State.RUNNING
              && newState != TaskStatus.State.COMMIT_PENDING
              && newState != TaskStatus.State.FAILED_UNCLEAN
              && newState != TaskStatus.State.KILLED_UNCLEAN
              && newState != TaskStatus.State.UNASSIGNED)
          && (oldState == newState)) {
        LOG.warn(
            "Recieved duplicate status update of '"
                + newState
                + "' for '"
                + taskid
                + "' of TIP '"
                + getTIPId()
                + "'"
                + "oldTT="
                + oldStatus.getTaskTracker()
                + " while newTT="
                + status.getTaskTracker());
        return false;
      }

      // The task is not allowed to move from completed back to running.
      // We have seen out of order status messagesmoving tasks from complete
      // to running. This is a spot fix, but it should be addressed more
      // globally.
      if ((newState == TaskStatus.State.RUNNING || newState == TaskStatus.State.UNASSIGNED)
          && (oldState == TaskStatus.State.FAILED
              || oldState == TaskStatus.State.KILLED
              || oldState == TaskStatus.State.FAILED_UNCLEAN
              || oldState == TaskStatus.State.KILLED_UNCLEAN
              || oldState == TaskStatus.State.SUCCEEDED
              || oldState == TaskStatus.State.COMMIT_PENDING)) {
        return false;
      }

      // Do not accept any status once the task is marked FAILED/KILLED
      // This is to handle the case of the JobTracker timing out a task
      // due to launch delay, but the TT comes back with any state or
      // TT got expired
      if (oldState == TaskStatus.State.FAILED || oldState == TaskStatus.State.KILLED) {
        tasksToKill.put(taskid, true);
        return false;
      }

      changed = oldState != newState;
    }
    // if task is a cleanup attempt, do not replace the complete status,
    // update only specific fields.
    // For example, startTime should not be updated,
    // but finishTime has to be updated.
    if (!isCleanupAttempt(taskid)) {
      taskStatuses.put(taskid, status);
      // we don't want to include setup tasks in the task execution stats
      if (!isJobSetupTask()
          && !isJobCleanupTask()
          && ((isMapTask() && job.hasSpeculativeMaps())
              || (!isMapTask() && job.hasSpeculativeReduces()))) {
        updateProgressRate(JobTracker.getClock().getTime());
      }
    } else {
      taskStatuses
          .get(taskid)
          .statusUpdate(
              status.getRunState(),
              status.getProgress(),
              status.getStateString(),
              status.getPhase(),
              status.getFinishTime());
    }

    // Recompute progress
    recomputeProgress();
    return changed;
  }
Exemplo n.º 18
0
  /**
   * Test job retire with tasks that report their *first* status only after the job retires. Steps :
   * - Start a mini-mr cluster with 1 task-tracker having only map slots. Note that this
   * task-tracker will take care of setup/cleanup and the map tasks. - Submit a job with 1 map task
   * and 1 reduce task - Wait for the job to finish the map task - Start a 2nd tracker that waits
   * for a long time after contacting the JT. - Wait for the 2nd tracker to get stuck - Kill the job
   * - Wait for the job to retire - Check if the tip mappings are cleaned up.
   */
  public void testJobRetireWithUnreportedTasks() throws Exception {
    MiniMRCluster mr = null;
    try {
      JobConf conf = new JobConf();
      // set the num-map-slots to 1 so that no reduce tasks but setup/cleanup
      // can run on it
      conf.setInt("mapred.tasktracker.map.tasks.maximum", 1);
      conf.setInt("mapred.tasktracker.reduce.tasks.maximum", 0);

      mr = startCluster(conf, 1);
      JobTracker jobtracker = mr.getJobTrackerRunner().getJobTracker();

      RunningJob job =
          UtilsForTests.runJob(
              mr.createJobConf(), new Path(testDir, "in-1"), new Path(testDir, "out-1"), 1, 1);
      JobID id = JobID.downgrade(job.getID());
      JobInProgress jip = jobtracker.getJob(id);

      // wait 100 secs for the job to complete its map task
      for (int i = 0; i < 1000 && jip.finishedMaps() < 1; i++) {
        UtilsForTests.waitFor(100);
      }
      assertEquals(jip.finishedMaps(), 1);

      // start a tracker that will wait
      LOG.info("Adding a waiting tracker");
      TaskTrackerRunner testTrackerRunner =
          mr.new TaskTrackerRunner(1, 1, null, mr.createJobConf()) {
            @Override
            TaskTracker createTaskTracker(JobConf conf) throws InterruptedException, IOException {
              return new WaitingTaskTracker(conf);
            }
          };
      mr.addTaskTracker(testTrackerRunner);
      LOG.info("Waiting tracker added");

      WaitingTaskTracker testTT = (WaitingTaskTracker) testTrackerRunner.getTaskTracker();

      // wait 100 secs for the newly started task-tracker to join
      for (int i = 0; i < 1000 && jobtracker.taskTrackers().size() < 2; i++) {
        UtilsForTests.waitFor(100);
      }
      assertEquals(jobtracker.taskTrackers().size(), 2);
      LOG.info("Cluster is now up with 2 trackers");
      // stop the test-tt as its no longer required
      mr.stopTaskTracker(mr.getTaskTrackerID(testTT.getName()));

      // 1 reduce task should be scheduled
      assertEquals("TestTT contacted but no reduce task scheduled on it", 1, jip.runningReduces());

      // kill the job
      LOG.info("Killing job " + id);
      job.killJob();

      // check if the reduce task attempt status is missing
      TaskInProgress tip = jip.getTasks(TaskType.REDUCE)[0];
      assertNull(tip.getTaskStatus(tip.getAllTaskAttemptIDs()[0]));

      // wait for the job to retire
      waitTillRetire(id, jobtracker);

      // check the taskidToTIPMap
      for (TaskAttemptID tid : jobtracker.taskidToTIPMap.keySet()) {
        LOG.info("TaskidToTIP : " + tid);
      }
      assertEquals("'taskid' to TIP mapping still exists", 0, jobtracker.taskidToTIPMap.size());
    } finally {
      if (mr != null) {
        mr.shutdown();
      }
      // cleanup
      FileUtil.fullyDelete(new File(testDir.toString()));
    }
  }
Exemplo n.º 19
0
 public int getJobTrackerInfoPort() {
   return tracker.getInfoPort();
 }
  private void printFailedAttempts(
      JspWriter out,
      JobTracker tracker,
      JobID jobId,
      TaskInProgress tip,
      TaskStatus.State failState)
      throws IOException {
    TaskStatus[] statuses = tip.getTaskStatuses();
    TaskID tipId = tip.getTIPId();
    for (int i = 0; i < statuses.length; ++i) {
      TaskStatus.State taskState = statuses[i].getRunState();
      if ((failState == null
              && (taskState == TaskStatus.State.FAILED || taskState == TaskStatus.State.KILLED))
          || taskState == failState) {
        String taskTrackerName = statuses[i].getTaskTracker();
        TaskTrackerStatus taskTracker = tracker.getTaskTrackerStatus(taskTrackerName);
        out.print(
            "<tr><td>"
                + statuses[i].getTaskID()
                + "</td><td><a href=\"taskdetails.jsp?jobid="
                + jobId
                + "&tipid="
                + tipId
                + "\">"
                + tipId
                + "</a></td>");
        if (taskTracker == null) {
          out.print("<td>" + taskTrackerName + "</td>");
        } else {
          out.print(
              "<td><a href=\"http://"
                  + taskTracker.getHost()
                  + ":"
                  + taskTracker.getHttpPort()
                  + "\">"
                  + taskTracker.getHost()
                  + "</a></td>");
        }
        out.print("<td>" + taskState + "</td>");
        out.print("<td><pre>");
        String[] failures = tracker.getTaskDiagnostics(statuses[i].getTaskID());
        if (failures == null) {
          out.print("&nbsp;");
        } else {
          for (int j = 0; j < failures.length; j++) {
            out.print(failures[j]);
            if (j < (failures.length - 1)) {
              out.print("\n-------\n");
            }
          }
        }
        out.print("</pre></td>");

        out.print("<td>");
        String taskLogUrl = null;
        if (taskTracker != null) {
          taskLogUrl =
              TaskLogServlet.getTaskLogUrl(
                  taskTracker.getHost(),
                  String.valueOf(taskTracker.getHttpPort()),
                  statuses[i].getTaskID().toString());
        }
        if (taskLogUrl != null) {
          String tailFourKBUrl = taskLogUrl + "&start=-4097";
          String tailEightKBUrl = taskLogUrl + "&start=-8193";
          String entireLogUrl = taskLogUrl;
          out.print("<a href=\"" + tailFourKBUrl + "\">Last 4KB</a><br/>");
          out.print("<a href=\"" + tailEightKBUrl + "\">Last 8KB</a><br/>");
          out.print("<a href=\"" + entireLogUrl + "\">All</a><br/>");
        } else {
          out.print("n/a"); // task tracker was lost
        }
        out.print("</td>");

        out.print("</tr>\n");
      }
    }
  }
  // for initTasks, update information from JobStory object
  @Override
  public synchronized void initTasks() throws IOException {
    boolean loggingEnabled = LOG.isDebugEnabled();
    if (loggingEnabled) {
      LOG.debug("(initTasks@SJIP) Starting Initialization for " + jobId);
    }
    numMapTasks = jobStory.getNumberMaps();
    numReduceTasks = jobStory.getNumberReduces();

    JobHistory.JobInfo.logSubmitted(
        getJobID(), conf, jobFile.toString(), this.startTime, hasRestarted());
    if (loggingEnabled) {
      LOG.debug("(initTasks@SJIP) Logged to job history for " + jobId);
    }

    //    checkTaskLimits();

    if (loggingEnabled) {
      LOG.debug("(initTasks@SJIP) Checked task limits for " + jobId);
    }

    final String jobFile = "default";
    splits = getRawSplits(jobStory.getInputSplits());
    if (loggingEnabled) {
      LOG.debug(
          "(initTasks@SJIP) Created splits for job = "
              + jobId
              + " number of splits = "
              + splits.length);
    }

    //    createMapTasks(jobFile, splits);

    numMapTasks = splits.length;
    maps = new TaskInProgress[numMapTasks];
    for (int i = 0; i < numMapTasks; ++i) {
      inputLength += splits[i].getDataLength();
      maps[i] = new TaskInProgress(jobId, jobFile, splits[i], conf, this, i, numSlotsPerMap);
    }
    if (numMapTasks > 0) {
      nonRunningMapCache = createCache(splits, maxLevel);
      if (loggingEnabled) {
        LOG.debug(
            "initTasks:numMaps="
                + numMapTasks
                + " Size of nonRunningMapCache="
                + nonRunningMapCache.size()
                + " for "
                + jobId);
      }
    }

    // set the launch time
    this.launchTime = JobTracker.getClock().getTime();

    //    createReduceTasks(jobFile);

    //
    // Create reduce tasks
    //
    this.reduces = new TaskInProgress[numReduceTasks];
    for (int i = 0; i < numReduceTasks; i++) {
      reduces[i] =
          new TaskInProgress(jobId, jobFile, numMapTasks, i, conf, this, numSlotsPerReduce);
      nonRunningReduces.add(reduces[i]);
    }

    // Calculate the minimum number of maps to be complete before
    // we should start scheduling reduces
    completedMapsForReduceSlowstart =
        (int)
            Math.ceil(
                (conf.getFloat(
                        "mapred.reduce.slowstart." + "completed.maps",
                        DEFAULT_COMPLETED_MAPS_PERCENT_FOR_REDUCE_SLOWSTART)
                    * numMapTasks));

    tasksInited.set(true);
    if (loggingEnabled) {
      LOG.debug(
          "Initializing job, nowstatus = " + JobStatus.getJobRunState(getStatus().getRunState()));
    }
    setupComplete();

    if (loggingEnabled) {
      LOG.debug(
          "Initializing job, inited-status = "
              + JobStatus.getJobRunState(getStatus().getRunState()));
    }
  }
  public void _jspService(HttpServletRequest request, HttpServletResponse response)
      throws java.io.IOException, ServletException {

    PageContext pageContext = null;
    HttpSession session = null;
    ServletContext application = null;
    ServletConfig config = null;
    JspWriter out = null;
    Object page = this;
    JspWriter _jspx_out = null;
    PageContext _jspx_page_context = null;

    try {
      response.setContentType("text/html; charset=UTF-8");
      pageContext = _jspxFactory.getPageContext(this, request, response, null, true, 8192, true);
      _jspx_page_context = pageContext;
      application = pageContext.getServletContext();
      config = pageContext.getServletConfig();
      session = pageContext.getSession();
      out = pageContext.getOut();
      _jspx_out = out;
      _jspx_resourceInjector =
          (org.apache.jasper.runtime.ResourceInjector)
              application.getAttribute("com.sun.appserv.jsp.resource.injector");

      out.write('\n');

      JobTracker tracker = (JobTracker) application.getAttribute("job.tracker");
      ClusterStatus status = tracker.getClusterStatus();
      String trackerName = StringUtils.simpleHostname(tracker.getJobTrackerMachine());

      out.write("\n<html>\n<head>\n<title>");
      out.print(trackerName);
      out.write(
          " Hadoop Locality Statistics</title>\n<link rel=\"stylesheet\" type=\"text/css\" href=\"/static/hadoop.css\">\n</head>\n<body>\n<h1>");
      out.print(trackerName);
      out.write(" Hadoop Locality Statistics</h1>\n\n<b>State:</b> ");
      out.print(status.getJobTrackerState());
      out.write("<br>\n<b>Started:</b> ");
      out.print(new Date(tracker.getStartTime()));
      out.write("<br>\n<b>Version:</b> ");
      out.print(VersionInfo.getVersion());
      out.write(",\n                r");
      out.print(VersionInfo.getRevision());
      out.write("<br>\n<b>Compiled:</b> ");
      out.print(VersionInfo.getDate());
      out.write(" by\n                 ");
      out.print(VersionInfo.getUser());
      out.write("<br>\n<b>Identifier:</b> ");
      out.print(tracker.getTrackerIdentifier());
      out.write("<br>\n\n<hr>\n\n");

      Collection<JobInProgress> jobs = new ArrayList<JobInProgress>();
      jobs.addAll(tracker.completedJobs());
      jobs.addAll(tracker.runningJobs());
      jobs.addAll(tracker.failedJobs());
      int dataLocalMaps = 0;
      int rackLocalMaps = 0;
      int totalMaps = 0;
      int totalReduces = 0;
      for (JobInProgress job : jobs) {
        Counters counters = job.getCounters();
        dataLocalMaps += counters.getCounter(JobInProgress.Counter.DATA_LOCAL_MAPS);
        rackLocalMaps += counters.getCounter(JobInProgress.Counter.RACK_LOCAL_MAPS);
        totalMaps += counters.getCounter(JobInProgress.Counter.TOTAL_LAUNCHED_MAPS);
        totalReduces += counters.getCounter(JobInProgress.Counter.TOTAL_LAUNCHED_REDUCES);
      }
      int dataLocalMapPct = totalMaps == 0 ? 0 : (100 * dataLocalMaps) / totalMaps;
      int rackLocalMapPct = totalMaps == 0 ? 0 : (100 * rackLocalMaps) / totalMaps;
      int dataRackLocalMapPct =
          totalMaps == 0 ? 0 : (100 * (dataLocalMaps + rackLocalMaps)) / totalMaps;

      out.write("\n<p>\n<b>Data Local Maps:</b> ");
      out.print(dataLocalMaps);
      out.write(' ');
      out.write('(');
      out.print(dataLocalMapPct);
      out.write("%) <br>\n<b>Rack Local Maps:</b> ");
      out.print(rackLocalMaps);
      out.write(' ');
      out.write('(');
      out.print(rackLocalMapPct);
      out.write("%) <br>\n<b>Data or Rack Local:</b> ");
      out.print(dataLocalMaps + rackLocalMaps);
      out.write(' ');
      out.write('(');
      out.print(dataRackLocalMapPct);
      out.write("%) <br>\n<b>Total Maps:</b> ");
      out.print(totalMaps);
      out.write(" <br>\n<b>Total Reduces:</b> ");
      out.print(totalReduces);
      out.write(" <br>\n</p>\n\n");

      out.println(ServletUtil.htmlFooter());

      out.write('\n');
    } catch (Throwable t) {
      if (!(t instanceof SkipPageException)) {
        out = _jspx_out;
        if (out != null && out.getBufferSize() != 0) out.clearBuffer();
        if (_jspx_page_context != null) _jspx_page_context.handlePageException(t);
      }
    } finally {
      _jspxFactory.releasePageContext(_jspx_page_context);
    }
  }
  @SuppressWarnings("deprecation")
  public SimulatorJobInProgress(
      JobID jobid, JobTracker jobtracker, JobConf default_conf, JobStory jobStory) {
    super(jobid, jobStory.getJobConf(), jobtracker);
    // jobSetupCleanupNeeded set to false in parent cstr, though
    // default is true

    restartCount = 0;
    jobSetupCleanupNeeded = false;

    this.memoryPerMap = conf.getMemoryForMapTask();
    this.memoryPerReduce = conf.getMemoryForReduceTask();
    this.maxTaskFailuresPerTracker = conf.getMaxTaskFailuresPerTracker();

    this.jobId = jobid;
    String url =
        "http://"
            + jobtracker.getJobTrackerMachine()
            + ":"
            + jobtracker.getInfoPort()
            + "/jobdetails.jsp?jobid="
            + jobid;
    this.jobtracker = jobtracker;
    this.conf = jobStory.getJobConf();
    this.priority = conf.getJobPriority();
    Path jobDir = jobtracker.getSystemDirectoryForJob(jobid);
    this.jobFile = new Path(jobDir, "job.xml");
    this.status =
        new JobStatus(jobid, 0.0f, 0.0f, 0.0f, 0.0f, JobStatus.PREP, priority, conf.getUser());
    this.profile =
        new JobProfile(
            jobStory.getUser(),
            jobid,
            this.jobFile.toString(),
            url,
            jobStory.getName(),
            conf.getQueueName());
    this.startTime = JobTracker.getClock().getTime();
    status.setStartTime(startTime);
    this.resourceEstimator = new ResourceEstimator(this);

    this.numMapTasks = jobStory.getNumberMaps();
    this.numReduceTasks = jobStory.getNumberReduces();
    this.taskCompletionEvents =
        new ArrayList<TaskCompletionEvent>(numMapTasks + numReduceTasks + 10);

    this.mapFailuresPercent = conf.getMaxMapTaskFailuresPercent();
    this.reduceFailuresPercent = conf.getMaxReduceTaskFailuresPercent();
    MetricsContext metricsContext = MetricsUtil.getContext("mapred");
    this.jobMetrics = MetricsUtil.createRecord(metricsContext, "job");
    this.jobMetrics.setTag("user", conf.getUser());
    this.jobMetrics.setTag("sessionId", conf.getSessionId());
    this.jobMetrics.setTag("jobName", conf.getJobName());
    this.jobMetrics.setTag("jobId", jobid.toString());

    this.maxLevel = jobtracker.getNumTaskCacheLevels();
    this.anyCacheLevel = this.maxLevel + 1;
    this.nonLocalMaps = new LinkedList<TaskInProgress>();
    this.nonLocalRunningMaps = new LinkedHashSet<TaskInProgress>();
    this.runningMapCache = new IdentityHashMap<Node, Set<TaskInProgress>>();
    this.nonRunningReduces = new LinkedList<TaskInProgress>();
    this.runningReduces = new LinkedHashSet<TaskInProgress>();
    this.slowTaskThreshold =
        Math.max(0.0f, conf.getFloat("mapred.speculative.execution.slowTaskThreshold", 1.0f));
    this.speculativeCap = conf.getFloat("mapred.speculative.execution.speculativeCap", 0.1f);
    this.slowNodeThreshold = conf.getFloat("mapred.speculative.execution.slowNodeThreshold", 1.0f);

    this.jobStory = jobStory;
    //    this.jobHistory = this.jobtracker.getJobHistory();
  }
Exemplo n.º 24
0
 public int getJobTrackerPort() {
   return tracker.getTrackerPort();
 }