/** Check refreshNodes for decommissioning blacklisted nodes. */ public void testBlacklistedNodeDecommissioning() throws Exception { LOG.info("Testing blacklisted node decommissioning"); MiniMRCluster mr = null; JobTracker jt = null; try { // start mini mr JobConf jtConf = new JobConf(); jtConf.set("mapred.max.tracker.blacklists", "1"); mr = new MiniMRCluster(0, 0, 2, "file:///", 1, null, null, null, jtConf); jt = mr.getJobTrackerRunner().getJobTracker(); assertEquals("Trackers not up", 2, jt.taskTrackers().size()); // validate the total tracker count assertEquals( "Active tracker count mismatch", 2, jt.getClusterStatus(false).getTaskTrackers()); // validate blacklisted count assertEquals( "Blacklisted tracker count mismatch", 0, jt.getClusterStatus(false).getBlacklistedTrackers()); // run a failing job to blacklist the tracker JobConf jConf = mr.createJobConf(); jConf.set("mapred.max.tracker.failures", "1"); jConf.setJobName("test-job-fail-once"); jConf.setMapperClass(FailOnceMapper.class); jConf.setReducerClass(IdentityReducer.class); jConf.setNumMapTasks(1); jConf.setNumReduceTasks(0); RunningJob job = UtilsForTests.runJob(jConf, new Path(TEST_DIR, "in"), new Path(TEST_DIR, "out")); job.waitForCompletion(); // validate the total tracker count assertEquals( "Active tracker count mismatch", 1, jt.getClusterStatus(false).getTaskTrackers()); // validate blacklisted count assertEquals( "Blacklisted tracker count mismatch", 1, jt.getClusterStatus(false).getBlacklistedTrackers()); // find the blacklisted tracker String trackerName = null; for (TaskTrackerStatus status : jt.taskTrackers()) { if (jt.isBlacklisted(status.getTrackerName())) { trackerName = status.getTrackerName(); break; } } // get the hostname String hostToDecommission = JobInProgress.convertTrackerNameToHostName(trackerName); LOG.info("Decommissioning tracker " + hostToDecommission); // decommission the node HashSet<String> decom = new HashSet<String>(1); decom.add(hostToDecommission); jt.decommissionNodes(decom); // validate // check the cluster status and tracker size assertEquals( "Tracker is not lost upon host decommissioning", 1, jt.getClusterStatus(false).getTaskTrackers()); assertEquals( "Blacklisted tracker count incorrect in cluster status " + "after decommissioning", 0, jt.getClusterStatus(false).getBlacklistedTrackers()); assertEquals("Tracker is not lost upon host decommissioning", 1, jt.taskTrackers().size()); } finally { if (mr != null) { mr.shutdown(); mr = null; jt = null; FileUtil.fullyDelete(new File(TEST_DIR.toString())); } } }
/** * Test job retire with tasks that report their *first* status only after the job retires. Steps : * - Start a mini-mr cluster with 1 task-tracker having only map slots. Note that this * task-tracker will take care of setup/cleanup and the map tasks. - Submit a job with 1 map task * and 1 reduce task - Wait for the job to finish the map task - Start a 2nd tracker that waits * for a long time after contacting the JT. - Wait for the 2nd tracker to get stuck - Kill the job * - Wait for the job to retire - Check if the tip mappings are cleaned up. */ public void testJobRetireWithUnreportedTasks() throws Exception { MiniMRCluster mr = null; try { JobConf conf = new JobConf(); // set the num-map-slots to 1 so that no reduce tasks but setup/cleanup // can run on it conf.setInt("mapred.tasktracker.map.tasks.maximum", 1); conf.setInt("mapred.tasktracker.reduce.tasks.maximum", 0); mr = startCluster(conf, 1); JobTracker jobtracker = mr.getJobTrackerRunner().getJobTracker(); RunningJob job = UtilsForTests.runJob( mr.createJobConf(), new Path(testDir, "in-1"), new Path(testDir, "out-1"), 1, 1); JobID id = JobID.downgrade(job.getID()); JobInProgress jip = jobtracker.getJob(id); // wait 100 secs for the job to complete its map task for (int i = 0; i < 1000 && jip.finishedMaps() < 1; i++) { UtilsForTests.waitFor(100); } assertEquals(jip.finishedMaps(), 1); // start a tracker that will wait LOG.info("Adding a waiting tracker"); TaskTrackerRunner testTrackerRunner = mr.new TaskTrackerRunner(1, 1, null, mr.createJobConf()) { @Override TaskTracker createTaskTracker(JobConf conf) throws InterruptedException, IOException { return new WaitingTaskTracker(conf); } }; mr.addTaskTracker(testTrackerRunner); LOG.info("Waiting tracker added"); WaitingTaskTracker testTT = (WaitingTaskTracker) testTrackerRunner.getTaskTracker(); // wait 100 secs for the newly started task-tracker to join for (int i = 0; i < 1000 && jobtracker.taskTrackers().size() < 2; i++) { UtilsForTests.waitFor(100); } assertEquals(jobtracker.taskTrackers().size(), 2); LOG.info("Cluster is now up with 2 trackers"); // stop the test-tt as its no longer required mr.stopTaskTracker(mr.getTaskTrackerID(testTT.getName())); // 1 reduce task should be scheduled assertEquals("TestTT contacted but no reduce task scheduled on it", 1, jip.runningReduces()); // kill the job LOG.info("Killing job " + id); job.killJob(); // check if the reduce task attempt status is missing TaskInProgress tip = jip.getTasks(TaskType.REDUCE)[0]; assertNull(tip.getTaskStatus(tip.getAllTaskAttemptIDs()[0])); // wait for the job to retire waitTillRetire(id, jobtracker); // check the taskidToTIPMap for (TaskAttemptID tid : jobtracker.taskidToTIPMap.keySet()) { LOG.info("TaskidToTIP : " + tid); } assertEquals("'taskid' to TIP mapping still exists", 0, jobtracker.taskidToTIPMap.size()); } finally { if (mr != null) { mr.shutdown(); } // cleanup FileUtil.fullyDelete(new File(testDir.toString())); } }