/** * Verify if the NodeManager could identify disk failures. * * @param localORLogDirs <em>true</em> represent nm-local-dirs and <em>false </em> means * nm-log-dirs * @param expectedDirs expected nm-local-dirs/nm-log-dirs as a string * @param isHealthy <em>true</em> if the overall node should be healthy */ private void verifyDisksHealth(boolean localORLogDirs, String expectedDirs, boolean isHealthy) { // Wait for the NodeManager to identify disk failures. waitForDiskHealthCheck(); List<String> list = localORLogDirs ? dirsHandler.getLocalDirs() : dirsHandler.getLogDirs(); String seenDirs = StringUtils.join(",", list); LOG.info("ExpectedDirs=" + expectedDirs); LOG.info("SeenDirs=" + seenDirs); Assert.assertTrue( "NodeManager could not identify disk failure.", expectedDirs.equals(seenDirs)); Assert.assertEquals( "Node's health in terms of disks is wrong", isHealthy, dirsHandler.areDisksHealthy()); for (int i = 0; i < 10; i++) { Iterator<RMNode> iter = yarnCluster.getResourceManager().getRMContext().getRMNodes().values().iterator(); if (iter.next().getNodeHealthStatus().getIsNodeHealthy() == isHealthy) { break; } // wait for the node health info to go to RM try { Thread.sleep(1000); } catch (InterruptedException e) { LOG.error("Interrupted while waiting for NM->RM heartbeat."); } } Iterator<RMNode> iter = yarnCluster.getResourceManager().getRMContext().getRMNodes().values().iterator(); Assert.assertEquals( "RM is not updated with the health status of a node", isHealthy, iter.next().getNodeHealthStatus().getIsNodeHealthy()); }
private void testDirsFailures(boolean localORLogDirs) throws IOException { String dirType = localORLogDirs ? "local" : "log"; String dirsProperty = localORLogDirs ? YarnConfiguration.NM_LOCAL_DIRS : YarnConfiguration.NM_LOG_DIRS; Configuration conf = new Configuration(); // set disk health check interval to a small value (say 1 sec). conf.setLong(YarnConfiguration.NM_DISK_HEALTH_CHECK_INTERVAL_MS, DISK_HEALTH_CHECK_INTERVAL); // If 2 out of the total 4 local-dirs fail OR if 2 Out of the total 4 // log-dirs fail, then the node's health status should become unhealthy. conf.setFloat(YarnConfiguration.NM_MIN_HEALTHY_DISKS_FRACTION, 0.60F); if (yarnCluster != null) { yarnCluster.stop(); FileUtil.fullyDelete(localFSDirBase); localFSDirBase.mkdirs(); } LOG.info("Starting up YARN cluster"); yarnCluster = new MiniYARNCluster(TestDiskFailures.class.getName(), 1, numLocalDirs, numLogDirs); yarnCluster.init(conf); yarnCluster.start(); NodeManager nm = yarnCluster.getNodeManager(0); LOG.info("Configured nm-" + dirType + "-dirs=" + nm.getConfig().get(dirsProperty)); dirsHandler = nm.getNodeHealthChecker().getDiskHandler(); List<String> list = localORLogDirs ? dirsHandler.getLocalDirs() : dirsHandler.getLogDirs(); String[] dirs = list.toArray(new String[list.size()]); Assert.assertEquals("Number of nm-" + dirType + "-dirs is wrong.", numLocalDirs, dirs.length); String expectedDirs = StringUtils.join(",", list); // validate the health of disks initially verifyDisksHealth(localORLogDirs, expectedDirs, true); // Make 1 nm-local-dir fail and verify if "the nodemanager can identify // the disk failure(s) and can update the list of good nm-local-dirs. prepareDirToFail(dirs[2]); expectedDirs = dirs[0] + "," + dirs[1] + "," + dirs[3]; verifyDisksHealth(localORLogDirs, expectedDirs, true); // Now, make 1 more nm-local-dir/nm-log-dir fail and verify if "the // nodemanager can identify the disk failures and can update the list of // good nm-local-dirs/nm-log-dirs and can update the overall health status // of the node to unhealthy". prepareDirToFail(dirs[0]); expectedDirs = dirs[1] + "," + dirs[3]; verifyDisksHealth(localORLogDirs, expectedDirs, false); // Fail the remaining 2 local-dirs/log-dirs and verify if NM remains with // empty list of local-dirs/log-dirs and the overall health status is // unhealthy. prepareDirToFail(dirs[1]); prepareDirToFail(dirs[3]); expectedDirs = ""; verifyDisksHealth(localORLogDirs, expectedDirs, false); }
/** * Make a local and log directory inaccessible during initialization and verify those bad * directories are recognized and removed from the list of available local and log directories. * * @throws IOException */ @Test public void testDirFailuresOnStartup() throws IOException { Configuration conf = new YarnConfiguration(); String localDir1 = new File(testDir, "localDir1").getPath(); String localDir2 = new File(testDir, "localDir2").getPath(); String logDir1 = new File(testDir, "logDir1").getPath(); String logDir2 = new File(testDir, "logDir2").getPath(); conf.set(YarnConfiguration.NM_LOCAL_DIRS, localDir1 + "," + localDir2); conf.set(YarnConfiguration.NM_LOG_DIRS, logDir1 + "," + logDir2); prepareDirToFail(localDir1); prepareDirToFail(logDir2); LocalDirsHandlerService dirSvc = new LocalDirsHandlerService(); dirSvc.init(conf); List<String> localDirs = dirSvc.getLocalDirs(); Assert.assertEquals(1, localDirs.size()); Assert.assertEquals(localDir2, localDirs.get(0)); List<String> logDirs = dirSvc.getLogDirs(); Assert.assertEquals(1, logDirs.size()); Assert.assertEquals(logDir1, logDirs.get(0)); }