/** * Verify if the NodeManager could identify disk failures. * * @param localORLogDirs <em>true</em> represent nm-local-dirs and <em>false </em> means * nm-log-dirs * @param expectedDirs expected nm-local-dirs/nm-log-dirs as a string * @param isHealthy <em>true</em> if the overall node should be healthy */ private void verifyDisksHealth(boolean localORLogDirs, String expectedDirs, boolean isHealthy) { // Wait for the NodeManager to identify disk failures. waitForDiskHealthCheck(); List<String> list = localORLogDirs ? dirsHandler.getLocalDirs() : dirsHandler.getLogDirs(); String seenDirs = StringUtils.join(",", list); LOG.info("ExpectedDirs=" + expectedDirs); LOG.info("SeenDirs=" + seenDirs); Assert.assertTrue( "NodeManager could not identify disk failure.", expectedDirs.equals(seenDirs)); Assert.assertEquals( "Node's health in terms of disks is wrong", isHealthy, dirsHandler.areDisksHealthy()); for (int i = 0; i < 10; i++) { Iterator<RMNode> iter = yarnCluster.getResourceManager().getRMContext().getRMNodes().values().iterator(); if (iter.next().getNodeHealthStatus().getIsNodeHealthy() == isHealthy) { break; } // wait for the node health info to go to RM try { Thread.sleep(1000); } catch (InterruptedException e) { LOG.error("Interrupted while waiting for NM->RM heartbeat."); } } Iterator<RMNode> iter = yarnCluster.getResourceManager().getRMContext().getRMNodes().values().iterator(); Assert.assertEquals( "RM is not updated with the health status of a node", isHealthy, iter.next().getNodeHealthStatus().getIsNodeHealthy()); }
private void testDirsFailures(boolean localORLogDirs) throws IOException { String dirType = localORLogDirs ? "local" : "log"; String dirsProperty = localORLogDirs ? YarnConfiguration.NM_LOCAL_DIRS : YarnConfiguration.NM_LOG_DIRS; Configuration conf = new Configuration(); // set disk health check interval to a small value (say 1 sec). conf.setLong(YarnConfiguration.NM_DISK_HEALTH_CHECK_INTERVAL_MS, DISK_HEALTH_CHECK_INTERVAL); // If 2 out of the total 4 local-dirs fail OR if 2 Out of the total 4 // log-dirs fail, then the node's health status should become unhealthy. conf.setFloat(YarnConfiguration.NM_MIN_HEALTHY_DISKS_FRACTION, 0.60F); if (yarnCluster != null) { yarnCluster.stop(); FileUtil.fullyDelete(localFSDirBase); localFSDirBase.mkdirs(); } LOG.info("Starting up YARN cluster"); yarnCluster = new MiniYARNCluster(TestDiskFailures.class.getName(), 1, numLocalDirs, numLogDirs); yarnCluster.init(conf); yarnCluster.start(); NodeManager nm = yarnCluster.getNodeManager(0); LOG.info("Configured nm-" + dirType + "-dirs=" + nm.getConfig().get(dirsProperty)); dirsHandler = nm.getNodeHealthChecker().getDiskHandler(); List<String> list = localORLogDirs ? dirsHandler.getLocalDirs() : dirsHandler.getLogDirs(); String[] dirs = list.toArray(new String[list.size()]); Assert.assertEquals("Number of nm-" + dirType + "-dirs is wrong.", numLocalDirs, dirs.length); String expectedDirs = StringUtils.join(",", list); // validate the health of disks initially verifyDisksHealth(localORLogDirs, expectedDirs, true); // Make 1 nm-local-dir fail and verify if "the nodemanager can identify // the disk failure(s) and can update the list of good nm-local-dirs. prepareDirToFail(dirs[2]); expectedDirs = dirs[0] + "," + dirs[1] + "," + dirs[3]; verifyDisksHealth(localORLogDirs, expectedDirs, true); // Now, make 1 more nm-local-dir/nm-log-dir fail and verify if "the // nodemanager can identify the disk failures and can update the list of // good nm-local-dirs/nm-log-dirs and can update the overall health status // of the node to unhealthy". prepareDirToFail(dirs[0]); expectedDirs = dirs[1] + "," + dirs[3]; verifyDisksHealth(localORLogDirs, expectedDirs, false); // Fail the remaining 2 local-dirs/log-dirs and verify if NM remains with // empty list of local-dirs/log-dirs and the overall health status is // unhealthy. prepareDirToFail(dirs[1]); prepareDirToFail(dirs[3]); expectedDirs = ""; verifyDisksHealth(localORLogDirs, expectedDirs, false); }
/** Wait for the NodeManger to go for the disk-health-check at least once. */ private void waitForDiskHealthCheck() { long lastDisksCheckTime = dirsHandler.getLastDisksCheckTime(); long time = lastDisksCheckTime; for (int i = 0; i < 10 && (time <= lastDisksCheckTime); i++) { try { Thread.sleep(1000); } catch (InterruptedException e) { LOG.error("Interrupted while waiting for NodeManager's disk health check."); } time = dirsHandler.getLastDisksCheckTime(); } }
private void uploadLogsForContainer(ContainerId containerId) { if (this.logAggregationDisabled) { return; } // Lazy creation of the writer if (this.writer == null) { LOG.info( "Starting aggregate log-file for app " + this.applicationId + " at " + this.remoteNodeTmpLogFileForApp); try { this.writer = new LogWriter(this.conf, this.remoteNodeTmpLogFileForApp, this.userUgi); // Write ACLs once when and if the writer is created. this.writer.writeApplicationACLs(appAcls); this.writer.writeApplicationOwner(this.userUgi.getShortUserName()); } catch (IOException e) { LOG.error( "Cannot create writer for app " + this.applicationId + ". Disabling log-aggregation for this app.", e); this.logAggregationDisabled = true; return; } } LOG.info( "Uploading logs for container " + containerId + ". Current good log dirs are " + StringUtils.join(",", dirsHandler.getLogDirs())); LogKey logKey = new LogKey(containerId); LogValue logValue = new LogValue(dirsHandler.getLogDirs(), containerId, userUgi.getShortUserName()); try { this.writer.append(logKey, logValue); } catch (IOException e) { LOG.error("Couldn't upload logs for " + containerId + ". Skipping this container."); } }
/** * Make a local and log directory inaccessible during initialization and verify those bad * directories are recognized and removed from the list of available local and log directories. * * @throws IOException */ @Test public void testDirFailuresOnStartup() throws IOException { Configuration conf = new YarnConfiguration(); String localDir1 = new File(testDir, "localDir1").getPath(); String localDir2 = new File(testDir, "localDir2").getPath(); String logDir1 = new File(testDir, "logDir1").getPath(); String logDir2 = new File(testDir, "logDir2").getPath(); conf.set(YarnConfiguration.NM_LOCAL_DIRS, localDir1 + "," + localDir2); conf.set(YarnConfiguration.NM_LOG_DIRS, logDir1 + "," + logDir2); prepareDirToFail(localDir1); prepareDirToFail(logDir2); LocalDirsHandlerService dirSvc = new LocalDirsHandlerService(); dirSvc.init(conf); List<String> localDirs = dirSvc.getLocalDirs(); Assert.assertEquals(1, localDirs.size()); Assert.assertEquals(localDir2, localDirs.get(0)); List<String> logDirs = dirSvc.getLogDirs(); Assert.assertEquals(1, logDirs.size()); Assert.assertEquals(logDir1, logDirs.get(0)); }
@SuppressWarnings("unchecked") private void doAppLogAggregation() { ContainerId containerId; while (!this.appFinishing.get()) { synchronized (this) { try { wait(THREAD_SLEEP_TIME); } catch (InterruptedException e) { LOG.warn("PendingContainers queue is interrupted"); this.appFinishing.set(true); } } } // Application is finished. Finish pending-containers while ((containerId = this.pendingContainers.poll()) != null) { uploadLogsForContainer(containerId); } // Remove the local app-log-dirs List<String> rootLogDirs = dirsHandler.getLogDirs(); Path[] localAppLogDirs = new Path[rootLogDirs.size()]; int index = 0; for (String rootLogDir : rootLogDirs) { localAppLogDirs[index] = new Path(rootLogDir, this.applicationId); index++; } this.delService.delete(this.userUgi.getShortUserName(), null, localAppLogDirs); if (this.writer != null) { this.writer.closeWriter(); LOG.info("Finished aggregate log-file for app " + this.applicationId); } try { userUgi.doAs( new PrivilegedExceptionAction<Object>() { @Override public Object run() throws Exception { FileSystem remoteFS = FileSystem.get(conf); remoteFS.rename(remoteNodeTmpLogFileForApp, remoteNodeLogFileForApp); return null; } }); } catch (Exception e) { LOG.error( "Failed to move temporary log file to final location: [" + remoteNodeTmpLogFileForApp + "] to [" + remoteNodeLogFileForApp + "]", e); } this.dispatcher .getEventHandler() .handle( new ApplicationEvent( this.appId, ApplicationEventType.APPLICATION_LOG_HANDLING_FINISHED)); this.appAggregationFinished.set(true); }