Example #1
0
  /**
   * Verify if the NodeManager could identify disk failures.
   *
   * @param localORLogDirs <em>true</em> represent nm-local-dirs and <em>false </em> means
   *     nm-log-dirs
   * @param expectedDirs expected nm-local-dirs/nm-log-dirs as a string
   * @param isHealthy <em>true</em> if the overall node should be healthy
   */
  private void verifyDisksHealth(boolean localORLogDirs, String expectedDirs, boolean isHealthy) {
    // Wait for the NodeManager to identify disk failures.
    waitForDiskHealthCheck();

    List<String> list = localORLogDirs ? dirsHandler.getLocalDirs() : dirsHandler.getLogDirs();
    String seenDirs = StringUtils.join(",", list);
    LOG.info("ExpectedDirs=" + expectedDirs);
    LOG.info("SeenDirs=" + seenDirs);
    Assert.assertTrue(
        "NodeManager could not identify disk failure.", expectedDirs.equals(seenDirs));

    Assert.assertEquals(
        "Node's health in terms of disks is wrong", isHealthy, dirsHandler.areDisksHealthy());
    for (int i = 0; i < 10; i++) {
      Iterator<RMNode> iter =
          yarnCluster.getResourceManager().getRMContext().getRMNodes().values().iterator();
      if (iter.next().getNodeHealthStatus().getIsNodeHealthy() == isHealthy) {
        break;
      }
      // wait for the node health info to go to RM
      try {
        Thread.sleep(1000);
      } catch (InterruptedException e) {
        LOG.error("Interrupted while waiting for NM->RM heartbeat.");
      }
    }
    Iterator<RMNode> iter =
        yarnCluster.getResourceManager().getRMContext().getRMNodes().values().iterator();
    Assert.assertEquals(
        "RM is not updated with the health status of a node",
        isHealthy,
        iter.next().getNodeHealthStatus().getIsNodeHealthy());
  }
Example #2
0
  private void testDirsFailures(boolean localORLogDirs) throws IOException {
    String dirType = localORLogDirs ? "local" : "log";
    String dirsProperty =
        localORLogDirs ? YarnConfiguration.NM_LOCAL_DIRS : YarnConfiguration.NM_LOG_DIRS;

    Configuration conf = new Configuration();
    // set disk health check interval to a small value (say 1 sec).
    conf.setLong(YarnConfiguration.NM_DISK_HEALTH_CHECK_INTERVAL_MS, DISK_HEALTH_CHECK_INTERVAL);

    // If 2 out of the total 4 local-dirs fail OR if 2 Out of the total 4
    // log-dirs fail, then the node's health status should become unhealthy.
    conf.setFloat(YarnConfiguration.NM_MIN_HEALTHY_DISKS_FRACTION, 0.60F);

    if (yarnCluster != null) {
      yarnCluster.stop();
      FileUtil.fullyDelete(localFSDirBase);
      localFSDirBase.mkdirs();
    }
    LOG.info("Starting up YARN cluster");
    yarnCluster =
        new MiniYARNCluster(TestDiskFailures.class.getName(), 1, numLocalDirs, numLogDirs);
    yarnCluster.init(conf);
    yarnCluster.start();

    NodeManager nm = yarnCluster.getNodeManager(0);
    LOG.info("Configured nm-" + dirType + "-dirs=" + nm.getConfig().get(dirsProperty));
    dirsHandler = nm.getNodeHealthChecker().getDiskHandler();
    List<String> list = localORLogDirs ? dirsHandler.getLocalDirs() : dirsHandler.getLogDirs();
    String[] dirs = list.toArray(new String[list.size()]);
    Assert.assertEquals("Number of nm-" + dirType + "-dirs is wrong.", numLocalDirs, dirs.length);
    String expectedDirs = StringUtils.join(",", list);
    // validate the health of disks initially
    verifyDisksHealth(localORLogDirs, expectedDirs, true);

    // Make 1 nm-local-dir fail and verify if "the nodemanager can identify
    // the disk failure(s) and can update the list of good nm-local-dirs.
    prepareDirToFail(dirs[2]);
    expectedDirs = dirs[0] + "," + dirs[1] + "," + dirs[3];
    verifyDisksHealth(localORLogDirs, expectedDirs, true);

    // Now, make 1 more nm-local-dir/nm-log-dir fail and verify if "the
    // nodemanager can identify the disk failures and can update the list of
    // good nm-local-dirs/nm-log-dirs and can update the overall health status
    // of the node to unhealthy".
    prepareDirToFail(dirs[0]);
    expectedDirs = dirs[1] + "," + dirs[3];
    verifyDisksHealth(localORLogDirs, expectedDirs, false);

    // Fail the remaining 2 local-dirs/log-dirs and verify if NM remains with
    // empty list of local-dirs/log-dirs and the overall health status is
    // unhealthy.
    prepareDirToFail(dirs[1]);
    prepareDirToFail(dirs[3]);
    expectedDirs = "";
    verifyDisksHealth(localORLogDirs, expectedDirs, false);
  }
Example #3
0
 /** Wait for the NodeManger to go for the disk-health-check at least once. */
 private void waitForDiskHealthCheck() {
   long lastDisksCheckTime = dirsHandler.getLastDisksCheckTime();
   long time = lastDisksCheckTime;
   for (int i = 0; i < 10 && (time <= lastDisksCheckTime); i++) {
     try {
       Thread.sleep(1000);
     } catch (InterruptedException e) {
       LOG.error("Interrupted while waiting for NodeManager's disk health check.");
     }
     time = dirsHandler.getLastDisksCheckTime();
   }
 }
  private void uploadLogsForContainer(ContainerId containerId) {

    if (this.logAggregationDisabled) {
      return;
    }

    // Lazy creation of the writer
    if (this.writer == null) {
      LOG.info(
          "Starting aggregate log-file for app "
              + this.applicationId
              + " at "
              + this.remoteNodeTmpLogFileForApp);
      try {
        this.writer = new LogWriter(this.conf, this.remoteNodeTmpLogFileForApp, this.userUgi);
        // Write ACLs once when and if the writer is created.
        this.writer.writeApplicationACLs(appAcls);
        this.writer.writeApplicationOwner(this.userUgi.getShortUserName());
      } catch (IOException e) {
        LOG.error(
            "Cannot create writer for app "
                + this.applicationId
                + ". Disabling log-aggregation for this app.",
            e);
        this.logAggregationDisabled = true;
        return;
      }
    }

    LOG.info(
        "Uploading logs for container "
            + containerId
            + ". Current good log dirs are "
            + StringUtils.join(",", dirsHandler.getLogDirs()));
    LogKey logKey = new LogKey(containerId);
    LogValue logValue =
        new LogValue(dirsHandler.getLogDirs(), containerId, userUgi.getShortUserName());
    try {
      this.writer.append(logKey, logValue);
    } catch (IOException e) {
      LOG.error("Couldn't upload logs for " + containerId + ". Skipping this container.");
    }
  }
Example #5
0
  /**
   * Make a local and log directory inaccessible during initialization and verify those bad
   * directories are recognized and removed from the list of available local and log directories.
   *
   * @throws IOException
   */
  @Test
  public void testDirFailuresOnStartup() throws IOException {
    Configuration conf = new YarnConfiguration();
    String localDir1 = new File(testDir, "localDir1").getPath();
    String localDir2 = new File(testDir, "localDir2").getPath();
    String logDir1 = new File(testDir, "logDir1").getPath();
    String logDir2 = new File(testDir, "logDir2").getPath();
    conf.set(YarnConfiguration.NM_LOCAL_DIRS, localDir1 + "," + localDir2);
    conf.set(YarnConfiguration.NM_LOG_DIRS, logDir1 + "," + logDir2);

    prepareDirToFail(localDir1);
    prepareDirToFail(logDir2);

    LocalDirsHandlerService dirSvc = new LocalDirsHandlerService();
    dirSvc.init(conf);
    List<String> localDirs = dirSvc.getLocalDirs();
    Assert.assertEquals(1, localDirs.size());
    Assert.assertEquals(localDir2, localDirs.get(0));
    List<String> logDirs = dirSvc.getLogDirs();
    Assert.assertEquals(1, logDirs.size());
    Assert.assertEquals(logDir1, logDirs.get(0));
  }
  @SuppressWarnings("unchecked")
  private void doAppLogAggregation() {
    ContainerId containerId;

    while (!this.appFinishing.get()) {
      synchronized (this) {
        try {
          wait(THREAD_SLEEP_TIME);
        } catch (InterruptedException e) {
          LOG.warn("PendingContainers queue is interrupted");
          this.appFinishing.set(true);
        }
      }
    }

    // Application is finished. Finish pending-containers
    while ((containerId = this.pendingContainers.poll()) != null) {
      uploadLogsForContainer(containerId);
    }

    // Remove the local app-log-dirs
    List<String> rootLogDirs = dirsHandler.getLogDirs();
    Path[] localAppLogDirs = new Path[rootLogDirs.size()];
    int index = 0;
    for (String rootLogDir : rootLogDirs) {
      localAppLogDirs[index] = new Path(rootLogDir, this.applicationId);
      index++;
    }
    this.delService.delete(this.userUgi.getShortUserName(), null, localAppLogDirs);

    if (this.writer != null) {
      this.writer.closeWriter();
      LOG.info("Finished aggregate log-file for app " + this.applicationId);
    }

    try {
      userUgi.doAs(
          new PrivilegedExceptionAction<Object>() {
            @Override
            public Object run() throws Exception {
              FileSystem remoteFS = FileSystem.get(conf);
              remoteFS.rename(remoteNodeTmpLogFileForApp, remoteNodeLogFileForApp);
              return null;
            }
          });
    } catch (Exception e) {
      LOG.error(
          "Failed to move temporary log file to final location: ["
              + remoteNodeTmpLogFileForApp
              + "] to ["
              + remoteNodeLogFileForApp
              + "]",
          e);
    }

    this.dispatcher
        .getEventHandler()
        .handle(
            new ApplicationEvent(
                this.appId, ApplicationEventType.APPLICATION_LOG_HANDLING_FINISHED));
    this.appAggregationFinished.set(true);
  }