private synchronized void getZkRunning() throws Exception {
    LOG.debug("Reading " + parentZnode + Constants.DEFAULT_ZOOKEEPER_ZNODE_SERVERS_RUNNING);
    List<String> children =
        getChildren(
            parentZnode + Constants.DEFAULT_ZOOKEEPER_ZNODE_SERVERS_RUNNING, new RunningWatcher());

    if (!children.isEmpty()) {
      for (String child : children) {
        // If stop-wms.sh is executed and WMS_MANAGES_ZK then zookeeper
        // is stopped abruptly.
        // Second scenario is when ZooKeeper fails for some reason
        // regardless of whether WMS
        // manages it. When either happens the WmsServer running znodes
        // still exist in ZooKeeper
        // and we see them at next startup. When they eventually timeout
        // we get node deleted events for a server that no longer
        // exists. So, only recognize
        // WmsServer running znodes that have timestamps after last
        // WmsMaster startup.
        Scanner scn = new Scanner(child);
        scn.useDelimiter(":");
        String hostName = scn.next();
        String instance = scn.next();
        int infoPort = Integer.parseInt(scn.next());
        long serverStartTimestamp = Long.parseLong(scn.next());
        scn.close();

        if (serverStartTimestamp < startupTimestamp) continue;

        if (!runningServers.contains(child)) {
          LOG.debug("Watching running [" + child + "]");
          zkc.exists(
              parentZnode + Constants.DEFAULT_ZOOKEEPER_ZNODE_SERVERS_RUNNING + "/" + child,
              new RunningWatcher());
          runningServers.add(child);
        }
      }
      metrics.setTotalRunning(runningServers.size());
    } else {
      metrics.setTotalRunning(0);
    }
  }
  private synchronized void restartServer(String znodePath) throws Exception {
    String child =
        znodePath.replace(
            parentZnode + Constants.DEFAULT_ZOOKEEPER_ZNODE_SERVERS_RUNNING + "/", "");
    Scanner scn = new Scanner(child);
    scn.useDelimiter(":");
    String hostName = scn.next();
    String instance = scn.next();
    int infoPort = Integer.parseInt(scn.next());
    long serverStartTimestamp = Long.parseLong(scn.next());
    scn.close();

    LOG.error("WmsServer [" + hostName + ":" + instance + "] failed.");

    if (runningServers.contains(child)) {
      LOG.debug("Found [" + child + "], deleting from running servers list");
      runningServers.remove(child);
      metrics.setTotalRunning(runningServers.size());
    }

    RestartHandler handler = new RestartHandler(child);
    restartQueue.add(handler);
  }
    @Override
    public ScriptContext call() throws Exception {
      try {
        Scanner scn = new Scanner(znodePath);
        scn.useDelimiter(":");
        String hostName = scn.next(); // host name
        String instance = scn.next(); // instance
        int infoPort = Integer.parseInt(scn.next()); // UI info port
        long serverStartTimestamp = Long.parseLong(scn.next());
        scn.close();

        // Get the --config property from classpath...it's always first
        // in the classpath
        String cp = System.getProperty("java.class.path");
        scn = new Scanner(cp);
        scn.useDelimiter(":");
        String confDir = scn.next();
        scn.close();
        LOG.debug("conf dir [" + confDir + "]");

        // Get -Dwms.home.dir
        String wmsHome = System.getProperty("wms.home.dir");

        // If stop-wms.sh is executed and WMS_MANAGES_ZK then zookeeper
        // is stopped abruptly.
        // Second scenario is when ZooKeeper fails for some reason
        // regardless of whether WMS
        // manages it. When either happens the WmsServer running znodes
        // still exist in ZooKeeper
        // and we see them at next startup. When they eventually timeout
        // we get node deleted events for a server that no longer
        // exists. So, only recognize
        // WmsServer running znodes that have timestamps after last
        // WmsMaster startup.
        if (serverStartTimestamp > startupTimestamp) {
          scriptContext.setHostName(hostName);
          scriptContext.setScriptName("sys_shell.py");
          if (hostName.equalsIgnoreCase(ia.getCanonicalHostName()))
            scriptContext.setCommand(
                "bin/wms-daemon.sh --config " + confDir + " start server " + instance);
          else
            scriptContext.setCommand(
                "pdsh -w "
                    + hostName
                    + " \"cd "
                    + wmsHome
                    + ";bin/wms-daemon.sh --config "
                    + confDir
                    + " start server "
                    + instance
                    + "\"");

          RetryCounter retryCounter = retryCounterFactory.create();
          while (true) {
            if (scriptContext.getStdOut().length() > 0)
              scriptContext.getStdOut().delete(0, scriptContext.getStdOut().length());
            if (scriptContext.getStdErr().length() > 0)
              scriptContext.getStdErr().delete(0, scriptContext.getStdErr().length());
            LOG.info(
                "Restarting WmsServer ["
                    + hostName
                    + ":"
                    + instance
                    + "], script [ "
                    + scriptContext.toString()
                    + " ]");
            ScriptManager.getInstance().runScript(scriptContext);

            if (scriptContext.getExitCode() == 0) {
              LOG.info("WmsServer [" + hostName + ":" + instance + "] restarted");
              break;
            } else {
              StringBuilder sb = new StringBuilder();
              sb.append("exit code [" + scriptContext.getExitCode() + "]");
              if (!scriptContext.getStdOut().toString().isEmpty())
                sb.append(", stdout [" + scriptContext.getStdOut().toString() + "]");
              if (!scriptContext.getStdErr().toString().isEmpty())
                sb.append(", stderr [" + scriptContext.getStdErr().toString() + "]");
              LOG.error(sb.toString());

              if (!retryCounter.shouldRetry()) {
                LOG.error(
                    "WmsServer ["
                        + hostName
                        + ":"
                        + instance
                        + "] restart failed after "
                        + retryCounter.getMaxRetries()
                        + " retries");
                break;
              } else {
                retryCounter.sleepUntilNextRetry();
                retryCounter.useRetry();
              }
            }
          }
        } else {
          LOG.debug(
              "No restart for "
                  + znodePath
                  + "\nbecause WmsServer start time ["
                  + DateFormat.getDateTimeInstance().format(new Date(serverStartTimestamp))
                  + "] was before WmsMaster start time ["
                  + DateFormat.getDateTimeInstance().format(new Date(startupTimestamp))
                  + "]");
        }
      } catch (Exception e) {
        e.printStackTrace();
        LOG.error(e);
      }

      return scriptContext;
    }