private synchronized void getZkRunning() throws Exception { LOG.debug("Reading " + parentZnode + Constants.DEFAULT_ZOOKEEPER_ZNODE_SERVERS_RUNNING); List<String> children = getChildren( parentZnode + Constants.DEFAULT_ZOOKEEPER_ZNODE_SERVERS_RUNNING, new RunningWatcher()); if (!children.isEmpty()) { for (String child : children) { // If stop-wms.sh is executed and WMS_MANAGES_ZK then zookeeper // is stopped abruptly. // Second scenario is when ZooKeeper fails for some reason // regardless of whether WMS // manages it. When either happens the WmsServer running znodes // still exist in ZooKeeper // and we see them at next startup. When they eventually timeout // we get node deleted events for a server that no longer // exists. So, only recognize // WmsServer running znodes that have timestamps after last // WmsMaster startup. Scanner scn = new Scanner(child); scn.useDelimiter(":"); String hostName = scn.next(); String instance = scn.next(); int infoPort = Integer.parseInt(scn.next()); long serverStartTimestamp = Long.parseLong(scn.next()); scn.close(); if (serverStartTimestamp < startupTimestamp) continue; if (!runningServers.contains(child)) { LOG.debug("Watching running [" + child + "]"); zkc.exists( parentZnode + Constants.DEFAULT_ZOOKEEPER_ZNODE_SERVERS_RUNNING + "/" + child, new RunningWatcher()); runningServers.add(child); } } metrics.setTotalRunning(runningServers.size()); } else { metrics.setTotalRunning(0); } }
private synchronized void restartServer(String znodePath) throws Exception { String child = znodePath.replace( parentZnode + Constants.DEFAULT_ZOOKEEPER_ZNODE_SERVERS_RUNNING + "/", ""); Scanner scn = new Scanner(child); scn.useDelimiter(":"); String hostName = scn.next(); String instance = scn.next(); int infoPort = Integer.parseInt(scn.next()); long serverStartTimestamp = Long.parseLong(scn.next()); scn.close(); LOG.error("WmsServer [" + hostName + ":" + instance + "] failed."); if (runningServers.contains(child)) { LOG.debug("Found [" + child + "], deleting from running servers list"); runningServers.remove(child); metrics.setTotalRunning(runningServers.size()); } RestartHandler handler = new RestartHandler(child); restartQueue.add(handler); }
@Override public ScriptContext call() throws Exception { try { Scanner scn = new Scanner(znodePath); scn.useDelimiter(":"); String hostName = scn.next(); // host name String instance = scn.next(); // instance int infoPort = Integer.parseInt(scn.next()); // UI info port long serverStartTimestamp = Long.parseLong(scn.next()); scn.close(); // Get the --config property from classpath...it's always first // in the classpath String cp = System.getProperty("java.class.path"); scn = new Scanner(cp); scn.useDelimiter(":"); String confDir = scn.next(); scn.close(); LOG.debug("conf dir [" + confDir + "]"); // Get -Dwms.home.dir String wmsHome = System.getProperty("wms.home.dir"); // If stop-wms.sh is executed and WMS_MANAGES_ZK then zookeeper // is stopped abruptly. // Second scenario is when ZooKeeper fails for some reason // regardless of whether WMS // manages it. When either happens the WmsServer running znodes // still exist in ZooKeeper // and we see them at next startup. When they eventually timeout // we get node deleted events for a server that no longer // exists. So, only recognize // WmsServer running znodes that have timestamps after last // WmsMaster startup. if (serverStartTimestamp > startupTimestamp) { scriptContext.setHostName(hostName); scriptContext.setScriptName("sys_shell.py"); if (hostName.equalsIgnoreCase(ia.getCanonicalHostName())) scriptContext.setCommand( "bin/wms-daemon.sh --config " + confDir + " start server " + instance); else scriptContext.setCommand( "pdsh -w " + hostName + " \"cd " + wmsHome + ";bin/wms-daemon.sh --config " + confDir + " start server " + instance + "\""); RetryCounter retryCounter = retryCounterFactory.create(); while (true) { if (scriptContext.getStdOut().length() > 0) scriptContext.getStdOut().delete(0, scriptContext.getStdOut().length()); if (scriptContext.getStdErr().length() > 0) scriptContext.getStdErr().delete(0, scriptContext.getStdErr().length()); LOG.info( "Restarting WmsServer [" + hostName + ":" + instance + "], script [ " + scriptContext.toString() + " ]"); ScriptManager.getInstance().runScript(scriptContext); if (scriptContext.getExitCode() == 0) { LOG.info("WmsServer [" + hostName + ":" + instance + "] restarted"); break; } else { StringBuilder sb = new StringBuilder(); sb.append("exit code [" + scriptContext.getExitCode() + "]"); if (!scriptContext.getStdOut().toString().isEmpty()) sb.append(", stdout [" + scriptContext.getStdOut().toString() + "]"); if (!scriptContext.getStdErr().toString().isEmpty()) sb.append(", stderr [" + scriptContext.getStdErr().toString() + "]"); LOG.error(sb.toString()); if (!retryCounter.shouldRetry()) { LOG.error( "WmsServer [" + hostName + ":" + instance + "] restart failed after " + retryCounter.getMaxRetries() + " retries"); break; } else { retryCounter.sleepUntilNextRetry(); retryCounter.useRetry(); } } } } else { LOG.debug( "No restart for " + znodePath + "\nbecause WmsServer start time [" + DateFormat.getDateTimeInstance().format(new Date(serverStartTimestamp)) + "] was before WmsMaster start time [" + DateFormat.getDateTimeInstance().format(new Date(startupTimestamp)) + "]"); } } catch (Exception e) { e.printStackTrace(); LOG.error(e); } return scriptContext; }