public void process(WatchedEvent event) { if (event.getType() == Event.EventType.NodeChildrenChanged) { LOG.debug("Running children changed [" + event.getPath() + "]"); try { getZkRunning(); } catch (Exception e) { e.printStackTrace(); LOG.error(e); } } else if (event.getType() == Event.EventType.NodeDeleted) { String znodePath = event.getPath(); LOG.debug("Running znode deleted [" + znodePath + "]"); try { restartServer(znodePath); } catch (Exception e) { e.printStackTrace(); LOG.error(e); } } }
@Override public ScriptContext call() throws Exception { try { Scanner scn = new Scanner(znodePath); scn.useDelimiter(":"); String hostName = scn.next(); // host name String instance = scn.next(); // instance int infoPort = Integer.parseInt(scn.next()); // UI info port long serverStartTimestamp = Long.parseLong(scn.next()); scn.close(); // Get the --config property from classpath...it's always first // in the classpath String cp = System.getProperty("java.class.path"); scn = new Scanner(cp); scn.useDelimiter(":"); String confDir = scn.next(); scn.close(); LOG.debug("conf dir [" + confDir + "]"); // Get -Dwms.home.dir String wmsHome = System.getProperty("wms.home.dir"); // If stop-wms.sh is executed and WMS_MANAGES_ZK then zookeeper // is stopped abruptly. // Second scenario is when ZooKeeper fails for some reason // regardless of whether WMS // manages it. When either happens the WmsServer running znodes // still exist in ZooKeeper // and we see them at next startup. When they eventually timeout // we get node deleted events for a server that no longer // exists. So, only recognize // WmsServer running znodes that have timestamps after last // WmsMaster startup. if (serverStartTimestamp > startupTimestamp) { scriptContext.setHostName(hostName); scriptContext.setScriptName("sys_shell.py"); if (hostName.equalsIgnoreCase(ia.getCanonicalHostName())) scriptContext.setCommand( "bin/wms-daemon.sh --config " + confDir + " start server " + instance); else scriptContext.setCommand( "pdsh -w " + hostName + " \"cd " + wmsHome + ";bin/wms-daemon.sh --config " + confDir + " start server " + instance + "\""); RetryCounter retryCounter = retryCounterFactory.create(); while (true) { if (scriptContext.getStdOut().length() > 0) scriptContext.getStdOut().delete(0, scriptContext.getStdOut().length()); if (scriptContext.getStdErr().length() > 0) scriptContext.getStdErr().delete(0, scriptContext.getStdErr().length()); LOG.info( "Restarting WmsServer [" + hostName + ":" + instance + "], script [ " + scriptContext.toString() + " ]"); ScriptManager.getInstance().runScript(scriptContext); if (scriptContext.getExitCode() == 0) { LOG.info("WmsServer [" + hostName + ":" + instance + "] restarted"); break; } else { StringBuilder sb = new StringBuilder(); sb.append("exit code [" + scriptContext.getExitCode() + "]"); if (!scriptContext.getStdOut().toString().isEmpty()) sb.append(", stdout [" + scriptContext.getStdOut().toString() + "]"); if (!scriptContext.getStdErr().toString().isEmpty()) sb.append(", stderr [" + scriptContext.getStdErr().toString() + "]"); LOG.error(sb.toString()); if (!retryCounter.shouldRetry()) { LOG.error( "WmsServer [" + hostName + ":" + instance + "] restart failed after " + retryCounter.getMaxRetries() + " retries"); break; } else { retryCounter.sleepUntilNextRetry(); retryCounter.useRetry(); } } } } else { LOG.debug( "No restart for " + znodePath + "\nbecause WmsServer start time [" + DateFormat.getDateTimeInstance().format(new Date(serverStartTimestamp)) + "] was before WmsMaster start time [" + DateFormat.getDateTimeInstance().format(new Date(startupTimestamp)) + "]"); } } catch (Exception e) { e.printStackTrace(); LOG.error(e); } return scriptContext; }