public ServerManager(WmsMaster wmsMaster) throws Exception { try { this.wmsMaster = wmsMaster; this.conf = wmsMaster.getConfiguration(); this.zkc = wmsMaster.getZkClient(); this.ia = wmsMaster.getInetAddress(); this.startupTimestamp = wmsMaster.getStartTime(); this.metrics = wmsMaster.getMetrics(); maxRestartAttempts = conf.getInt( Constants.WMS_MASTER_SERVER_RESTART_HANDLER_ATTEMPTS, Constants.DEFAULT_WMS_MASTER_SERVER_RESTART_HANDLER_ATTEMPTS); retryIntervalMillis = conf.getInt( Constants.WMS_MASTER_SERVER_RESTART_HANDLER_RETRY_INTERVAL_MILLIS, Constants.DEFAULT_WMS_MASTER_SERVER_RESTART_HANDLER_RETRY_INTERVAL_MILLIS); retryCounterFactory = new RetryCounterFactory(maxRestartAttempts, retryIntervalMillis); parentZnode = conf.get(Constants.ZOOKEEPER_ZNODE_PARENT, Constants.DEFAULT_ZOOKEEPER_ZNODE_PARENT); pool = Executors.newSingleThreadExecutor(); } catch (Exception e) { e.printStackTrace(); LOG.error(e); throw e; } }
@Override public Boolean call() throws Exception { long timeoutMillis = 5000; try { getServersFile(); getZkRunning(); while (true) { while (!restartQueue.isEmpty()) { LOG.debug("Restart queue size [" + restartQueue.size() + "]"); RestartHandler handler = restartQueue.poll(); Future<ScriptContext> runner = pool.submit(handler); ScriptContext scriptContext = runner.get(); // blocking call if (scriptContext.getExitCode() != 0) restartQueue.add(handler); } try { Thread.sleep(timeoutMillis); } catch (InterruptedException e) { } } } catch (Exception e) { e.printStackTrace(); LOG.error(e); pool.shutdown(); throw e; } }
public void setWatchWorkers() { try { List<String> workerList = zooKeeper.getChildren(ZK_WORKER, zkWatcher); for (String worker : workerList) { zooKeeper.exists(ZK_WORKER + "/" + worker, zkWatcher); } } catch (Exception e) { e.printStackTrace(); } }
public void process(WatchedEvent event) { if (event.getType() == Event.EventType.NodeChildrenChanged) { LOG.debug("Running children changed [" + event.getPath() + "]"); try { getZkRunning(); } catch (Exception e) { e.printStackTrace(); LOG.error(e); } } else if (event.getType() == Event.EventType.NodeDeleted) { String znodePath = event.getPath(); LOG.debug("Running znode deleted [" + znodePath + "]"); try { restartServer(znodePath); } catch (Exception e) { e.printStackTrace(); LOG.error(e); } } }
private static String lock(String lock) { String realPath = ""; String parent = "/lock"; String lockName = parent + "/" + lock; logger.debug("Getting lock " + lockName); try { if (zkInstance.exists(parent, false) == null) zkInstance.create(parent, new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.fromFlag(0)); } catch (Exception E) { logger.error("Error creating lock node: " + E.toString()); return null; } List<String> children = new LinkedList<String>(); try { // List <ACL> ACLList = zkInstance.getACL(lockName, zkInstance.exists(lock, false)); realPath = zkInstance.create( lockName, new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL_SEQUENTIAL); // children = zkInstance.getChildren(realPath, false); checkLock: while (true) { children = zkInstance.getChildren(parent, false); for (String curChild : children) { String child = parent + "/" + curChild; // System.out.println(child + " " + realPath + " " + // Integer.toString(child.compareTo(realPath))); if (child.compareTo(realPath) < 0 && child.length() == realPath.length() && curChild.startsWith(lock)) { // System.out.println(child + " cmp to " + realPath); Thread.sleep(300); continue checkLock; } } logger.info("Got lock " + lockName); return realPath; } } catch (Exception E) { logger.error("Exception while trying to get lock " + lockName + " :" + E.toString()); E.printStackTrace(); return null; } }
/* * public synchronized ArrayList<Request> getWorkloadsList() { * ArrayList<Request> workloads = new ArrayList<Request>(); * * LOG.debug("Reading " + parentZnode + * Constants.DEFAULT_ZOOKEEPER_ZNODE_WORKLOADS); * * try { List<String> children = getChildren(parentZnode + * Constants.DEFAULT_ZOOKEEPER_ZNODE_WORKLOADS, null); * * if (!children.isEmpty()) { for (String child : children) { Request * request = new Request(); String workloadZnode = parentZnode + * Constants.DEFAULT_ZOOKEEPER_ZNODE_WORKLOADS + "/" + child; Stat stat = * zkc.exists(workloadZnode, false); if (stat != null) { byte[] bytes = * zkc.getData(workloadZnode, false, stat); try { * deserializer.deserialize(request, bytes); workloads.add(request); } catch * (TException e) { e.printStackTrace(); } } } } } catch (Exception e) { * e.printStackTrace(); } * * return workloads; } */ public synchronized ArrayList<String> getClientsList() { ArrayList<String> clients = new ArrayList<String>(); LOG.debug("Reading " + parentZnode + Constants.DEFAULT_ZOOKEEPER_ZNODE_CLIENTS); try { List<String> children = getChildren(parentZnode + Constants.DEFAULT_ZOOKEEPER_ZNODE_CLIENTS, null); if (!children.isEmpty()) { for (String child : children) { clients.add(child); } } } catch (Exception e) { e.printStackTrace(); } return clients; }
@Subscribe public void handleJob(JobPacket jobPacket) throws Exception { JobPacket packetToClient = new JobPacket(); if (jobPacket.type == JobPacket.JOB_REQ) { jobQueue.add(jobPacket.hash); packetToClient.type = JobPacket.JOB_ACCEPTED; packetToClient.result = "none"; } if (jobPacket.type == JobPacket.JOB_STATUS) { // check under /result/<hash> try { if ((zooKeeper.exists(Joiner.on("/").join(ZK_JOBS, jobPacket.hash), false) != null) && (zooKeeper.exists(Joiner.on("/").join(ZK_RESULT, jobPacket.hash), false) == null)) { System.out.println("Job in progress, please wait!"); packetToClient.type = JobPacket.JOB_PROGRESS; packetToClient.result = "none"; } if ((zooKeeper.exists(Joiner.on("/").join(ZK_JOBS, jobPacket.hash), false) == null) && (zooKeeper.exists(Joiner.on("/").join(ZK_RESULT, jobPacket.hash), false) == null)) { System.out.println("No such Job, please enter your job again!"); packetToClient.type = JobPacket.JOB_NOTFOUND; packetToClient.result = "none"; } if (zooKeeper.exists(Joiner.on("/").join(ZK_RESULT, jobPacket.hash), false) != null) { byte[] data = zooKeeper.getData(Joiner.on("/").join(ZK_RESULT, jobPacket.hash), false, null); packetToClient.type = JobPacket.JOB_RESULT; if (data == null) { System.out.println("Result not found!"); packetToClient.result = null; } else { String result = new String(data); packetToClient.result = result; System.out.println("Result found!"); } } } catch (Exception e) { e.printStackTrace(); } } socket.send(SerializationUtils.serialize(packetToClient), 0); }
public static void main(String[] args) { String myID = null; if (args.length == 4) { try { zooHost = args[0]; zooPort = Integer.parseInt(args[1]); myPort = Integer.parseInt(args[2]); myID = args[3]; } catch (Exception e) { e.printStackTrace(); } } else { System.err.println("Usage tracker [zooHost] [zooPort] [myPort] [myID]"); System.exit(-1); } // initialize ZMQ context = ZMQ.context(1); socket = context.socket(ZMQ.REP); socket.bind("tcp://*:" + myPort); eventBus = new EventBus("Tracker"); JobTracker t = new JobTracker(myID); eventBus.register(t); System.out.println("Starting thread"); new Thread(t.manageWorker()).start(); while (true) { // wait for client req then respond JobPacket packetFromServer = (JobPacket) SerializationUtils.deserialize(socket.recv(0)); System.out.println("From client" + packetFromServer.type); eventBus.post(packetFromServer); } }
@Override public ScriptContext call() throws Exception { try { Scanner scn = new Scanner(znodePath); scn.useDelimiter(":"); String hostName = scn.next(); // host name String instance = scn.next(); // instance int infoPort = Integer.parseInt(scn.next()); // UI info port long serverStartTimestamp = Long.parseLong(scn.next()); scn.close(); // Get the --config property from classpath...it's always first // in the classpath String cp = System.getProperty("java.class.path"); scn = new Scanner(cp); scn.useDelimiter(":"); String confDir = scn.next(); scn.close(); LOG.debug("conf dir [" + confDir + "]"); // Get -Dwms.home.dir String wmsHome = System.getProperty("wms.home.dir"); // If stop-wms.sh is executed and WMS_MANAGES_ZK then zookeeper // is stopped abruptly. // Second scenario is when ZooKeeper fails for some reason // regardless of whether WMS // manages it. When either happens the WmsServer running znodes // still exist in ZooKeeper // and we see them at next startup. When they eventually timeout // we get node deleted events for a server that no longer // exists. So, only recognize // WmsServer running znodes that have timestamps after last // WmsMaster startup. if (serverStartTimestamp > startupTimestamp) { scriptContext.setHostName(hostName); scriptContext.setScriptName("sys_shell.py"); if (hostName.equalsIgnoreCase(ia.getCanonicalHostName())) scriptContext.setCommand( "bin/wms-daemon.sh --config " + confDir + " start server " + instance); else scriptContext.setCommand( "pdsh -w " + hostName + " \"cd " + wmsHome + ";bin/wms-daemon.sh --config " + confDir + " start server " + instance + "\""); RetryCounter retryCounter = retryCounterFactory.create(); while (true) { if (scriptContext.getStdOut().length() > 0) scriptContext.getStdOut().delete(0, scriptContext.getStdOut().length()); if (scriptContext.getStdErr().length() > 0) scriptContext.getStdErr().delete(0, scriptContext.getStdErr().length()); LOG.info( "Restarting WmsServer [" + hostName + ":" + instance + "], script [ " + scriptContext.toString() + " ]"); ScriptManager.getInstance().runScript(scriptContext); if (scriptContext.getExitCode() == 0) { LOG.info("WmsServer [" + hostName + ":" + instance + "] restarted"); break; } else { StringBuilder sb = new StringBuilder(); sb.append("exit code [" + scriptContext.getExitCode() + "]"); if (!scriptContext.getStdOut().toString().isEmpty()) sb.append(", stdout [" + scriptContext.getStdOut().toString() + "]"); if (!scriptContext.getStdErr().toString().isEmpty()) sb.append(", stderr [" + scriptContext.getStdErr().toString() + "]"); LOG.error(sb.toString()); if (!retryCounter.shouldRetry()) { LOG.error( "WmsServer [" + hostName + ":" + instance + "] restart failed after " + retryCounter.getMaxRetries() + " retries"); break; } else { retryCounter.sleepUntilNextRetry(); retryCounter.useRetry(); } } } } else { LOG.debug( "No restart for " + znodePath + "\nbecause WmsServer start time [" + DateFormat.getDateTimeInstance().format(new Date(serverStartTimestamp)) + "] was before WmsMaster start time [" + DateFormat.getDateTimeInstance().format(new Date(startupTimestamp)) + "]"); } } catch (Exception e) { e.printStackTrace(); LOG.error(e); } return scriptContext; }
public JobTracker(String myID) { // connect zooKeeper client zk server try { zkWatcher = new ZkWatcher(); zkConnected = new CountDownLatch(1); zooKeeper = new ZooKeeper( zooHost + ":" + zooPort, ZK_TIMEOUT, new Watcher() { @Override public void process(WatchedEvent event) { /* Release Lock if ZooKeeper is connected */ if (event.getState() == Event.KeeperState.SyncConnected) { zkConnected.countDown(); } else { System.err.println("Could not connect to ZooKeeper!"); System.exit(0); } } }); zkConnected.await(); // Create /jobs if it doesn't exists if (zooKeeper.exists(ZK_JOBS, false) == null) { zooKeeper.create(ZK_JOBS, null, ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } // Create /result if it doesn't exists, to stores already processed jobs with results if (zooKeeper.exists(ZK_RESULT, false) == null) { zooKeeper.create(ZK_RESULT, null, ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } // if /tracker does not exists, create one if (zooKeeper.exists(ZK_TRACKER, false) == null) { zooKeeper.create( ZK_TRACKER, Joiner.on(":").join(InetAddress.getLocalHost().getHostAddress(), myPort).getBytes(), ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); // create myself as leader zooKeeper.create( Joiner.on("/").join(ZK_TRACKER, myID), "primary".getBytes(), ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL_SEQUENTIAL); } else { // if no children then create myself as leader System.out.println("tracker has no children"); if (zooKeeper.getChildren(ZK_TRACKER, zkWatcher, null).isEmpty()) { // create myself as leader and update data of /tracker zooKeeper.setData( ZK_TRACKER, Joiner.on(":").join(InetAddress.getLocalHost().getHostAddress(), myPort).getBytes(), -1); zooKeeper.create( Joiner.on("/").join(ZK_TRACKER, myID), "primary".getBytes(), ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL_SEQUENTIAL); } /* else if there is a child then become the backup */ else { // set watch on "primary" zooKeeper.exists( Joiner.on("/").join(ZK_TRACKER, zooKeeper.getChildren(ZK_TRACKER, zkWatcher).get(0)), zkWatcher); zooKeeper.create( Joiner.on("/").join(ZK_TRACKER, myID), "backup".getBytes(), ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL_SEQUENTIAL); } } // set watch at /worker zooKeeper.getChildren(ZK_WORKER, zkWatcher); zooKeeper.getData(ZK_WORKER, zkWatcher, null); } catch (Exception e) { e.printStackTrace(); } }
@Override public void process(WatchedEvent event) { Event.EventType type = event.getType(); String path = event.getPath(); System.out.println("Path: " + path + ", Event type:" + type); // set watch on workers setWatchWorkers(); switch (type) { case NodeDeleted: try { // Check if node deleted is from /tracker if (path.contains(ZK_TRACKER)) { String node = zooKeeper.getChildren(ZK_TRACKER, false).get(0); // If primary is dead then I become primary if (!zooKeeper .getData(Joiner.on("/").join(ZK_TRACKER, node), false, null) .equals("primary")) { zooKeeper.setData(Joiner.on("/").join(ZK_TRACKER, node), "primary".getBytes(), -1); zooKeeper.setData( ZK_TRACKER, Joiner.on(":") .join(InetAddress.getLocalHost().getHostAddress(), myPort) .getBytes(), -1); } } if (path.contains(ZK_WORKER)) { List<String> currJobs = zooKeeper.getChildren(ZK_JOBS, false); // /worker/<id> String workerId = path.split("/")[2]; System.out.println("Dead worker id " + workerId); for (String job : currJobs) { while (true) { String currData = new String(zooKeeper.getData(Joiner.on("/").join(ZK_JOBS, job), false, null)); int currVersion = zooKeeper.exists(Joiner.on("/").join(ZK_JOBS, job), false).getVersion(); System.out.println("Version curr" + currVersion); // de-serialize WorkerInfo workerInfo = gson.fromJson(currData, WorkerInfo.class); HashMap<String, List<Integer>> newMap = workerInfo.getWorkerInfo(); List<Integer> deadWorkerList = newMap.get(workerId); System.out.println("dead worker List" + deadWorkerList); if (deadWorkerList == null) { break; } // remove it from current info as its dead newMap.remove(workerId); List<String> currWorker = zooKeeper.getChildren(ZK_WORKER, false); // Pick a worker ramdomly and assign the task to it by adding to its current task // list int rand = randGen.nextInt(currWorker.size()); List<Integer> newWorkerList = newMap.get(currWorker.get(rand)); if (newWorkerList == null) newWorkerList = deadWorkerList; else newWorkerList.addAll(deadWorkerList); newMap.put(currWorker.get(rand), newWorkerList); WorkerInfo newWorkerInfo = new WorkerInfo(newMap, job); // serialize String newData = gson.toJson(newWorkerInfo); // setdata on the znode /jobs/<hash> try { if (zooKeeper.setData( Joiner.on("/").join(ZK_JOBS, job), newData.getBytes(), currVersion) != null) { System.out.println( "Update data for " + workerId + " on to " + currWorker.get(rand)); break; } } catch (KeeperException e) { // Ignore } } } } } catch (Exception e) { e.printStackTrace(); } break; } }