@Override
  public Boolean call() throws Exception {

    long timeoutMillis = 5000;

    try {
      getServersFile();
      getZkRunning();

      while (true) {
        while (!restartQueue.isEmpty()) {
          LOG.debug("Restart queue size [" + restartQueue.size() + "]");
          RestartHandler handler = restartQueue.poll();
          Future<ScriptContext> runner = pool.submit(handler);
          ScriptContext scriptContext = runner.get(); // blocking call
          if (scriptContext.getExitCode() != 0) restartQueue.add(handler);
        }

        try {
          Thread.sleep(timeoutMillis);
        } catch (InterruptedException e) {
        }
      }

    } catch (Exception e) {
      e.printStackTrace();
      LOG.error(e);
      pool.shutdown();
      throw e;
    }
  }
 public ServerManager(WmsMaster wmsMaster) throws Exception {
   try {
     this.wmsMaster = wmsMaster;
     this.conf = wmsMaster.getConfiguration();
     this.zkc = wmsMaster.getZkClient();
     this.ia = wmsMaster.getInetAddress();
     this.startupTimestamp = wmsMaster.getStartTime();
     this.metrics = wmsMaster.getMetrics();
     maxRestartAttempts =
         conf.getInt(
             Constants.WMS_MASTER_SERVER_RESTART_HANDLER_ATTEMPTS,
             Constants.DEFAULT_WMS_MASTER_SERVER_RESTART_HANDLER_ATTEMPTS);
     retryIntervalMillis =
         conf.getInt(
             Constants.WMS_MASTER_SERVER_RESTART_HANDLER_RETRY_INTERVAL_MILLIS,
             Constants.DEFAULT_WMS_MASTER_SERVER_RESTART_HANDLER_RETRY_INTERVAL_MILLIS);
     retryCounterFactory = new RetryCounterFactory(maxRestartAttempts, retryIntervalMillis);
     parentZnode =
         conf.get(Constants.ZOOKEEPER_ZNODE_PARENT, Constants.DEFAULT_ZOOKEEPER_ZNODE_PARENT);
     pool = Executors.newSingleThreadExecutor();
   } catch (Exception e) {
     e.printStackTrace();
     LOG.error(e);
     throw e;
   }
 }
Пример #3
0
 public void setWatchWorkers() {
   try {
     List<String> workerList = zooKeeper.getChildren(ZK_WORKER, zkWatcher);
     for (String worker : workerList) {
       zooKeeper.exists(ZK_WORKER + "/" + worker, zkWatcher);
     }
   } catch (Exception e) {
     e.printStackTrace();
   }
 }
Пример #4
0
 private static boolean unlock(String lock) {
   logger.info("Releasing lock " + lock);
   try {
     zkInstance.delete(lock, -1);
   } catch (Exception E) {
     logger.debug("Error releasing lock: " + E.toString());
     return true;
   }
   return true;
 }
Пример #5
0
  private static String lock(String lock) {
    String realPath = "";
    String parent = "/lock";
    String lockName = parent + "/" + lock;

    logger.debug("Getting lock " + lockName);

    try {
      if (zkInstance.exists(parent, false) == null)
        zkInstance.create(parent, new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.fromFlag(0));
    } catch (Exception E) {
      logger.error("Error creating lock node: " + E.toString());
      return null;
    }

    List<String> children = new LinkedList<String>();
    try {
      // List <ACL> ACLList = zkInstance.getACL(lockName, zkInstance.exists(lock, false));

      realPath =
          zkInstance.create(
              lockName, new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL_SEQUENTIAL);
      // children = zkInstance.getChildren(realPath, false);
      checkLock:
      while (true) {
        children = zkInstance.getChildren(parent, false);
        for (String curChild : children) {
          String child = parent + "/" + curChild;
          // System.out.println(child + " " + realPath + " " +
          // Integer.toString(child.compareTo(realPath)));
          if (child.compareTo(realPath) < 0
              && child.length() == realPath.length()
              && curChild.startsWith(lock)) {
            // System.out.println(child + " cmp to " + realPath);
            Thread.sleep(300);
            continue checkLock;
          }
        }
        logger.info("Got lock " + lockName);
        return realPath;
      }
    } catch (Exception E) {
      logger.error("Exception while trying to get lock " + lockName + " :" + E.toString());
      E.printStackTrace();
      return null;
    }
  }
  /*
   * public synchronized ArrayList<Request> getWorkloadsList() {
   * ArrayList<Request> workloads = new ArrayList<Request>();
   *
   * LOG.debug("Reading " + parentZnode +
   * Constants.DEFAULT_ZOOKEEPER_ZNODE_WORKLOADS);
   *
   * try { List<String> children = getChildren(parentZnode +
   * Constants.DEFAULT_ZOOKEEPER_ZNODE_WORKLOADS, null);
   *
   * if (!children.isEmpty()) { for (String child : children) { Request
   * request = new Request(); String workloadZnode = parentZnode +
   * Constants.DEFAULT_ZOOKEEPER_ZNODE_WORKLOADS + "/" + child; Stat stat =
   * zkc.exists(workloadZnode, false); if (stat != null) { byte[] bytes =
   * zkc.getData(workloadZnode, false, stat); try {
   * deserializer.deserialize(request, bytes); workloads.add(request); } catch
   * (TException e) { e.printStackTrace(); } } } } } catch (Exception e) {
   * e.printStackTrace(); }
   *
   * return workloads; }
   */
  public synchronized ArrayList<String> getClientsList() {
    ArrayList<String> clients = new ArrayList<String>();

    LOG.debug("Reading " + parentZnode + Constants.DEFAULT_ZOOKEEPER_ZNODE_CLIENTS);

    try {
      List<String> children =
          getChildren(parentZnode + Constants.DEFAULT_ZOOKEEPER_ZNODE_CLIENTS, null);
      if (!children.isEmpty()) {
        for (String child : children) {
          clients.add(child);
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
    }

    return clients;
  }
Пример #7
0
  @Subscribe
  public void handleJob(JobPacket jobPacket) throws Exception {
    JobPacket packetToClient = new JobPacket();
    if (jobPacket.type == JobPacket.JOB_REQ) {
      jobQueue.add(jobPacket.hash);
      packetToClient.type = JobPacket.JOB_ACCEPTED;
      packetToClient.result = "none";
    }
    if (jobPacket.type == JobPacket.JOB_STATUS) {

      // check under /result/<hash>
      try {
        if ((zooKeeper.exists(Joiner.on("/").join(ZK_JOBS, jobPacket.hash), false) != null)
            && (zooKeeper.exists(Joiner.on("/").join(ZK_RESULT, jobPacket.hash), false) == null)) {
          System.out.println("Job in progress, please wait!");
          packetToClient.type = JobPacket.JOB_PROGRESS;
          packetToClient.result = "none";
        }
        if ((zooKeeper.exists(Joiner.on("/").join(ZK_JOBS, jobPacket.hash), false) == null)
            && (zooKeeper.exists(Joiner.on("/").join(ZK_RESULT, jobPacket.hash), false) == null)) {
          System.out.println("No such Job, please enter your job again!");
          packetToClient.type = JobPacket.JOB_NOTFOUND;
          packetToClient.result = "none";
        }
        if (zooKeeper.exists(Joiner.on("/").join(ZK_RESULT, jobPacket.hash), false) != null) {
          byte[] data =
              zooKeeper.getData(Joiner.on("/").join(ZK_RESULT, jobPacket.hash), false, null);
          packetToClient.type = JobPacket.JOB_RESULT;
          if (data == null) {
            System.out.println("Result not found!");
            packetToClient.result = null;
          } else {
            String result = new String(data);
            packetToClient.result = result;
            System.out.println("Result found!");
          }
        }
      } catch (Exception e) {
        e.printStackTrace();
      }
    }
    socket.send(SerializationUtils.serialize(packetToClient), 0);
  }
 public void process(WatchedEvent event) {
   if (event.getType() == Event.EventType.NodeChildrenChanged) {
     LOG.debug("Running children changed [" + event.getPath() + "]");
     try {
       getZkRunning();
     } catch (Exception e) {
       e.printStackTrace();
       LOG.error(e);
     }
   } else if (event.getType() == Event.EventType.NodeDeleted) {
     String znodePath = event.getPath();
     LOG.debug("Running znode deleted [" + znodePath + "]");
     try {
       restartServer(znodePath);
     } catch (Exception e) {
       e.printStackTrace();
       LOG.error(e);
     }
   }
 }
Пример #9
0
  public static void main(String[] args) {
    String myID = null;
    if (args.length == 4) {

      try {
        zooHost = args[0];
        zooPort = Integer.parseInt(args[1]);
        myPort = Integer.parseInt(args[2]);
        myID = args[3];

      } catch (Exception e) {
        e.printStackTrace();
      }

    } else {
      System.err.println("Usage tracker [zooHost] [zooPort] [myPort] [myID]");
      System.exit(-1);
    }
    // initialize ZMQ
    context = ZMQ.context(1);
    socket = context.socket(ZMQ.REP);
    socket.bind("tcp://*:" + myPort);

    eventBus = new EventBus("Tracker");
    JobTracker t = new JobTracker(myID);
    eventBus.register(t);
    System.out.println("Starting thread");
    new Thread(t.manageWorker()).start();

    while (true) {
      // wait for client req then respond
      JobPacket packetFromServer = (JobPacket) SerializationUtils.deserialize(socket.recv(0));
      System.out.println("From client" + packetFromServer.type);
      eventBus.post(packetFromServer);
    }
  }
    @Override
    public ScriptContext call() throws Exception {
      try {
        Scanner scn = new Scanner(znodePath);
        scn.useDelimiter(":");
        String hostName = scn.next(); // host name
        String instance = scn.next(); // instance
        int infoPort = Integer.parseInt(scn.next()); // UI info port
        long serverStartTimestamp = Long.parseLong(scn.next());
        scn.close();

        // Get the --config property from classpath...it's always first
        // in the classpath
        String cp = System.getProperty("java.class.path");
        scn = new Scanner(cp);
        scn.useDelimiter(":");
        String confDir = scn.next();
        scn.close();
        LOG.debug("conf dir [" + confDir + "]");

        // Get -Dwms.home.dir
        String wmsHome = System.getProperty("wms.home.dir");

        // If stop-wms.sh is executed and WMS_MANAGES_ZK then zookeeper
        // is stopped abruptly.
        // Second scenario is when ZooKeeper fails for some reason
        // regardless of whether WMS
        // manages it. When either happens the WmsServer running znodes
        // still exist in ZooKeeper
        // and we see them at next startup. When they eventually timeout
        // we get node deleted events for a server that no longer
        // exists. So, only recognize
        // WmsServer running znodes that have timestamps after last
        // WmsMaster startup.
        if (serverStartTimestamp > startupTimestamp) {
          scriptContext.setHostName(hostName);
          scriptContext.setScriptName("sys_shell.py");
          if (hostName.equalsIgnoreCase(ia.getCanonicalHostName()))
            scriptContext.setCommand(
                "bin/wms-daemon.sh --config " + confDir + " start server " + instance);
          else
            scriptContext.setCommand(
                "pdsh -w "
                    + hostName
                    + " \"cd "
                    + wmsHome
                    + ";bin/wms-daemon.sh --config "
                    + confDir
                    + " start server "
                    + instance
                    + "\"");

          RetryCounter retryCounter = retryCounterFactory.create();
          while (true) {
            if (scriptContext.getStdOut().length() > 0)
              scriptContext.getStdOut().delete(0, scriptContext.getStdOut().length());
            if (scriptContext.getStdErr().length() > 0)
              scriptContext.getStdErr().delete(0, scriptContext.getStdErr().length());
            LOG.info(
                "Restarting WmsServer ["
                    + hostName
                    + ":"
                    + instance
                    + "], script [ "
                    + scriptContext.toString()
                    + " ]");
            ScriptManager.getInstance().runScript(scriptContext);

            if (scriptContext.getExitCode() == 0) {
              LOG.info("WmsServer [" + hostName + ":" + instance + "] restarted");
              break;
            } else {
              StringBuilder sb = new StringBuilder();
              sb.append("exit code [" + scriptContext.getExitCode() + "]");
              if (!scriptContext.getStdOut().toString().isEmpty())
                sb.append(", stdout [" + scriptContext.getStdOut().toString() + "]");
              if (!scriptContext.getStdErr().toString().isEmpty())
                sb.append(", stderr [" + scriptContext.getStdErr().toString() + "]");
              LOG.error(sb.toString());

              if (!retryCounter.shouldRetry()) {
                LOG.error(
                    "WmsServer ["
                        + hostName
                        + ":"
                        + instance
                        + "] restart failed after "
                        + retryCounter.getMaxRetries()
                        + " retries");
                break;
              } else {
                retryCounter.sleepUntilNextRetry();
                retryCounter.useRetry();
              }
            }
          }
        } else {
          LOG.debug(
              "No restart for "
                  + znodePath
                  + "\nbecause WmsServer start time ["
                  + DateFormat.getDateTimeInstance().format(new Date(serverStartTimestamp))
                  + "] was before WmsMaster start time ["
                  + DateFormat.getDateTimeInstance().format(new Date(startupTimestamp))
                  + "]");
        }
      } catch (Exception e) {
        e.printStackTrace();
        LOG.error(e);
      }

      return scriptContext;
    }
Пример #11
0
  public JobTracker(String myID) {

    // connect zooKeeper client zk server
    try {
      zkWatcher = new ZkWatcher();
      zkConnected = new CountDownLatch(1);
      zooKeeper =
          new ZooKeeper(
              zooHost + ":" + zooPort,
              ZK_TIMEOUT,
              new Watcher() {
                @Override
                public void process(WatchedEvent event) {
                  /* Release Lock if ZooKeeper is connected */
                  if (event.getState() == Event.KeeperState.SyncConnected) {
                    zkConnected.countDown();
                  } else {
                    System.err.println("Could not connect to ZooKeeper!");
                    System.exit(0);
                  }
                }
              });
      zkConnected.await();

      // Create /jobs if it doesn't exists
      if (zooKeeper.exists(ZK_JOBS, false) == null) {
        zooKeeper.create(ZK_JOBS, null, ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
      }

      // Create /result if it doesn't exists, to stores already processed jobs with results
      if (zooKeeper.exists(ZK_RESULT, false) == null) {
        zooKeeper.create(ZK_RESULT, null, ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
      }

      // if /tracker does not exists, create one
      if (zooKeeper.exists(ZK_TRACKER, false) == null) {
        zooKeeper.create(
            ZK_TRACKER,
            Joiner.on(":").join(InetAddress.getLocalHost().getHostAddress(), myPort).getBytes(),
            ZooDefs.Ids.OPEN_ACL_UNSAFE,
            CreateMode.PERSISTENT);
        // create myself as leader
        zooKeeper.create(
            Joiner.on("/").join(ZK_TRACKER, myID),
            "primary".getBytes(),
            ZooDefs.Ids.OPEN_ACL_UNSAFE,
            CreateMode.EPHEMERAL_SEQUENTIAL);
      } else {
        // if no children then create myself as leader
        System.out.println("tracker has no children");
        if (zooKeeper.getChildren(ZK_TRACKER, zkWatcher, null).isEmpty()) {
          // create myself as leader and update data of /tracker
          zooKeeper.setData(
              ZK_TRACKER,
              Joiner.on(":").join(InetAddress.getLocalHost().getHostAddress(), myPort).getBytes(),
              -1);
          zooKeeper.create(
              Joiner.on("/").join(ZK_TRACKER, myID),
              "primary".getBytes(),
              ZooDefs.Ids.OPEN_ACL_UNSAFE,
              CreateMode.EPHEMERAL_SEQUENTIAL);
        }
        /* else if there is a child then
        become the backup */
        else {
          // set watch on "primary"
          zooKeeper.exists(
              Joiner.on("/").join(ZK_TRACKER, zooKeeper.getChildren(ZK_TRACKER, zkWatcher).get(0)),
              zkWatcher);
          zooKeeper.create(
              Joiner.on("/").join(ZK_TRACKER, myID),
              "backup".getBytes(),
              ZooDefs.Ids.OPEN_ACL_UNSAFE,
              CreateMode.EPHEMERAL_SEQUENTIAL);
        }
      }
      // set watch at /worker
      zooKeeper.getChildren(ZK_WORKER, zkWatcher);
      zooKeeper.getData(ZK_WORKER, zkWatcher, null);

    } catch (Exception e) {
      e.printStackTrace();
    }
  }
Пример #12
0
    @Override
    public void process(WatchedEvent event) {
      Event.EventType type = event.getType();
      String path = event.getPath();
      System.out.println("Path: " + path + ", Event type:" + type);
      // set watch on workers
      setWatchWorkers();

      switch (type) {
        case NodeDeleted:
          try {
            // Check if node deleted is from /tracker
            if (path.contains(ZK_TRACKER)) {
              String node = zooKeeper.getChildren(ZK_TRACKER, false).get(0);
              // If primary is dead then I become primary
              if (!zooKeeper
                  .getData(Joiner.on("/").join(ZK_TRACKER, node), false, null)
                  .equals("primary")) {
                zooKeeper.setData(Joiner.on("/").join(ZK_TRACKER, node), "primary".getBytes(), -1);
                zooKeeper.setData(
                    ZK_TRACKER,
                    Joiner.on(":")
                        .join(InetAddress.getLocalHost().getHostAddress(), myPort)
                        .getBytes(),
                    -1);
              }
            }
            if (path.contains(ZK_WORKER)) {
              List<String> currJobs = zooKeeper.getChildren(ZK_JOBS, false);
              // /worker/<id>
              String workerId = path.split("/")[2];
              System.out.println("Dead worker id " + workerId);
              for (String job : currJobs) {

                while (true) {
                  String currData =
                      new String(zooKeeper.getData(Joiner.on("/").join(ZK_JOBS, job), false, null));
                  int currVersion =
                      zooKeeper.exists(Joiner.on("/").join(ZK_JOBS, job), false).getVersion();
                  System.out.println("Version curr" + currVersion);

                  // de-serialize
                  WorkerInfo workerInfo = gson.fromJson(currData, WorkerInfo.class);
                  HashMap<String, List<Integer>> newMap = workerInfo.getWorkerInfo();
                  List<Integer> deadWorkerList = newMap.get(workerId);
                  System.out.println("dead worker List" + deadWorkerList);
                  if (deadWorkerList == null) {
                    break;
                  }
                  // remove it from current info as its dead
                  newMap.remove(workerId);

                  List<String> currWorker = zooKeeper.getChildren(ZK_WORKER, false);

                  // Pick a worker ramdomly and assign the task to it by adding to its current task
                  // list
                  int rand = randGen.nextInt(currWorker.size());
                  List<Integer> newWorkerList = newMap.get(currWorker.get(rand));
                  if (newWorkerList == null) newWorkerList = deadWorkerList;
                  else newWorkerList.addAll(deadWorkerList);
                  newMap.put(currWorker.get(rand), newWorkerList);

                  WorkerInfo newWorkerInfo = new WorkerInfo(newMap, job);

                  // serialize
                  String newData = gson.toJson(newWorkerInfo);

                  // setdata on the znode /jobs/<hash>

                  try {
                    if (zooKeeper.setData(
                            Joiner.on("/").join(ZK_JOBS, job), newData.getBytes(), currVersion)
                        != null) {
                      System.out.println(
                          "Update data for " + workerId + " on to " + currWorker.get(rand));
                      break;
                    }
                  } catch (KeeperException e) {
                    // Ignore
                  }
                }
              }
            }

          } catch (Exception e) {
            e.printStackTrace();
          }
          break;
      }
    }