@Test
  public void testDeadWorker() throws Exception {
    LOG.info("testDeadWorker");

    conf.setLong("hbase.splitlog.max.resubmit", 0);
    slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null);
    slm.finishInitialization();
    TaskBatch batch = new TaskBatch();

    String tasknode = submitTaskAndWait(batch, "foo/1");
    int version = ZKUtil.checkExists(zkw, tasknode);
    final ServerName worker1 = new ServerName("worker1,1,1");
    SplitLogTask slt = new SplitLogTask.Owned(worker1);
    ZKUtil.setData(zkw, tasknode, slt.toByteArray());
    if (tot_mgr_heartbeat.get() == 0) waitForCounter(tot_mgr_heartbeat, 0, 1, to / 2);
    slm.handleDeadWorker(worker1);
    if (tot_mgr_resubmit.get() == 0) waitForCounter(tot_mgr_resubmit, 0, 1, to + to / 2);
    if (tot_mgr_resubmit_dead_server_task.get() == 0) {
      waitForCounter(tot_mgr_resubmit_dead_server_task, 0, 1, to + to / 2);
    }

    int version1 = ZKUtil.checkExists(zkw, tasknode);
    assertTrue(version1 > version);
    byte[] taskstate = ZKUtil.getData(zkw, tasknode);
    slt = SplitLogTask.parseFrom(taskstate);
    assertTrue(slt.isUnassigned(DUMMY_MASTER));
    return;
  }
  @Test
  public void testTaskResigned() throws Exception {
    LOG.info("TestTaskResigned - resubmit task node once in RESIGNED state");
    assertEquals(tot_mgr_resubmit.get(), 0);
    slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null);
    slm.finishInitialization();
    assertEquals(tot_mgr_resubmit.get(), 0);
    TaskBatch batch = new TaskBatch();
    String tasknode = submitTaskAndWait(batch, "foo/1");
    assertEquals(tot_mgr_resubmit.get(), 0);
    final ServerName worker1 = new ServerName("worker1,1,1");
    assertEquals(tot_mgr_resubmit.get(), 0);
    SplitLogTask slt = new SplitLogTask.Resigned(worker1);
    assertEquals(tot_mgr_resubmit.get(), 0);
    ZKUtil.setData(zkw, tasknode, slt.toByteArray());
    int version = ZKUtil.checkExists(zkw, tasknode);
    // Could be small race here.
    if (tot_mgr_resubmit.get() == 0) waitForCounter(tot_mgr_resubmit, 0, 1, to / 2);
    assertEquals(tot_mgr_resubmit.get(), 1);
    int version1 = ZKUtil.checkExists(zkw, tasknode);
    assertTrue("version1=" + version1 + ", version=" + version, version1 > version);

    byte[] taskstate = ZKUtil.getData(zkw, tasknode);
    slt = SplitLogTask.parseFrom(taskstate);
    assertTrue(slt.isUnassigned(DUMMY_MASTER));
  }
  @Test
  public void testRescanCleanup() throws Exception {
    LOG.info("TestRescanCleanup - ensure RESCAN nodes are cleaned up");

    slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null);
    slm.finishInitialization();
    TaskBatch batch = new TaskBatch();

    String tasknode = submitTaskAndWait(batch, "foo/1");
    int version = ZKUtil.checkExists(zkw, tasknode);
    final ServerName worker1 = new ServerName("worker1,1,1");
    SplitLogTask slt = new SplitLogTask.Owned(worker1);
    ZKUtil.setData(zkw, tasknode, slt.toByteArray());
    waitForCounter(tot_mgr_heartbeat, 0, 1, to / 2);
    waitForCounter(
        new Expr() {
          @Override
          public long eval() {
            return (tot_mgr_resubmit.get() + tot_mgr_resubmit_failed.get());
          }
        },
        0,
        1,
        5 * 60000); // wait long enough
    Assert.assertEquals(
        "Could not run test. Lost ZK connection?", 0, tot_mgr_resubmit_failed.get());
    int version1 = ZKUtil.checkExists(zkw, tasknode);
    assertTrue(version1 > version);
    byte[] taskstate = ZKUtil.getData(zkw, tasknode);
    slt = SplitLogTask.parseFrom(taskstate);
    assertTrue(slt.isUnassigned(DUMMY_MASTER));

    waitForCounter(tot_mgr_rescan_deleted, 0, 1, to / 2);
  }
  /**
   * Test whether the splitlog correctly creates a task in zookeeper
   *
   * @throws Exception
   */
  @Test
  public void testTaskCreation() throws Exception {

    LOG.info("TestTaskCreation - test the creation of a task in zk");
    slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null);
    slm.finishInitialization();
    TaskBatch batch = new TaskBatch();

    String tasknode = submitTaskAndWait(batch, "foo/1");

    byte[] data = ZKUtil.getData(zkw, tasknode);
    SplitLogTask slt = SplitLogTask.parseFrom(data);
    LOG.info("Task node created " + slt.toString());
    assertTrue(slt.isUnassigned(DUMMY_MASTER));
  }
 void getDataSetWatchSuccess(String path, byte[] data) {
   SplitLogTask slt;
   try {
     slt = SplitLogTask.parseFrom(data);
   } catch (DeserializationException e) {
     LOG.warn("Failed parse", e);
     return;
   }
   synchronized (grabTaskLock) {
     if (workerInGrabTask) {
       // currentTask can change but that's ok
       String taskpath = currentTask;
       if (taskpath != null && taskpath.equals(path)) {
         ServerName serverName = manager.getServer().getServerName();
         // have to compare data. cannot compare version because then there
         // will be race with attemptToOwnTask()
         // cannot just check whether the node has been transitioned to
         // UNASSIGNED because by the time this worker sets the data watch
         // the node might have made two transitions - from owned by this
         // worker to unassigned to owned by another worker
         if (!slt.isOwned(serverName)
             && !slt.isDone(serverName)
             && !slt.isErr(serverName)
             && !slt.isResigned(serverName)) {
           LOG.info(
               "task "
                   + taskpath
                   + " preempted from "
                   + serverName
                   + ", current task state and owner="
                   + slt.toString());
           worker.stopTask();
         }
       }
     }
   }
 }
  /**
   * try to grab a 'lock' on the task zk node to own and execute the task.
   *
   * <p>
   *
   * @param path zk node for the task
   */
  private void grabTask(String path) {
    Stat stat = new Stat();
    byte[] data;
    synchronized (grabTaskLock) {
      currentTask = path;
      workerInGrabTask = true;
      if (Thread.interrupted()) {
        return;
      }
    }
    try {
      try {
        if ((data = ZKUtil.getDataNoWatch(watcher, path, stat)) == null) {
          SplitLogCounters.tot_wkr_failed_to_grab_task_no_data.incrementAndGet();
          return;
        }
      } catch (KeeperException e) {
        LOG.warn("Failed to get data for znode " + path, e);
        SplitLogCounters.tot_wkr_failed_to_grab_task_exception.incrementAndGet();
        return;
      }
      SplitLogTask slt;
      try {
        slt = SplitLogTask.parseFrom(data);
      } catch (DeserializationException e) {
        LOG.warn("Failed parse data for znode " + path, e);
        SplitLogCounters.tot_wkr_failed_to_grab_task_exception.incrementAndGet();
        return;
      }
      if (!slt.isUnassigned()) {
        SplitLogCounters.tot_wkr_failed_to_grab_task_owned.incrementAndGet();
        return;
      }

      currentVersion =
          attemptToOwnTask(
              true, watcher, server.getServerName(), path, slt.getMode(), stat.getVersion());
      if (currentVersion < 0) {
        SplitLogCounters.tot_wkr_failed_to_grab_task_lost_race.incrementAndGet();
        return;
      }

      if (ZKSplitLog.isRescanNode(watcher, currentTask)) {
        ZkSplitLogWorkerCoordination.ZkSplitTaskDetails splitTaskDetails =
            new ZkSplitLogWorkerCoordination.ZkSplitTaskDetails();
        splitTaskDetails.setTaskNode(currentTask);
        splitTaskDetails.setCurTaskZKVersion(new MutableInt(currentVersion));

        endTask(
            new SplitLogTask.Done(server.getServerName(), slt.getMode()),
            SplitLogCounters.tot_wkr_task_acquired_rescan,
            splitTaskDetails);
        return;
      }

      LOG.info("worker " + server.getServerName() + " acquired task " + path);
      SplitLogCounters.tot_wkr_task_acquired.incrementAndGet();
      getDataSetWatchAsync();

      submitTask(path, slt.getMode(), currentVersion, reportPeriod);

      // after a successful submit, sleep a little bit to allow other RSs to grab the rest tasks
      try {
        int sleepTime = RandomUtils.nextInt(500) + 500;
        Thread.sleep(sleepTime);
      } catch (InterruptedException e) {
        LOG.warn("Interrupted while yielding for other region servers", e);
        Thread.currentThread().interrupt();
      }
    } finally {
      synchronized (grabTaskLock) {
        workerInGrabTask = false;
        // clear the interrupt from stopTask() otherwise the next task will
        // suffer
        Thread.interrupted();
      }
    }
  }