@Test public void testDeadWorker() throws Exception { LOG.info("testDeadWorker"); conf.setLong("hbase.splitlog.max.resubmit", 0); slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null); slm.finishInitialization(); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); int version = ZKUtil.checkExists(zkw, tasknode); final ServerName worker1 = new ServerName("worker1,1,1"); SplitLogTask slt = new SplitLogTask.Owned(worker1); ZKUtil.setData(zkw, tasknode, slt.toByteArray()); if (tot_mgr_heartbeat.get() == 0) waitForCounter(tot_mgr_heartbeat, 0, 1, to / 2); slm.handleDeadWorker(worker1); if (tot_mgr_resubmit.get() == 0) waitForCounter(tot_mgr_resubmit, 0, 1, to + to / 2); if (tot_mgr_resubmit_dead_server_task.get() == 0) { waitForCounter(tot_mgr_resubmit_dead_server_task, 0, 1, to + to / 2); } int version1 = ZKUtil.checkExists(zkw, tasknode); assertTrue(version1 > version); byte[] taskstate = ZKUtil.getData(zkw, tasknode); slt = SplitLogTask.parseFrom(taskstate); assertTrue(slt.isUnassigned(DUMMY_MASTER)); return; }
@Test public void testTaskResigned() throws Exception { LOG.info("TestTaskResigned - resubmit task node once in RESIGNED state"); assertEquals(tot_mgr_resubmit.get(), 0); slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null); slm.finishInitialization(); assertEquals(tot_mgr_resubmit.get(), 0); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); assertEquals(tot_mgr_resubmit.get(), 0); final ServerName worker1 = new ServerName("worker1,1,1"); assertEquals(tot_mgr_resubmit.get(), 0); SplitLogTask slt = new SplitLogTask.Resigned(worker1); assertEquals(tot_mgr_resubmit.get(), 0); ZKUtil.setData(zkw, tasknode, slt.toByteArray()); int version = ZKUtil.checkExists(zkw, tasknode); // Could be small race here. if (tot_mgr_resubmit.get() == 0) waitForCounter(tot_mgr_resubmit, 0, 1, to / 2); assertEquals(tot_mgr_resubmit.get(), 1); int version1 = ZKUtil.checkExists(zkw, tasknode); assertTrue("version1=" + version1 + ", version=" + version, version1 > version); byte[] taskstate = ZKUtil.getData(zkw, tasknode); slt = SplitLogTask.parseFrom(taskstate); assertTrue(slt.isUnassigned(DUMMY_MASTER)); }
@Test public void testRescanCleanup() throws Exception { LOG.info("TestRescanCleanup - ensure RESCAN nodes are cleaned up"); slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null); slm.finishInitialization(); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); int version = ZKUtil.checkExists(zkw, tasknode); final ServerName worker1 = new ServerName("worker1,1,1"); SplitLogTask slt = new SplitLogTask.Owned(worker1); ZKUtil.setData(zkw, tasknode, slt.toByteArray()); waitForCounter(tot_mgr_heartbeat, 0, 1, to / 2); waitForCounter( new Expr() { @Override public long eval() { return (tot_mgr_resubmit.get() + tot_mgr_resubmit_failed.get()); } }, 0, 1, 5 * 60000); // wait long enough Assert.assertEquals( "Could not run test. Lost ZK connection?", 0, tot_mgr_resubmit_failed.get()); int version1 = ZKUtil.checkExists(zkw, tasknode); assertTrue(version1 > version); byte[] taskstate = ZKUtil.getData(zkw, tasknode); slt = SplitLogTask.parseFrom(taskstate); assertTrue(slt.isUnassigned(DUMMY_MASTER)); waitForCounter(tot_mgr_rescan_deleted, 0, 1, to / 2); }
/** * Test whether the splitlog correctly creates a task in zookeeper * * @throws Exception */ @Test public void testTaskCreation() throws Exception { LOG.info("TestTaskCreation - test the creation of a task in zk"); slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null); slm.finishInitialization(); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); byte[] data = ZKUtil.getData(zkw, tasknode); SplitLogTask slt = SplitLogTask.parseFrom(data); LOG.info("Task node created " + slt.toString()); assertTrue(slt.isUnassigned(DUMMY_MASTER)); }
void getDataSetWatchSuccess(String path, byte[] data) { SplitLogTask slt; try { slt = SplitLogTask.parseFrom(data); } catch (DeserializationException e) { LOG.warn("Failed parse", e); return; } synchronized (grabTaskLock) { if (workerInGrabTask) { // currentTask can change but that's ok String taskpath = currentTask; if (taskpath != null && taskpath.equals(path)) { ServerName serverName = manager.getServer().getServerName(); // have to compare data. cannot compare version because then there // will be race with attemptToOwnTask() // cannot just check whether the node has been transitioned to // UNASSIGNED because by the time this worker sets the data watch // the node might have made two transitions - from owned by this // worker to unassigned to owned by another worker if (!slt.isOwned(serverName) && !slt.isDone(serverName) && !slt.isErr(serverName) && !slt.isResigned(serverName)) { LOG.info( "task " + taskpath + " preempted from " + serverName + ", current task state and owner=" + slt.toString()); worker.stopTask(); } } } } }
/** * try to grab a 'lock' on the task zk node to own and execute the task. * * <p> * * @param path zk node for the task */ private void grabTask(String path) { Stat stat = new Stat(); byte[] data; synchronized (grabTaskLock) { currentTask = path; workerInGrabTask = true; if (Thread.interrupted()) { return; } } try { try { if ((data = ZKUtil.getDataNoWatch(watcher, path, stat)) == null) { SplitLogCounters.tot_wkr_failed_to_grab_task_no_data.incrementAndGet(); return; } } catch (KeeperException e) { LOG.warn("Failed to get data for znode " + path, e); SplitLogCounters.tot_wkr_failed_to_grab_task_exception.incrementAndGet(); return; } SplitLogTask slt; try { slt = SplitLogTask.parseFrom(data); } catch (DeserializationException e) { LOG.warn("Failed parse data for znode " + path, e); SplitLogCounters.tot_wkr_failed_to_grab_task_exception.incrementAndGet(); return; } if (!slt.isUnassigned()) { SplitLogCounters.tot_wkr_failed_to_grab_task_owned.incrementAndGet(); return; } currentVersion = attemptToOwnTask( true, watcher, server.getServerName(), path, slt.getMode(), stat.getVersion()); if (currentVersion < 0) { SplitLogCounters.tot_wkr_failed_to_grab_task_lost_race.incrementAndGet(); return; } if (ZKSplitLog.isRescanNode(watcher, currentTask)) { ZkSplitLogWorkerCoordination.ZkSplitTaskDetails splitTaskDetails = new ZkSplitLogWorkerCoordination.ZkSplitTaskDetails(); splitTaskDetails.setTaskNode(currentTask); splitTaskDetails.setCurTaskZKVersion(new MutableInt(currentVersion)); endTask( new SplitLogTask.Done(server.getServerName(), slt.getMode()), SplitLogCounters.tot_wkr_task_acquired_rescan, splitTaskDetails); return; } LOG.info("worker " + server.getServerName() + " acquired task " + path); SplitLogCounters.tot_wkr_task_acquired.incrementAndGet(); getDataSetWatchAsync(); submitTask(path, slt.getMode(), currentVersion, reportPeriod); // after a successful submit, sleep a little bit to allow other RSs to grab the rest tasks try { int sleepTime = RandomUtils.nextInt(500) + 500; Thread.sleep(sleepTime); } catch (InterruptedException e) { LOG.warn("Interrupted while yielding for other region servers", e); Thread.currentThread().interrupt(); } } finally { synchronized (grabTaskLock) { workerInGrabTask = false; // clear the interrupt from stopTask() otherwise the next task will // suffer Thread.interrupted(); } } }