@Test public void testMultipleResubmits() throws Exception { LOG.info("TestMultipleResbmits - no indefinite resubmissions"); conf.setInt("hbase.splitlog.max.resubmit", 2); slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null); slm.finishInitialization(); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); int version = ZKUtil.checkExists(zkw, tasknode); final ServerName worker1 = new ServerName("worker1,1,1"); final ServerName worker2 = new ServerName("worker2,1,1"); final ServerName worker3 = new ServerName("worker3,1,1"); SplitLogTask slt = new SplitLogTask.Owned(worker1); ZKUtil.setData(zkw, tasknode, slt.toByteArray()); waitForCounter(tot_mgr_heartbeat, 0, 1, to / 2); waitForCounter(tot_mgr_resubmit, 0, 1, to + to / 2); int version1 = ZKUtil.checkExists(zkw, tasknode); assertTrue(version1 > version); slt = new SplitLogTask.Owned(worker2); ZKUtil.setData(zkw, tasknode, slt.toByteArray()); waitForCounter(tot_mgr_heartbeat, 1, 2, to / 2); waitForCounter(tot_mgr_resubmit, 1, 2, to + to / 2); int version2 = ZKUtil.checkExists(zkw, tasknode); assertTrue(version2 > version1); slt = new SplitLogTask.Owned(worker3); ZKUtil.setData(zkw, tasknode, slt.toByteArray()); waitForCounter(tot_mgr_heartbeat, 1, 2, to / 2); waitForCounter(tot_mgr_resubmit_threshold_reached, 0, 1, to + to / 2); Thread.sleep(to + to / 2); assertEquals(2L, tot_mgr_resubmit.get()); }
@Test public void testMultipleResubmits() throws Exception { LOG.info("TestMultipleResbmits - no indefinite resubmissions"); int to = 1000; conf.setInt("hbase.splitlog.manager.timeout", to); conf.setInt("hbase.splitlog.manager.timeoutmonitor.period", 100); to = to + 2 * 100; conf.setInt("hbase.splitlog.max.resubmit", 2); slm = new SplitLogManager(zkw, conf, stopper, "dummy-master", null); slm.finishInitialization(); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); int version = ZKUtil.checkExists(zkw, tasknode); ZKUtil.setData(zkw, tasknode, TaskState.TASK_OWNED.get("worker1")); waitForCounter(tot_mgr_heartbeat, 0, 1, 1000); waitForCounter(tot_mgr_resubmit, 0, 1, to + 100); int version1 = ZKUtil.checkExists(zkw, tasknode); assertTrue(version1 > version); ZKUtil.setData(zkw, tasknode, TaskState.TASK_OWNED.get("worker2")); waitForCounter(tot_mgr_heartbeat, 1, 2, 1000); waitForCounter(tot_mgr_resubmit, 1, 2, to + 100); int version2 = ZKUtil.checkExists(zkw, tasknode); assertTrue(version2 > version1); ZKUtil.setData(zkw, tasknode, TaskState.TASK_OWNED.get("worker3")); waitForCounter(tot_mgr_heartbeat, 1, 2, 1000); waitForCounter(tot_mgr_resubmit_threshold_reached, 0, 1, to + 100); Thread.sleep(to + 100); assertEquals(2L, tot_mgr_resubmit.get()); }
public void updateRecord(MetaRecord rec) { try { long ts = System.currentTimeMillis(); putRecord(rec, ts); String node = ZKUtil.joinZNode(H2MetaTableTracker.NODE_NAME, Integer.toString(rec.getId())); // setData会异步触发所有机器(包括本机)上的H2MetaTableTracker.nodeDataChanged // 然后触发下列调用: // =>org.h2.engine.Database.updateDatabaseObject(int) // =>org.h2.engine.Database.update(Session, DbObject) // =>org.h2.engine.Database.addMeta0(Session, DbObject, boolean) // =>又到此方法 // 所以会造成循环 synchronized (this) { // 避免setData后立刻触发nodeDataChanged,此时IdVersion还未更新 ZKUtil.setData(watcher, node, Bytes.toBytes(ts)); // setData后watch不见了,所以要继续watch,监听其他人对此node的修改 // ZKUtil.watchAndCheckExists(watcher, node); Stat stat = new Stat(); ZKUtil.getDataAndWatch(watcher, node, stat); // 这里记录下id的最新版本,触发nodeDataChanged时再检查一下是否版本一样, // 如果不大于这里的版本那么就不再执行updateDatabaseObject操作 tracker.updateIdVersion(rec.getId(), stat.getVersion()); } } catch (Exception e) { throw new RuntimeException(e); } }
@Test public void testRescanCleanup() throws Exception { LOG.info("TestRescanCleanup - ensure RESCAN nodes are cleaned up"); slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null); slm.finishInitialization(); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); int version = ZKUtil.checkExists(zkw, tasknode); final ServerName worker1 = new ServerName("worker1,1,1"); SplitLogTask slt = new SplitLogTask.Owned(worker1); ZKUtil.setData(zkw, tasknode, slt.toByteArray()); waitForCounter(tot_mgr_heartbeat, 0, 1, to / 2); waitForCounter( new Expr() { @Override public long eval() { return (tot_mgr_resubmit.get() + tot_mgr_resubmit_failed.get()); } }, 0, 1, 5 * 60000); // wait long enough Assert.assertEquals( "Could not run test. Lost ZK connection?", 0, tot_mgr_resubmit_failed.get()); int version1 = ZKUtil.checkExists(zkw, tasknode); assertTrue(version1 > version); byte[] taskstate = ZKUtil.getData(zkw, tasknode); slt = SplitLogTask.parseFrom(taskstate); assertTrue(slt.isUnassigned(DUMMY_MASTER)); waitForCounter(tot_mgr_rescan_deleted, 0, 1, to / 2); }
@Test public void testDeadWorker() throws Exception { LOG.info("testDeadWorker"); conf.setLong("hbase.splitlog.max.resubmit", 0); slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null); slm.finishInitialization(); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); int version = ZKUtil.checkExists(zkw, tasknode); final ServerName worker1 = new ServerName("worker1,1,1"); SplitLogTask slt = new SplitLogTask.Owned(worker1); ZKUtil.setData(zkw, tasknode, slt.toByteArray()); if (tot_mgr_heartbeat.get() == 0) waitForCounter(tot_mgr_heartbeat, 0, 1, to / 2); slm.handleDeadWorker(worker1); if (tot_mgr_resubmit.get() == 0) waitForCounter(tot_mgr_resubmit, 0, 1, to + to / 2); if (tot_mgr_resubmit_dead_server_task.get() == 0) { waitForCounter(tot_mgr_resubmit_dead_server_task, 0, 1, to + to / 2); } int version1 = ZKUtil.checkExists(zkw, tasknode); assertTrue(version1 > version); byte[] taskstate = ZKUtil.getData(zkw, tasknode); slt = SplitLogTask.parseFrom(taskstate); assertTrue(slt.isUnassigned(DUMMY_MASTER)); return; }
/** * endTask() can fail and the only way to recover out of it is for the {@link SplitLogManager} to * timeout the task node. * * @param slt * @param ctr */ public static void endTask( ZooKeeperWatcher zkw, SplitLogTask slt, AtomicLong ctr, String task, int taskZKVersion) { try { if (ZKUtil.setData(zkw, task, slt.toByteArray(), taskZKVersion)) { LOG.info("successfully transitioned task " + task + " to final state " + slt); ctr.incrementAndGet(); return; } LOG.warn( "failed to transistion task " + task + " to end state " + slt + " because of version mismatch "); } catch (KeeperException.BadVersionException bve) { LOG.warn( "transisition task " + task + " to " + slt + " failed because of version mismatch", bve); } catch (KeeperException.NoNodeException e) { LOG.fatal( "logic error - end task " + task + " " + slt + " failed because task doesn't exist", e); } catch (KeeperException e) { LOG.warn("failed to end task, " + task + " " + slt, e); } SplitLogCounters.tot_wkr_final_transition_failed.incrementAndGet(); }
/** * endTask() can fail and the only way to recover out of it is for the {@link * org.apache.hadoop.hbase.master.SplitLogManager} to timeout the task node. * * @param slt * @param ctr */ @Override public void endTask(SplitLogTask slt, AtomicLong ctr, SplitTaskDetails details) { ZkSplitTaskDetails zkDetails = (ZkSplitTaskDetails) details; String task = zkDetails.getTaskNode(); int taskZKVersion = zkDetails.getCurTaskZKVersion().intValue(); try { if (ZKUtil.setData(watcher, task, slt.toByteArray(), taskZKVersion)) { LOG.info("successfully transitioned task " + task + " to final state " + slt); ctr.incrementAndGet(); return; } LOG.warn( "failed to transistion task " + task + " to end state " + slt + " because of version mismatch "); } catch (KeeperException.BadVersionException bve) { LOG.warn( "transisition task " + task + " to " + slt + " failed because of version mismatch", bve); } catch (KeeperException.NoNodeException e) { LOG.fatal( "logic error - end task " + task + " " + slt + " failed because task doesn't exist", e); } catch (KeeperException e) { LOG.warn("failed to end task, " + task + " " + slt, e); } SplitLogCounters.tot_wkr_final_transition_failed.incrementAndGet(); }
@Test public void testUnassignedTimeout() throws Exception { LOG.info("TestUnassignedTimeout - iff all tasks are unassigned then" + " resubmit"); // create an orphan task in OWNED state String tasknode1 = ZKSplitLog.getEncodedNodeName(zkw, "orphan/1"); final ServerName worker1 = new ServerName("worker1,1,1"); SplitLogTask slt = new SplitLogTask.Owned(worker1); zkw.getRecoverableZooKeeper() .create(tasknode1, slt.toByteArray(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null); slm.finishInitialization(); waitForCounter(tot_mgr_orphan_task_acquired, 0, 1, to / 2); // submit another task which will stay in unassigned mode TaskBatch batch = new TaskBatch(); submitTaskAndWait(batch, "foo/1"); // keep updating the orphan owned node every to/2 seconds for (int i = 0; i < (3 * to) / 100; i++) { Thread.sleep(100); final ServerName worker2 = new ServerName("worker1,1,1"); slt = new SplitLogTask.Owned(worker2); ZKUtil.setData(zkw, tasknode1, slt.toByteArray()); } // since we have stopped heartbeating the owned node therefore it should // get resubmitted LOG.info("waiting for manager to resubmit the orphan task"); waitForCounter(tot_mgr_resubmit, 0, 1, to + to / 2); // now all the nodes are unassigned. manager should post another rescan waitForCounter(tot_mgr_resubmit_unassigned, 0, 1, 2 * to + to / 2); }
@Test public void testTaskResigned() throws Exception { LOG.info("TestTaskResigned - resubmit task node once in RESIGNED state"); assertEquals(tot_mgr_resubmit.get(), 0); slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null); slm.finishInitialization(); assertEquals(tot_mgr_resubmit.get(), 0); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); assertEquals(tot_mgr_resubmit.get(), 0); final ServerName worker1 = new ServerName("worker1,1,1"); assertEquals(tot_mgr_resubmit.get(), 0); SplitLogTask slt = new SplitLogTask.Resigned(worker1); assertEquals(tot_mgr_resubmit.get(), 0); ZKUtil.setData(zkw, tasknode, slt.toByteArray()); int version = ZKUtil.checkExists(zkw, tasknode); // Could be small race here. if (tot_mgr_resubmit.get() == 0) waitForCounter(tot_mgr_resubmit, 0, 1, to / 2); assertEquals(tot_mgr_resubmit.get(), 1); int version1 = ZKUtil.checkExists(zkw, tasknode); assertTrue("version1=" + version1 + ", version=" + version, version1 > version); byte[] taskstate = ZKUtil.getData(zkw, tasknode); slt = SplitLogTask.parseFrom(taskstate); assertTrue(slt.isUnassigned(DUMMY_MASTER)); }
@Test public void testTaskDone() throws Exception { LOG.info("TestTaskDone - cleanup task node once in DONE state"); slm = new SplitLogManager(zkw, conf, stopper, "dummy-master", null); slm.finishInitialization(); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); ZKUtil.setData(zkw, tasknode, TaskState.TASK_DONE.get("worker")); synchronized (batch) { while (batch.installed != batch.done) { batch.wait(); } } waitForCounter(tot_mgr_task_deleted, 0, 1, 1000); assertTrue(ZKUtil.checkExists(zkw, tasknode) == -1); }
@Test public void testTaskResigned() throws Exception { LOG.info("TestTaskResigned - resubmit task node once in RESIGNED state"); slm = new SplitLogManager(zkw, conf, stopper, "dummy-master", null); slm.finishInitialization(); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); ZKUtil.setData(zkw, tasknode, TaskState.TASK_RESIGNED.get("worker")); int version = ZKUtil.checkExists(zkw, tasknode); waitForCounter(tot_mgr_resubmit, 0, 1, 1000); int version1 = ZKUtil.checkExists(zkw, tasknode); assertTrue(version1 > version); byte[] taskstate = ZKUtil.getData(zkw, tasknode); assertTrue(Arrays.equals(taskstate, TaskState.TASK_UNASSIGNED.get("dummy-master"))); }
@Override public void setPeerTableCFsConfig(String id, String tableCFsStr) throws ReplicationException { try { if (!peerExists(id)) { throw new IllegalArgumentException( "Cannot set peer tableCFs because id=" + id + " does not exist."); } String tableCFsZKNode = getTableCFsNode(id); byte[] tableCFs = Bytes.toBytes(tableCFsStr); if (ZKUtil.checkExists(this.zookeeper, tableCFsZKNode) != -1) { ZKUtil.setData(this.zookeeper, tableCFsZKNode, tableCFs); } else { ZKUtil.createAndWatch(this.zookeeper, tableCFsZKNode, tableCFs); } LOG.info("Peer tableCFs with id= " + id + " is now " + tableCFsStr); } catch (KeeperException e) { throw new ReplicationException("Unable to change tableCFs of the peer with id=" + id, e); } }
@Test public void testTaskDone() throws Exception { LOG.info("TestTaskDone - cleanup task node once in DONE state"); slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null); slm.finishInitialization(); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); final ServerName worker1 = new ServerName("worker1,1,1"); SplitLogTask slt = new SplitLogTask.Done(worker1); ZKUtil.setData(zkw, tasknode, slt.toByteArray()); synchronized (batch) { while (batch.installed != batch.done) { batch.wait(); } } waitForCounter(tot_mgr_task_deleted, 0, 1, to / 2); assertTrue(ZKUtil.checkExists(zkw, tasknode) == -1); }
@Test public void testTaskErr() throws Exception { LOG.info("TestTaskErr - cleanup task node once in ERR state"); conf.setInt("hbase.splitlog.max.resubmit", 0); slm = new SplitLogManager(zkw, conf, stopper, "dummy-master", null); slm.finishInitialization(); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); ZKUtil.setData(zkw, tasknode, TaskState.TASK_ERR.get("worker")); synchronized (batch) { while (batch.installed != batch.error) { batch.wait(); } } waitForCounter(tot_mgr_task_deleted, 0, 1, 1000); assertTrue(ZKUtil.checkExists(zkw, tasknode) == -1); conf.setInt("hbase.splitlog.max.resubmit", ZKSplitLog.DEFAULT_MAX_RESUBMIT); }
@Test public void testUnassignedTimeout() throws Exception { LOG.info("TestUnassignedTimeout - iff all tasks are unassigned then" + " resubmit"); // create an orphan task in OWNED state String tasknode1 = ZKSplitLog.getEncodedNodeName(zkw, "orphan/1"); zkw.getRecoverableZooKeeper() .create( tasknode1, TaskState.TASK_OWNED.get("dummy-worker"), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); int to = 1000; conf.setInt("hbase.splitlog.manager.timeout", to); conf.setInt("hbase.splitlog.manager.unassigned.timeout", 2 * to); conf.setInt("hbase.splitlog.manager.timeoutmonitor.period", 100); slm = new SplitLogManager(zkw, conf, stopper, "dummy-master", null); slm.finishInitialization(); waitForCounter(tot_mgr_orphan_task_acquired, 0, 1, 100); // submit another task which will stay in unassigned mode TaskBatch batch = new TaskBatch(); submitTaskAndWait(batch, "foo/1"); // keep updating the orphan owned node every to/2 seconds for (int i = 0; i < (3 * to) / 100; i++) { Thread.sleep(100); ZKUtil.setData(zkw, tasknode1, TaskState.TASK_OWNED.get("dummy-worker")); } // since we have stopped heartbeating the owned node therefore it should // get resubmitted LOG.info("waiting for manager to resubmit the orphan task"); waitForCounter(tot_mgr_resubmit, 0, 1, to + 500); // now all the nodes are unassigned. manager should post another rescan waitForCounter(tot_mgr_resubmit_unassigned, 0, 1, 2 * to + 500); }
/** * Update the state znode of a peer cluster. * * @param id * @param state */ private void changePeerState(String id, ZooKeeperProtos.ReplicationState.State state) throws ReplicationException { try { if (!peerExists(id)) { throw new IllegalArgumentException( "Cannot enable/disable peer because id=" + id + " does not exist."); } String peerStateZNode = getPeerStateNode(id); byte[] stateBytes = (state == ZooKeeperProtos.ReplicationState.State.ENABLED) ? ENABLED_ZNODE_BYTES : DISABLED_ZNODE_BYTES; if (ZKUtil.checkExists(this.zookeeper, peerStateZNode) != -1) { ZKUtil.setData(this.zookeeper, peerStateZNode, stateBytes); } else { ZKUtil.createAndWatch(this.zookeeper, peerStateZNode, stateBytes); } LOG.info("Peer with id= " + id + " is now " + state.name()); } catch (KeeperException e) { throw new ReplicationException("Unable to change state of the peer with id=" + id, e); } }
@Test public void testRescanCleanup() throws Exception { LOG.info("TestRescanCleanup - ensure RESCAN nodes are cleaned up"); conf.setInt("hbase.splitlog.manager.timeout", 1000); conf.setInt("hbase.splitlog.manager.timeoutmonitor.period", 100); slm = new SplitLogManager(zkw, conf, stopper, "dummy-master", null); slm.finishInitialization(); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); int version = ZKUtil.checkExists(zkw, tasknode); ZKUtil.setData(zkw, tasknode, TaskState.TASK_OWNED.get("worker1")); waitForCounter(tot_mgr_heartbeat, 0, 1, 1000); waitForCounter( new Expr() { @Override public long eval() { return (tot_mgr_resubmit.get() + tot_mgr_resubmit_failed.get()); } }, 0, 1, 5 * 60000); // wait long enough if (tot_mgr_resubmit_failed.get() == 0) { int version1 = ZKUtil.checkExists(zkw, tasknode); assertTrue(version1 > version); byte[] taskstate = ZKUtil.getData(zkw, tasknode); assertTrue(Arrays.equals(TaskState.TASK_UNASSIGNED.get("dummy-master"), taskstate)); waitForCounter(tot_mgr_rescan_deleted, 0, 1, 1000); } else { LOG.warn("Could not run test. Lost ZK connection?"); } return; }
public boolean shouldAddCheckerMaster() { ZooKeeperWatcher zk = super.getZooKeeper(); String numberN = ZKUtil.joinZNode(zk.baseZNode, CCIndexConstants.CheckNumNode); try { if (ZKUtil.checkExists(zk, numberN) != -1) { ZKUtil.createSetData(zk, numberN, Bytes.toBytes(1)); } else { int num = Bytes.toInt(ZKUtil.getData(zk, numberN)); if (num < this.checkMasterN) { ZKUtil.setData(zk, numberN, Bytes.toBytes(num + 1)); return true; } else { return false; } } } catch (KeeperException e) { // TODO Auto-generated catch block e.printStackTrace(); } return false; }
@Test public void testTaskErr() throws Exception { LOG.info("TestTaskErr - cleanup task node once in ERR state"); conf.setInt("hbase.splitlog.max.resubmit", 0); slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null); slm.finishInitialization(); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); final ServerName worker1 = new ServerName("worker1,1,1"); SplitLogTask slt = new SplitLogTask.Err(worker1); ZKUtil.setData(zkw, tasknode, slt.toByteArray()); synchronized (batch) { while (batch.installed != batch.error) { batch.wait(); } } waitForCounter(tot_mgr_task_deleted, 0, 1, to / 2); assertTrue(ZKUtil.checkExists(zkw, tasknode) == -1); conf.setInt("hbase.splitlog.max.resubmit", SplitLogManager.DEFAULT_MAX_RESUBMIT); }
@Test public void testDeadWorker() throws Exception { LOG.info("testDeadWorker"); conf.setLong("hbase.splitlog.max.resubmit", 0); slm = new SplitLogManager(zkw, conf, stopper, "dummy-master", null); slm.finishInitialization(); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); int version = ZKUtil.checkExists(zkw, tasknode); ZKUtil.setData(zkw, tasknode, TaskState.TASK_OWNED.get("worker1")); waitForCounter(tot_mgr_heartbeat, 0, 1, 1000); slm.handleDeadWorker("worker1"); waitForCounter(tot_mgr_resubmit, 0, 1, 1000); waitForCounter(tot_mgr_resubmit_dead_server_task, 0, 1, 1000); int version1 = ZKUtil.checkExists(zkw, tasknode); assertTrue(version1 > version); byte[] taskstate = ZKUtil.getData(zkw, tasknode); assertTrue(Arrays.equals(TaskState.TASK_UNASSIGNED.get("dummy-master"), taskstate)); return; }
@Test public void testWorkerCrash() throws Exception { slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null); slm.finishInitialization(); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); final ServerName worker1 = new ServerName("worker1,1,1"); SplitLogTask slt = new SplitLogTask.Owned(worker1); ZKUtil.setData(zkw, tasknode, slt.toByteArray()); if (tot_mgr_heartbeat.get() == 0) waitForCounter(tot_mgr_heartbeat, 0, 1, to / 2); // Not yet resubmitted. Assert.assertEquals(0, tot_mgr_resubmit.get()); // This server becomes dead Mockito.when(sm.isServerOnline(worker1)).thenReturn(false); Thread.sleep(1300); // The timeout checker is done every 1000 ms (hardcoded). // It has been resubmitted Assert.assertEquals(1, tot_mgr_resubmit.get()); }