private void waitForNewProcedures() { // watch for new procedues that we need to start subprocedures for LOG.debug("Looking for new procedures under znode:'" + zkController.getAcquiredBarrier() + "'"); List<String> runningProcedures = null; try { runningProcedures = ZKUtil.listChildrenAndWatchForNewChildren( zkController.getWatcher(), zkController.getAcquiredBarrier()); if (runningProcedures == null) { LOG.debug("No running procedures."); return; } } catch (KeeperException e) { member.controllerConnectionFailure( "General failure when watching for new procedures", e, null); } if (runningProcedures == null) { LOG.debug("No running procedures."); return; } for (String procName : runningProcedures) { // then read in the procedure information String path = ZKUtil.joinZNode(zkController.getAcquiredBarrier(), procName); startNewSubprocedure(path); } }
public void updateRecord(MetaRecord rec) { try { long ts = System.currentTimeMillis(); putRecord(rec, ts); String node = ZKUtil.joinZNode(H2MetaTableTracker.NODE_NAME, Integer.toString(rec.getId())); // setData会异步触发所有机器(包括本机)上的H2MetaTableTracker.nodeDataChanged // 然后触发下列调用: // =>org.h2.engine.Database.updateDatabaseObject(int) // =>org.h2.engine.Database.update(Session, DbObject) // =>org.h2.engine.Database.addMeta0(Session, DbObject, boolean) // =>又到此方法 // 所以会造成循环 synchronized (this) { // 避免setData后立刻触发nodeDataChanged,此时IdVersion还未更新 ZKUtil.setData(watcher, node, Bytes.toBytes(ts)); // setData后watch不见了,所以要继续watch,监听其他人对此node的修改 // ZKUtil.watchAndCheckExists(watcher, node); Stat stat = new Stat(); ZKUtil.getDataAndWatch(watcher, node, stat); // 这里记录下id的最新版本,触发nodeDataChanged时再检查一下是否版本一样, // 如果不大于这里的版本那么就不再执行updateDatabaseObject操作 tracker.updateIdVersion(rec.getId(), stat.getVersion()); } } catch (Exception e) { throw new RuntimeException(e); } }
@Override public Configuration getPeerConf(String peerId) throws ReplicationException { String znode = ZKUtil.joinZNode(this.peersZNode, peerId); byte[] data = null; try { data = ZKUtil.getData(this.zookeeper, znode); } catch (KeeperException e) { throw new ReplicationException("Error getting configuration for peer with id=" + peerId, e); } if (data == null) { LOG.error("Could not get configuration for peer because it doesn't exist. peerId=" + peerId); return null; } String otherClusterKey = ""; try { otherClusterKey = parsePeerFrom(data); } catch (DeserializationException e) { LOG.warn( "Failed to parse cluster key from peerId=" + peerId + ", specifically the content from the following znode: " + znode); return null; } Configuration otherConf = new Configuration(this.conf); try { ZKUtil.applyClusterKeyToConf(otherConf, otherClusterKey); } catch (IOException e) { LOG.error("Can't get peer configuration for peerId=" + peerId + " because:", e); return null; } return otherConf; }
@Test public void testDeadWorker() throws Exception { LOG.info("testDeadWorker"); conf.setLong("hbase.splitlog.max.resubmit", 0); slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null); slm.finishInitialization(); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); int version = ZKUtil.checkExists(zkw, tasknode); final ServerName worker1 = new ServerName("worker1,1,1"); SplitLogTask slt = new SplitLogTask.Owned(worker1); ZKUtil.setData(zkw, tasknode, slt.toByteArray()); if (tot_mgr_heartbeat.get() == 0) waitForCounter(tot_mgr_heartbeat, 0, 1, to / 2); slm.handleDeadWorker(worker1); if (tot_mgr_resubmit.get() == 0) waitForCounter(tot_mgr_resubmit, 0, 1, to + to / 2); if (tot_mgr_resubmit_dead_server_task.get() == 0) { waitForCounter(tot_mgr_resubmit_dead_server_task, 0, 1, to + to / 2); } int version1 = ZKUtil.checkExists(zkw, tasknode); assertTrue(version1 > version); byte[] taskstate = ZKUtil.getData(zkw, tasknode); slt = SplitLogTask.parseFrom(taskstate); assertTrue(slt.isUnassigned(DUMMY_MASTER)); return; }
/** * Helper method to connect to a peer * * @param peerId peer's identifier * @return object representing the peer * @throws ReplicationException */ private ReplicationPeer getPeer(String peerId) throws ReplicationException { Configuration peerConf = getPeerConf(peerId); if (peerConf == null) { return null; } if (this.ourClusterKey.equals(ZKUtil.getZooKeeperClusterKey(peerConf))) { LOG.debug("Not connecting to " + peerId + " because it's us"); return null; } ReplicationPeer peer = new ReplicationPeer(peerConf, peerId, ZKUtil.getZooKeeperClusterKey(peerConf)); try { peer.startStateTracker(this.zookeeper, this.getPeerStateNode(peerId)); } catch (KeeperException e) { throw new ReplicationException( "Error starting the peer state tracker for peerId=" + peerId, e); } try { peer.startTableCFsTracker(this.zookeeper, this.getTableCFsNode(peerId)); } catch (KeeperException e) { throw new ReplicationException( "Error starting the peer tableCFs tracker for peerId=" + peerId, e); } peer.getZkw().registerListener(new PeerRegionServerListener(peer)); return peer; }
@Override public void addPeer(String id, String clusterKey, String tableCFs) throws ReplicationException { try { if (peerExists(id)) { throw new IllegalArgumentException( "Cannot add a peer with id=" + id + " because that id already exists."); } ZKUtil.createWithParents(this.zookeeper, this.peersZNode); ZKUtil.createAndWatch( this.zookeeper, ZKUtil.joinZNode(this.peersZNode, id), toByteArray(clusterKey)); // There is a race b/w PeerWatcher and ReplicationZookeeper#add method to create the // peer-state znode. This happens while adding a peer. // The peer state data is set as "ENABLED" by default. ZKUtil.createNodeIfNotExistsAndWatch( this.zookeeper, getPeerStateNode(id), ENABLED_ZNODE_BYTES); // A peer is enabled by default String tableCFsStr = (tableCFs == null) ? "" : tableCFs; ZKUtil.createNodeIfNotExistsAndWatch( this.zookeeper, getTableCFsNode(id), Bytes.toBytes(tableCFsStr)); } catch (KeeperException e) { throw new ReplicationException( "Could not add peer with id=" + id + ", clusterKey=" + clusterKey, e); } }
@Test public void testRescanCleanup() throws Exception { LOG.info("TestRescanCleanup - ensure RESCAN nodes are cleaned up"); slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null); slm.finishInitialization(); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); int version = ZKUtil.checkExists(zkw, tasknode); final ServerName worker1 = new ServerName("worker1,1,1"); SplitLogTask slt = new SplitLogTask.Owned(worker1); ZKUtil.setData(zkw, tasknode, slt.toByteArray()); waitForCounter(tot_mgr_heartbeat, 0, 1, to / 2); waitForCounter( new Expr() { @Override public long eval() { return (tot_mgr_resubmit.get() + tot_mgr_resubmit_failed.get()); } }, 0, 1, 5 * 60000); // wait long enough Assert.assertEquals( "Could not run test. Lost ZK connection?", 0, tot_mgr_resubmit_failed.get()); int version1 = ZKUtil.checkExists(zkw, tasknode); assertTrue(version1 > version); byte[] taskstate = ZKUtil.getData(zkw, tasknode); slt = SplitLogTask.parseFrom(taskstate); assertTrue(slt.isUnassigned(DUMMY_MASTER)); waitForCounter(tot_mgr_rescan_deleted, 0, 1, to / 2); }
@Test public void testUnassignedOrphan() throws Exception { LOG.info("TestUnassignedOrphan - an unassigned task is resubmitted at" + " startup"); String tasknode = ZKSplitLog.getEncodedNodeName(zkw, "orphan/test/slash"); // create an unassigned orphan task zkw.getRecoverableZooKeeper() .create( tasknode, TaskState.TASK_UNASSIGNED.get("dummy-worker"), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); int version = ZKUtil.checkExists(zkw, tasknode); slm = new SplitLogManager(zkw, conf, stopper, "dummy-master", null); slm.finishInitialization(); waitForCounter(tot_mgr_orphan_task_acquired, 0, 1, 100); Task task = slm.findOrCreateOrphanTask(tasknode); assertTrue(task.isOrphan()); assertTrue(task.isUnassigned()); // wait for RESCAN node to be created waitForCounter(tot_mgr_rescan, 0, 1, 500); Task task2 = slm.findOrCreateOrphanTask(tasknode); assertTrue(task == task2); LOG.debug("task = " + task); assertEquals(1L, tot_mgr_resubmit.get()); assertEquals(1, task.incarnation); assertEquals(0, task.unforcedResubmits); assertTrue(task.isOrphan()); assertTrue(task.isUnassigned()); assertTrue(ZKUtil.checkExists(zkw, tasknode) > version); }
/** * This attempts to create an acquired state znode for the procedure (snapshot name). * * <p>It then looks for the reached znode to trigger in-barrier execution. If not present we have * a watcher, if present then trigger the in-barrier action. */ @Override public void sendMemberAcquired(Subprocedure sub) throws IOException { String procName = sub.getName(); try { LOG.debug( "Member: '" + memberName + "' joining acquired barrier for procedure (" + procName + ") in zk"); String acquiredZNode = ZKUtil.joinZNode( ZKProcedureUtil.getAcquireBarrierNode(zkController, procName), memberName); ZKUtil.createAndFailSilent(zkController.getWatcher(), acquiredZNode); // watch for the complete node for this snapshot String reachedBarrier = zkController.getReachedBarrierNode(procName); LOG.debug("Watch for global barrier reached:" + reachedBarrier); if (ZKUtil.watchAndCheckExists(zkController.getWatcher(), reachedBarrier)) { receivedReachedGlobalBarrier(reachedBarrier); } } catch (KeeperException e) { member.controllerConnectionFailure( "Failed to acquire barrier for procedure: " + procName + " and member: " + memberName, e, procName); } }
@Test public void testTaskResigned() throws Exception { LOG.info("TestTaskResigned - resubmit task node once in RESIGNED state"); assertEquals(tot_mgr_resubmit.get(), 0); slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null); slm.finishInitialization(); assertEquals(tot_mgr_resubmit.get(), 0); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); assertEquals(tot_mgr_resubmit.get(), 0); final ServerName worker1 = new ServerName("worker1,1,1"); assertEquals(tot_mgr_resubmit.get(), 0); SplitLogTask slt = new SplitLogTask.Resigned(worker1); assertEquals(tot_mgr_resubmit.get(), 0); ZKUtil.setData(zkw, tasknode, slt.toByteArray()); int version = ZKUtil.checkExists(zkw, tasknode); // Could be small race here. if (tot_mgr_resubmit.get() == 0) waitForCounter(tot_mgr_resubmit, 0, 1, to / 2); assertEquals(tot_mgr_resubmit.get(), 1); int version1 = ZKUtil.checkExists(zkw, tasknode); assertTrue("version1=" + version1 + ", version=" + version, version1 > version); byte[] taskstate = ZKUtil.getData(zkw, tasknode); slt = SplitLogTask.parseFrom(taskstate); assertTrue(slt.isUnassigned(DUMMY_MASTER)); }
@Before public void setup() throws Exception { TEST_UTIL = new HBaseTestingUtility(); TEST_UTIL.startMiniZKCluster(); conf = TEST_UTIL.getConfiguration(); // Use a different ZK wrapper instance for each tests. zkw = new ZooKeeperWatcher(conf, "split-log-manager-tests" + UUID.randomUUID().toString(), null); ZKUtil.deleteChildrenRecursively(zkw, zkw.baseZNode); ZKUtil.createAndFailSilent(zkw, zkw.baseZNode); assertTrue(ZKUtil.checkExists(zkw, zkw.baseZNode) != -1); LOG.debug(zkw.baseZNode + " created"); ZKUtil.createAndFailSilent(zkw, zkw.splitLogZNode); assertTrue(ZKUtil.checkExists(zkw, zkw.splitLogZNode) != -1); LOG.debug(zkw.splitLogZNode + " created"); stopped = false; resetCounters(); // By default, we let the test manage the error as before, so the server // does not appear as dead from the master point of view, only from the split log pov. Mockito.when(sm.isServerOnline(Mockito.any(ServerName.class))).thenReturn(true); Mockito.when(master.getServerManager()).thenReturn(sm); to = 4000; conf.setInt("hbase.splitlog.manager.timeout", to); conf.setInt("hbase.splitlog.manager.unassigned.timeout", 2 * to); conf.setInt("hbase.splitlog.manager.timeoutmonitor.period", 100); to = to + 4 * 100; }
public void addRecord(MetaRecord rec) { try { putRecord(rec, System.currentTimeMillis()); String node = ZKUtil.joinZNode(H2MetaTableTracker.NODE_NAME, Integer.toString(rec.getId())); ZKUtil.createAndWatch(watcher, node, EMPTY_BYTE_ARRAY); tracker.updateIdVersion(rec.getId(), 0); } catch (Exception e) { throw new RuntimeException(e); } }
@Override public void init() throws ReplicationException { try { if (ZKUtil.checkExists(this.zookeeper, this.peersZNode) < 0) { ZKUtil.createWithParents(this.zookeeper, this.peersZNode); } } catch (KeeperException e) { throw new ReplicationException("Could not initialize replication peers", e); } connectExistingPeers(); }
/** * Test waiting on meta w/ no timeout specified. * * @throws Exception */ @Ignore // Can't make it work reliably on all platforms; mockito gets confused // Throwing: org.mockito.exceptions.misusing.WrongTypeOfReturnValue: // Result cannot be returned by locateRegion() // If you plug locateRegion, it then throws for incCounter, and if you plug // that ... and so one. @Test public void testNoTimeoutWaitForMeta() throws Exception { // Mock an HConnection and a HRegionInterface implementation. Have the // HConnection return the HRI. Have the HRI return a few mocked up responses // to make our test work. // Mock an HRegionInterface. final HRegionInterface implementation = Mockito.mock(HRegionInterface.class); HConnection connection = mockConnection(implementation); // Now the ct is up... set into the mocks some answers that make it look // like things have been getting assigned. Make it so we'll return a // location (no matter what the Get is). Same for getHRegionInfo -- always // just return the meta region. final Result result = getMetaTableRowResult(); // TODO: Refactor. This method has been moved out of HConnection. // It works for now but has been deprecated. Mockito.when(connection.getRegionServerWithRetries((ServerCallable<Result>) Mockito.any())) .thenReturn(result); Mockito.when(implementation.getRegionInfo((byte[]) Mockito.any())) .thenReturn(HRegionInfo.FIRST_META_REGIONINFO); final CatalogTracker ct = constructAndStartCatalogTracker(connection); ServerName hsa = ct.getMetaLocation(); Assert.assertNull(hsa); // Now test waiting on meta location getting set. Thread t = new WaitOnMetaThread(ct) { @Override void doWaiting() throws InterruptedException { this.ct.waitForMeta(); } }; startWaitAliveThenWaitItLives(t, 1000); // This should trigger wake up of meta wait (Its the removal of the meta // region unassigned node that triggers catalogtrackers that a meta has // been assigned). String node = ct.getMetaNodeTracker().getNode(); ZKUtil.createAndFailSilent(this.watcher, node); MetaEditor.updateMetaLocation(ct, HRegionInfo.FIRST_META_REGIONINFO, SN); ZKUtil.deleteNode(this.watcher, node); // Go get the new meta location. waitForMeta gets and verifies meta. Assert.assertTrue(ct.waitForMeta(10000).equals(SN)); // Join the thread... should exit shortly. t.join(); // Now meta is available. Assert.assertTrue(ct.waitForMeta(10000).equals(SN)); }
@Override public void removePeer(String id) throws ReplicationException { try { if (!peerExists(id)) { throw new IllegalArgumentException( "Cannot remove peer with id=" + id + " because that id does not exist."); } ZKUtil.deleteNodeRecursively(this.zookeeper, ZKUtil.joinZNode(this.peersZNode, id)); } catch (KeeperException e) { throw new ReplicationException("Could not remove peer with id=" + id, e); } }
public void removeRecord(int id) { try { // System.out.println("removeRecord id: " + id); // new Error().printStackTrace(); Delete delete = new Delete(Bytes.toBytes(id)); table.delete(delete); ZKUtil.deleteNodeFailSilent( watcher, ZKUtil.joinZNode(H2MetaTableTracker.NODE_NAME, Integer.toString(id))); tracker.removeId(id); } catch (Exception e) { throw new RuntimeException(e); } }
private static HRegionServer setDrainingServer(final HRegionServer hrs) throws KeeperException { LOG.info( "Making " + hrs.getServerName() + " the draining server; " + "it has " + hrs.getNumberOfOnlineRegions() + " online regions"); ZooKeeperWatcher zkw = hrs.getZooKeeper(); String hrsDrainingZnode = ZKUtil.joinZNode(zkw.drainingZNode, hrs.getServerName().toString()); ZKUtil.createWithParents(zkw, hrsDrainingZnode); return hrs; }
public int getRedoPos(boolean watch) { try { byte[] data = null; if (watch) data = ZKUtil.getDataAndWatch(watcher, ZooKeeperAdmin.METATABLE_NODE); else data = ZKUtil.getData(watcher, ZooKeeperAdmin.METATABLE_NODE); if (data != null && data.length > 0) { return Bytes.toInt(data); } return 1; } catch (Exception e) { throw new MetaTableTrackerException(e); } }
/** * Kick off a new sub-procedure on the listener with the data stored in the passed znode. * * <p>Will attempt to create the same procedure multiple times if an procedure znode with the same * name is created. It is left up the coordinator to ensure this doesn't occur. * * @param path full path to the znode for the procedure to start */ private synchronized void startNewSubprocedure(String path) { LOG.debug("Found procedure znode: " + path); String opName = ZKUtil.getNodeName(path); // start watching for an abort notification for the procedure String abortZNode = zkController.getAbortZNode(opName); try { if (ZKUtil.watchAndCheckExists(zkController.getWatcher(), abortZNode)) { LOG.debug("Not starting:" + opName + " because we already have an abort notification."); return; } } catch (KeeperException e) { member.controllerConnectionFailure( "Failed to get the abort znode (" + abortZNode + ") for procedure :" + opName, e, opName); return; } // get the data for the procedure Subprocedure subproc = null; try { byte[] data = ZKUtil.getData(zkController.getWatcher(), path); if (!ProtobufUtil.isPBMagicPrefix(data)) { String msg = "Data in for starting procuedure " + opName + " is illegally formatted (no pb magic). " + "Killing the procedure: " + Bytes.toString(data); LOG.error(msg); throw new IllegalArgumentException(msg); } LOG.debug("start proc data length is " + data.length); data = Arrays.copyOfRange(data, ProtobufUtil.lengthOfPBMagic(), data.length); LOG.debug("Found data for znode:" + path); subproc = member.createSubprocedure(opName, data); member.submitSubprocedure(subproc); } catch (IllegalArgumentException iae) { LOG.error("Illegal argument exception", iae); sendMemberAborted(subproc, new ForeignException(getMemberName(), iae)); } catch (IllegalStateException ise) { LOG.error("Illegal state exception ", ise); sendMemberAborted(subproc, new ForeignException(getMemberName(), ise)); } catch (KeeperException e) { member.controllerConnectionFailure( "Failed to get data for new procedure:" + opName, e, opName); } catch (InterruptedException e) { member.controllerConnectionFailure( "Failed to get data for new procedure:" + opName, e, opName); Thread.currentThread().interrupt(); } }
public void loadMetaRecords(List<MetaRecord> records) throws Exception { MetaRecord rec; for (Result r : table.getScanner(new Scan())) { if (r.isEmpty()) continue; rec = getMetaRecord(r); records.add(rec); if (!tracker.contains(rec.getId())) ZKUtil.createNodeIfNotExistsAndWatch( watcher, ZKUtil.joinZNode(H2MetaTableTracker.NODE_NAME, Integer.toString(rec.getId())), EMPTY_BYTE_ARRAY); } }
private void watchForAbortedProcedures() { LOG.debug("Checking for aborted procedures on node: '" + zkController.getAbortZnode() + "'"); try { // this is the list of the currently aborted procedues for (String node : ZKUtil.listChildrenAndWatchForNewChildren( zkController.getWatcher(), zkController.getAbortZnode())) { String abortNode = ZKUtil.joinZNode(zkController.getAbortZnode(), node); abort(abortNode); } } catch (KeeperException e) { member.controllerConnectionFailure( "Failed to list children for abort node:" + zkController.getAbortZnode(), e, null); } }
@Before public void setup() throws Exception { TEST_UTIL.startMiniZKCluster(); conf = TEST_UTIL.getConfiguration(); zkw = new ZooKeeperWatcher(conf, "split-log-manager-tests", null); ZKUtil.deleteChildrenRecursively(zkw, zkw.baseZNode); ZKUtil.createAndFailSilent(zkw, zkw.baseZNode); assertTrue(ZKUtil.checkExists(zkw, zkw.baseZNode) != -1); LOG.debug(zkw.baseZNode + " created"); ZKUtil.createAndFailSilent(zkw, zkw.splitLogZNode); assertTrue(ZKUtil.checkExists(zkw, zkw.splitLogZNode) != -1); LOG.debug(zkw.splitLogZNode + " created"); stopped = false; resetCounters(); }
/** * endTask() can fail and the only way to recover out of it is for the {@link * org.apache.hadoop.hbase.master.SplitLogManager} to timeout the task node. * * @param slt * @param ctr */ @Override public void endTask(SplitLogTask slt, AtomicLong ctr, SplitTaskDetails details) { ZkSplitTaskDetails zkDetails = (ZkSplitTaskDetails) details; String task = zkDetails.getTaskNode(); int taskZKVersion = zkDetails.getCurTaskZKVersion().intValue(); try { if (ZKUtil.setData(watcher, task, slt.toByteArray(), taskZKVersion)) { LOG.info("successfully transitioned task " + task + " to final state " + slt); ctr.incrementAndGet(); return; } LOG.warn( "failed to transistion task " + task + " to end state " + slt + " because of version mismatch "); } catch (KeeperException.BadVersionException bve) { LOG.warn( "transisition task " + task + " to " + slt + " failed because of version mismatch", bve); } catch (KeeperException.NoNodeException e) { LOG.fatal( "logic error - end task " + task + " " + slt + " failed because task doesn't exist", e); } catch (KeeperException e) { LOG.warn("failed to end task, " + task + " " + slt, e); } SplitLogCounters.tot_wkr_final_transition_failed.incrementAndGet(); }
@Test public void testVanishingTaskZNode() throws Exception { LOG.info("testVanishingTaskZNode"); conf.setInt("hbase.splitlog.manager.unassigned.timeout", 0); slm = new SplitLogManager(zkw, conf, stopper, "dummy-master", null); slm.finishInitialization(); FileSystem fs = TEST_UTIL.getTestFileSystem(); final Path logDir = new Path(fs.getWorkingDirectory(), UUID.randomUUID().toString()); fs.mkdirs(logDir); Path logFile = new Path(logDir, UUID.randomUUID().toString()); fs.createNewFile(logFile); new Thread() { public void run() { try { // this call will block because there are no SplitLogWorkers slm.splitLogDistributed(logDir); } catch (Exception e) { LOG.warn("splitLogDistributed failed", e); fail(); } } }.start(); waitForCounter(tot_mgr_node_create_result, 0, 1, 10000); String znode = ZKSplitLog.getEncodedNodeName(zkw, logFile.toString()); // remove the task znode ZKUtil.deleteNode(zkw, znode); waitForCounter(tot_mgr_get_data_nonode, 0, 1, 30000); waitForCounter(tot_mgr_log_split_batch_success, 0, 1, 1000); assertTrue(fs.exists(logFile)); fs.delete(logDir, true); }
/** * endTask() can fail and the only way to recover out of it is for the {@link SplitLogManager} to * timeout the task node. * * @param slt * @param ctr */ public static void endTask( ZooKeeperWatcher zkw, SplitLogTask slt, AtomicLong ctr, String task, int taskZKVersion) { try { if (ZKUtil.setData(zkw, task, slt.toByteArray(), taskZKVersion)) { LOG.info("successfully transitioned task " + task + " to final state " + slt); ctr.incrementAndGet(); return; } LOG.warn( "failed to transistion task " + task + " to end state " + slt + " because of version mismatch "); } catch (KeeperException.BadVersionException bve) { LOG.warn( "transisition task " + task + " to " + slt + " failed because of version mismatch", bve); } catch (KeeperException.NoNodeException e) { LOG.fatal( "logic error - end task " + task + " " + slt + " failed because task doesn't exist", e); } catch (KeeperException e) { LOG.warn("failed to end task, " + task + " " + slt, e); } SplitLogCounters.tot_wkr_final_transition_failed.incrementAndGet(); }
@Test public void testUnassignedTimeout() throws Exception { LOG.info("TestUnassignedTimeout - iff all tasks are unassigned then" + " resubmit"); // create an orphan task in OWNED state String tasknode1 = ZKSplitLog.getEncodedNodeName(zkw, "orphan/1"); final ServerName worker1 = new ServerName("worker1,1,1"); SplitLogTask slt = new SplitLogTask.Owned(worker1); zkw.getRecoverableZooKeeper() .create(tasknode1, slt.toByteArray(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); slm = new SplitLogManager(zkw, conf, stopper, master, DUMMY_MASTER, null); slm.finishInitialization(); waitForCounter(tot_mgr_orphan_task_acquired, 0, 1, to / 2); // submit another task which will stay in unassigned mode TaskBatch batch = new TaskBatch(); submitTaskAndWait(batch, "foo/1"); // keep updating the orphan owned node every to/2 seconds for (int i = 0; i < (3 * to) / 100; i++) { Thread.sleep(100); final ServerName worker2 = new ServerName("worker1,1,1"); slt = new SplitLogTask.Owned(worker2); ZKUtil.setData(zkw, tasknode1, slt.toByteArray()); } // since we have stopped heartbeating the owned node therefore it should // get resubmitted LOG.info("waiting for manager to resubmit the orphan task"); waitForCounter(tot_mgr_resubmit, 0, 1, to + to / 2); // now all the nodes are unassigned. manager should post another rescan waitForCounter(tot_mgr_resubmit_unassigned, 0, 1, 2 * to + to / 2); }
@Test public void TestMap() throws Exception { String prefix = "0000"; final String fileName = "19691231f2cd014ea28f42788214560a21a44cef"; final String mobFilePath = prefix + fileName; ImmutableBytesWritable r = new ImmutableBytesWritable(Bytes.toBytes("r")); final KeyValue[] kvList = new KeyValue[1]; kvList[0] = new KeyValue( Bytes.toBytes("row"), Bytes.toBytes("family"), Bytes.toBytes("column"), Bytes.toBytes(mobFilePath)); Result columns = mock(Result.class); when(columns.rawCells()).thenReturn(kvList); Configuration configuration = new Configuration(TEST_UTIL.getConfiguration()); ZooKeeperWatcher zkw = new ZooKeeperWatcher(configuration, "1", new DummyMobAbortable()); TableName tn = TableName.valueOf("testSweepMapper"); TableName lockName = MobUtils.getTableLockName(tn); String znode = ZKUtil.joinZNode(zkw.tableLockZNode, lockName.getNameAsString()); configuration.set(SweepJob.SWEEP_JOB_ID, "1"); configuration.set(SweepJob.SWEEP_JOB_TABLE_NODE, znode); ServerName serverName = SweepJob.getCurrentServerName(configuration); configuration.set(SweepJob.SWEEP_JOB_SERVERNAME, serverName.toString()); TableLockManager tableLockManager = TableLockManager.createTableLockManager(configuration, zkw, serverName); TableLock lock = tableLockManager.writeLock(lockName, "Run sweep tool"); lock.acquire(); try { Mapper<ImmutableBytesWritable, Result, Text, KeyValue>.Context ctx = mock(Mapper.Context.class); when(ctx.getConfiguration()).thenReturn(configuration); SweepMapper map = new SweepMapper(); doAnswer( new Answer<Void>() { @Override public Void answer(InvocationOnMock invocation) throws Throwable { Text text = (Text) invocation.getArguments()[0]; KeyValue kv = (KeyValue) invocation.getArguments()[1]; assertEquals(Bytes.toString(text.getBytes(), 0, text.getLength()), fileName); assertEquals(0, Bytes.compareTo(kv.getKey(), kvList[0].getKey())); return null; } }) .when(ctx) .write(any(Text.class), any(KeyValue.class)); map.map(r, columns, ctx); } finally { lock.release(); } }
public static void deletePgPortEphemeralNode(ServerName sn, int port, boolean isMaster) { try { ZKUtil.deleteNode( ZooKeeperAdmin.getZooKeeperWatcher(), getPgPortEphemeralNodePath(sn, port, isMaster)); } catch (KeeperException e) { throw DbException.convert(e); } }
@Test public void testTaskDone() throws Exception { LOG.info("TestTaskDone - cleanup task node once in DONE state"); slm = new SplitLogManager(zkw, conf, stopper, "dummy-master", null); slm.finishInitialization(); TaskBatch batch = new TaskBatch(); String tasknode = submitTaskAndWait(batch, "foo/1"); ZKUtil.setData(zkw, tasknode, TaskState.TASK_DONE.get("worker")); synchronized (batch) { while (batch.installed != batch.done) { batch.wait(); } } waitForCounter(tot_mgr_task_deleted, 0, 1, 1000); assertTrue(ZKUtil.checkExists(zkw, tasknode) == -1); }
private static String getPgPortEphemeralNodePath(ServerName sn, int port, boolean isMaster) { String znode = (isMaster ? "M" : "S") + ":" + sn.getHostAndPort() + Addressing.HOSTNAME_PORT_SEPARATOR + port; return ZKUtil.joinZNode(ZooKeeperAdmin.PG_SERVER_NODE, znode); }