/** Decommissioning using a post-configured exclude hosts file */ @Test public void testAddNewExcludePathToConfiguration() throws Exception { Configuration conf = new Configuration(); rm = new MockRM(conf); rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); MockNM nm2 = rm.registerNode("host2:5678", 10240); ClusterMetrics metrics = ClusterMetrics.getMetrics(); assert (metrics != null); int initialMetricCount = metrics.getNumDecommisionedNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertEquals(NodeAction.NORMAL, nodeHeartbeat.getNodeAction()); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertEquals(NodeAction.NORMAL, nodeHeartbeat.getNodeAction()); writeToHostsFile("host2"); conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile.getAbsolutePath()); rm.getNodesListManager().refreshNodes(conf); checkDecommissionedNMCount(rm, ++initialMetricCount); nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertEquals( "Node should not have been decomissioned.", NodeAction.NORMAL, nodeHeartbeat.getNodeAction()); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertEquals( "Node should have been decomissioned but is in state" + nodeHeartbeat.getNodeAction(), NodeAction.SHUTDOWN, nodeHeartbeat.getNodeAction()); }
/** @throws Exception If failed. */ public void testClusterNodeMetrics() throws Exception { final Ignite ignite0 = grid(); final Ignite ignite1 = startGrid(1); GridTestUtils.waitForCondition( new GridAbsPredicate() { @Override public boolean apply() { return ignite0.cluster().nodes().size() == 2 && ignite1.cluster().nodes().size() == 2; } }, 3000L); ClusterMetrics metrics0 = ignite0.cluster().localNode().metrics(); ClusterMetrics nodesMetrics = ignite0 .cluster() .forNode(ignite0.cluster().localNode(), ignite1.cluster().localNode()) .metrics(); assertEquals(metrics0.getTotalCpus(), nodesMetrics.getTotalCpus()); assertEquals(1, metrics0.getTotalNodes()); assertEquals(2, nodesMetrics.getTotalNodes()); assert metrics0.getHeapMemoryUsed() > 0; assert metrics0.getHeapMemoryTotal() > 0; assert metrics0.getNonHeapMemoryMaximum() > 0; }
/** @throws Exception If failed. */ public void testIoMetrics() throws Exception { Ignite ignite0 = grid(); Ignite ignite1 = startGrid(1); Object msg = new TestMessage(); int size = ignite0.configuration().getMarshaller().marshal(msg).length; assert size > MSG_SIZE; final CountDownLatch latch = new CountDownLatch(MSG_CNT); ignite0 .message() .localListen( null, new MessagingListenActor<TestMessage>() { @Override protected void receive(UUID nodeId, TestMessage rcvMsg) throws Throwable { latch.countDown(); } }); ignite1 .message() .localListen( null, new MessagingListenActor<TestMessage>() { @Override protected void receive(UUID nodeId, TestMessage rcvMsg) throws Throwable { respond(rcvMsg); } }); for (int i = 0; i < MSG_CNT; i++) message(ignite0.cluster().forRemotes()).send(null, msg); latch.await(); ClusterMetrics metrics = ignite0.cluster().localNode().metrics(); info("Node 0 metrics: " + metrics); // Time sync messages are being sent. assert metrics.getSentMessagesCount() >= MSG_CNT; assert metrics.getSentBytesCount() > size * MSG_CNT; assert metrics.getReceivedMessagesCount() >= MSG_CNT; assert metrics.getReceivedBytesCount() > size * MSG_CNT; metrics = ignite1.cluster().localNode().metrics(); info("Node 1 metrics: " + metrics); // Time sync messages are being sent. assert metrics.getSentMessagesCount() >= MSG_CNT; assert metrics.getSentBytesCount() > size * MSG_CNT; assert metrics.getReceivedMessagesCount() >= MSG_CNT; assert metrics.getReceivedBytesCount() > size * MSG_CNT; }
private void checkShutdownNMCount(MockRM rm, int count) throws InterruptedException { int waitCount = 0; while (ClusterMetrics.getMetrics().getNumShutdownNMs() != count && waitCount++ < 20) { synchronized (this) { wait(100); } } Assert.assertEquals( "The shutdown metrics are not updated", count, ClusterMetrics.getMetrics().getNumShutdownNMs()); }
@Test public void testInvalidNMUnregistration() throws Exception { Configuration conf = new Configuration(); rm = new MockRM(conf); rm.start(); ResourceTrackerService resourceTrackerService = rm.getResourceTrackerService(); int shutdownNMsCount = ClusterMetrics.getMetrics().getNumShutdownNMs(); int decommisionedNMsCount = ClusterMetrics.getMetrics().getNumDecommisionedNMs(); // Node not found for unregister UnRegisterNodeManagerRequest request = Records.newRecord(UnRegisterNodeManagerRequest.class); request.setNodeId(BuilderUtils.newNodeId("host", 1234)); resourceTrackerService.unRegisterNodeManager(request); checkShutdownNMCount(rm, 0); checkDecommissionedNMCount(rm, 0); // 1. Register the Node Manager // 2. Exclude the same Node Manager host // 3. Give NM heartbeat to RM // 4. Unregister the Node Manager MockNM nm1 = new MockNM("host1:1234", 5120, resourceTrackerService); RegisterNodeManagerResponse response = nm1.registerNode(); Assert.assertEquals(NodeAction.NORMAL, response.getNodeAction()); writeToHostsFile("host2"); conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile.getAbsolutePath()); rm.getNodesListManager().refreshNodes(conf); NodeHeartbeatResponse heartbeatResponse = nm1.nodeHeartbeat(true); Assert.assertEquals(NodeAction.SHUTDOWN, heartbeatResponse.getNodeAction()); checkShutdownNMCount(rm, shutdownNMsCount); checkDecommissionedNMCount(rm, ++decommisionedNMsCount); request.setNodeId(nm1.getNodeId()); resourceTrackerService.unRegisterNodeManager(request); checkShutdownNMCount(rm, shutdownNMsCount); checkDecommissionedNMCount(rm, decommisionedNMsCount); // 1. Register the Node Manager // 2. Exclude the same Node Manager host // 3. Unregister the Node Manager MockNM nm2 = new MockNM("host2:1234", 5120, resourceTrackerService); RegisterNodeManagerResponse response2 = nm2.registerNode(); Assert.assertEquals(NodeAction.NORMAL, response2.getNodeAction()); writeToHostsFile("host1"); conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile.getAbsolutePath()); rm.getNodesListManager().refreshNodes(conf); request.setNodeId(nm2.getNodeId()); resourceTrackerService.unRegisterNodeManager(request); checkShutdownNMCount(rm, shutdownNMsCount); checkDecommissionedNMCount(rm, ++decommisionedNMsCount); }
/** decommissioning using a exclude hosts file */ @Test public void testDecommissionWithExcludeHosts() throws Exception { Configuration conf = new Configuration(); conf.set("yarn.resourcemanager.nodes.exclude-path", hostFile.getAbsolutePath()); writeToHostsFile(""); rm = new MockRM(conf); rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); MockNM nm2 = rm.registerNode("host2:5678", 10240); int initialMetricCount = ClusterMetrics.getMetrics().getNumDecommisionedNMs(); HeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); writeToHostsFile("host2"); rm.getNodesListManager().refreshNodes(); nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertTrue( "The decommisioned metrics are not updated", NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction())); checkDecommissionedNMCount(rm, ++initialMetricCount); }
/** Decommissioning using a pre-configured include hosts file */ @Test public void testDecommissionWithIncludeHosts() throws Exception { writeToHostsFile("localhost", "host1", "host2"); Configuration conf = new Configuration(); conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile.getAbsolutePath()); rm = new MockRM(conf); rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); MockNM nm2 = rm.registerNode("host2:5678", 10240); MockNM nm3 = rm.registerNode("localhost:4433", 1024); ClusterMetrics metrics = ClusterMetrics.getMetrics(); assert (metrics != null); int metricCount = metrics.getNumDecommisionedNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm3.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); // To test that IPs also work String ip = NetUtils.normalizeHostName("localhost"); writeToHostsFile("host1", ip); rm.getNodesListManager().refreshNodes(conf); checkDecommissionedNMCount(rm, ++metricCount); nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); Assert.assertEquals(1, ClusterMetrics.getMetrics().getNumDecommisionedNMs()); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertTrue( "Node is not decommisioned.", NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm3.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); Assert.assertEquals(metricCount, ClusterMetrics.getMetrics().getNumDecommisionedNMs()); }
@Test public void testUnhealthyNMUnregistration() throws Exception { Configuration conf = new Configuration(); rm = new MockRM(conf); rm.start(); ResourceTrackerService resourceTrackerService = rm.getResourceTrackerService(); MockNM nm1 = rm.registerNode("host1:1234", 5120); Assert.assertEquals(0, ClusterMetrics.getMetrics().getUnhealthyNMs()); // node healthy nm1.nodeHeartbeat(true); int shutdownNMsCount = ClusterMetrics.getMetrics().getNumShutdownNMs(); // node unhealthy nm1.nodeHeartbeat(false); checkUnealthyNMCount(rm, nm1, true, 1); UnRegisterNodeManagerRequest request = Records.newRecord(UnRegisterNodeManagerRequest.class); request.setNodeId(nm1.getNodeId()); resourceTrackerService.unRegisterNodeManager(request); checkShutdownNMCount(rm, ++shutdownNMsCount); }
@After public void tearDown() { if (hostFile != null && hostFile.exists()) { hostFile.delete(); } ClusterMetrics.destroy(); if (rm != null) { rm.stop(); } MetricsSystem ms = DefaultMetricsSystem.instance(); if (ms.getSource("ClusterMetrics") != null) { DefaultMetricsSystem.shutdown(); } }
private void checkUnealthyNMCount(MockRM rm, MockNM nm1, boolean health, int count) throws Exception { int waitCount = 0; while ((rm.getRMContext().getRMNodes().get(nm1.getNodeId()).getState() != NodeState.UNHEALTHY) == health && waitCount++ < 20) { synchronized (this) { wait(100); } } Assert.assertFalse( (rm.getRMContext().getRMNodes().get(nm1.getNodeId()).getState() != NodeState.UNHEALTHY) == health); Assert.assertEquals( "Unhealthy metrics not incremented", count, ClusterMetrics.getMetrics().getUnhealthyNMs()); }
@Test public void testUnhealthyNodeStatus() throws Exception { Configuration conf = new Configuration(); conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile.getAbsolutePath()); rm = new MockRM(conf); rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); Assert.assertEquals(0, ClusterMetrics.getMetrics().getUnhealthyNMs()); // node healthy nm1.nodeHeartbeat(true); // node unhealthy nm1.nodeHeartbeat(false); checkUnealthyNMCount(rm, nm1, true, 1); // node healthy again nm1.nodeHeartbeat(true); checkUnealthyNMCount(rm, nm1, false, 0); }
@Test public void testReboot() throws Exception { Configuration conf = new Configuration(); rm = new MockRM(conf); rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); MockNM nm2 = rm.registerNode("host2:1234", 2048); int initialMetricCount = ClusterMetrics.getMetrics().getNumRebootedNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm2.nodeHeartbeat(new HashMap<ApplicationId, List<ContainerStatus>>(), true, -100); Assert.assertTrue(NodeAction.RESYNC.equals(nodeHeartbeat.getNodeAction())); Assert.assertEquals( "Too far behind rm response id:0 nm response id:-100", nodeHeartbeat.getDiagnosticsMessage()); checkRebootedNMCount(rm, ++initialMetricCount); }
@Test public void testUnhealthyNodeStatus() throws Exception { Configuration conf = new Configuration(); conf.set("yarn.resourcemanager.nodes.exclude-path", hostFile.getAbsolutePath()); MockRM rm = new MockRM(conf); rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); Assert.assertEquals(0, ClusterMetrics.getMetrics().getUnhealthyNMs()); // node healthy nm1.nodeHeartbeat(true); // node unhealthy nm1.nodeHeartbeat(false); checkUnealthyNMCount(rm, nm1, true, 1); // node healthy again nm1.nodeHeartbeat(true); checkUnealthyNMCount(rm, nm1, false, 0); }
@Test public void testReboot() throws Exception { Configuration conf = new Configuration(); rm = new MockRM(conf); rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); MockNM nm2 = new MockNM("host2:1234", 2048, rm.getResourceTrackerService()); int initialMetricCount = ClusterMetrics.getMetrics().getNumRebootedNMs(); HeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm2.nodeHeartbeat( new HashMap<ApplicationId, List<ContainerStatus>>(), true, recordFactory.newRecordInstance(NodeId.class)); Assert.assertTrue(NodeAction.REBOOT.equals(nodeHeartbeat.getNodeAction())); checkRebootedNMCount(rm, ++initialMetricCount); }
@Test public void testNMUnregistration() throws Exception { Configuration conf = new Configuration(); rm = new MockRM(conf); rm.start(); ResourceTrackerService resourceTrackerService = rm.getResourceTrackerService(); MockNM nm1 = rm.registerNode("host1:1234", 5120); int shutdownNMsCount = ClusterMetrics.getMetrics().getNumShutdownNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); UnRegisterNodeManagerRequest request = Records.newRecord(UnRegisterNodeManagerRequest.class); request.setNodeId(nm1.getNodeId()); resourceTrackerService.unRegisterNodeManager(request); checkShutdownNMCount(rm, ++shutdownNMsCount); // The RM should remove the node after unregistration, hence send a reboot // command. nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.RESYNC.equals(nodeHeartbeat.getNodeAction())); }
/** @throws Exception If failed. */ public void testSingleTaskMetrics() throws Exception { Ignite ignite = grid(); ignite.compute().execute(new GridTestTask(), "testArg"); // Let metrics update twice. final CountDownLatch latch = new CountDownLatch(2); ignite .events() .localListen( new IgnitePredicate<Event>() { @Override public boolean apply(Event evt) { assert evt.type() == EVT_NODE_METRICS_UPDATED; latch.countDown(); return true; } }, EVT_NODE_METRICS_UPDATED); // Wait for metrics update. latch.await(); ClusterMetrics metrics = ignite.cluster().localNode().metrics(); info("Node metrics: " + metrics); assert metrics.getAverageActiveJobs() > 0; assert metrics.getAverageCancelledJobs() == 0; assert metrics.getAverageJobExecuteTime() >= 0; assert metrics.getAverageJobWaitTime() >= 0; assert metrics.getAverageRejectedJobs() == 0; assert metrics.getAverageWaitingJobs() == 0; assert metrics.getCurrentActiveJobs() == 0; assert metrics.getCurrentCancelledJobs() == 0; assert metrics.getCurrentJobExecuteTime() == 0; assert metrics.getCurrentJobWaitTime() == 0; assert metrics.getCurrentWaitingJobs() == 0; assert metrics.getMaximumActiveJobs() == 1; assert metrics.getMaximumCancelledJobs() == 0; assert metrics.getMaximumJobExecuteTime() >= 0; assert metrics.getMaximumJobWaitTime() >= 0; assert metrics.getMaximumRejectedJobs() == 0; assert metrics.getMaximumWaitingJobs() == 0; assert metrics.getTotalCancelledJobs() == 0; assert metrics.getTotalExecutedJobs() == 1; assert metrics.getTotalRejectedJobs() == 0; assert metrics.getTotalExecutedTasks() == 1; assertTrue( "MaximumJobExecuteTime=" + metrics.getMaximumJobExecuteTime() + " is less than AverageJobExecuteTime=" + metrics.getAverageJobExecuteTime(), metrics.getMaximumJobExecuteTime() >= metrics.getAverageJobExecuteTime()); }
/** Decommissioning using a pre-configured exclude hosts file */ @Test public void testDecommissionWithExcludeHosts() throws Exception { Configuration conf = new Configuration(); conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile.getAbsolutePath()); writeToHostsFile(""); final DrainDispatcher dispatcher = new DrainDispatcher(); rm = new MockRM(conf) { @Override protected Dispatcher createDispatcher() { return dispatcher; } }; rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); MockNM nm2 = rm.registerNode("host2:5678", 10240); MockNM nm3 = rm.registerNode("localhost:4433", 1024); dispatcher.await(); int metricCount = ClusterMetrics.getMetrics().getNumDecommisionedNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); dispatcher.await(); // To test that IPs also work String ip = NetUtils.normalizeHostName("localhost"); writeToHostsFile("host2", ip); rm.getNodesListManager().refreshNodes(conf); checkDecommissionedNMCount(rm, metricCount + 2); nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertTrue( "The decommisioned metrics are not updated", NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm3.nodeHeartbeat(true); Assert.assertTrue( "The decommisioned metrics are not updated", NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction())); dispatcher.await(); writeToHostsFile(""); rm.getNodesListManager().refreshNodes(conf); nm3 = rm.registerNode("localhost:4433", 1024); dispatcher.await(); nodeHeartbeat = nm3.nodeHeartbeat(true); dispatcher.await(); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); // decommissined node is 1 since 1 node is rejoined after updating exclude // file checkDecommissionedNMCount(rm, metricCount + 1); }
@Test(timeout = 60000) public void testReconnect() throws IOException { RMNodeImpl node = getRunningNode(); ClusterMetrics cm = ClusterMetrics.getMetrics(); int initialActive = cm.getNumActiveNMs(); int initialLost = cm.getNumLostNMs(); int initialUnhealthy = cm.getUnhealthyNMs(); int initialDecommissioned = cm.getNumDecommisionedNMs(); int initialRebooted = cm.getNumRebootedNMs(); node.handle( new RMNodeReconnectEvent( node.getNodeID(), node, new TransactionStateImpl(TransactionType.RM))); Assert.assertEquals("Active Nodes", initialActive, cm.getNumActiveNMs()); Assert.assertEquals("Lost Nodes", initialLost, cm.getNumLostNMs()); Assert.assertEquals("Unhealthy Nodes", initialUnhealthy, cm.getUnhealthyNMs()); Assert.assertEquals("Decommissioned Nodes", initialDecommissioned, cm.getNumDecommisionedNMs()); Assert.assertEquals("Rebooted Nodes", initialRebooted, cm.getNumRebootedNMs()); Assert.assertEquals(NodeState.RUNNING, node.getState()); Assert.assertNotNull(nodesListManagerEvent); Assert.assertEquals(NodesListManagerEventType.NODE_USABLE, nodesListManagerEvent.getType()); }
@Test(timeout = 60000) public void testAdd() { RMNodeImpl node = getNewNode(); ClusterMetrics cm = ClusterMetrics.getMetrics(); int initialActive = cm.getNumActiveNMs(); int initialLost = cm.getNumLostNMs(); int initialUnhealthy = cm.getUnhealthyNMs(); int initialDecommissioned = cm.getNumDecommisionedNMs(); int initialRebooted = cm.getNumRebootedNMs(); node.handle(new RMNodeEvent(node.getNodeID(), RMNodeEventType.STARTED, null)); Assert.assertEquals("Active Nodes", initialActive + 1, cm.getNumActiveNMs()); Assert.assertEquals("Lost Nodes", initialLost, cm.getNumLostNMs()); Assert.assertEquals("Unhealthy Nodes", initialUnhealthy, cm.getUnhealthyNMs()); Assert.assertEquals("Decommissioned Nodes", initialDecommissioned, cm.getNumDecommisionedNMs()); Assert.assertEquals("Rebooted Nodes", initialRebooted, cm.getNumRebootedNMs()); Assert.assertEquals(NodeState.RUNNING, node.getState()); Assert.assertNotNull(nodesListManagerEvent); Assert.assertEquals(NodesListManagerEventType.NODE_USABLE, nodesListManagerEvent.getType()); }
@Test(timeout = 60000) public void testUnhealthyRebooting() throws IOException { RMNodeImpl node = getUnhealthyNode(); ClusterMetrics cm = ClusterMetrics.getMetrics(); int initialActive = cm.getNumActiveNMs(); int initialLost = cm.getNumLostNMs(); int initialUnhealthy = cm.getUnhealthyNMs(); int initialDecommissioned = cm.getNumDecommisionedNMs(); int initialRebooted = cm.getNumRebootedNMs(); node.handle( new RMNodeEvent( node.getNodeID(), RMNodeEventType.REBOOTING, new TransactionStateImpl(TransactionType.RM))); Assert.assertEquals("Active Nodes", initialActive, cm.getNumActiveNMs()); Assert.assertEquals("Lost Nodes", initialLost, cm.getNumLostNMs()); Assert.assertEquals("Unhealthy Nodes", initialUnhealthy - 1, cm.getUnhealthyNMs()); Assert.assertEquals("Decommissioned Nodes", initialDecommissioned, cm.getNumDecommisionedNMs()); Assert.assertEquals("Rebooted Nodes", initialRebooted + 1, cm.getNumRebootedNMs()); Assert.assertEquals(NodeState.REBOOTED, node.getState()); }
@Test public void testReconnectNode() throws Exception { final DrainDispatcher dispatcher = new DrainDispatcher(); rm = new MockRM() { @Override protected EventHandler<SchedulerEvent> createSchedulerEventDispatcher() { return new SchedulerEventDispatcher(this.scheduler) { @Override public void handle(SchedulerEvent event) { scheduler.handle(event); } }; } @Override protected Dispatcher createDispatcher() { return dispatcher; } }; rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); MockNM nm2 = rm.registerNode("host2:5678", 5120); nm1.nodeHeartbeat(true); nm2.nodeHeartbeat(false); dispatcher.await(); checkUnealthyNMCount(rm, nm2, true, 1); final int expectedNMs = ClusterMetrics.getMetrics().getNumActiveNMs(); QueueMetrics metrics = rm.getResourceScheduler().getRootQueueMetrics(); // TODO Metrics incorrect in case of the FifoScheduler Assert.assertEquals(5120, metrics.getAvailableMB()); // reconnect of healthy node nm1 = rm.registerNode("host1:1234", 5120); NodeHeartbeatResponse response = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction())); dispatcher.await(); Assert.assertEquals(expectedNMs, ClusterMetrics.getMetrics().getNumActiveNMs()); checkUnealthyNMCount(rm, nm2, true, 1); // reconnect of unhealthy node nm2 = rm.registerNode("host2:5678", 5120); response = nm2.nodeHeartbeat(false); Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction())); dispatcher.await(); Assert.assertEquals(expectedNMs, ClusterMetrics.getMetrics().getNumActiveNMs()); checkUnealthyNMCount(rm, nm2, true, 1); // unhealthy node changed back to healthy nm2 = rm.registerNode("host2:5678", 5120); dispatcher.await(); response = nm2.nodeHeartbeat(true); response = nm2.nodeHeartbeat(true); dispatcher.await(); Assert.assertEquals(5120 + 5120, metrics.getAvailableMB()); // reconnect of node with changed capability nm1 = rm.registerNode("host2:5678", 10240); dispatcher.await(); response = nm1.nodeHeartbeat(true); dispatcher.await(); Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction())); Assert.assertEquals(5120 + 10240, metrics.getAvailableMB()); // reconnect of node with changed capability and running applications List<ApplicationId> runningApps = new ArrayList<ApplicationId>(); runningApps.add(ApplicationId.newInstance(1, 0)); nm1 = rm.registerNode("host2:5678", 15360, 2, runningApps); dispatcher.await(); response = nm1.nodeHeartbeat(true); dispatcher.await(); Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction())); Assert.assertEquals(5120 + 15360, metrics.getAvailableMB()); // reconnect healthy node changing http port nm1 = new MockNM("host1:1234", 5120, rm.getResourceTrackerService()); nm1.setHttpPort(3); nm1.registerNode(); dispatcher.await(); response = nm1.nodeHeartbeat(true); response = nm1.nodeHeartbeat(true); dispatcher.await(); RMNode rmNode = rm.getRMContext().getRMNodes().get(nm1.getNodeId()); Assert.assertEquals(3, rmNode.getHttpPort()); Assert.assertEquals(5120, rmNode.getTotalCapability().getMemory()); Assert.assertEquals(5120 + 15360, metrics.getAvailableMB()); }