public SchedulerNode(RMNode node, boolean usePortForNodeName) { this.rmNode = node; this.availableResource = Resources.clone(node.getTotalCapability()); this.totalResourceCapability = Resources.clone(node.getTotalCapability()); if (usePortForNodeName) { nodeName = rmNode.getHostName() + ":" + node.getNodeID().getPort(); } else { nodeName = rmNode.getHostName(); } }
/** * Release an allocated container on this node. * * @param container container to be released */ public synchronized void releaseContainer(Container container) { if (!isValidContainer(container)) { LOG.error("Invalid container released " + container); return; } /* remove the containers from the nodemanger */ if (null != launchedContainers.remove(container.getId())) { updateResource(container); } LOG.info( "Released container " + container.getId() + " of capacity " + container.getResource() + " on host " + rmNode.getNodeAddress() + ", which currently has " + numContainers + " containers, " + getUsedResource() + " used and " + getAvailableResource() + " available" + ", release resources=" + true); }
private synchronized void deductAvailableResource(Resource resource) { if (resource == null) { LOG.error("Invalid deduction of null resource for " + rmNode.getNodeAddress()); return; } Resources.subtractFrom(availableResource, resource); Resources.addTo(usedResource, resource); }
@Override public UpdateNodeResourceResponse updateNodeResource(UpdateNodeResourceRequest request) throws YarnException, IOException { Map<NodeId, ResourceOption> nodeResourceMap = request.getNodeResourceMap(); Set<NodeId> nodeIds = nodeResourceMap.keySet(); // verify nodes are all valid first. // if any invalid nodes, throw exception instead of partially updating // valid nodes. for (NodeId nodeId : nodeIds) { RMNode node = this.rmContext.getActiveRMNodes().get(nodeId); if (node == null) { LOG.error( "Resource update get failed on all nodes due to change " + "resource on an unrecognized node: " + nodeId); throw RPCUtil.getRemoteException( "Resource update get failed on all nodes due to change resource " + "on an unrecognized node: " + nodeId); } } // do resource update on each node. // Notice: it is still possible to have invalid NodeIDs as nodes decommission // may happen just at the same time. This time, only log and skip absent // nodes without throwing any exceptions. for (Map.Entry<NodeId, ResourceOption> entry : nodeResourceMap.entrySet()) { ResourceOption newResourceOption = entry.getValue(); NodeId nodeId = entry.getKey(); RMNode node = this.rmContext.getActiveRMNodes().get(nodeId); if (node == null) { LOG.warn("Resource update get failed on an unrecognized node: " + nodeId); } else { node.setResourceOption(newResourceOption); LOG.info( "Update resource successfully on node(" + node.getNodeID() + ") with resource(" + newResourceOption.toString() + ")"); } } UpdateNodeResourceResponse response = recordFactory.newRecordInstance(UpdateNodeResourceResponse.class); return response; }
@Override public String toString() { return "host: " + rmNode.getNodeAddress() + " #containers=" + getNumContainers() + " available=" + getAvailableResource().getMemory() + " used=" + getUsedResource().getMemory(); }
public FiCaSchedulerNode(RMNode node, boolean usePortForNodeName) { this.rmNode = node; this.availableResource.setMemory(node.getTotalCapability().getMemory()); this.availableResource.setVirtualCores(node.getTotalCapability().getVirtualCores()); totalResourceCapability = Resource.newInstance( node.getTotalCapability().getMemory(), node.getTotalCapability().getVirtualCores()); if (usePortForNodeName) { nodeName = rmNode.getHostName() + ":" + node.getNodeID().getPort(); } else { nodeName = rmNode.getHostName(); } }
/** * The Scheduler has allocated containers on this node to the given application. * * @param applicationId application * @param rmContainer allocated container */ public synchronized void allocateContainer(ApplicationId applicationId, RMContainer rmContainer) { Container container = rmContainer.getContainer(); deductAvailableResource(container.getResource()); ++numContainers; launchedContainers.put(container.getId(), rmContainer); LOG.info( "Assigned container " + container.getId() + " of capacity " + container.getResource() + " on host " + rmNode.getNodeAddress() + ", which currently has " + numContainers + " containers, " + getUsedResource() + " used and " + getAvailableResource() + " available"); }
@Test public void testReconnectNode() throws Exception { final DrainDispatcher dispatcher = new DrainDispatcher(); rm = new MockRM() { @Override protected EventHandler<SchedulerEvent> createSchedulerEventDispatcher() { return new SchedulerEventDispatcher(this.scheduler) { @Override public void handle(SchedulerEvent event) { scheduler.handle(event); } }; } @Override protected Dispatcher createDispatcher() { return dispatcher; } }; rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); MockNM nm2 = rm.registerNode("host2:5678", 5120); nm1.nodeHeartbeat(true); nm2.nodeHeartbeat(false); dispatcher.await(); checkUnealthyNMCount(rm, nm2, true, 1); final int expectedNMs = ClusterMetrics.getMetrics().getNumActiveNMs(); QueueMetrics metrics = rm.getResourceScheduler().getRootQueueMetrics(); // TODO Metrics incorrect in case of the FifoScheduler Assert.assertEquals(5120, metrics.getAvailableMB()); // reconnect of healthy node nm1 = rm.registerNode("host1:1234", 5120); NodeHeartbeatResponse response = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction())); dispatcher.await(); Assert.assertEquals(expectedNMs, ClusterMetrics.getMetrics().getNumActiveNMs()); checkUnealthyNMCount(rm, nm2, true, 1); // reconnect of unhealthy node nm2 = rm.registerNode("host2:5678", 5120); response = nm2.nodeHeartbeat(false); Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction())); dispatcher.await(); Assert.assertEquals(expectedNMs, ClusterMetrics.getMetrics().getNumActiveNMs()); checkUnealthyNMCount(rm, nm2, true, 1); // unhealthy node changed back to healthy nm2 = rm.registerNode("host2:5678", 5120); dispatcher.await(); response = nm2.nodeHeartbeat(true); response = nm2.nodeHeartbeat(true); dispatcher.await(); Assert.assertEquals(5120 + 5120, metrics.getAvailableMB()); // reconnect of node with changed capability nm1 = rm.registerNode("host2:5678", 10240); dispatcher.await(); response = nm1.nodeHeartbeat(true); dispatcher.await(); Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction())); Assert.assertEquals(5120 + 10240, metrics.getAvailableMB()); // reconnect of node with changed capability and running applications List<ApplicationId> runningApps = new ArrayList<ApplicationId>(); runningApps.add(ApplicationId.newInstance(1, 0)); nm1 = rm.registerNode("host2:5678", 15360, 2, runningApps); dispatcher.await(); response = nm1.nodeHeartbeat(true); dispatcher.await(); Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction())); Assert.assertEquals(5120 + 15360, metrics.getAvailableMB()); // reconnect healthy node changing http port nm1 = new MockNM("host1:1234", 5120, rm.getResourceTrackerService()); nm1.setHttpPort(3); nm1.registerNode(); dispatcher.await(); response = nm1.nodeHeartbeat(true); response = nm1.nodeHeartbeat(true); dispatcher.await(); RMNode rmNode = rm.getRMContext().getRMNodes().get(nm1.getNodeId()); Assert.assertEquals(3, rmNode.getHttpPort()); Assert.assertEquals(5120, rmNode.getTotalCapability().getMemory()); Assert.assertEquals(5120 + 15360, metrics.getAvailableMB()); }