/** * Whether this app has containers requests that could be satisfied on the given node, if the node * had full space. */ public boolean hasContainerForNode(Priority prio, FSSchedulerNode node) { ResourceRequest anyRequest = getResourceRequest(prio, ResourceRequest.ANY); ResourceRequest rackRequest = getResourceRequest(prio, node.getRackName()); ResourceRequest nodeRequest = getResourceRequest(prio, node.getNodeName()); return // There must be outstanding requests at the given priority: anyRequest != null && anyRequest.getNumContainers() > 0 && // If locality relaxation is turned off at *-level, there must be a // non-zero request for the node's rack: (anyRequest.getRelaxLocality() || (rackRequest != null && rackRequest.getNumContainers() > 0)) && // If locality relaxation is turned off at rack-level, there must be a // non-zero request at the node: (rackRequest == null || rackRequest.getRelaxLocality() || (nodeRequest != null && nodeRequest.getNumContainers() > 0)) && // The requested container must be able to fit on the node: Resources.lessThanOrEqual( RESOURCE_CALCULATOR, null, anyRequest.getCapability(), node.getRMNode().getTotalCapability()); }
@Override public AllocateResponse allocate(float progressIndicator) throws YarnException, IOException { Preconditions.checkArgument( progressIndicator >= 0, "Progress indicator should not be negative"); AllocateResponse allocateResponse = null; List<ResourceRequest> askList = null; List<ContainerId> releaseList = null; AllocateRequest allocateRequest = null; List<String> blacklistToAdd = new ArrayList<String>(); List<String> blacklistToRemove = new ArrayList<String>(); try { synchronized (this) { askList = new ArrayList<ResourceRequest>(ask.size()); for (ResourceRequest r : ask) { // create a copy of ResourceRequest as we might change it while the // RPC layer is using it to send info across askList.add( ResourceRequest.newInstance( r.getPriority(), r.getResourceName(), r.getCapability(), r.getNumContainers(), r.getRelaxLocality(), r.getNodeLabelExpression())); } releaseList = new ArrayList<ContainerId>(release); // optimistically clear this collection assuming no RPC failure ask.clear(); release.clear(); blacklistToAdd.addAll(blacklistAdditions); blacklistToRemove.addAll(blacklistRemovals); ResourceBlacklistRequest blacklistRequest = (blacklistToAdd != null) || (blacklistToRemove != null) ? ResourceBlacklistRequest.newInstance(blacklistToAdd, blacklistToRemove) : null; allocateRequest = AllocateRequest.newInstance( lastResponseId, progressIndicator, askList, releaseList, blacklistRequest); // clear blacklistAdditions and blacklistRemovals before // unsynchronized part blacklistAdditions.clear(); blacklistRemovals.clear(); } try { allocateResponse = rmClient.allocate(allocateRequest); } catch (ApplicationMasterNotRegisteredException e) { LOG.warn("ApplicationMaster is out of sync with ResourceManager," + " hence resyncing."); synchronized (this) { release.addAll(this.pendingRelease); blacklistAdditions.addAll(this.blacklistedNodes); for (Map<String, TreeMap<Resource, ResourceRequestInfo>> rr : remoteRequestsTable.values()) { for (Map<Resource, ResourceRequestInfo> capabalities : rr.values()) { for (ResourceRequestInfo request : capabalities.values()) { addResourceRequestToAsk(request.remoteRequest); } } } } // re register with RM registerApplicationMaster(); allocateResponse = allocate(progressIndicator); return allocateResponse; } synchronized (this) { // update these on successful RPC clusterNodeCount = allocateResponse.getNumClusterNodes(); lastResponseId = allocateResponse.getResponseId(); clusterAvailableResources = allocateResponse.getAvailableResources(); if (!allocateResponse.getNMTokens().isEmpty()) { populateNMTokens(allocateResponse.getNMTokens()); } if (allocateResponse.getAMRMToken() != null) { updateAMRMToken(allocateResponse.getAMRMToken()); } if (!pendingRelease.isEmpty() && !allocateResponse.getCompletedContainersStatuses().isEmpty()) { removePendingReleaseRequests(allocateResponse.getCompletedContainersStatuses()); } } } finally { // TODO how to differentiate remote yarn exception vs error in rpc if (allocateResponse == null) { // we hit an exception in allocate() // preserve ask and release for next call to allocate() synchronized (this) { release.addAll(releaseList); // requests could have been added or deleted during call to allocate // If requests were added/removed then there is nothing to do since // the ResourceRequest object in ask would have the actual new value. // If ask does not have this ResourceRequest then it was unchanged and // so we can add the value back safely. // This assumes that there will no concurrent calls to allocate() and // so we dont have to worry about ask being changed in the // synchronized block at the beginning of this method. for (ResourceRequest oldAsk : askList) { if (!ask.contains(oldAsk)) { ask.add(oldAsk); } } blacklistAdditions.addAll(blacklistToAdd); blacklistRemovals.addAll(blacklistToRemove); } } } return allocateResponse; }
protected void containerFailedOnHost(String hostName) { if (!nodeBlacklistingEnabled) { return; } if (blacklistedNodes.contains(hostName)) { if (LOG.isDebugEnabled()) { LOG.debug("Host " + hostName + " is already blacklisted."); } return; // already blacklisted } Integer failures = nodeFailures.remove(hostName); failures = failures == null ? Integer.valueOf(0) : failures; failures++; LOG.info(failures + " failures on node " + hostName); if (failures >= maxTaskFailuresPerNode) { blacklistedNodes.add(hostName); // Even if blacklisting is ignored, continue to remove the host from // the request table. The RM may have additional nodes it can allocate on. LOG.info("Blacklisted host " + hostName); // remove all the requests corresponding to this hostname for (Map<String, Map<Resource, ResourceRequest>> remoteRequests : remoteRequestsTable.values()) { // remove from host if no pending allocations boolean foundAll = true; Map<Resource, ResourceRequest> reqMap = remoteRequests.get(hostName); if (reqMap != null) { for (ResourceRequest req : reqMap.values()) { if (!ask.remove(req)) { foundAll = false; // if ask already sent to RM, we can try and overwrite it if possible. // send a new ask to RM with numContainers // specified for the blacklisted host to be 0. ResourceRequest zeroedRequest = ResourceRequest.newInstance( req.getPriority(), req.getResourceName(), req.getCapability(), req.getNumContainers(), req.getRelaxLocality()); zeroedRequest.setNumContainers(0); // to be sent to RM on next heartbeat addResourceRequestToAsk(zeroedRequest); } } // if all requests were still in ask queue // we can remove this request if (foundAll) { remoteRequests.remove(hostName); } } // TODO handling of rack blacklisting // Removing from rack should be dependent on no. of failures within the rack // Blacklisting a rack on the basis of a single node's blacklisting // may be overly aggressive. // Node failures could be co-related with other failures on the same rack // but we probably need a better approach at trying to decide how and when // to blacklist a rack } } else { nodeFailures.put(hostName, failures); } }
private Resource assignContainer(FSSchedulerNode node, boolean reserved) { if (LOG.isDebugEnabled()) { LOG.debug("Node offered to app: " + getName() + " reserved: " + reserved); } Collection<Priority> prioritiesToTry = (reserved) ? Arrays.asList(node.getReservedContainer().getReservedPriority()) : getPriorities(); // For each priority, see if we can schedule a node local, rack local // or off-switch request. Rack of off-switch requests may be delayed // (not scheduled) in order to promote better locality. synchronized (this) { for (Priority priority : prioritiesToTry) { if (getTotalRequiredResources(priority) <= 0 || !hasContainerForNode(priority, node)) { continue; } addSchedulingOpportunity(priority); // Check the AM resource usage for the leaf queue if (getLiveContainers().size() == 0 && !getUnmanagedAM()) { if (!getQueue().canRunAppAM(getAMResource())) { return Resources.none(); } } ResourceRequest rackLocalRequest = getResourceRequest(priority, node.getRackName()); ResourceRequest localRequest = getResourceRequest(priority, node.getNodeName()); if (localRequest != null && !localRequest.getRelaxLocality()) { LOG.warn("Relax locality off is not supported on local request: " + localRequest); } NodeType allowedLocality; if (scheduler.isContinuousSchedulingEnabled()) { allowedLocality = getAllowedLocalityLevelByTime( priority, scheduler.getNodeLocalityDelayMs(), scheduler.getRackLocalityDelayMs(), scheduler.getClock().getTime()); } else { allowedLocality = getAllowedLocalityLevel( priority, scheduler.getNumClusterNodes(), scheduler.getNodeLocalityThreshold(), scheduler.getRackLocalityThreshold()); } if (rackLocalRequest != null && rackLocalRequest.getNumContainers() != 0 && localRequest != null && localRequest.getNumContainers() != 0) { return assignContainer(node, localRequest, NodeType.NODE_LOCAL, reserved); } if (rackLocalRequest != null && !rackLocalRequest.getRelaxLocality()) { continue; } if (rackLocalRequest != null && rackLocalRequest.getNumContainers() != 0 && (allowedLocality.equals(NodeType.RACK_LOCAL) || allowedLocality.equals(NodeType.OFF_SWITCH))) { return assignContainer(node, rackLocalRequest, NodeType.RACK_LOCAL, reserved); } ResourceRequest offSwitchRequest = getResourceRequest(priority, ResourceRequest.ANY); if (offSwitchRequest != null && !offSwitchRequest.getRelaxLocality()) { continue; } if (offSwitchRequest != null && offSwitchRequest.getNumContainers() != 0 && allowedLocality.equals(NodeType.OFF_SWITCH)) { return assignContainer(node, offSwitchRequest, NodeType.OFF_SWITCH, reserved); } } } return Resources.none(); }
private Resource assignContainer( FSSchedulerNode node, boolean reserved, TransactionState transactionState) { if (LOG.isDebugEnabled()) { LOG.debug("Node offered to app: " + getName() + " reserved: " + reserved); } if (reserved) { RMContainer rmContainer = node.getReservedContainer(); Priority priority = rmContainer.getReservedPriority(); // Make sure the application still needs requests at this priority if (app.getTotalRequiredResources(priority) == 0) { unreserve(priority, node); return Resources.none(); } } Collection<Priority> prioritiesToTry = (reserved) ? Arrays.asList(node.getReservedContainer().getReservedPriority()) : app.getPriorities(); // For each priority, see if we can schedule a node local, rack local // or off-switch request. Rack of off-switch requests may be delayed // (not scheduled) in order to promote better locality. synchronized (app) { for (Priority priority : prioritiesToTry) { if (app.getTotalRequiredResources(priority) <= 0 || !hasContainerForNode(priority, node)) { continue; } app.addSchedulingOpportunity(priority); ResourceRequest rackLocalRequest = app.getResourceRequest(priority, node.getRackName()); ResourceRequest localRequest = app.getResourceRequest(priority, node.getNodeName()); if (localRequest != null && !localRequest.getRelaxLocality()) { LOG.warn("Relax locality off is not supported on local request: " + localRequest); } NodeType allowedLocality; if (scheduler.isContinuousSchedulingEnabled()) { allowedLocality = app.getAllowedLocalityLevelByTime( priority, scheduler.getNodeLocalityDelayMs(), scheduler.getRackLocalityDelayMs(), scheduler.getClock().getTime()); } else { allowedLocality = app.getAllowedLocalityLevel( priority, scheduler.getNumClusterNodes(), scheduler.getNodeLocalityThreshold(), scheduler.getRackLocalityThreshold()); } if (rackLocalRequest != null && rackLocalRequest.getNumContainers() != 0 && localRequest != null && localRequest.getNumContainers() != 0) { return assignContainer( node, priority, localRequest, NodeType.NODE_LOCAL, reserved, transactionState); } if (rackLocalRequest != null && !rackLocalRequest.getRelaxLocality()) { continue; } if (rackLocalRequest != null && rackLocalRequest.getNumContainers() != 0 && (allowedLocality.equals(NodeType.RACK_LOCAL) || allowedLocality.equals(NodeType.OFF_SWITCH))) { return assignContainer( node, priority, rackLocalRequest, NodeType.RACK_LOCAL, reserved, transactionState); } ResourceRequest offSwitchRequest = app.getResourceRequest(priority, ResourceRequest.ANY); if (offSwitchRequest != null && !offSwitchRequest.getRelaxLocality()) { continue; } if (offSwitchRequest != null && offSwitchRequest.getNumContainers() != 0 && allowedLocality.equals(NodeType.OFF_SWITCH)) { return assignContainer( node, priority, offSwitchRequest, NodeType.OFF_SWITCH, reserved, transactionState); } } } return Resources.none(); }