public void assignToLeafTasks(LinkedList<TaskRequestEvent> taskRequests) { Collections.shuffle(taskRequests); LinkedList<TaskRequestEvent> remoteTaskRequests = new LinkedList<>(); String queryMasterHostAndPort = context .getMasterContext() .getQueryMasterContext() .getWorkerContext() .getConnectionInfo() .getHostAndQMPort(); TaskRequestEvent taskRequest; while (leafTasks.size() > 0 && (!taskRequests.isEmpty() || !remoteTaskRequests.isEmpty())) { int localAssign = 0; int rackAssign = 0; taskRequest = taskRequests.pollFirst(); if (taskRequest == null) { // if there are only remote task requests taskRequest = remoteTaskRequests.pollFirst(); } // checking if this container is still alive. // If not, ignore the task request and stop the task runner WorkerConnectionInfo connectionInfo = context.getMasterContext().getWorkerMap().get(taskRequest.getWorkerId()); if (connectionInfo == null) continue; // getting the hostname of requested node String host = connectionInfo.getHost(); // if there are no worker matched to the hostname a task request if (!leafTaskHostMapping.containsKey(host) && !taskRequests.isEmpty()) { String normalizedHost = NetUtils.normalizeHost(host); if (!leafTaskHostMapping.containsKey(normalizedHost)) { // this case means one of either cases: // * there are no blocks which reside in this node. // * all blocks which reside in this node are consumed, and this task runner requests a // remote task. // In this case, we transfer the task request to the remote task request list, and skip // the followings. remoteTaskRequests.add(taskRequest); continue; } else { host = normalizedHost; } } if (LOG.isDebugEnabled()) { LOG.debug( "assignToLeafTasks: " + taskRequest.getExecutionBlockId() + "," + "worker=" + connectionInfo.getHostAndPeerRpcPort()); } ////////////////////////////////////////////////////////////////////// // disk or host-local allocation ////////////////////////////////////////////////////////////////////// TaskAttemptId attemptId = allocateLocalTask(host); if (attemptId == null) { // if a local task cannot be found HostVolumeMapping hostVolumeMapping = leafTaskHostMapping.get(host); if (!taskRequests .isEmpty()) { // if other requests remains, move to remote list for better locality remoteTaskRequests.add(taskRequest); candidateWorkers.remove(connectionInfo.getId()); continue; } else { if (hostVolumeMapping != null) { int nodes = context.getMasterContext().getWorkerMap().size(); // this part is to control the assignment of tail and remote task balancing per node int tailLimit = 1; if (remainingScheduledObjectNum() > 0 && nodes > 0) { tailLimit = Math.max(remainingScheduledObjectNum() / nodes, 1); } if (hostVolumeMapping.getRemoteConcurrency() >= tailLimit) { // remote task throttling per node continue; } else { // assign to remote volume hostVolumeMapping.increaseConcurrency(HostVolumeMapping.REMOTE); } } } ////////////////////////////////////////////////////////////////////// // rack-local allocation ////////////////////////////////////////////////////////////////////// attemptId = allocateRackTask(host); ////////////////////////////////////////////////////////////////////// // random node allocation ////////////////////////////////////////////////////////////////////// if (attemptId == null && leafTaskNum() > 0) { synchronized (leafTasks) { attemptId = leafTasks.iterator().next(); leafTasks.remove(attemptId); } } if (attemptId != null && hostVolumeMapping != null) { hostVolumeMapping.lastAssignedVolumeId.put(attemptId, HostVolumeMapping.REMOTE); } rackAssign++; } else { localAssign++; } if (attemptId != null) { Task task = stage.getTask(attemptId.getTaskId()); TaskRequest taskAssign = new TaskRequestImpl( attemptId, new ArrayList<>(task.getAllFragments()), "", false, LogicalNodeSerializer.serialize(task.getLogicalPlan()), context.getMasterContext().getQueryContext(), stage.getDataChannel(), stage.getBlock().getEnforcer(), queryMasterHostAndPort); if (checkIfInterQuery(stage.getMasterPlan(), stage.getBlock())) { taskAssign.setInterQuery(); } // TODO send batch request BatchAllocationRequest.Builder requestProto = BatchAllocationRequest.newBuilder(); requestProto.addTaskRequest( TaskAllocationProto.newBuilder() .setResource(taskRequest.getResponseProto().getResource()) .setTaskRequest(taskAssign.getProto()) .build()); requestProto.setExecutionBlockId(attemptId.getTaskId().getExecutionBlockId().getProto()); context .getMasterContext() .getEventHandler() .handle(new TaskAttemptAssignedEvent(attemptId, connectionInfo)); InetSocketAddress addr = stage.getAssignedWorkerMap().get(connectionInfo.getId()); if (addr == null) addr = new InetSocketAddress(connectionInfo.getHost(), connectionInfo.getPeerRpcPort()); AsyncRpcClient tajoWorkerRpc = null; CallFuture<BatchAllocationResponse> callFuture = new CallFuture<>(); totalAttempts++; try { tajoWorkerRpc = RpcClientManager.getInstance() .getClient(addr, TajoWorkerProtocol.class, true, rpcParams); TajoWorkerProtocol.TajoWorkerProtocolService tajoWorkerRpcClient = tajoWorkerRpc.getStub(); tajoWorkerRpcClient.allocateTasks( callFuture.getController(), requestProto.build(), callFuture); BatchAllocationResponse responseProto = callFuture.get(RpcConstants.FUTURE_TIMEOUT_SECONDS_DEFAULT, TimeUnit.SECONDS); if (responseProto.getCancellationTaskCount() > 0) { for (TaskAllocationProto proto : responseProto.getCancellationTaskList()) { cancel(task.getAttempt(new TaskAttemptId(proto.getTaskRequest().getId()))); cancellation++; } if (LOG.isDebugEnabled()) { LOG.debug( "Canceled requests: " + responseProto.getCancellationTaskCount() + " from " + addr); } continue; } } catch (Exception e) { LOG.error(e); } scheduledObjectNum--; totalAssigned++; hostLocalAssigned += localAssign; rackLocalAssigned += rackAssign; if (rackAssign > 0) { LOG.info( String.format( "Assigned Local/Rack/Total: (%d/%d/%d), " + "Attempted Cancel/Assign/Total: (%d/%d/%d), " + "Locality: %.2f%%, Rack host: %s", hostLocalAssigned, rackLocalAssigned, totalAssigned, cancellation, totalAssigned, totalAttempts, ((double) hostLocalAssigned / (double) totalAssigned) * 100, host)); } } else { throw new RuntimeException("Illegal State!!!!!!!!!!!!!!!!!!!!!"); } } }
private TaskAttemptId allocateRackTask(String host) { List<HostVolumeMapping> remainingTasks = Lists.newArrayList(leafTaskHostMapping.values()); String rack = RackResolver.resolve(host).getNetworkLocation(); TaskAttemptId attemptId = null; if (remainingTasks.size() > 0) { synchronized (scheduledRequests) { // find largest remaining task of other host in rack Collections.sort( remainingTasks, new Comparator<HostVolumeMapping>() { @Override public int compare(HostVolumeMapping v1, HostVolumeMapping v2) { // descending remaining tasks if (v2.remainTasksNum.get() > v1.remainTasksNum.get()) { return 1; } else if (v2.remainTasksNum.get() == v1.remainTasksNum.get()) { return 0; } else { return -1; } } }); } for (HostVolumeMapping tasks : remainingTasks) { for (int i = 0; i < tasks.getRemainingLocalTaskSize(); i++) { TaskAttemptId tId = tasks.getTaskAttemptIdByRack(rack); if (tId == null) break; if (leafTasks.contains(tId)) { leafTasks.remove(tId); attemptId = tId; break; } } if (attemptId != null) break; } } // find task in rack if (attemptId == null) { HashSet<TaskAttemptId> list = leafTasksRackMapping.get(rack); if (list != null) { synchronized (list) { Iterator<TaskAttemptId> iterator = list.iterator(); while (iterator.hasNext()) { TaskAttemptId tId = iterator.next(); iterator.remove(); if (leafTasks.contains(tId)) { leafTasks.remove(tId); attemptId = tId; break; } } } } } return attemptId; }
/** * One worker can have multiple running task runners. <code>HostVolumeMapping</code> describes * various information for one worker, including : * * <ul> * <li>host name * <li>rack name * <li>unassigned tasks for each disk volume * <li>last assigned volume id - it can be used for assigning task in a round-robin manner * <li>the number of running tasks for each volume * </ul> * * , each task runner and the concurrency number of running tasks for volumes. * * <p>Here, we identifier a task runner by {@link ContainerId}, and we use volume ids to identify * all disks in this node. Actually, each volume is only used to distinguish disks, and we don't * know a certain volume id indicates a certain disk. If you want to know volume id, please read * the below section. * * <h3>Volume id</h3> * * Volume id is an integer. Each volume id identifies each disk volume. * * <p>This volume id can be obtained from * org.apache.hadoop.fs.BlockStorageLocation#getVolumeIds()}. * HDFS cannot give any volume id due * to unknown reason and disabled config 'dfs.client.file-block-locations.enabled'. In this case, * the volume id will be -1 or other native integer. * * <h3>See Also</h3> * * <ul> * <li>HDFS-3672 (https://issues.apache.org/jira/browse/HDFS-3672). * </ul> */ public class HostVolumeMapping { private final String host; private final String rack; /** A key is disk volume, and a value is a list of tasks to be scheduled. */ private Map<Integer, LinkedHashSet<TaskAttempt>> unassignedTaskForEachVolume = Collections.synchronizedMap(new HashMap<>()); /** A value is last assigned volume id for each task runner */ private HashMap<TaskAttemptId, Integer> lastAssignedVolumeId = Maps.newHashMap(); /** * A key is disk volume id, and a value is the load of this volume. This load is measured by * counting how many number of tasks are running. * * <p>These disk volumes are kept in an order of ascending order of the volume id. In other * words, the head volume ids are likely to -1, meaning no given volume id. */ private SortedMap<Integer, Integer> diskVolumeLoads = new TreeMap<>(); /** The total number of remain tasks in this host */ private AtomicInteger remainTasksNum = new AtomicInteger(0); public static final int REMOTE = -2; public HostVolumeMapping(String host, String rack) { this.host = host; this.rack = rack; } public synchronized void addTaskAttempt(int volumeId, TaskAttempt attemptId) { synchronized (unassignedTaskForEachVolume) { LinkedHashSet<TaskAttempt> list = unassignedTaskForEachVolume.get(volumeId); if (list == null) { list = new LinkedHashSet<>(); unassignedTaskForEachVolume.put(volumeId, list); } list.add(attemptId); } remainTasksNum.incrementAndGet(); if (!diskVolumeLoads.containsKey(volumeId)) diskVolumeLoads.put(volumeId, 0); } /** * Priorities 1. a task list in a volume of host 2. unknown block or Non-splittable task in host * 3. remote tasks. unassignedTaskForEachVolume is only contained local task. so it will be null */ public synchronized TaskAttemptId getLocalTask() { int volumeId = getLowestVolumeId(); TaskAttemptId taskAttemptId = null; if (unassignedTaskForEachVolume.size() > 0) { int retry = unassignedTaskForEachVolume.size(); do { // clean and get a remaining local task taskAttemptId = getAndRemove(volumeId); if (taskAttemptId == null) { // reassign next volume volumeId = getLowestVolumeId(); retry--; } else { lastAssignedVolumeId.put(taskAttemptId, volumeId); break; } } while (retry > 0); } else { this.remainTasksNum.set(0); } return taskAttemptId; } public synchronized TaskAttemptId getTaskAttemptIdByRack(String rack) { TaskAttemptId taskAttemptId = null; if (unassignedTaskForEachVolume.size() > 0 && this.rack.equals(rack)) { int retry = unassignedTaskForEachVolume.size(); do { // clean and get a remaining task int volumeId = getLowestVolumeId(); taskAttemptId = getAndRemove(volumeId); if (taskAttemptId == null) { retry--; } else { break; } } while (retry > 0); } return taskAttemptId; } private synchronized TaskAttemptId getAndRemove(int volumeId) { TaskAttemptId taskAttemptId = null; if (!unassignedTaskForEachVolume.containsKey(volumeId)) { if (volumeId > REMOTE) { diskVolumeLoads.remove(volumeId); } return taskAttemptId; } LinkedHashSet<TaskAttempt> list = unassignedTaskForEachVolume.get(volumeId); if (list != null && !list.isEmpty()) { TaskAttempt taskAttempt; synchronized (unassignedTaskForEachVolume) { Iterator<TaskAttempt> iterator = list.iterator(); taskAttempt = iterator.next(); iterator.remove(); } taskAttemptId = taskAttempt.getId(); for (DataLocation location : taskAttempt.getTask().getDataLocations()) { HostVolumeMapping volumeMapping = scheduledRequests.leafTaskHostMapping.get(location.getHost()); if (volumeMapping != null) { volumeMapping.removeTaskAttempt(location.getVolumeId(), taskAttempt); } } increaseConcurrency(volumeId); } return taskAttemptId; } private synchronized void removeTaskAttempt(int volumeId, TaskAttempt taskAttempt) { if (!unassignedTaskForEachVolume.containsKey(volumeId)) return; LinkedHashSet<TaskAttempt> tasks = unassignedTaskForEachVolume.get(volumeId); if (tasks.remove(taskAttempt)) { remainTasksNum.getAndDecrement(); } if (tasks.isEmpty()) { unassignedTaskForEachVolume.remove(volumeId); if (volumeId > REMOTE) { diskVolumeLoads.remove(volumeId); } } } /** * Increase the count of running tasks and disk loads for a certain task runner. * * @param volumeId Volume identifier * @return the volume load (i.e., how many running tasks use this volume) */ private synchronized int increaseConcurrency(int volumeId) { int concurrency = 1; if (diskVolumeLoads.containsKey(volumeId)) { concurrency = diskVolumeLoads.get(volumeId) + 1; } if (volumeId > -1) { LOG.info( "Assigned host : " + host + ", Volume : " + volumeId + ", Concurrency : " + concurrency); } else if (volumeId == -1) { // this case is disabled namenode block meta or compressed text file or amazon s3 LOG.info( "Assigned host : " + host + ", Unknown Volume : " + volumeId + ", Concurrency : " + concurrency); } else if (volumeId == REMOTE) { // this case has processed all block on host and it will be assigned to remote LOG.info( "Assigned host : " + host + ", Remaining local tasks : " + getRemainingLocalTaskSize() + ", Remote Concurrency : " + concurrency); } diskVolumeLoads.put(volumeId, concurrency); return concurrency; } /** Decrease the count of running tasks of a certain task runner */ private synchronized void decreaseConcurrency(int volumeId) { if (diskVolumeLoads.containsKey(volumeId)) { Integer concurrency = diskVolumeLoads.get(volumeId); if (concurrency > 0) { diskVolumeLoads.put(volumeId, concurrency - 1); } else { if (volumeId > REMOTE && !unassignedTaskForEachVolume.containsKey(volumeId)) { diskVolumeLoads.remove(volumeId); } } } } /** volume of a host : 0 ~ n compressed task, amazon s3, unKnown volume : -1 remote task : -2 */ public int getLowestVolumeId() { Map.Entry<Integer, Integer> volumeEntry = null; for (Map.Entry<Integer, Integer> entry : diskVolumeLoads.entrySet()) { if (volumeEntry == null) volumeEntry = entry; if (volumeEntry.getValue() >= entry.getValue()) { volumeEntry = entry; } } if (volumeEntry != null) { return volumeEntry.getKey(); } else { return REMOTE; } } public int getRemoteConcurrency() { return getVolumeConcurrency(REMOTE); } public int getVolumeConcurrency(int volumeId) { Integer size = diskVolumeLoads.get(volumeId); if (size == null) return 0; else return size; } public int getRemainingLocalTaskSize() { return remainTasksNum.get(); } public String getHost() { return host; } public String getRack() { return rack; } }
private class ScheduledRequests { // two list leafTasks and nonLeafTasks keep all tasks to be scheduled. Even though some task is // included in // leafTaskHostMapping or leafTasksRackMapping, some task T will not be sent to a task runner // if the task is not included in leafTasks and nonLeafTasks. private final Set<TaskAttemptId> leafTasks = Collections.synchronizedSet(new HashSet<>()); private final Set<TaskAttemptId> nonLeafTasks = Collections.synchronizedSet(new HashSet<>()); private Map<String, HostVolumeMapping> leafTaskHostMapping = Maps.newConcurrentMap(); private final Map<String, HashSet<TaskAttemptId>> leafTasksRackMapping = Maps.newConcurrentMap(); protected void clear() { leafTasks.clear(); nonLeafTasks.clear(); leafTaskHostMapping.clear(); leafTasksRackMapping.clear(); } private void addLeafTask(TaskAttemptToSchedulerEvent event) { TaskAttempt taskAttempt = event.getTaskAttempt(); List<DataLocation> locations = taskAttempt.getTask().getDataLocations(); for (DataLocation location : locations) { String host = location.getHost(); leafTaskHosts.add(host); HostVolumeMapping hostVolumeMapping = leafTaskHostMapping.get(host); if (hostVolumeMapping == null) { String rack = RackResolver.resolve(host).getNetworkLocation(); hostVolumeMapping = new HostVolumeMapping(host, rack); leafTaskHostMapping.put(host, hostVolumeMapping); } hostVolumeMapping.addTaskAttempt(location.getVolumeId(), taskAttempt); if (LOG.isDebugEnabled()) { LOG.debug("Added attempt req to host " + host); } HashSet<TaskAttemptId> list = leafTasksRackMapping.get(hostVolumeMapping.getRack()); if (list == null) { list = new HashSet<>(); leafTasksRackMapping.put(hostVolumeMapping.getRack(), list); } list.add(taskAttempt.getId()); if (LOG.isDebugEnabled()) { LOG.debug("Added attempt req to rack " + hostVolumeMapping.getRack()); } } leafTasks.add(taskAttempt.getId()); } private void addNonLeafTask(TaskAttemptToSchedulerEvent event) { nonLeafTasks.add(event.getTaskAttempt().getId()); } public int leafTaskNum() { return leafTasks.size(); } public int nonLeafTaskNum() { return nonLeafTasks.size(); } private TaskAttemptId allocateLocalTask(String host) { HostVolumeMapping hostVolumeMapping = leafTaskHostMapping.get(host); if (hostVolumeMapping != null) { // tajo host is located in hadoop datanode for (int i = 0; i < hostVolumeMapping.getRemainingLocalTaskSize(); i++) { TaskAttemptId attemptId = hostVolumeMapping.getLocalTask(); if (attemptId == null) break; // find remaining local task if (leafTasks.contains(attemptId)) { leafTasks.remove(attemptId); return attemptId; } } } return null; } private TaskAttemptId allocateRackTask(String host) { List<HostVolumeMapping> remainingTasks = Lists.newArrayList(leafTaskHostMapping.values()); String rack = RackResolver.resolve(host).getNetworkLocation(); TaskAttemptId attemptId = null; if (remainingTasks.size() > 0) { synchronized (scheduledRequests) { // find largest remaining task of other host in rack Collections.sort( remainingTasks, new Comparator<HostVolumeMapping>() { @Override public int compare(HostVolumeMapping v1, HostVolumeMapping v2) { // descending remaining tasks if (v2.remainTasksNum.get() > v1.remainTasksNum.get()) { return 1; } else if (v2.remainTasksNum.get() == v1.remainTasksNum.get()) { return 0; } else { return -1; } } }); } for (HostVolumeMapping tasks : remainingTasks) { for (int i = 0; i < tasks.getRemainingLocalTaskSize(); i++) { TaskAttemptId tId = tasks.getTaskAttemptIdByRack(rack); if (tId == null) break; if (leafTasks.contains(tId)) { leafTasks.remove(tId); attemptId = tId; break; } } if (attemptId != null) break; } } // find task in rack if (attemptId == null) { HashSet<TaskAttemptId> list = leafTasksRackMapping.get(rack); if (list != null) { synchronized (list) { Iterator<TaskAttemptId> iterator = list.iterator(); while (iterator.hasNext()) { TaskAttemptId tId = iterator.next(); iterator.remove(); if (leafTasks.contains(tId)) { leafTasks.remove(tId); attemptId = tId; break; } } } } } return attemptId; } public void assignToLeafTasks(LinkedList<TaskRequestEvent> taskRequests) { Collections.shuffle(taskRequests); LinkedList<TaskRequestEvent> remoteTaskRequests = new LinkedList<>(); String queryMasterHostAndPort = context .getMasterContext() .getQueryMasterContext() .getWorkerContext() .getConnectionInfo() .getHostAndQMPort(); TaskRequestEvent taskRequest; while (leafTasks.size() > 0 && (!taskRequests.isEmpty() || !remoteTaskRequests.isEmpty())) { int localAssign = 0; int rackAssign = 0; taskRequest = taskRequests.pollFirst(); if (taskRequest == null) { // if there are only remote task requests taskRequest = remoteTaskRequests.pollFirst(); } // checking if this container is still alive. // If not, ignore the task request and stop the task runner WorkerConnectionInfo connectionInfo = context.getMasterContext().getWorkerMap().get(taskRequest.getWorkerId()); if (connectionInfo == null) continue; // getting the hostname of requested node String host = connectionInfo.getHost(); // if there are no worker matched to the hostname a task request if (!leafTaskHostMapping.containsKey(host) && !taskRequests.isEmpty()) { String normalizedHost = NetUtils.normalizeHost(host); if (!leafTaskHostMapping.containsKey(normalizedHost)) { // this case means one of either cases: // * there are no blocks which reside in this node. // * all blocks which reside in this node are consumed, and this task runner requests a // remote task. // In this case, we transfer the task request to the remote task request list, and skip // the followings. remoteTaskRequests.add(taskRequest); continue; } else { host = normalizedHost; } } if (LOG.isDebugEnabled()) { LOG.debug( "assignToLeafTasks: " + taskRequest.getExecutionBlockId() + "," + "worker=" + connectionInfo.getHostAndPeerRpcPort()); } ////////////////////////////////////////////////////////////////////// // disk or host-local allocation ////////////////////////////////////////////////////////////////////// TaskAttemptId attemptId = allocateLocalTask(host); if (attemptId == null) { // if a local task cannot be found HostVolumeMapping hostVolumeMapping = leafTaskHostMapping.get(host); if (!taskRequests .isEmpty()) { // if other requests remains, move to remote list for better locality remoteTaskRequests.add(taskRequest); candidateWorkers.remove(connectionInfo.getId()); continue; } else { if (hostVolumeMapping != null) { int nodes = context.getMasterContext().getWorkerMap().size(); // this part is to control the assignment of tail and remote task balancing per node int tailLimit = 1; if (remainingScheduledObjectNum() > 0 && nodes > 0) { tailLimit = Math.max(remainingScheduledObjectNum() / nodes, 1); } if (hostVolumeMapping.getRemoteConcurrency() >= tailLimit) { // remote task throttling per node continue; } else { // assign to remote volume hostVolumeMapping.increaseConcurrency(HostVolumeMapping.REMOTE); } } } ////////////////////////////////////////////////////////////////////// // rack-local allocation ////////////////////////////////////////////////////////////////////// attemptId = allocateRackTask(host); ////////////////////////////////////////////////////////////////////// // random node allocation ////////////////////////////////////////////////////////////////////// if (attemptId == null && leafTaskNum() > 0) { synchronized (leafTasks) { attemptId = leafTasks.iterator().next(); leafTasks.remove(attemptId); } } if (attemptId != null && hostVolumeMapping != null) { hostVolumeMapping.lastAssignedVolumeId.put(attemptId, HostVolumeMapping.REMOTE); } rackAssign++; } else { localAssign++; } if (attemptId != null) { Task task = stage.getTask(attemptId.getTaskId()); TaskRequest taskAssign = new TaskRequestImpl( attemptId, new ArrayList<>(task.getAllFragments()), "", false, LogicalNodeSerializer.serialize(task.getLogicalPlan()), context.getMasterContext().getQueryContext(), stage.getDataChannel(), stage.getBlock().getEnforcer(), queryMasterHostAndPort); if (checkIfInterQuery(stage.getMasterPlan(), stage.getBlock())) { taskAssign.setInterQuery(); } // TODO send batch request BatchAllocationRequest.Builder requestProto = BatchAllocationRequest.newBuilder(); requestProto.addTaskRequest( TaskAllocationProto.newBuilder() .setResource(taskRequest.getResponseProto().getResource()) .setTaskRequest(taskAssign.getProto()) .build()); requestProto.setExecutionBlockId(attemptId.getTaskId().getExecutionBlockId().getProto()); context .getMasterContext() .getEventHandler() .handle(new TaskAttemptAssignedEvent(attemptId, connectionInfo)); InetSocketAddress addr = stage.getAssignedWorkerMap().get(connectionInfo.getId()); if (addr == null) addr = new InetSocketAddress(connectionInfo.getHost(), connectionInfo.getPeerRpcPort()); AsyncRpcClient tajoWorkerRpc = null; CallFuture<BatchAllocationResponse> callFuture = new CallFuture<>(); totalAttempts++; try { tajoWorkerRpc = RpcClientManager.getInstance() .getClient(addr, TajoWorkerProtocol.class, true, rpcParams); TajoWorkerProtocol.TajoWorkerProtocolService tajoWorkerRpcClient = tajoWorkerRpc.getStub(); tajoWorkerRpcClient.allocateTasks( callFuture.getController(), requestProto.build(), callFuture); BatchAllocationResponse responseProto = callFuture.get(RpcConstants.FUTURE_TIMEOUT_SECONDS_DEFAULT, TimeUnit.SECONDS); if (responseProto.getCancellationTaskCount() > 0) { for (TaskAllocationProto proto : responseProto.getCancellationTaskList()) { cancel(task.getAttempt(new TaskAttemptId(proto.getTaskRequest().getId()))); cancellation++; } if (LOG.isDebugEnabled()) { LOG.debug( "Canceled requests: " + responseProto.getCancellationTaskCount() + " from " + addr); } continue; } } catch (Exception e) { LOG.error(e); } scheduledObjectNum--; totalAssigned++; hostLocalAssigned += localAssign; rackLocalAssigned += rackAssign; if (rackAssign > 0) { LOG.info( String.format( "Assigned Local/Rack/Total: (%d/%d/%d), " + "Attempted Cancel/Assign/Total: (%d/%d/%d), " + "Locality: %.2f%%, Rack host: %s", hostLocalAssigned, rackLocalAssigned, totalAssigned, cancellation, totalAssigned, totalAttempts, ((double) hostLocalAssigned / (double) totalAssigned) * 100, host)); } } else { throw new RuntimeException("Illegal State!!!!!!!!!!!!!!!!!!!!!"); } } } private boolean checkIfInterQuery(MasterPlan masterPlan, ExecutionBlock block) { if (masterPlan.isRoot(block)) { return false; } ExecutionBlock parent = masterPlan.getParent(block); if (masterPlan.isRoot(parent) && parent.isUnionOnly()) { return false; } return true; } public void assignToNonLeafTasks(LinkedList<TaskRequestEvent> taskRequests) { Collections.shuffle(taskRequests); String queryMasterHostAndPort = context .getMasterContext() .getQueryMasterContext() .getWorkerContext() .getConnectionInfo() .getHostAndQMPort(); TaskRequestEvent taskRequest; while (!taskRequests.isEmpty()) { taskRequest = taskRequests.pollFirst(); LOG.debug("assignToNonLeafTasks: " + taskRequest.getExecutionBlockId()); TaskAttemptId attemptId; // random allocation if (nonLeafTasks.size() > 0) { synchronized (nonLeafTasks) { attemptId = nonLeafTasks.iterator().next(); nonLeafTasks.remove(attemptId); } LOG.debug("Assigned based on * match"); Task task; task = stage.getTask(attemptId.getTaskId()); TaskRequest taskAssign = new TaskRequestImpl( attemptId, Lists.newArrayList(task.getAllFragments()), "", false, LogicalNodeSerializer.serialize(task.getLogicalPlan()), context.getMasterContext().getQueryContext(), stage.getDataChannel(), stage.getBlock().getEnforcer(), queryMasterHostAndPort); if (checkIfInterQuery(stage.getMasterPlan(), stage.getBlock())) { taskAssign.setInterQuery(); } for (Map.Entry<String, Set<FetchImpl>> entry : task.getFetchMap().entrySet()) { Collection<FetchImpl> fetches = entry.getValue(); if (fetches != null) { for (FetchImpl fetch : fetches) { taskAssign.addFetch(entry.getKey(), fetch); } } } WorkerConnectionInfo connectionInfo = context.getMasterContext().getWorkerMap().get(taskRequest.getWorkerId()); // TODO send batch request BatchAllocationRequest.Builder requestProto = BatchAllocationRequest.newBuilder(); requestProto.addTaskRequest( TaskAllocationProto.newBuilder() .setResource(taskRequest.getResponseProto().getResource()) .setTaskRequest(taskAssign.getProto()) .build()); requestProto.setExecutionBlockId(attemptId.getTaskId().getExecutionBlockId().getProto()); context .getMasterContext() .getEventHandler() .handle(new TaskAttemptAssignedEvent(attemptId, connectionInfo)); CallFuture<BatchAllocationResponse> callFuture = new CallFuture<>(); InetSocketAddress addr = stage.getAssignedWorkerMap().get(connectionInfo.getId()); if (addr == null) addr = new InetSocketAddress(connectionInfo.getHost(), connectionInfo.getPeerRpcPort()); AsyncRpcClient tajoWorkerRpc; try { tajoWorkerRpc = RpcClientManager.getInstance() .getClient(addr, TajoWorkerProtocol.class, true, rpcParams); TajoWorkerProtocol.TajoWorkerProtocolService tajoWorkerRpcClient = tajoWorkerRpc.getStub(); tajoWorkerRpcClient.allocateTasks( callFuture.getController(), requestProto.build(), callFuture); BatchAllocationResponse responseProto = callFuture.get(RpcConstants.FUTURE_TIMEOUT_SECONDS_DEFAULT, TimeUnit.SECONDS); if (responseProto.getCancellationTaskCount() > 0) { for (TaskAllocationProto proto : responseProto.getCancellationTaskList()) { cancel(task.getAttempt(new TaskAttemptId(proto.getTaskRequest().getId()))); cancellation++; } if (LOG.isDebugEnabled()) { LOG.debug( "Canceled requests: " + responseProto.getCancellationTaskCount() + " from " + addr); } continue; } totalAssigned++; scheduledObjectNum--; } catch (Exception e) { LOG.error(e); } } } } }
public void assignToNonLeafTasks(LinkedList<TaskRequestEvent> taskRequests) { Collections.shuffle(taskRequests); String queryMasterHostAndPort = context .getMasterContext() .getQueryMasterContext() .getWorkerContext() .getConnectionInfo() .getHostAndQMPort(); TaskRequestEvent taskRequest; while (!taskRequests.isEmpty()) { taskRequest = taskRequests.pollFirst(); LOG.debug("assignToNonLeafTasks: " + taskRequest.getExecutionBlockId()); TaskAttemptId attemptId; // random allocation if (nonLeafTasks.size() > 0) { synchronized (nonLeafTasks) { attemptId = nonLeafTasks.iterator().next(); nonLeafTasks.remove(attemptId); } LOG.debug("Assigned based on * match"); Task task; task = stage.getTask(attemptId.getTaskId()); TaskRequest taskAssign = new TaskRequestImpl( attemptId, Lists.newArrayList(task.getAllFragments()), "", false, LogicalNodeSerializer.serialize(task.getLogicalPlan()), context.getMasterContext().getQueryContext(), stage.getDataChannel(), stage.getBlock().getEnforcer(), queryMasterHostAndPort); if (checkIfInterQuery(stage.getMasterPlan(), stage.getBlock())) { taskAssign.setInterQuery(); } for (Map.Entry<String, Set<FetchImpl>> entry : task.getFetchMap().entrySet()) { Collection<FetchImpl> fetches = entry.getValue(); if (fetches != null) { for (FetchImpl fetch : fetches) { taskAssign.addFetch(entry.getKey(), fetch); } } } WorkerConnectionInfo connectionInfo = context.getMasterContext().getWorkerMap().get(taskRequest.getWorkerId()); // TODO send batch request BatchAllocationRequest.Builder requestProto = BatchAllocationRequest.newBuilder(); requestProto.addTaskRequest( TaskAllocationProto.newBuilder() .setResource(taskRequest.getResponseProto().getResource()) .setTaskRequest(taskAssign.getProto()) .build()); requestProto.setExecutionBlockId(attemptId.getTaskId().getExecutionBlockId().getProto()); context .getMasterContext() .getEventHandler() .handle(new TaskAttemptAssignedEvent(attemptId, connectionInfo)); CallFuture<BatchAllocationResponse> callFuture = new CallFuture<>(); InetSocketAddress addr = stage.getAssignedWorkerMap().get(connectionInfo.getId()); if (addr == null) addr = new InetSocketAddress(connectionInfo.getHost(), connectionInfo.getPeerRpcPort()); AsyncRpcClient tajoWorkerRpc; try { tajoWorkerRpc = RpcClientManager.getInstance() .getClient(addr, TajoWorkerProtocol.class, true, rpcParams); TajoWorkerProtocol.TajoWorkerProtocolService tajoWorkerRpcClient = tajoWorkerRpc.getStub(); tajoWorkerRpcClient.allocateTasks( callFuture.getController(), requestProto.build(), callFuture); BatchAllocationResponse responseProto = callFuture.get(RpcConstants.FUTURE_TIMEOUT_SECONDS_DEFAULT, TimeUnit.SECONDS); if (responseProto.getCancellationTaskCount() > 0) { for (TaskAllocationProto proto : responseProto.getCancellationTaskList()) { cancel(task.getAttempt(new TaskAttemptId(proto.getTaskRequest().getId()))); cancellation++; } if (LOG.isDebugEnabled()) { LOG.debug( "Canceled requests: " + responseProto.getCancellationTaskCount() + " from " + addr); } continue; } totalAssigned++; scheduledObjectNum--; } catch (Exception e) { LOG.error(e); } } } }