Example #1
0
    public void assignToLeafTasks(LinkedList<TaskRequestEvent> taskRequests) {
      Collections.shuffle(taskRequests);
      LinkedList<TaskRequestEvent> remoteTaskRequests = new LinkedList<>();
      String queryMasterHostAndPort =
          context
              .getMasterContext()
              .getQueryMasterContext()
              .getWorkerContext()
              .getConnectionInfo()
              .getHostAndQMPort();

      TaskRequestEvent taskRequest;
      while (leafTasks.size() > 0 && (!taskRequests.isEmpty() || !remoteTaskRequests.isEmpty())) {
        int localAssign = 0;
        int rackAssign = 0;

        taskRequest = taskRequests.pollFirst();
        if (taskRequest == null) { // if there are only remote task requests
          taskRequest = remoteTaskRequests.pollFirst();
        }

        // checking if this container is still alive.
        // If not, ignore the task request and stop the task runner
        WorkerConnectionInfo connectionInfo =
            context.getMasterContext().getWorkerMap().get(taskRequest.getWorkerId());
        if (connectionInfo == null) continue;

        // getting the hostname of requested node
        String host = connectionInfo.getHost();

        // if there are no worker matched to the hostname a task request
        if (!leafTaskHostMapping.containsKey(host) && !taskRequests.isEmpty()) {
          String normalizedHost = NetUtils.normalizeHost(host);

          if (!leafTaskHostMapping.containsKey(normalizedHost)) {
            // this case means one of either cases:
            // * there are no blocks which reside in this node.
            // * all blocks which reside in this node are consumed, and this task runner requests a
            // remote task.
            // In this case, we transfer the task request to the remote task request list, and skip
            // the followings.
            remoteTaskRequests.add(taskRequest);
            continue;
          } else {
            host = normalizedHost;
          }
        }

        if (LOG.isDebugEnabled()) {
          LOG.debug(
              "assignToLeafTasks: "
                  + taskRequest.getExecutionBlockId()
                  + ","
                  + "worker="
                  + connectionInfo.getHostAndPeerRpcPort());
        }

        //////////////////////////////////////////////////////////////////////
        // disk or host-local allocation
        //////////////////////////////////////////////////////////////////////
        TaskAttemptId attemptId = allocateLocalTask(host);

        if (attemptId == null) { // if a local task cannot be found
          HostVolumeMapping hostVolumeMapping = leafTaskHostMapping.get(host);

          if (!taskRequests
              .isEmpty()) { // if other requests remains, move to remote list for better locality
            remoteTaskRequests.add(taskRequest);
            candidateWorkers.remove(connectionInfo.getId());
            continue;

          } else {
            if (hostVolumeMapping != null) {
              int nodes = context.getMasterContext().getWorkerMap().size();
              // this part is to control the assignment of tail and remote task balancing per node
              int tailLimit = 1;
              if (remainingScheduledObjectNum() > 0 && nodes > 0) {
                tailLimit = Math.max(remainingScheduledObjectNum() / nodes, 1);
              }

              if (hostVolumeMapping.getRemoteConcurrency()
                  >= tailLimit) { // remote task throttling per node
                continue;
              } else {
                // assign to remote volume
                hostVolumeMapping.increaseConcurrency(HostVolumeMapping.REMOTE);
              }
            }
          }

          //////////////////////////////////////////////////////////////////////
          // rack-local allocation
          //////////////////////////////////////////////////////////////////////
          attemptId = allocateRackTask(host);

          //////////////////////////////////////////////////////////////////////
          // random node allocation
          //////////////////////////////////////////////////////////////////////
          if (attemptId == null && leafTaskNum() > 0) {
            synchronized (leafTasks) {
              attemptId = leafTasks.iterator().next();
              leafTasks.remove(attemptId);
            }
          }

          if (attemptId != null && hostVolumeMapping != null) {
            hostVolumeMapping.lastAssignedVolumeId.put(attemptId, HostVolumeMapping.REMOTE);
          }
          rackAssign++;
        } else {
          localAssign++;
        }

        if (attemptId != null) {
          Task task = stage.getTask(attemptId.getTaskId());
          TaskRequest taskAssign =
              new TaskRequestImpl(
                  attemptId,
                  new ArrayList<>(task.getAllFragments()),
                  "",
                  false,
                  LogicalNodeSerializer.serialize(task.getLogicalPlan()),
                  context.getMasterContext().getQueryContext(),
                  stage.getDataChannel(),
                  stage.getBlock().getEnforcer(),
                  queryMasterHostAndPort);

          if (checkIfInterQuery(stage.getMasterPlan(), stage.getBlock())) {
            taskAssign.setInterQuery();
          }

          // TODO send batch request
          BatchAllocationRequest.Builder requestProto = BatchAllocationRequest.newBuilder();
          requestProto.addTaskRequest(
              TaskAllocationProto.newBuilder()
                  .setResource(taskRequest.getResponseProto().getResource())
                  .setTaskRequest(taskAssign.getProto())
                  .build());

          requestProto.setExecutionBlockId(attemptId.getTaskId().getExecutionBlockId().getProto());
          context
              .getMasterContext()
              .getEventHandler()
              .handle(new TaskAttemptAssignedEvent(attemptId, connectionInfo));

          InetSocketAddress addr = stage.getAssignedWorkerMap().get(connectionInfo.getId());
          if (addr == null)
            addr = new InetSocketAddress(connectionInfo.getHost(), connectionInfo.getPeerRpcPort());

          AsyncRpcClient tajoWorkerRpc = null;
          CallFuture<BatchAllocationResponse> callFuture = new CallFuture<>();
          totalAttempts++;
          try {
            tajoWorkerRpc =
                RpcClientManager.getInstance()
                    .getClient(addr, TajoWorkerProtocol.class, true, rpcParams);

            TajoWorkerProtocol.TajoWorkerProtocolService tajoWorkerRpcClient =
                tajoWorkerRpc.getStub();
            tajoWorkerRpcClient.allocateTasks(
                callFuture.getController(), requestProto.build(), callFuture);

            BatchAllocationResponse responseProto =
                callFuture.get(RpcConstants.FUTURE_TIMEOUT_SECONDS_DEFAULT, TimeUnit.SECONDS);

            if (responseProto.getCancellationTaskCount() > 0) {
              for (TaskAllocationProto proto : responseProto.getCancellationTaskList()) {
                cancel(task.getAttempt(new TaskAttemptId(proto.getTaskRequest().getId())));
                cancellation++;
              }

              if (LOG.isDebugEnabled()) {
                LOG.debug(
                    "Canceled requests: "
                        + responseProto.getCancellationTaskCount()
                        + " from "
                        + addr);
              }
              continue;
            }
          } catch (Exception e) {
            LOG.error(e);
          }
          scheduledObjectNum--;
          totalAssigned++;
          hostLocalAssigned += localAssign;
          rackLocalAssigned += rackAssign;

          if (rackAssign > 0) {
            LOG.info(
                String.format(
                    "Assigned Local/Rack/Total: (%d/%d/%d), "
                        + "Attempted Cancel/Assign/Total: (%d/%d/%d), "
                        + "Locality: %.2f%%, Rack host: %s",
                    hostLocalAssigned,
                    rackLocalAssigned,
                    totalAssigned,
                    cancellation,
                    totalAssigned,
                    totalAttempts,
                    ((double) hostLocalAssigned / (double) totalAssigned) * 100,
                    host));
          }

        } else {
          throw new RuntimeException("Illegal State!!!!!!!!!!!!!!!!!!!!!");
        }
      }
    }
Example #2
0
    private TaskAttemptId allocateRackTask(String host) {

      List<HostVolumeMapping> remainingTasks = Lists.newArrayList(leafTaskHostMapping.values());
      String rack = RackResolver.resolve(host).getNetworkLocation();
      TaskAttemptId attemptId = null;

      if (remainingTasks.size() > 0) {
        synchronized (scheduledRequests) {
          // find largest remaining task of other host in rack
          Collections.sort(
              remainingTasks,
              new Comparator<HostVolumeMapping>() {
                @Override
                public int compare(HostVolumeMapping v1, HostVolumeMapping v2) {
                  // descending remaining tasks
                  if (v2.remainTasksNum.get() > v1.remainTasksNum.get()) {
                    return 1;
                  } else if (v2.remainTasksNum.get() == v1.remainTasksNum.get()) {
                    return 0;
                  } else {
                    return -1;
                  }
                }
              });
        }

        for (HostVolumeMapping tasks : remainingTasks) {
          for (int i = 0; i < tasks.getRemainingLocalTaskSize(); i++) {
            TaskAttemptId tId = tasks.getTaskAttemptIdByRack(rack);

            if (tId == null) break;

            if (leafTasks.contains(tId)) {
              leafTasks.remove(tId);
              attemptId = tId;
              break;
            }
          }
          if (attemptId != null) break;
        }
      }

      // find task in rack
      if (attemptId == null) {
        HashSet<TaskAttemptId> list = leafTasksRackMapping.get(rack);
        if (list != null) {
          synchronized (list) {
            Iterator<TaskAttemptId> iterator = list.iterator();
            while (iterator.hasNext()) {
              TaskAttemptId tId = iterator.next();
              iterator.remove();
              if (leafTasks.contains(tId)) {
                leafTasks.remove(tId);
                attemptId = tId;
                break;
              }
            }
          }
        }
      }

      return attemptId;
    }
Example #3
0
  /**
   * One worker can have multiple running task runners. <code>HostVolumeMapping</code> describes
   * various information for one worker, including :
   *
   * <ul>
   *   <li>host name
   *   <li>rack name
   *   <li>unassigned tasks for each disk volume
   *   <li>last assigned volume id - it can be used for assigning task in a round-robin manner
   *   <li>the number of running tasks for each volume
   * </ul>
   *
   * , each task runner and the concurrency number of running tasks for volumes.
   *
   * <p>Here, we identifier a task runner by {@link ContainerId}, and we use volume ids to identify
   * all disks in this node. Actually, each volume is only used to distinguish disks, and we don't
   * know a certain volume id indicates a certain disk. If you want to know volume id, please read
   * the below section.
   *
   * <h3>Volume id</h3>
   *
   * Volume id is an integer. Each volume id identifies each disk volume.
   *
   * <p>This volume id can be obtained from
   * org.apache.hadoop.fs.BlockStorageLocation#getVolumeIds()}. * HDFS cannot give any volume id due
   * to unknown reason and disabled config 'dfs.client.file-block-locations.enabled'. In this case,
   * the volume id will be -1 or other native integer.
   *
   * <h3>See Also</h3>
   *
   * <ul>
   *   <li>HDFS-3672 (https://issues.apache.org/jira/browse/HDFS-3672).
   * </ul>
   */
  public class HostVolumeMapping {
    private final String host;
    private final String rack;
    /** A key is disk volume, and a value is a list of tasks to be scheduled. */
    private Map<Integer, LinkedHashSet<TaskAttempt>> unassignedTaskForEachVolume =
        Collections.synchronizedMap(new HashMap<>());
    /** A value is last assigned volume id for each task runner */
    private HashMap<TaskAttemptId, Integer> lastAssignedVolumeId = Maps.newHashMap();
    /**
     * A key is disk volume id, and a value is the load of this volume. This load is measured by
     * counting how many number of tasks are running.
     *
     * <p>These disk volumes are kept in an order of ascending order of the volume id. In other
     * words, the head volume ids are likely to -1, meaning no given volume id.
     */
    private SortedMap<Integer, Integer> diskVolumeLoads = new TreeMap<>();
    /** The total number of remain tasks in this host */
    private AtomicInteger remainTasksNum = new AtomicInteger(0);

    public static final int REMOTE = -2;

    public HostVolumeMapping(String host, String rack) {
      this.host = host;
      this.rack = rack;
    }

    public synchronized void addTaskAttempt(int volumeId, TaskAttempt attemptId) {
      synchronized (unassignedTaskForEachVolume) {
        LinkedHashSet<TaskAttempt> list = unassignedTaskForEachVolume.get(volumeId);
        if (list == null) {
          list = new LinkedHashSet<>();
          unassignedTaskForEachVolume.put(volumeId, list);
        }
        list.add(attemptId);
      }

      remainTasksNum.incrementAndGet();

      if (!diskVolumeLoads.containsKey(volumeId)) diskVolumeLoads.put(volumeId, 0);
    }

    /**
     * Priorities 1. a task list in a volume of host 2. unknown block or Non-splittable task in host
     * 3. remote tasks. unassignedTaskForEachVolume is only contained local task. so it will be null
     */
    public synchronized TaskAttemptId getLocalTask() {
      int volumeId = getLowestVolumeId();
      TaskAttemptId taskAttemptId = null;

      if (unassignedTaskForEachVolume.size() > 0) {
        int retry = unassignedTaskForEachVolume.size();
        do {
          // clean and get a remaining local task
          taskAttemptId = getAndRemove(volumeId);

          if (taskAttemptId == null) {
            // reassign next volume
            volumeId = getLowestVolumeId();
            retry--;
          } else {
            lastAssignedVolumeId.put(taskAttemptId, volumeId);
            break;
          }
        } while (retry > 0);
      } else {
        this.remainTasksNum.set(0);
      }

      return taskAttemptId;
    }

    public synchronized TaskAttemptId getTaskAttemptIdByRack(String rack) {
      TaskAttemptId taskAttemptId = null;

      if (unassignedTaskForEachVolume.size() > 0 && this.rack.equals(rack)) {
        int retry = unassignedTaskForEachVolume.size();
        do {
          // clean and get a remaining task
          int volumeId = getLowestVolumeId();
          taskAttemptId = getAndRemove(volumeId);
          if (taskAttemptId == null) {
            retry--;
          } else {
            break;
          }
        } while (retry > 0);
      }
      return taskAttemptId;
    }

    private synchronized TaskAttemptId getAndRemove(int volumeId) {
      TaskAttemptId taskAttemptId = null;
      if (!unassignedTaskForEachVolume.containsKey(volumeId)) {
        if (volumeId > REMOTE) {
          diskVolumeLoads.remove(volumeId);
        }
        return taskAttemptId;
      }

      LinkedHashSet<TaskAttempt> list = unassignedTaskForEachVolume.get(volumeId);
      if (list != null && !list.isEmpty()) {
        TaskAttempt taskAttempt;
        synchronized (unassignedTaskForEachVolume) {
          Iterator<TaskAttempt> iterator = list.iterator();
          taskAttempt = iterator.next();
          iterator.remove();
        }

        taskAttemptId = taskAttempt.getId();
        for (DataLocation location : taskAttempt.getTask().getDataLocations()) {
          HostVolumeMapping volumeMapping =
              scheduledRequests.leafTaskHostMapping.get(location.getHost());
          if (volumeMapping != null) {
            volumeMapping.removeTaskAttempt(location.getVolumeId(), taskAttempt);
          }
        }

        increaseConcurrency(volumeId);
      }

      return taskAttemptId;
    }

    private synchronized void removeTaskAttempt(int volumeId, TaskAttempt taskAttempt) {
      if (!unassignedTaskForEachVolume.containsKey(volumeId)) return;

      LinkedHashSet<TaskAttempt> tasks = unassignedTaskForEachVolume.get(volumeId);
      if (tasks.remove(taskAttempt)) {
        remainTasksNum.getAndDecrement();
      }

      if (tasks.isEmpty()) {
        unassignedTaskForEachVolume.remove(volumeId);
        if (volumeId > REMOTE) {
          diskVolumeLoads.remove(volumeId);
        }
      }
    }

    /**
     * Increase the count of running tasks and disk loads for a certain task runner.
     *
     * @param volumeId Volume identifier
     * @return the volume load (i.e., how many running tasks use this volume)
     */
    private synchronized int increaseConcurrency(int volumeId) {

      int concurrency = 1;
      if (diskVolumeLoads.containsKey(volumeId)) {
        concurrency = diskVolumeLoads.get(volumeId) + 1;
      }

      if (volumeId > -1) {
        LOG.info(
            "Assigned host : "
                + host
                + ", Volume : "
                + volumeId
                + ", Concurrency : "
                + concurrency);
      } else if (volumeId == -1) {
        // this case is disabled namenode block meta or compressed text file or amazon s3
        LOG.info(
            "Assigned host : "
                + host
                + ", Unknown Volume : "
                + volumeId
                + ", Concurrency : "
                + concurrency);
      } else if (volumeId == REMOTE) {
        // this case has processed all block on host and it will be assigned to remote
        LOG.info(
            "Assigned host : "
                + host
                + ", Remaining local tasks : "
                + getRemainingLocalTaskSize()
                + ", Remote Concurrency : "
                + concurrency);
      }
      diskVolumeLoads.put(volumeId, concurrency);
      return concurrency;
    }

    /** Decrease the count of running tasks of a certain task runner */
    private synchronized void decreaseConcurrency(int volumeId) {
      if (diskVolumeLoads.containsKey(volumeId)) {
        Integer concurrency = diskVolumeLoads.get(volumeId);
        if (concurrency > 0) {
          diskVolumeLoads.put(volumeId, concurrency - 1);
        } else {
          if (volumeId > REMOTE && !unassignedTaskForEachVolume.containsKey(volumeId)) {
            diskVolumeLoads.remove(volumeId);
          }
        }
      }
    }

    /** volume of a host : 0 ~ n compressed task, amazon s3, unKnown volume : -1 remote task : -2 */
    public int getLowestVolumeId() {
      Map.Entry<Integer, Integer> volumeEntry = null;

      for (Map.Entry<Integer, Integer> entry : diskVolumeLoads.entrySet()) {
        if (volumeEntry == null) volumeEntry = entry;

        if (volumeEntry.getValue() >= entry.getValue()) {
          volumeEntry = entry;
        }
      }

      if (volumeEntry != null) {
        return volumeEntry.getKey();
      } else {
        return REMOTE;
      }
    }

    public int getRemoteConcurrency() {
      return getVolumeConcurrency(REMOTE);
    }

    public int getVolumeConcurrency(int volumeId) {
      Integer size = diskVolumeLoads.get(volumeId);
      if (size == null) return 0;
      else return size;
    }

    public int getRemainingLocalTaskSize() {
      return remainTasksNum.get();
    }

    public String getHost() {
      return host;
    }

    public String getRack() {
      return rack;
    }
  }
Example #4
0
  private class ScheduledRequests {
    // two list leafTasks and nonLeafTasks keep all tasks to be scheduled. Even though some task is
    // included in
    // leafTaskHostMapping or leafTasksRackMapping, some task T will not be sent to a task runner
    // if the task is not included in leafTasks and nonLeafTasks.
    private final Set<TaskAttemptId> leafTasks = Collections.synchronizedSet(new HashSet<>());
    private final Set<TaskAttemptId> nonLeafTasks = Collections.synchronizedSet(new HashSet<>());
    private Map<String, HostVolumeMapping> leafTaskHostMapping = Maps.newConcurrentMap();
    private final Map<String, HashSet<TaskAttemptId>> leafTasksRackMapping =
        Maps.newConcurrentMap();

    protected void clear() {
      leafTasks.clear();
      nonLeafTasks.clear();
      leafTaskHostMapping.clear();
      leafTasksRackMapping.clear();
    }

    private void addLeafTask(TaskAttemptToSchedulerEvent event) {
      TaskAttempt taskAttempt = event.getTaskAttempt();
      List<DataLocation> locations = taskAttempt.getTask().getDataLocations();

      for (DataLocation location : locations) {
        String host = location.getHost();
        leafTaskHosts.add(host);

        HostVolumeMapping hostVolumeMapping = leafTaskHostMapping.get(host);
        if (hostVolumeMapping == null) {
          String rack = RackResolver.resolve(host).getNetworkLocation();
          hostVolumeMapping = new HostVolumeMapping(host, rack);
          leafTaskHostMapping.put(host, hostVolumeMapping);
        }
        hostVolumeMapping.addTaskAttempt(location.getVolumeId(), taskAttempt);

        if (LOG.isDebugEnabled()) {
          LOG.debug("Added attempt req to host " + host);
        }

        HashSet<TaskAttemptId> list = leafTasksRackMapping.get(hostVolumeMapping.getRack());
        if (list == null) {
          list = new HashSet<>();
          leafTasksRackMapping.put(hostVolumeMapping.getRack(), list);
        }

        list.add(taskAttempt.getId());

        if (LOG.isDebugEnabled()) {
          LOG.debug("Added attempt req to rack " + hostVolumeMapping.getRack());
        }
      }

      leafTasks.add(taskAttempt.getId());
    }

    private void addNonLeafTask(TaskAttemptToSchedulerEvent event) {
      nonLeafTasks.add(event.getTaskAttempt().getId());
    }

    public int leafTaskNum() {
      return leafTasks.size();
    }

    public int nonLeafTaskNum() {
      return nonLeafTasks.size();
    }

    private TaskAttemptId allocateLocalTask(String host) {
      HostVolumeMapping hostVolumeMapping = leafTaskHostMapping.get(host);

      if (hostVolumeMapping != null) { // tajo host is located in hadoop datanode
        for (int i = 0; i < hostVolumeMapping.getRemainingLocalTaskSize(); i++) {
          TaskAttemptId attemptId = hostVolumeMapping.getLocalTask();

          if (attemptId == null) break;
          // find remaining local task
          if (leafTasks.contains(attemptId)) {
            leafTasks.remove(attemptId);
            return attemptId;
          }
        }
      }
      return null;
    }

    private TaskAttemptId allocateRackTask(String host) {

      List<HostVolumeMapping> remainingTasks = Lists.newArrayList(leafTaskHostMapping.values());
      String rack = RackResolver.resolve(host).getNetworkLocation();
      TaskAttemptId attemptId = null;

      if (remainingTasks.size() > 0) {
        synchronized (scheduledRequests) {
          // find largest remaining task of other host in rack
          Collections.sort(
              remainingTasks,
              new Comparator<HostVolumeMapping>() {
                @Override
                public int compare(HostVolumeMapping v1, HostVolumeMapping v2) {
                  // descending remaining tasks
                  if (v2.remainTasksNum.get() > v1.remainTasksNum.get()) {
                    return 1;
                  } else if (v2.remainTasksNum.get() == v1.remainTasksNum.get()) {
                    return 0;
                  } else {
                    return -1;
                  }
                }
              });
        }

        for (HostVolumeMapping tasks : remainingTasks) {
          for (int i = 0; i < tasks.getRemainingLocalTaskSize(); i++) {
            TaskAttemptId tId = tasks.getTaskAttemptIdByRack(rack);

            if (tId == null) break;

            if (leafTasks.contains(tId)) {
              leafTasks.remove(tId);
              attemptId = tId;
              break;
            }
          }
          if (attemptId != null) break;
        }
      }

      // find task in rack
      if (attemptId == null) {
        HashSet<TaskAttemptId> list = leafTasksRackMapping.get(rack);
        if (list != null) {
          synchronized (list) {
            Iterator<TaskAttemptId> iterator = list.iterator();
            while (iterator.hasNext()) {
              TaskAttemptId tId = iterator.next();
              iterator.remove();
              if (leafTasks.contains(tId)) {
                leafTasks.remove(tId);
                attemptId = tId;
                break;
              }
            }
          }
        }
      }

      return attemptId;
    }

    public void assignToLeafTasks(LinkedList<TaskRequestEvent> taskRequests) {
      Collections.shuffle(taskRequests);
      LinkedList<TaskRequestEvent> remoteTaskRequests = new LinkedList<>();
      String queryMasterHostAndPort =
          context
              .getMasterContext()
              .getQueryMasterContext()
              .getWorkerContext()
              .getConnectionInfo()
              .getHostAndQMPort();

      TaskRequestEvent taskRequest;
      while (leafTasks.size() > 0 && (!taskRequests.isEmpty() || !remoteTaskRequests.isEmpty())) {
        int localAssign = 0;
        int rackAssign = 0;

        taskRequest = taskRequests.pollFirst();
        if (taskRequest == null) { // if there are only remote task requests
          taskRequest = remoteTaskRequests.pollFirst();
        }

        // checking if this container is still alive.
        // If not, ignore the task request and stop the task runner
        WorkerConnectionInfo connectionInfo =
            context.getMasterContext().getWorkerMap().get(taskRequest.getWorkerId());
        if (connectionInfo == null) continue;

        // getting the hostname of requested node
        String host = connectionInfo.getHost();

        // if there are no worker matched to the hostname a task request
        if (!leafTaskHostMapping.containsKey(host) && !taskRequests.isEmpty()) {
          String normalizedHost = NetUtils.normalizeHost(host);

          if (!leafTaskHostMapping.containsKey(normalizedHost)) {
            // this case means one of either cases:
            // * there are no blocks which reside in this node.
            // * all blocks which reside in this node are consumed, and this task runner requests a
            // remote task.
            // In this case, we transfer the task request to the remote task request list, and skip
            // the followings.
            remoteTaskRequests.add(taskRequest);
            continue;
          } else {
            host = normalizedHost;
          }
        }

        if (LOG.isDebugEnabled()) {
          LOG.debug(
              "assignToLeafTasks: "
                  + taskRequest.getExecutionBlockId()
                  + ","
                  + "worker="
                  + connectionInfo.getHostAndPeerRpcPort());
        }

        //////////////////////////////////////////////////////////////////////
        // disk or host-local allocation
        //////////////////////////////////////////////////////////////////////
        TaskAttemptId attemptId = allocateLocalTask(host);

        if (attemptId == null) { // if a local task cannot be found
          HostVolumeMapping hostVolumeMapping = leafTaskHostMapping.get(host);

          if (!taskRequests
              .isEmpty()) { // if other requests remains, move to remote list for better locality
            remoteTaskRequests.add(taskRequest);
            candidateWorkers.remove(connectionInfo.getId());
            continue;

          } else {
            if (hostVolumeMapping != null) {
              int nodes = context.getMasterContext().getWorkerMap().size();
              // this part is to control the assignment of tail and remote task balancing per node
              int tailLimit = 1;
              if (remainingScheduledObjectNum() > 0 && nodes > 0) {
                tailLimit = Math.max(remainingScheduledObjectNum() / nodes, 1);
              }

              if (hostVolumeMapping.getRemoteConcurrency()
                  >= tailLimit) { // remote task throttling per node
                continue;
              } else {
                // assign to remote volume
                hostVolumeMapping.increaseConcurrency(HostVolumeMapping.REMOTE);
              }
            }
          }

          //////////////////////////////////////////////////////////////////////
          // rack-local allocation
          //////////////////////////////////////////////////////////////////////
          attemptId = allocateRackTask(host);

          //////////////////////////////////////////////////////////////////////
          // random node allocation
          //////////////////////////////////////////////////////////////////////
          if (attemptId == null && leafTaskNum() > 0) {
            synchronized (leafTasks) {
              attemptId = leafTasks.iterator().next();
              leafTasks.remove(attemptId);
            }
          }

          if (attemptId != null && hostVolumeMapping != null) {
            hostVolumeMapping.lastAssignedVolumeId.put(attemptId, HostVolumeMapping.REMOTE);
          }
          rackAssign++;
        } else {
          localAssign++;
        }

        if (attemptId != null) {
          Task task = stage.getTask(attemptId.getTaskId());
          TaskRequest taskAssign =
              new TaskRequestImpl(
                  attemptId,
                  new ArrayList<>(task.getAllFragments()),
                  "",
                  false,
                  LogicalNodeSerializer.serialize(task.getLogicalPlan()),
                  context.getMasterContext().getQueryContext(),
                  stage.getDataChannel(),
                  stage.getBlock().getEnforcer(),
                  queryMasterHostAndPort);

          if (checkIfInterQuery(stage.getMasterPlan(), stage.getBlock())) {
            taskAssign.setInterQuery();
          }

          // TODO send batch request
          BatchAllocationRequest.Builder requestProto = BatchAllocationRequest.newBuilder();
          requestProto.addTaskRequest(
              TaskAllocationProto.newBuilder()
                  .setResource(taskRequest.getResponseProto().getResource())
                  .setTaskRequest(taskAssign.getProto())
                  .build());

          requestProto.setExecutionBlockId(attemptId.getTaskId().getExecutionBlockId().getProto());
          context
              .getMasterContext()
              .getEventHandler()
              .handle(new TaskAttemptAssignedEvent(attemptId, connectionInfo));

          InetSocketAddress addr = stage.getAssignedWorkerMap().get(connectionInfo.getId());
          if (addr == null)
            addr = new InetSocketAddress(connectionInfo.getHost(), connectionInfo.getPeerRpcPort());

          AsyncRpcClient tajoWorkerRpc = null;
          CallFuture<BatchAllocationResponse> callFuture = new CallFuture<>();
          totalAttempts++;
          try {
            tajoWorkerRpc =
                RpcClientManager.getInstance()
                    .getClient(addr, TajoWorkerProtocol.class, true, rpcParams);

            TajoWorkerProtocol.TajoWorkerProtocolService tajoWorkerRpcClient =
                tajoWorkerRpc.getStub();
            tajoWorkerRpcClient.allocateTasks(
                callFuture.getController(), requestProto.build(), callFuture);

            BatchAllocationResponse responseProto =
                callFuture.get(RpcConstants.FUTURE_TIMEOUT_SECONDS_DEFAULT, TimeUnit.SECONDS);

            if (responseProto.getCancellationTaskCount() > 0) {
              for (TaskAllocationProto proto : responseProto.getCancellationTaskList()) {
                cancel(task.getAttempt(new TaskAttemptId(proto.getTaskRequest().getId())));
                cancellation++;
              }

              if (LOG.isDebugEnabled()) {
                LOG.debug(
                    "Canceled requests: "
                        + responseProto.getCancellationTaskCount()
                        + " from "
                        + addr);
              }
              continue;
            }
          } catch (Exception e) {
            LOG.error(e);
          }
          scheduledObjectNum--;
          totalAssigned++;
          hostLocalAssigned += localAssign;
          rackLocalAssigned += rackAssign;

          if (rackAssign > 0) {
            LOG.info(
                String.format(
                    "Assigned Local/Rack/Total: (%d/%d/%d), "
                        + "Attempted Cancel/Assign/Total: (%d/%d/%d), "
                        + "Locality: %.2f%%, Rack host: %s",
                    hostLocalAssigned,
                    rackLocalAssigned,
                    totalAssigned,
                    cancellation,
                    totalAssigned,
                    totalAttempts,
                    ((double) hostLocalAssigned / (double) totalAssigned) * 100,
                    host));
          }

        } else {
          throw new RuntimeException("Illegal State!!!!!!!!!!!!!!!!!!!!!");
        }
      }
    }

    private boolean checkIfInterQuery(MasterPlan masterPlan, ExecutionBlock block) {
      if (masterPlan.isRoot(block)) {
        return false;
      }

      ExecutionBlock parent = masterPlan.getParent(block);
      if (masterPlan.isRoot(parent) && parent.isUnionOnly()) {
        return false;
      }

      return true;
    }

    public void assignToNonLeafTasks(LinkedList<TaskRequestEvent> taskRequests) {
      Collections.shuffle(taskRequests);
      String queryMasterHostAndPort =
          context
              .getMasterContext()
              .getQueryMasterContext()
              .getWorkerContext()
              .getConnectionInfo()
              .getHostAndQMPort();

      TaskRequestEvent taskRequest;
      while (!taskRequests.isEmpty()) {
        taskRequest = taskRequests.pollFirst();
        LOG.debug("assignToNonLeafTasks: " + taskRequest.getExecutionBlockId());

        TaskAttemptId attemptId;
        // random allocation
        if (nonLeafTasks.size() > 0) {
          synchronized (nonLeafTasks) {
            attemptId = nonLeafTasks.iterator().next();
            nonLeafTasks.remove(attemptId);
          }
          LOG.debug("Assigned based on * match");

          Task task;
          task = stage.getTask(attemptId.getTaskId());

          TaskRequest taskAssign =
              new TaskRequestImpl(
                  attemptId,
                  Lists.newArrayList(task.getAllFragments()),
                  "",
                  false,
                  LogicalNodeSerializer.serialize(task.getLogicalPlan()),
                  context.getMasterContext().getQueryContext(),
                  stage.getDataChannel(),
                  stage.getBlock().getEnforcer(),
                  queryMasterHostAndPort);

          if (checkIfInterQuery(stage.getMasterPlan(), stage.getBlock())) {
            taskAssign.setInterQuery();
          }
          for (Map.Entry<String, Set<FetchImpl>> entry : task.getFetchMap().entrySet()) {
            Collection<FetchImpl> fetches = entry.getValue();
            if (fetches != null) {
              for (FetchImpl fetch : fetches) {
                taskAssign.addFetch(entry.getKey(), fetch);
              }
            }
          }

          WorkerConnectionInfo connectionInfo =
              context.getMasterContext().getWorkerMap().get(taskRequest.getWorkerId());

          // TODO send batch request
          BatchAllocationRequest.Builder requestProto = BatchAllocationRequest.newBuilder();
          requestProto.addTaskRequest(
              TaskAllocationProto.newBuilder()
                  .setResource(taskRequest.getResponseProto().getResource())
                  .setTaskRequest(taskAssign.getProto())
                  .build());

          requestProto.setExecutionBlockId(attemptId.getTaskId().getExecutionBlockId().getProto());
          context
              .getMasterContext()
              .getEventHandler()
              .handle(new TaskAttemptAssignedEvent(attemptId, connectionInfo));

          CallFuture<BatchAllocationResponse> callFuture = new CallFuture<>();

          InetSocketAddress addr = stage.getAssignedWorkerMap().get(connectionInfo.getId());
          if (addr == null)
            addr = new InetSocketAddress(connectionInfo.getHost(), connectionInfo.getPeerRpcPort());

          AsyncRpcClient tajoWorkerRpc;
          try {
            tajoWorkerRpc =
                RpcClientManager.getInstance()
                    .getClient(addr, TajoWorkerProtocol.class, true, rpcParams);
            TajoWorkerProtocol.TajoWorkerProtocolService tajoWorkerRpcClient =
                tajoWorkerRpc.getStub();
            tajoWorkerRpcClient.allocateTasks(
                callFuture.getController(), requestProto.build(), callFuture);

            BatchAllocationResponse responseProto =
                callFuture.get(RpcConstants.FUTURE_TIMEOUT_SECONDS_DEFAULT, TimeUnit.SECONDS);

            if (responseProto.getCancellationTaskCount() > 0) {
              for (TaskAllocationProto proto : responseProto.getCancellationTaskList()) {
                cancel(task.getAttempt(new TaskAttemptId(proto.getTaskRequest().getId())));
                cancellation++;
              }

              if (LOG.isDebugEnabled()) {
                LOG.debug(
                    "Canceled requests: "
                        + responseProto.getCancellationTaskCount()
                        + " from "
                        + addr);
              }
              continue;
            }

            totalAssigned++;
            scheduledObjectNum--;
          } catch (Exception e) {
            LOG.error(e);
          }
        }
      }
    }
  }
Example #5
0
    public void assignToNonLeafTasks(LinkedList<TaskRequestEvent> taskRequests) {
      Collections.shuffle(taskRequests);
      String queryMasterHostAndPort =
          context
              .getMasterContext()
              .getQueryMasterContext()
              .getWorkerContext()
              .getConnectionInfo()
              .getHostAndQMPort();

      TaskRequestEvent taskRequest;
      while (!taskRequests.isEmpty()) {
        taskRequest = taskRequests.pollFirst();
        LOG.debug("assignToNonLeafTasks: " + taskRequest.getExecutionBlockId());

        TaskAttemptId attemptId;
        // random allocation
        if (nonLeafTasks.size() > 0) {
          synchronized (nonLeafTasks) {
            attemptId = nonLeafTasks.iterator().next();
            nonLeafTasks.remove(attemptId);
          }
          LOG.debug("Assigned based on * match");

          Task task;
          task = stage.getTask(attemptId.getTaskId());

          TaskRequest taskAssign =
              new TaskRequestImpl(
                  attemptId,
                  Lists.newArrayList(task.getAllFragments()),
                  "",
                  false,
                  LogicalNodeSerializer.serialize(task.getLogicalPlan()),
                  context.getMasterContext().getQueryContext(),
                  stage.getDataChannel(),
                  stage.getBlock().getEnforcer(),
                  queryMasterHostAndPort);

          if (checkIfInterQuery(stage.getMasterPlan(), stage.getBlock())) {
            taskAssign.setInterQuery();
          }
          for (Map.Entry<String, Set<FetchImpl>> entry : task.getFetchMap().entrySet()) {
            Collection<FetchImpl> fetches = entry.getValue();
            if (fetches != null) {
              for (FetchImpl fetch : fetches) {
                taskAssign.addFetch(entry.getKey(), fetch);
              }
            }
          }

          WorkerConnectionInfo connectionInfo =
              context.getMasterContext().getWorkerMap().get(taskRequest.getWorkerId());

          // TODO send batch request
          BatchAllocationRequest.Builder requestProto = BatchAllocationRequest.newBuilder();
          requestProto.addTaskRequest(
              TaskAllocationProto.newBuilder()
                  .setResource(taskRequest.getResponseProto().getResource())
                  .setTaskRequest(taskAssign.getProto())
                  .build());

          requestProto.setExecutionBlockId(attemptId.getTaskId().getExecutionBlockId().getProto());
          context
              .getMasterContext()
              .getEventHandler()
              .handle(new TaskAttemptAssignedEvent(attemptId, connectionInfo));

          CallFuture<BatchAllocationResponse> callFuture = new CallFuture<>();

          InetSocketAddress addr = stage.getAssignedWorkerMap().get(connectionInfo.getId());
          if (addr == null)
            addr = new InetSocketAddress(connectionInfo.getHost(), connectionInfo.getPeerRpcPort());

          AsyncRpcClient tajoWorkerRpc;
          try {
            tajoWorkerRpc =
                RpcClientManager.getInstance()
                    .getClient(addr, TajoWorkerProtocol.class, true, rpcParams);
            TajoWorkerProtocol.TajoWorkerProtocolService tajoWorkerRpcClient =
                tajoWorkerRpc.getStub();
            tajoWorkerRpcClient.allocateTasks(
                callFuture.getController(), requestProto.build(), callFuture);

            BatchAllocationResponse responseProto =
                callFuture.get(RpcConstants.FUTURE_TIMEOUT_SECONDS_DEFAULT, TimeUnit.SECONDS);

            if (responseProto.getCancellationTaskCount() > 0) {
              for (TaskAllocationProto proto : responseProto.getCancellationTaskList()) {
                cancel(task.getAttempt(new TaskAttemptId(proto.getTaskRequest().getId())));
                cancellation++;
              }

              if (LOG.isDebugEnabled()) {
                LOG.debug(
                    "Canceled requests: "
                        + responseProto.getCancellationTaskCount()
                        + " from "
                        + addr);
              }
              continue;
            }

            totalAssigned++;
            scheduledObjectNum--;
          } catch (Exception e) {
            LOG.error(e);
          }
        }
      }
    }