/**
   * Submit the work for actual execution.
   *
   * @throws InvalidProtocolBufferException
   */
  public void submitWork(SubmitWorkRequestProto request, String llapHost, int llapPort) {
    // Register the pending events to be sent for this spec.
    VertexOrBinary vob = request.getWorkSpec();
    assert vob.hasVertexBinary() != vob.hasVertex();
    SignableVertexSpec vertex = null;
    try {
      vertex =
          vob.hasVertex() ? vob.getVertex() : SignableVertexSpec.parseFrom(vob.getVertexBinary());
    } catch (InvalidProtocolBufferException e) {
      throw new RuntimeException(e);
    }
    QueryIdentifierProto queryIdentifierProto = vertex.getQueryIdentifier();
    TezTaskAttemptID attemptId =
        Converters.createTaskAttemptId(
            queryIdentifierProto,
            vertex.getVertexIndex(),
            request.getFragmentNumber(),
            request.getAttemptNumber());
    final String fragmentId = attemptId.toString();

    pendingEvents.putIfAbsent(
        fragmentId,
        new PendingEventData(
            new TaskHeartbeatInfo(fragmentId, llapHost, llapPort), Lists.<TezEvent>newArrayList()));

    // Setup timer task to check for hearbeat timeouts
    timer.scheduleAtFixedRate(
        new HeartbeatCheckTask(), connectionTimeout, connectionTimeout, TimeUnit.MILLISECONDS);

    // Send out the actual SubmitWorkRequest
    communicator.sendSubmitWork(
        request,
        llapHost,
        llapPort,
        new LlapProtocolClientProxy.ExecuteRequestCallback<SubmitWorkResponseProto>() {

          @Override
          public void setResponse(SubmitWorkResponseProto response) {
            if (response.hasSubmissionState()) {
              if (response.getSubmissionState().equals(SubmissionStateProto.REJECTED)) {
                String msg = "Fragment: " + fragmentId + " rejected. Server Busy.";
                LOG.info(msg);
                if (responder != null) {
                  Throwable err = new RuntimeException(msg);
                  responder.submissionFailed(fragmentId, err);
                }
                return;
              }
            }
          }

          @Override
          public void indicateError(Throwable t) {
            String msg = "Failed to submit: " + fragmentId;
            LOG.error(msg, t);
            Throwable err = new RuntimeException(msg, t);
            responder.submissionFailed(fragmentId, err);
          }
        });
  }
Example #2
0
 /** Inform the AM that this task has been killed. */
 public void reportTaskKilled() {
   killedTaskHandler.taskKilled(
       request.getAmHost(),
       request.getAmPort(),
       request.getUser(),
       jobToken,
       null,
       taskSpec.getDAGName(),
       taskSpec.getTaskAttemptID());
 }
Example #3
0
 /**
  * Attempt to kill a running task. If the task has not started running, it will not start. If it's
  * already running, a kill request will be sent to it.
  *
  * <p>The AM will be informed about the task kill.
  */
 public void killTask() {
   if (!isCompleted.get()) {
     if (!killInvoked.getAndSet(true)) {
       synchronized (this) {
         LOG.info(
             "Kill task requested for id={}, taskRunnerSetup={}",
             taskSpec.getTaskAttemptID(),
             (taskRunner != null));
         if (taskRunner != null) {
           killtimerWatch.start();
           LOG.info("Issuing kill to task {}", taskSpec.getTaskAttemptID());
           boolean killed = taskRunner.killTask();
           if (killed) {
             // Sending a kill message to the AM right here. Don't need to wait for the task to
             // complete.
             LOG.info(
                 "Kill request for task {} completed. Informing AM", taskSpec.getTaskAttemptID());
             reportTaskKilled();
           } else {
             LOG.info(
                 "Kill request for task {} did not complete because the task is already complete",
                 taskSpec.getTaskAttemptID());
           }
           shouldRunTask = false;
         } else {
           // If the task hasn't started, and it is killed - report back to the AM that the task
           // has been killed.
           LOG.debug("Reporting taskKilled for non-started fragment {}", getRequestId());
           reportTaskKilled();
         }
         if (!isStarted.get()) {
           // If the task hasn't started - inform about fragment completion immediately. It's
           // possible for
           // the callable to never run.
           fragmentCompletionHanler.fragmentComplete(fragmentInfo);
           this.amReporter.unregisterTask(request.getAmHost(), request.getAmPort());
         }
       }
     } else {
       // This should not happen.
       LOG.warn(
           "Ignoring kill request for task {} since a previous kill request was processed",
           taskSpec.getTaskAttemptID());
     }
   } else {
     LOG.info(
         "Ignoring kill request for task {} since it's already complete",
         taskSpec.getTaskAttemptID());
   }
 }
Example #4
0
    // Errors are handled on the way over. FAIL/SUCCESS is informed via regular heartbeats. Killed
    // via a kill message when a task kill is requested by the daemon.
    @Override
    public void onSuccess(TaskRunner2Result result) {
      isCompleted.set(true);

      switch (result.getEndReason()) {
          // Only the KILLED case requires a message to be sent out to the AM.
        case SUCCESS:
          LOG.debug("Successfully finished {}", requestId);
          metrics.incrExecutorTotalSuccess();
          break;
        case CONTAINER_STOP_REQUESTED:
          LOG.info("Received container stop request (AM preemption) for {}", requestId);
          break;
        case KILL_REQUESTED:
          LOG.info("Killed task {}", requestId);
          if (killtimerWatch.isRunning()) {
            killtimerWatch.stop();
            long elapsed = killtimerWatch.elapsedMillis();
            LOG.info("Time to die for task {}", elapsed);
          }
          metrics.incrPreemptionTimeLost(runtimeWatch.elapsedMillis());
          metrics.incrExecutorTotalKilled();
          break;
        case COMMUNICATION_FAILURE:
          LOG.info("Failed to run {} due to communication failure", requestId);
          metrics.incrExecutorTotalExecutionFailed();
          break;
        case TASK_ERROR:
          LOG.info("Failed to run {} due to task error", requestId);
          metrics.incrExecutorTotalExecutionFailed();
          break;
      }
      fragmentCompletionHanler.fragmentComplete(fragmentInfo);

      taskRunnerCallable.shutdown();
      HistoryLogger.logFragmentEnd(
          request.getApplicationIdString(),
          request.getContainerIdString(),
          executionContext.getHostName(),
          request.getFragmentSpec().getDagName(),
          request.getFragmentSpec().getVertexName(),
          request.getFragmentSpec().getFragmentNumber(),
          request.getFragmentSpec().getAttemptNumber(),
          taskRunnerCallable.threadName,
          taskRunnerCallable.startTime,
          true);
      metrics.decrExecutorNumQueuedRequests();
    }
Example #5
0
 @Override
 public String toString() {
   return requestId
       + " {canFinish: "
       + canFinish()
       + ", vertexParallelism: "
       + request.getFragmentSpec().getVertexParallelism()
       + ", selfAndUpstreamParallelism: "
       + request.getFragmentRuntimeInfo().getNumSelfAndUpstreamTasks()
       + ", selfAndUpstreamComplete: "
       + request.getFragmentRuntimeInfo().getNumSelfAndUpstreamCompletedTasks()
       + ", firstAttemptStartTime: "
       + getFragmentRuntimeInfo().getFirstAttemptStartTime()
       + ", dagStartTime:"
       + getFragmentRuntimeInfo().getDagStartTime()
       + ", withinDagPriority: "
       + getFragmentRuntimeInfo().getWithinDagPriority()
       + "}";
 }
Example #6
0
 @Override
 public void onFailure(Throwable t) {
   LOG.error("TezTaskRunner execution failed for : " + getTaskIdentifierString(request), t);
   isCompleted.set(true);
   fragmentCompletionHanler.fragmentComplete(fragmentInfo);
   // TODO HIVE-10236 Report a fatal error over the umbilical
   taskRunnerCallable.shutdown();
   HistoryLogger.logFragmentEnd(
       request.getApplicationIdString(),
       request.getContainerIdString(),
       executionContext.getHostName(),
       request.getFragmentSpec().getDagName(),
       request.getFragmentSpec().getVertexName(),
       request.getFragmentSpec().getFragmentNumber(),
       request.getFragmentSpec().getAttemptNumber(),
       taskRunnerCallable.threadName,
       taskRunnerCallable.startTime,
       false);
   if (metrics != null) {
     metrics.decrExecutorNumQueuedRequests();
   }
 }
Example #7
0
 @VisibleForTesting
 public TaskRunnerCallable(
     SubmitWorkRequestProto request,
     QueryFragmentInfo fragmentInfo,
     Configuration conf,
     ExecutionContext executionContext,
     Map<String, String> envMap,
     Credentials credentials,
     long memoryAvailable,
     AMReporter amReporter,
     ConfParams confParams,
     LlapDaemonExecutorMetrics metrics,
     KilledTaskHandler killedTaskHandler,
     FragmentCompletionHandler fragmentCompleteHandler) {
   this.request = request;
   this.fragmentInfo = fragmentInfo;
   this.conf = conf;
   this.executionContext = executionContext;
   this.envMap = envMap;
   this.objectRegistry = new ObjectRegistryImpl();
   this.credentials = credentials;
   this.memoryAvailable = memoryAvailable;
   this.confParams = confParams;
   this.jobToken = TokenCache.getSessionToken(credentials);
   this.taskSpec = Converters.getTaskSpecfromProto(request.getFragmentSpec());
   this.amReporter = amReporter;
   // Register with the AMReporter when the callable is setup. Unregister once it starts running.
   if (jobToken != null) {
     this.amReporter.registerTask(
         request.getAmHost(),
         request.getAmPort(),
         request.getUser(),
         jobToken,
         null,
         request.getFragmentSpec().getDagName());
   }
   this.metrics = metrics;
   this.requestId = request.getFragmentSpec().getFragmentIdentifierString();
   // TODO Change this to the queryId/Name when that's available.
   this.queryId = request.getFragmentSpec().getDagName();
   this.killedTaskHandler = killedTaskHandler;
   this.fragmentCompletionHanler = fragmentCompleteHandler;
 }
Example #8
0
 public static String getTaskIdentifierString(SubmitWorkRequestProto request) {
   StringBuilder sb = new StringBuilder();
   sb.append("AppId=")
       .append(request.getApplicationIdString())
       .append(", containerId=")
       .append(request.getContainerIdString())
       .append(", Dag=")
       .append(request.getFragmentSpec().getDagName())
       .append(", Vertex=")
       .append(request.getFragmentSpec().getVertexName())
       .append(", FragmentNum=")
       .append(request.getFragmentSpec().getFragmentNumber())
       .append(", Attempt=")
       .append(request.getFragmentSpec().getAttemptNumber());
   return sb.toString();
 }
Example #9
0
 public FragmentSpecProto getFragmentSpec() {
   return request.getFragmentSpec();
 }
Example #10
0
 public FragmentRuntimeInfo getFragmentRuntimeInfo() {
   return request.getFragmentRuntimeInfo();
 }
Example #11
0
  @Override
  protected TaskRunner2Result callInternal() throws Exception {
    isStarted.set(true);

    this.startTime = System.currentTimeMillis();
    this.threadName = Thread.currentThread().getName();
    if (LOG.isDebugEnabled()) {
      LOG.debug("canFinish: " + taskSpec.getTaskAttemptID() + ": " + canFinish());
    }

    // Unregister from the AMReporter, since the task is now running.
    this.amReporter.unregisterTask(request.getAmHost(), request.getAmPort());

    synchronized (this) {
      if (!shouldRunTask) {
        LOG.info("Not starting task {} since it was killed earlier", taskSpec.getTaskAttemptID());
        return new TaskRunner2Result(EndReason.KILL_REQUESTED, null, false);
      }
    }

    // TODO This executor seems unnecessary. Here and TezChild
    ExecutorService executorReal =
        Executors.newFixedThreadPool(
            1, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("TezTaskRunner").build());
    executor = MoreExecutors.listeningDecorator(executorReal);

    // TODO Consolidate this code with TezChild.
    runtimeWatch.start();
    UserGroupInformation taskUgi = UserGroupInformation.createRemoteUser(request.getUser());
    taskUgi.addCredentials(credentials);

    Map<String, ByteBuffer> serviceConsumerMetadata = new HashMap<>();
    serviceConsumerMetadata.put(
        TezConstants.TEZ_SHUFFLE_HANDLER_SERVICE_ID,
        TezCommonUtils.convertJobTokenToBytes(jobToken));
    Multimap<String, String> startedInputsMap = createStartedInputMap(request.getFragmentSpec());

    UserGroupInformation taskOwner =
        UserGroupInformation.createRemoteUser(request.getTokenIdentifier());
    final InetSocketAddress address =
        NetUtils.createSocketAddrForHost(request.getAmHost(), request.getAmPort());
    SecurityUtil.setTokenService(jobToken, address);
    taskOwner.addToken(jobToken);
    umbilical =
        taskOwner.doAs(
            new PrivilegedExceptionAction<LlapTaskUmbilicalProtocol>() {
              @Override
              public LlapTaskUmbilicalProtocol run() throws Exception {
                return RPC.getProxy(
                    LlapTaskUmbilicalProtocol.class,
                    LlapTaskUmbilicalProtocol.versionID,
                    address,
                    conf);
              }
            });

    taskReporter =
        new LlapTaskReporter(
            umbilical,
            confParams.amHeartbeatIntervalMsMax,
            confParams.amCounterHeartbeatInterval,
            confParams.amMaxEventsPerHeartbeat,
            new AtomicLong(0),
            request.getContainerIdString());

    String attemptId = fragmentInfo.getFragmentIdentifierString();
    IOContextMap.setThreadAttemptId(attemptId);
    try {
      synchronized (this) {
        if (shouldRunTask) {
          taskRunner =
              new TezTaskRunner2(
                  conf,
                  taskUgi,
                  fragmentInfo.getLocalDirs(),
                  taskSpec,
                  request.getAppAttemptNumber(),
                  serviceConsumerMetadata,
                  envMap,
                  startedInputsMap,
                  taskReporter,
                  executor,
                  objectRegistry,
                  pid,
                  executionContext,
                  memoryAvailable,
                  false);
        }
      }
      if (taskRunner == null) {
        LOG.info("Not starting task {} since it was killed earlier", taskSpec.getTaskAttemptID());
        return new TaskRunner2Result(EndReason.KILL_REQUESTED, null, false);
      }

      try {
        TaskRunner2Result result = taskRunner.run();
        if (result.isContainerShutdownRequested()) {
          LOG.warn("Unexpected container shutdown requested while running task. Ignoring");
        }
        isCompleted.set(true);
        return result;
      } finally {
        FileSystem.closeAllForUGI(taskUgi);
        LOG.info(
            "ExecutionTime for Container: "
                + request.getContainerIdString()
                + "="
                + runtimeWatch.stop().elapsedMillis());
        if (LOG.isDebugEnabled()) {
          LOG.debug(
              "canFinish post completion: " + taskSpec.getTaskAttemptID() + ": " + canFinish());
        }
      }
    } finally {
      IOContextMap.clearThreadAttempt(attemptId);
    }
  }