/** * Submit the work for actual execution. * * @throws InvalidProtocolBufferException */ public void submitWork(SubmitWorkRequestProto request, String llapHost, int llapPort) { // Register the pending events to be sent for this spec. VertexOrBinary vob = request.getWorkSpec(); assert vob.hasVertexBinary() != vob.hasVertex(); SignableVertexSpec vertex = null; try { vertex = vob.hasVertex() ? vob.getVertex() : SignableVertexSpec.parseFrom(vob.getVertexBinary()); } catch (InvalidProtocolBufferException e) { throw new RuntimeException(e); } QueryIdentifierProto queryIdentifierProto = vertex.getQueryIdentifier(); TezTaskAttemptID attemptId = Converters.createTaskAttemptId( queryIdentifierProto, vertex.getVertexIndex(), request.getFragmentNumber(), request.getAttemptNumber()); final String fragmentId = attemptId.toString(); pendingEvents.putIfAbsent( fragmentId, new PendingEventData( new TaskHeartbeatInfo(fragmentId, llapHost, llapPort), Lists.<TezEvent>newArrayList())); // Setup timer task to check for hearbeat timeouts timer.scheduleAtFixedRate( new HeartbeatCheckTask(), connectionTimeout, connectionTimeout, TimeUnit.MILLISECONDS); // Send out the actual SubmitWorkRequest communicator.sendSubmitWork( request, llapHost, llapPort, new LlapProtocolClientProxy.ExecuteRequestCallback<SubmitWorkResponseProto>() { @Override public void setResponse(SubmitWorkResponseProto response) { if (response.hasSubmissionState()) { if (response.getSubmissionState().equals(SubmissionStateProto.REJECTED)) { String msg = "Fragment: " + fragmentId + " rejected. Server Busy."; LOG.info(msg); if (responder != null) { Throwable err = new RuntimeException(msg); responder.submissionFailed(fragmentId, err); } return; } } } @Override public void indicateError(Throwable t) { String msg = "Failed to submit: " + fragmentId; LOG.error(msg, t); Throwable err = new RuntimeException(msg, t); responder.submissionFailed(fragmentId, err); } }); }
/** Inform the AM that this task has been killed. */ public void reportTaskKilled() { killedTaskHandler.taskKilled( request.getAmHost(), request.getAmPort(), request.getUser(), jobToken, null, taskSpec.getDAGName(), taskSpec.getTaskAttemptID()); }
/** * Attempt to kill a running task. If the task has not started running, it will not start. If it's * already running, a kill request will be sent to it. * * <p>The AM will be informed about the task kill. */ public void killTask() { if (!isCompleted.get()) { if (!killInvoked.getAndSet(true)) { synchronized (this) { LOG.info( "Kill task requested for id={}, taskRunnerSetup={}", taskSpec.getTaskAttemptID(), (taskRunner != null)); if (taskRunner != null) { killtimerWatch.start(); LOG.info("Issuing kill to task {}", taskSpec.getTaskAttemptID()); boolean killed = taskRunner.killTask(); if (killed) { // Sending a kill message to the AM right here. Don't need to wait for the task to // complete. LOG.info( "Kill request for task {} completed. Informing AM", taskSpec.getTaskAttemptID()); reportTaskKilled(); } else { LOG.info( "Kill request for task {} did not complete because the task is already complete", taskSpec.getTaskAttemptID()); } shouldRunTask = false; } else { // If the task hasn't started, and it is killed - report back to the AM that the task // has been killed. LOG.debug("Reporting taskKilled for non-started fragment {}", getRequestId()); reportTaskKilled(); } if (!isStarted.get()) { // If the task hasn't started - inform about fragment completion immediately. It's // possible for // the callable to never run. fragmentCompletionHanler.fragmentComplete(fragmentInfo); this.amReporter.unregisterTask(request.getAmHost(), request.getAmPort()); } } } else { // This should not happen. LOG.warn( "Ignoring kill request for task {} since a previous kill request was processed", taskSpec.getTaskAttemptID()); } } else { LOG.info( "Ignoring kill request for task {} since it's already complete", taskSpec.getTaskAttemptID()); } }
// Errors are handled on the way over. FAIL/SUCCESS is informed via regular heartbeats. Killed // via a kill message when a task kill is requested by the daemon. @Override public void onSuccess(TaskRunner2Result result) { isCompleted.set(true); switch (result.getEndReason()) { // Only the KILLED case requires a message to be sent out to the AM. case SUCCESS: LOG.debug("Successfully finished {}", requestId); metrics.incrExecutorTotalSuccess(); break; case CONTAINER_STOP_REQUESTED: LOG.info("Received container stop request (AM preemption) for {}", requestId); break; case KILL_REQUESTED: LOG.info("Killed task {}", requestId); if (killtimerWatch.isRunning()) { killtimerWatch.stop(); long elapsed = killtimerWatch.elapsedMillis(); LOG.info("Time to die for task {}", elapsed); } metrics.incrPreemptionTimeLost(runtimeWatch.elapsedMillis()); metrics.incrExecutorTotalKilled(); break; case COMMUNICATION_FAILURE: LOG.info("Failed to run {} due to communication failure", requestId); metrics.incrExecutorTotalExecutionFailed(); break; case TASK_ERROR: LOG.info("Failed to run {} due to task error", requestId); metrics.incrExecutorTotalExecutionFailed(); break; } fragmentCompletionHanler.fragmentComplete(fragmentInfo); taskRunnerCallable.shutdown(); HistoryLogger.logFragmentEnd( request.getApplicationIdString(), request.getContainerIdString(), executionContext.getHostName(), request.getFragmentSpec().getDagName(), request.getFragmentSpec().getVertexName(), request.getFragmentSpec().getFragmentNumber(), request.getFragmentSpec().getAttemptNumber(), taskRunnerCallable.threadName, taskRunnerCallable.startTime, true); metrics.decrExecutorNumQueuedRequests(); }
@Override public String toString() { return requestId + " {canFinish: " + canFinish() + ", vertexParallelism: " + request.getFragmentSpec().getVertexParallelism() + ", selfAndUpstreamParallelism: " + request.getFragmentRuntimeInfo().getNumSelfAndUpstreamTasks() + ", selfAndUpstreamComplete: " + request.getFragmentRuntimeInfo().getNumSelfAndUpstreamCompletedTasks() + ", firstAttemptStartTime: " + getFragmentRuntimeInfo().getFirstAttemptStartTime() + ", dagStartTime:" + getFragmentRuntimeInfo().getDagStartTime() + ", withinDagPriority: " + getFragmentRuntimeInfo().getWithinDagPriority() + "}"; }
@Override public void onFailure(Throwable t) { LOG.error("TezTaskRunner execution failed for : " + getTaskIdentifierString(request), t); isCompleted.set(true); fragmentCompletionHanler.fragmentComplete(fragmentInfo); // TODO HIVE-10236 Report a fatal error over the umbilical taskRunnerCallable.shutdown(); HistoryLogger.logFragmentEnd( request.getApplicationIdString(), request.getContainerIdString(), executionContext.getHostName(), request.getFragmentSpec().getDagName(), request.getFragmentSpec().getVertexName(), request.getFragmentSpec().getFragmentNumber(), request.getFragmentSpec().getAttemptNumber(), taskRunnerCallable.threadName, taskRunnerCallable.startTime, false); if (metrics != null) { metrics.decrExecutorNumQueuedRequests(); } }
@VisibleForTesting public TaskRunnerCallable( SubmitWorkRequestProto request, QueryFragmentInfo fragmentInfo, Configuration conf, ExecutionContext executionContext, Map<String, String> envMap, Credentials credentials, long memoryAvailable, AMReporter amReporter, ConfParams confParams, LlapDaemonExecutorMetrics metrics, KilledTaskHandler killedTaskHandler, FragmentCompletionHandler fragmentCompleteHandler) { this.request = request; this.fragmentInfo = fragmentInfo; this.conf = conf; this.executionContext = executionContext; this.envMap = envMap; this.objectRegistry = new ObjectRegistryImpl(); this.credentials = credentials; this.memoryAvailable = memoryAvailable; this.confParams = confParams; this.jobToken = TokenCache.getSessionToken(credentials); this.taskSpec = Converters.getTaskSpecfromProto(request.getFragmentSpec()); this.amReporter = amReporter; // Register with the AMReporter when the callable is setup. Unregister once it starts running. if (jobToken != null) { this.amReporter.registerTask( request.getAmHost(), request.getAmPort(), request.getUser(), jobToken, null, request.getFragmentSpec().getDagName()); } this.metrics = metrics; this.requestId = request.getFragmentSpec().getFragmentIdentifierString(); // TODO Change this to the queryId/Name when that's available. this.queryId = request.getFragmentSpec().getDagName(); this.killedTaskHandler = killedTaskHandler; this.fragmentCompletionHanler = fragmentCompleteHandler; }
public static String getTaskIdentifierString(SubmitWorkRequestProto request) { StringBuilder sb = new StringBuilder(); sb.append("AppId=") .append(request.getApplicationIdString()) .append(", containerId=") .append(request.getContainerIdString()) .append(", Dag=") .append(request.getFragmentSpec().getDagName()) .append(", Vertex=") .append(request.getFragmentSpec().getVertexName()) .append(", FragmentNum=") .append(request.getFragmentSpec().getFragmentNumber()) .append(", Attempt=") .append(request.getFragmentSpec().getAttemptNumber()); return sb.toString(); }
public FragmentSpecProto getFragmentSpec() { return request.getFragmentSpec(); }
public FragmentRuntimeInfo getFragmentRuntimeInfo() { return request.getFragmentRuntimeInfo(); }
@Override protected TaskRunner2Result callInternal() throws Exception { isStarted.set(true); this.startTime = System.currentTimeMillis(); this.threadName = Thread.currentThread().getName(); if (LOG.isDebugEnabled()) { LOG.debug("canFinish: " + taskSpec.getTaskAttemptID() + ": " + canFinish()); } // Unregister from the AMReporter, since the task is now running. this.amReporter.unregisterTask(request.getAmHost(), request.getAmPort()); synchronized (this) { if (!shouldRunTask) { LOG.info("Not starting task {} since it was killed earlier", taskSpec.getTaskAttemptID()); return new TaskRunner2Result(EndReason.KILL_REQUESTED, null, false); } } // TODO This executor seems unnecessary. Here and TezChild ExecutorService executorReal = Executors.newFixedThreadPool( 1, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("TezTaskRunner").build()); executor = MoreExecutors.listeningDecorator(executorReal); // TODO Consolidate this code with TezChild. runtimeWatch.start(); UserGroupInformation taskUgi = UserGroupInformation.createRemoteUser(request.getUser()); taskUgi.addCredentials(credentials); Map<String, ByteBuffer> serviceConsumerMetadata = new HashMap<>(); serviceConsumerMetadata.put( TezConstants.TEZ_SHUFFLE_HANDLER_SERVICE_ID, TezCommonUtils.convertJobTokenToBytes(jobToken)); Multimap<String, String> startedInputsMap = createStartedInputMap(request.getFragmentSpec()); UserGroupInformation taskOwner = UserGroupInformation.createRemoteUser(request.getTokenIdentifier()); final InetSocketAddress address = NetUtils.createSocketAddrForHost(request.getAmHost(), request.getAmPort()); SecurityUtil.setTokenService(jobToken, address); taskOwner.addToken(jobToken); umbilical = taskOwner.doAs( new PrivilegedExceptionAction<LlapTaskUmbilicalProtocol>() { @Override public LlapTaskUmbilicalProtocol run() throws Exception { return RPC.getProxy( LlapTaskUmbilicalProtocol.class, LlapTaskUmbilicalProtocol.versionID, address, conf); } }); taskReporter = new LlapTaskReporter( umbilical, confParams.amHeartbeatIntervalMsMax, confParams.amCounterHeartbeatInterval, confParams.amMaxEventsPerHeartbeat, new AtomicLong(0), request.getContainerIdString()); String attemptId = fragmentInfo.getFragmentIdentifierString(); IOContextMap.setThreadAttemptId(attemptId); try { synchronized (this) { if (shouldRunTask) { taskRunner = new TezTaskRunner2( conf, taskUgi, fragmentInfo.getLocalDirs(), taskSpec, request.getAppAttemptNumber(), serviceConsumerMetadata, envMap, startedInputsMap, taskReporter, executor, objectRegistry, pid, executionContext, memoryAvailable, false); } } if (taskRunner == null) { LOG.info("Not starting task {} since it was killed earlier", taskSpec.getTaskAttemptID()); return new TaskRunner2Result(EndReason.KILL_REQUESTED, null, false); } try { TaskRunner2Result result = taskRunner.run(); if (result.isContainerShutdownRequested()) { LOG.warn("Unexpected container shutdown requested while running task. Ignoring"); } isCompleted.set(true); return result; } finally { FileSystem.closeAllForUGI(taskUgi); LOG.info( "ExecutionTime for Container: " + request.getContainerIdString() + "=" + runtimeWatch.stop().elapsedMillis()); if (LOG.isDebugEnabled()) { LOG.debug( "canFinish post completion: " + taskSpec.getTaskAttemptID() + ": " + canFinish()); } } } finally { IOContextMap.clearThreadAttempt(attemptId); } }