private synchronized void scheduleNextRequest() { // stopped or done? TaskInfo taskInfo = HttpRemoteTask.this.taskInfo.get(); if (!running || taskInfo.getState().isDone()) { return; } // outstanding request? if (future != null && !future.isDone()) { // this should never happen log.error("Can not reschedule update because an update is already running"); return; } // if throttled due to error, asynchronously wait for timeout and try again ListenableFuture<?> errorRateLimit = getErrorTracker.acquireRequestPermit(); if (!errorRateLimit.isDone()) { errorRateLimit.addListener(this::scheduleNextRequest, executor); return; } Request request = prepareGet() .setUri(uriBuilderFrom(taskInfo.getSelf()).addParameter("summarize").build()) .setHeader(HttpHeaders.CONTENT_TYPE, MediaType.JSON_UTF_8.toString()) .setHeader(PrestoHeaders.PRESTO_CURRENT_STATE, taskInfo.getState().toString()) .setHeader(PrestoHeaders.PRESTO_MAX_WAIT, refreshMaxWait.toString()) .build(); getErrorTracker.startRequest(); future = httpClient.executeAsync(request, createFullJsonResponseHandler(taskInfoCodec)); Futures.addCallback( future, new SimpleHttpResponseHandler<>(this, request.getUri()), executor); }
@Override public RecordCursor cursor() { Builder table = InMemoryRecordSet.builder(TASK_TABLE); for (TaskInfo taskInfo : taskManager.getAllTaskInfo()) { TaskStats stats = taskInfo.getStats(); table.addRow( nodeId, taskInfo.getTaskId().toString(), taskInfo.getTaskId().getStageId().toString(), taskInfo.getTaskId().getQueryId().toString(), taskInfo.getState().toString(), (long) stats.getTotalDrivers(), (long) stats.getQueuedDrivers(), (long) stats.getRunningDrivers(), (long) stats.getCompletedDrivers(), toMillis(stats.getTotalScheduledTime()), toMillis(stats.getTotalCpuTime()), toMillis(stats.getTotalUserTime()), toMillis(stats.getTotalBlockedTime()), toBytes(stats.getRawInputDataSize()), stats.getRawInputPositions(), toBytes(stats.getProcessedInputDataSize()), stats.getProcessedInputPositions(), toBytes(stats.getOutputDataSize()), stats.getOutputPositions(), toTimeStamp(stats.getCreateTime()), toTimeStamp(stats.getFirstStartTime()), toTimeStamp(taskInfo.getLastHeartbeat()), toTimeStamp(stats.getEndTime())); } return table.build().cursor(); }
@Override public void failed(Throwable cause) { try (SetThreadName ignored = new SetThreadName("ContinuousTaskInfoFetcher-%s", taskId)) { synchronized (this) { future = null; } try { // if task not already done, record error TaskInfo taskInfo = getTaskInfo(); if (!taskInfo.getState().isDone()) { getErrorTracker.requestFailed(cause); } } catch (Error e) { failTask(e); abort(); throw e; } catch (RuntimeException e) { failTask(e); abort(); } finally { // there is no back off here so we can get a lot of error messages when a server spins // down, but it typically goes away quickly because the queries get canceled scheduleNextRequest(); } } }
@Override public void failed(Throwable cause) { try (SetThreadName ignored = new SetThreadName("UpdateResponseHandler-%s", taskId)) { try { synchronized (HttpRemoteTask.this) { currentRequest = null; } // on failure assume we need to update again needsUpdate.set(true); // if task not already done, record error TaskInfo taskInfo = getTaskInfo(); if (!taskInfo.getState().isDone()) { updateErrorTracker.requestFailed(cause); } } catch (Error e) { failTask(e); abort(); throw e; } catch (RuntimeException e) { failTask(e); abort(); } finally { scheduleUpdate(); } } }
/** Move the task directly to the failed state */ private void failTask(Throwable cause) { TaskInfo taskInfo = getTaskInfo(); if (!taskInfo.getState().isDone()) { log.debug(cause, "Remote task failed: %s", taskInfo.getSelf()); } updateTaskInfo( new TaskInfo( taskInfo.getTaskId(), taskInfo.getNodeInstanceId(), TaskInfo.MAX_VERSION, TaskState.FAILED, taskInfo.getSelf(), taskInfo.getLastHeartbeat(), taskInfo.getOutputBuffers(), taskInfo.getNoMoreSplits(), taskInfo.getStats(), ImmutableList.of(toFailure(cause)))); }
private synchronized void updateTaskInfo(TaskInfo newValue, List<TaskSource> sources) { if (newValue.getState().isDone()) { // splits can be huge so clear the list pendingSplits.clear(); fireSplitCountChanged(-pendingSourceSplitCount); pendingSourceSplitCount = 0; } int oldPartitionedSplitCount = getPartitionedSplitCount(); // change to new value if old value is not changed and new value has a newer version AtomicBoolean workerRestarted = new AtomicBoolean(); boolean updated = taskInfo.setIf( newValue, oldValue -> { // did the worker restart if (oldValue.getNodeInstanceId().isPresent() && !oldValue.getNodeInstanceId().equals(newValue.getNodeInstanceId())) { workerRestarted.set(true); return false; } if (oldValue.getState().isDone()) { // never update if the task has reached a terminal state return false; } if (newValue.getVersion() < oldValue.getVersion()) { // don't update to an older version (same version is ok) return false; } return true; }); if (workerRestarted.get()) { PrestoException exception = new PrestoException( WORKER_RESTARTED, format("%s (%s)", WORKER_RESTARTED_ERROR, newValue.getSelf())); failTask(exception); abort(); } // remove acknowledged splits, which frees memory for (TaskSource source : sources) { PlanNodeId planNodeId = source.getPlanNodeId(); int removed = 0; for (ScheduledSplit split : source.getSplits()) { if (pendingSplits.remove(planNodeId, split)) { removed++; } } if (planNodeId.equals(planFragment.getPartitionedSource())) { pendingSourceSplitCount -= removed; } } if (updated) { if (getTaskInfo().getState().isDone()) { fireSplitCountChanged(-oldPartitionedSplitCount); } else { fireSplitCountChanged(getPartitionedSplitCount() - oldPartitionedSplitCount); } } }