@Override public void failed(Throwable cause) { try (SetThreadName ignored = new SetThreadName("ContinuousTaskInfoFetcher-%s", taskId)) { synchronized (this) { future = null; } try { // if task not already done, record error TaskInfo taskInfo = getTaskInfo(); if (!taskInfo.getState().isDone()) { getErrorTracker.requestFailed(cause); } } catch (Error e) { failTask(e); abort(); throw e; } catch (RuntimeException e) { failTask(e); abort(); } finally { // there is no back off here so we can get a lot of error messages when a server spins // down, but it typically goes away quickly because the queries get canceled scheduleNextRequest(); } } }
@Test(enabled = false) public void testQuery() throws Exception { URI location = client.execute( preparePost() .setUri(uriFor("/v1/query")) .setBodyGenerator(createStaticBodyGenerator("query", UTF_8)) .build(), new CreatedResponseHandler()); assertQueryStatus(location, QueryState.RUNNING); QueryInfo queryInfo = client.execute( prepareGet().setUri(location).build(), createJsonResponseHandler(jsonCodec(QueryInfo.class))); TaskInfo taskInfo = queryInfo.getOutputStage().getTasks().get(0); URI outputLocation = uriFor("/v1/task/" + taskInfo.getTaskId() + "/results/out"); long sequenceId = 0; PagesResponse response = client.execute( prepareGet() .setUri( uriBuilderFrom(outputLocation).appendPath(String.valueOf(sequenceId)).build()) .build(), new PageResponseHandler()); List<Page> pages = response.getPages(); assertEquals(countPositions(pages), 220); assertQueryStatus(location, QueryState.RUNNING); sequenceId += pages.size(); response = client.execute( prepareGet() .setUri( uriBuilderFrom(outputLocation).appendPath(String.valueOf(sequenceId)).build()) .build(), new PageResponseHandler()); pages = response.getPages(); assertEquals(countPositions(pages), 44 + 48); sequenceId += pages.size(); response = client.execute( prepareGet() .setUri( uriBuilderFrom(outputLocation).appendPath(String.valueOf(sequenceId)).build()) .build(), new PageResponseHandler()); pages = response.getPages(); assertEquals(countPositions(pages), 0); assertQueryStatus(location, QueryState.FINISHED); // cancel the query StatusResponse cancelResponse = client.execute(prepareDelete().setUri(location).build(), createStatusResponseHandler()); assertQueryStatus(location, QueryState.FINISHED); assertEquals(cancelResponse.getStatusCode(), HttpStatus.NO_CONTENT.code()); }
private synchronized void scheduleNextRequest() { // stopped or done? TaskInfo taskInfo = HttpRemoteTask.this.taskInfo.get(); if (!running || taskInfo.getState().isDone()) { return; } // outstanding request? if (future != null && !future.isDone()) { // this should never happen log.error("Can not reschedule update because an update is already running"); return; } // if throttled due to error, asynchronously wait for timeout and try again ListenableFuture<?> errorRateLimit = getErrorTracker.acquireRequestPermit(); if (!errorRateLimit.isDone()) { errorRateLimit.addListener(this::scheduleNextRequest, executor); return; } Request request = prepareGet() .setUri(uriBuilderFrom(taskInfo.getSelf()).addParameter("summarize").build()) .setHeader(HttpHeaders.CONTENT_TYPE, MediaType.JSON_UTF_8.toString()) .setHeader(PrestoHeaders.PRESTO_CURRENT_STATE, taskInfo.getState().toString()) .setHeader(PrestoHeaders.PRESTO_MAX_WAIT, refreshMaxWait.toString()) .build(); getErrorTracker.startRequest(); future = httpClient.executeAsync(request, createFullJsonResponseHandler(taskInfoCodec)); Futures.addCallback( future, new SimpleHttpResponseHandler<>(this, request.getUri()), executor); }
@Override public void failed(Throwable cause) { try (SetThreadName ignored = new SetThreadName("UpdateResponseHandler-%s", taskId)) { try { synchronized (HttpRemoteTask.this) { currentRequest = null; } // on failure assume we need to update again needsUpdate.set(true); // if task not already done, record error TaskInfo taskInfo = getTaskInfo(); if (!taskInfo.getState().isDone()) { updateErrorTracker.requestFailed(cause); } } catch (Error e) { failTask(e); abort(); throw e; } catch (RuntimeException e) { failTask(e); abort(); } finally { scheduleUpdate(); } } }
/** Move the task directly to the failed state */ private void failTask(Throwable cause) { TaskInfo taskInfo = getTaskInfo(); if (!taskInfo.getState().isDone()) { log.debug(cause, "Remote task failed: %s", taskInfo.getSelf()); } updateTaskInfo( new TaskInfo( taskInfo.getTaskId(), taskInfo.getNodeInstanceId(), TaskInfo.MAX_VERSION, TaskState.FAILED, taskInfo.getSelf(), taskInfo.getLastHeartbeat(), taskInfo.getOutputBuffers(), taskInfo.getNoMoreSplits(), taskInfo.getStats(), ImmutableList.of(toFailure(cause)))); }
@Override public RecordCursor cursor() { Builder table = InMemoryRecordSet.builder(TASK_TABLE); for (TaskInfo taskInfo : taskManager.getAllTaskInfo()) { TaskStats stats = taskInfo.getStats(); table.addRow( nodeId, taskInfo.getTaskId().toString(), taskInfo.getTaskId().getStageId().toString(), taskInfo.getTaskId().getQueryId().toString(), taskInfo.getState().toString(), (long) stats.getTotalDrivers(), (long) stats.getQueuedDrivers(), (long) stats.getRunningDrivers(), (long) stats.getCompletedDrivers(), toMillis(stats.getTotalScheduledTime()), toMillis(stats.getTotalCpuTime()), toMillis(stats.getTotalUserTime()), toMillis(stats.getTotalBlockedTime()), toBytes(stats.getRawInputDataSize()), stats.getRawInputPositions(), toBytes(stats.getProcessedInputDataSize()), stats.getProcessedInputPositions(), toBytes(stats.getOutputDataSize()), stats.getOutputPositions(), toTimeStamp(stats.getCreateTime()), toTimeStamp(stats.getFirstStartTime()), toTimeStamp(taskInfo.getLastHeartbeat()), toTimeStamp(stats.getEndTime())); } return table.build().cursor(); }
@Override public synchronized void abort() { try (SetThreadName ignored = new SetThreadName("HttpRemoteTask-%s", taskId)) { // clear pending splits to free memory fireSplitCountChanged(-pendingSourceSplitCount); pendingSplits.clear(); pendingSourceSplitCount = 0; // cancel pending request if (currentRequest != null) { currentRequest.cancel(true); currentRequest = null; currentRequestStartNanos = 0; } // mark task as canceled (if not already done) TaskInfo taskInfo = getTaskInfo(); URI uri = taskInfo.getSelf(); updateTaskInfo( new TaskInfo( taskInfo.getTaskId(), taskInfo.getNodeInstanceId(), TaskInfo.MAX_VERSION, TaskState.ABORTED, uri, taskInfo.getLastHeartbeat(), taskInfo.getOutputBuffers(), taskInfo.getNoMoreSplits(), taskInfo.getStats(), ImmutableList.<ExecutionFailureInfo>of())); // send abort to task and ignore response Request request = prepareDelete().setUri(uriBuilderFrom(uri).addParameter("summarize").build()).build(); scheduleAsyncCleanupRequest(new Backoff(MAX_CLEANUP_RETRY_TIME), request, "abort"); } }
private void logQueryTimeline(QueryInfo queryInfo) { try { QueryStats queryStats = queryInfo.getQueryStats(); DateTime queryStartTime = queryStats.getCreateTime(); DateTime queryEndTime = queryStats.getEndTime(); // query didn't finish cleanly if (queryStartTime == null || queryEndTime == null) { return; } // planning duration -- start to end of planning Duration planning = queryStats.getTotalPlanningTime(); if (planning == null) { planning = new Duration(0, MILLISECONDS); } List<StageInfo> stages = StageInfo.getAllStages(queryInfo.getOutputStage()); // long lastSchedulingCompletion = 0; long firstTaskStartTime = queryEndTime.getMillis(); long lastTaskStartTime = queryStartTime.getMillis() + planning.toMillis(); long lastTaskEndTime = queryStartTime.getMillis() + planning.toMillis(); for (StageInfo stage : stages) { // only consider leaf stages if (!stage.getSubStages().isEmpty()) { continue; } for (TaskInfo taskInfo : stage.getTasks()) { TaskStats taskStats = taskInfo.getStats(); DateTime firstStartTime = taskStats.getFirstStartTime(); if (firstStartTime != null) { firstTaskStartTime = Math.min(firstStartTime.getMillis(), firstTaskStartTime); } DateTime lastStartTime = taskStats.getLastStartTime(); if (lastStartTime != null) { lastTaskStartTime = Math.max(lastStartTime.getMillis(), lastTaskStartTime); } DateTime endTime = taskStats.getEndTime(); if (endTime != null) { lastTaskEndTime = Math.max(endTime.getMillis(), lastTaskEndTime); } } } Duration elapsed = millis(queryEndTime.getMillis() - queryStartTime.getMillis()); Duration scheduling = millis(firstTaskStartTime - queryStartTime.getMillis() - planning.toMillis()); Duration running = millis(lastTaskEndTime - firstTaskStartTime); Duration finishing = millis(queryEndTime.getMillis() - lastTaskEndTime); log.info( "TIMELINE: Query %s :: elapsed %s :: planning %s :: scheduling %s :: running %s :: finishing %s :: begin %s :: end %s", queryInfo.getQueryId(), elapsed, planning, scheduling, running, finishing, queryStartTime, queryEndTime); } catch (Exception e) { log.error(e, "Error logging query timeline"); } }
private synchronized void updateTaskInfo(TaskInfo newValue, List<TaskSource> sources) { if (newValue.getState().isDone()) { // splits can be huge so clear the list pendingSplits.clear(); fireSplitCountChanged(-pendingSourceSplitCount); pendingSourceSplitCount = 0; } int oldPartitionedSplitCount = getPartitionedSplitCount(); // change to new value if old value is not changed and new value has a newer version AtomicBoolean workerRestarted = new AtomicBoolean(); boolean updated = taskInfo.setIf( newValue, oldValue -> { // did the worker restart if (oldValue.getNodeInstanceId().isPresent() && !oldValue.getNodeInstanceId().equals(newValue.getNodeInstanceId())) { workerRestarted.set(true); return false; } if (oldValue.getState().isDone()) { // never update if the task has reached a terminal state return false; } if (newValue.getVersion() < oldValue.getVersion()) { // don't update to an older version (same version is ok) return false; } return true; }); if (workerRestarted.get()) { PrestoException exception = new PrestoException( WORKER_RESTARTED, format("%s (%s)", WORKER_RESTARTED_ERROR, newValue.getSelf())); failTask(exception); abort(); } // remove acknowledged splits, which frees memory for (TaskSource source : sources) { PlanNodeId planNodeId = source.getPlanNodeId(); int removed = 0; for (ScheduledSplit split : source.getSplits()) { if (pendingSplits.remove(planNodeId, split)) { removed++; } } if (planNodeId.equals(planFragment.getPartitionedSource())) { pendingSourceSplitCount -= removed; } } if (updated) { if (getTaskInfo().getState().isDone()) { fireSplitCountChanged(-oldPartitionedSplitCount); } else { fireSplitCountChanged(getPartitionedSplitCount() - oldPartitionedSplitCount); } } }