void markFinished( Map<AccumulatorRegistry.Metric, Accumulator<?, ?>> flinkAccumulators, Map<String, Accumulator<?, ?>> userAccumulators) { // this call usually comes during RUNNING, but may also come while still in deploying (very fast // tasks!) while (true) { ExecutionState current = this.state; if (current == RUNNING || current == DEPLOYING) { if (transitionState(current, FINISHED)) { try { for (IntermediateResultPartition finishedPartition : getVertex().finishAllBlockingPartitions()) { IntermediateResultPartition[] allPartitions = finishedPartition.getIntermediateResult().getPartitions(); for (IntermediateResultPartition partition : allPartitions) { scheduleOrUpdateConsumers(partition.getConsumers()); } } synchronized (accumulatorLock) { this.flinkAccumulators = flinkAccumulators; this.userAccumulators = userAccumulators; } assignedResource.releaseSlot(); vertex.getExecutionGraph().deregisterExecution(this); } finally { vertex.executionFinished(); } return; } } else if (current == CANCELING) { // we sent a cancel call, and the task manager finished before it arrived. We // will never get a CANCELED call back from the job manager cancelingComplete(); return; } else if (current == CANCELED || current == FAILED) { if (LOG.isDebugEnabled()) { LOG.debug("Task FINISHED, but concurrently went to state " + state); } return; } else { // this should not happen, we need to fail this markFailed(new Exception("Vertex received FINISHED message while being in state " + state)); return; } } }
void cancelingComplete() { // the taskmanagers can themselves cancel tasks without an external trigger, if they find that // the // network stack is canceled (for example by a failing / canceling receiver or sender // this is an artifact of the old network runtime, but for now we need to support task // transitions // from running directly to canceled while (true) { ExecutionState current = this.state; if (current == CANCELED) { return; } else if (current == CANCELING || current == RUNNING || current == DEPLOYING) { if (transitionState(current, CANCELED)) { try { assignedResource.releaseSlot(); vertex.getExecutionGraph().deregisterExecution(this); } finally { vertex.executionCanceled(); } return; } // else fall through the loop } else { // failing in the meantime may happen and is no problem. // anything else is a serious problem !!! if (current != FAILED) { String message = String.format( "Asynchronous race: Found state %s after successful cancel call.", state); LOG.error(message); vertex.getExecutionGraph().fail(new Exception(message)); } return; } } }
public void cancel() { // depending on the previous state, we go directly to cancelled (no cancel call necessary) // -- or to canceling (cancel call needs to be sent to the task manager) // because of several possibly previous states, we need to again loop until we make a // successful atomic state transition while (true) { ExecutionState current = this.state; if (current == CANCELING || current == CANCELED) { // already taken care of, no need to cancel again return; } // these two are the common cases where we need to send a cancel call else if (current == RUNNING || current == DEPLOYING) { // try to transition to canceling, if successful, send the cancel call if (transitionState(current, CANCELING)) { sendCancelRpcCall(); return; } // else: fall through the loop } else if (current == FINISHED || current == FAILED) { // nothing to do any more. finished failed before it could be cancelled. // in any case, the task is removed from the TaskManager already sendFailIntermediateResultPartitionsRpcCall(); return; } else if (current == CREATED || current == SCHEDULED) { // from here, we can directly switch to cancelled, because the no task has been deployed if (transitionState(current, CANCELED)) { // we skip the canceling state. set the timestamp, for a consistent appearance markTimestamp(CANCELING, getStateTimestamp(CANCELED)); try { vertex.getExecutionGraph().deregisterExecution(this); if (assignedResource != null) { assignedResource.releaseSlot(); } } finally { vertex.executionCanceled(); } return; } // else: fall through the loop } else { throw new IllegalStateException(current.name()); } } }
private boolean processFail(Throwable t, boolean isCallback) { // damn, we failed. This means only that we keep our books and notify our parent // JobExecutionVertex // the actual computation on the task manager is cleaned up by the TaskManager that noticed the // failure // we may need to loop multiple times (in the presence of concurrent calls) in order to // atomically switch to failed while (true) { ExecutionState current = this.state; if (current == FAILED) { // already failed. It is enough to remember once that we failed (its sad enough) return false; } if (current == CANCELED) { // we are already aborting or are already aborted if (LOG.isDebugEnabled()) { LOG.debug( String.format( "Ignoring transition of vertex %s to %s while being %s", getVertexWithAttempt(), FAILED, CANCELED)); } return false; } if (transitionState(current, FAILED, t)) { // success (in a manner of speaking) this.failureCause = t; try { if (assignedResource != null) { assignedResource.releaseSlot(); } vertex.getExecutionGraph().deregisterExecution(this); } finally { vertex.executionFailed(t); } if (!isCallback && (current == RUNNING || current == DEPLOYING)) { if (LOG.isDebugEnabled()) { LOG.debug("Sending out cancel request, to remove task execution from TaskManager."); } try { if (assignedResource != null) { sendCancelRpcCall(); } } catch (Throwable tt) { // no reason this should ever happen, but log it to be safe LOG.error("Error triggering cancel call while marking task as failed.", tt); } } // leave the loop return true; } } }
public void deployToSlot(final SimpleSlot slot) throws JobException { // sanity checks if (slot == null) { throw new NullPointerException(); } if (!slot.isAlive()) { throw new JobException("Target slot for deployment is not alive."); } // make sure exactly one deployment call happens from the correct state // note: the transition from CREATED to DEPLOYING is for testing purposes only ExecutionState previous = this.state; if (previous == SCHEDULED || previous == CREATED) { if (!transitionState(previous, DEPLOYING)) { // race condition, someone else beat us to the deploying call. // this should actually not happen and indicates a race somewhere else throw new IllegalStateException("Cannot deploy task: Concurrent deployment call race."); } } else { // vertex may have been cancelled, or it was already scheduled throw new IllegalStateException( "The vertex must be in CREATED or SCHEDULED state to be deployed. Found state " + previous); } try { // good, we are allowed to deploy if (!slot.setExecutedVertex(this)) { throw new JobException("Could not assign the ExecutionVertex to the slot " + slot); } this.assignedResource = slot; this.assignedResourceLocation = slot.getInstance().getInstanceConnectionInfo(); // race double check, did we fail/cancel and do we need to release the slot? if (this.state != DEPLOYING) { slot.releaseSlot(); return; } if (LOG.isInfoEnabled()) { LOG.info( String.format( "Deploying %s (attempt #%d) to %s", vertex.getSimpleName(), attemptNumber, slot.getInstance().getInstanceConnectionInfo().getHostname())); } final TaskDeploymentDescriptor deployment = vertex.createDeploymentDescriptor( attemptId, slot, operatorState, recoveryTimestamp, attemptNumber); // register this execution at the execution graph, to receive call backs vertex.getExecutionGraph().registerExecution(this); final Instance instance = slot.getInstance(); final ActorGateway gateway = instance.getActorGateway(); final Future<Object> deployAction = gateway.ask(new SubmitTask(deployment), timeout); deployAction.onComplete( new OnComplete<Object>() { @Override public void onComplete(Throwable failure, Object success) throws Throwable { if (failure != null) { if (failure instanceof TimeoutException) { String taskname = deployment.getTaskInfo().getTaskNameWithSubtasks() + " (" + attemptId + ')'; markFailed( new Exception( "Cannot deploy task " + taskname + " - TaskManager (" + instance + ") not responding after a timeout of " + timeout, failure)); } else { markFailed(failure); } } else { if (!(success.equals(Messages.getAcknowledge()))) { markFailed( new Exception( "Failed to deploy the task to slot " + slot + ": Response was not of type Acknowledge")); } } } }, executionContext); } catch (Throwable t) { markFailed(t); ExceptionUtils.rethrow(t); } }