예제 #1
0
  void markFinished(
      Map<AccumulatorRegistry.Metric, Accumulator<?, ?>> flinkAccumulators,
      Map<String, Accumulator<?, ?>> userAccumulators) {

    // this call usually comes during RUNNING, but may also come while still in deploying (very fast
    // tasks!)
    while (true) {
      ExecutionState current = this.state;

      if (current == RUNNING || current == DEPLOYING) {

        if (transitionState(current, FINISHED)) {
          try {
            for (IntermediateResultPartition finishedPartition :
                getVertex().finishAllBlockingPartitions()) {

              IntermediateResultPartition[] allPartitions =
                  finishedPartition.getIntermediateResult().getPartitions();

              for (IntermediateResultPartition partition : allPartitions) {
                scheduleOrUpdateConsumers(partition.getConsumers());
              }
            }

            synchronized (accumulatorLock) {
              this.flinkAccumulators = flinkAccumulators;
              this.userAccumulators = userAccumulators;
            }

            assignedResource.releaseSlot();
            vertex.getExecutionGraph().deregisterExecution(this);
          } finally {
            vertex.executionFinished();
          }
          return;
        }
      } else if (current == CANCELING) {
        // we sent a cancel call, and the task manager finished before it arrived. We
        // will never get a CANCELED call back from the job manager
        cancelingComplete();
        return;
      } else if (current == CANCELED || current == FAILED) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Task FINISHED, but concurrently went to state " + state);
        }
        return;
      } else {
        // this should not happen, we need to fail this
        markFailed(new Exception("Vertex received FINISHED message while being in state " + state));
        return;
      }
    }
  }
예제 #2
0
  void cancelingComplete() {

    // the taskmanagers can themselves cancel tasks without an external trigger, if they find that
    // the
    // network stack is canceled (for example by a failing / canceling receiver or sender
    // this is an artifact of the old network runtime, but for now we need to support task
    // transitions
    // from running directly to canceled

    while (true) {
      ExecutionState current = this.state;

      if (current == CANCELED) {
        return;
      } else if (current == CANCELING || current == RUNNING || current == DEPLOYING) {
        if (transitionState(current, CANCELED)) {
          try {
            assignedResource.releaseSlot();
            vertex.getExecutionGraph().deregisterExecution(this);
          } finally {
            vertex.executionCanceled();
          }
          return;
        }

        // else fall through the loop
      } else {
        // failing in the meantime may happen and is no problem.
        // anything else is a serious problem !!!
        if (current != FAILED) {
          String message =
              String.format(
                  "Asynchronous race: Found state %s after successful cancel call.", state);
          LOG.error(message);
          vertex.getExecutionGraph().fail(new Exception(message));
        }
        return;
      }
    }
  }
예제 #3
0
  public void cancel() {
    // depending on the previous state, we go directly to cancelled (no cancel call necessary)
    // -- or to canceling (cancel call needs to be sent to the task manager)

    // because of several possibly previous states, we need to again loop until we make a
    // successful atomic state transition
    while (true) {

      ExecutionState current = this.state;

      if (current == CANCELING || current == CANCELED) {
        // already taken care of, no need to cancel again
        return;
      }

      // these two are the common cases where we need to send a cancel call
      else if (current == RUNNING || current == DEPLOYING) {
        // try to transition to canceling, if successful, send the cancel call
        if (transitionState(current, CANCELING)) {
          sendCancelRpcCall();
          return;
        }
        // else: fall through the loop
      } else if (current == FINISHED || current == FAILED) {
        // nothing to do any more. finished failed before it could be cancelled.
        // in any case, the task is removed from the TaskManager already
        sendFailIntermediateResultPartitionsRpcCall();

        return;
      } else if (current == CREATED || current == SCHEDULED) {
        // from here, we can directly switch to cancelled, because the no task has been deployed
        if (transitionState(current, CANCELED)) {

          // we skip the canceling state. set the timestamp, for a consistent appearance
          markTimestamp(CANCELING, getStateTimestamp(CANCELED));

          try {
            vertex.getExecutionGraph().deregisterExecution(this);
            if (assignedResource != null) {
              assignedResource.releaseSlot();
            }
          } finally {
            vertex.executionCanceled();
          }
          return;
        }
        // else: fall through the loop
      } else {
        throw new IllegalStateException(current.name());
      }
    }
  }
예제 #4
0
  private boolean processFail(Throwable t, boolean isCallback) {

    // damn, we failed. This means only that we keep our books and notify our parent
    // JobExecutionVertex
    // the actual computation on the task manager is cleaned up by the TaskManager that noticed the
    // failure

    // we may need to loop multiple times (in the presence of concurrent calls) in order to
    // atomically switch to failed
    while (true) {
      ExecutionState current = this.state;

      if (current == FAILED) {
        // already failed. It is enough to remember once that we failed (its sad enough)
        return false;
      }

      if (current == CANCELED) {
        // we are already aborting or are already aborted
        if (LOG.isDebugEnabled()) {
          LOG.debug(
              String.format(
                  "Ignoring transition of vertex %s to %s while being %s",
                  getVertexWithAttempt(), FAILED, CANCELED));
        }
        return false;
      }

      if (transitionState(current, FAILED, t)) {
        // success (in a manner of speaking)
        this.failureCause = t;

        try {
          if (assignedResource != null) {
            assignedResource.releaseSlot();
          }
          vertex.getExecutionGraph().deregisterExecution(this);
        } finally {
          vertex.executionFailed(t);
        }

        if (!isCallback && (current == RUNNING || current == DEPLOYING)) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Sending out cancel request, to remove task execution from TaskManager.");
          }

          try {
            if (assignedResource != null) {
              sendCancelRpcCall();
            }
          } catch (Throwable tt) {
            // no reason this should ever happen, but log it to be safe
            LOG.error("Error triggering cancel call while marking task as failed.", tt);
          }
        }

        // leave the loop
        return true;
      }
    }
  }
예제 #5
0
  public void deployToSlot(final SimpleSlot slot) throws JobException {
    // sanity checks
    if (slot == null) {
      throw new NullPointerException();
    }
    if (!slot.isAlive()) {
      throw new JobException("Target slot for deployment is not alive.");
    }

    // make sure exactly one deployment call happens from the correct state
    // note: the transition from CREATED to DEPLOYING is for testing purposes only
    ExecutionState previous = this.state;
    if (previous == SCHEDULED || previous == CREATED) {
      if (!transitionState(previous, DEPLOYING)) {
        // race condition, someone else beat us to the deploying call.
        // this should actually not happen and indicates a race somewhere else
        throw new IllegalStateException("Cannot deploy task: Concurrent deployment call race.");
      }
    } else {
      // vertex may have been cancelled, or it was already scheduled
      throw new IllegalStateException(
          "The vertex must be in CREATED or SCHEDULED state to be deployed. Found state "
              + previous);
    }

    try {
      // good, we are allowed to deploy
      if (!slot.setExecutedVertex(this)) {
        throw new JobException("Could not assign the ExecutionVertex to the slot " + slot);
      }
      this.assignedResource = slot;
      this.assignedResourceLocation = slot.getInstance().getInstanceConnectionInfo();

      // race double check, did we fail/cancel and do we need to release the slot?
      if (this.state != DEPLOYING) {
        slot.releaseSlot();
        return;
      }

      if (LOG.isInfoEnabled()) {
        LOG.info(
            String.format(
                "Deploying %s (attempt #%d) to %s",
                vertex.getSimpleName(),
                attemptNumber,
                slot.getInstance().getInstanceConnectionInfo().getHostname()));
      }

      final TaskDeploymentDescriptor deployment =
          vertex.createDeploymentDescriptor(
              attemptId, slot, operatorState, recoveryTimestamp, attemptNumber);

      // register this execution at the execution graph, to receive call backs
      vertex.getExecutionGraph().registerExecution(this);

      final Instance instance = slot.getInstance();
      final ActorGateway gateway = instance.getActorGateway();

      final Future<Object> deployAction = gateway.ask(new SubmitTask(deployment), timeout);

      deployAction.onComplete(
          new OnComplete<Object>() {

            @Override
            public void onComplete(Throwable failure, Object success) throws Throwable {
              if (failure != null) {
                if (failure instanceof TimeoutException) {
                  String taskname =
                      deployment.getTaskInfo().getTaskNameWithSubtasks() + " (" + attemptId + ')';

                  markFailed(
                      new Exception(
                          "Cannot deploy task "
                              + taskname
                              + " - TaskManager ("
                              + instance
                              + ") not responding after a timeout of "
                              + timeout,
                          failure));
                } else {
                  markFailed(failure);
                }
              } else {
                if (!(success.equals(Messages.getAcknowledge()))) {
                  markFailed(
                      new Exception(
                          "Failed to deploy the task to slot "
                              + slot
                              + ": Response was not of type Acknowledge"));
                }
              }
            }
          },
          executionContext);
    } catch (Throwable t) {
      markFailed(t);
      ExceptionUtils.rethrow(t);
    }
  }