Example #1
0
  /**
   * Shuts down the checkpoint coordinator.
   *
   * <p>After this method has been called, the coordinator does not accept and further messages and
   * cannot trigger any further checkpoints.
   */
  public void shutdown() throws Exception {
    synchronized (lock) {
      try {
        if (!shutdown) {
          shutdown = true;
          LOG.info("Stopping checkpoint coordinator for job " + job);

          periodicScheduling = false;
          triggerRequestQueued = false;

          // shut down the thread that handles the timeouts and pending triggers
          timer.cancel();

          // make sure that the actor does not linger
          if (jobStatusListener != null) {
            jobStatusListener.tell(PoisonPill.getInstance());
            jobStatusListener = null;
          }

          checkpointIdCounter.stop();

          // clear and discard all pending checkpoints
          for (PendingCheckpoint pending : pendingCheckpoints.values()) {
            pending.discard(userClassLoader);
          }
          pendingCheckpoints.clear();

          // clean and discard all successful checkpoints
          completedCheckpointStore.discardAllCheckpoints();

          onShutdown();
        }
      } finally {
        // Remove shutdown hook to prevent resource leaks, unless this is invoked by the
        // shutdown hook itself.
        if (shutdownHook != null && shutdownHook != Thread.currentThread()) {
          try {
            Runtime.getRuntime().removeShutdownHook(shutdownHook);
          } catch (IllegalStateException ignored) {
            // race, JVM is in shutdown already, we can safely ignore this
          } catch (Throwable t) {
            LOG.warn("Error unregistering checkpoint coordinator shutdown hook.", t);
          }
        }
      }
    }
  }
Example #2
0
 public List<CompletedCheckpoint> getSuccessfulCheckpoints() throws Exception {
   synchronized (lock) {
     return completedCheckpointStore.getAllCheckpoints();
   }
 }
Example #3
0
 public int getNumberOfRetainedSuccessfulCheckpoints() {
   synchronized (lock) {
     return completedCheckpointStore.getNumberOfRetainedCheckpoints();
   }
 }
Example #4
0
  public boolean restoreLatestCheckpointedState(
      Map<JobVertexID, ExecutionJobVertex> tasks,
      boolean errorIfNoCheckpoint,
      boolean allOrNothingState)
      throws Exception {

    synchronized (lock) {
      if (shutdown) {
        throw new IllegalStateException("CheckpointCoordinator is shut down");
      }

      // Recover the checkpoints
      completedCheckpointStore.recover();

      // restore from the latest checkpoint
      CompletedCheckpoint latest = completedCheckpointStore.getLatestCheckpoint();

      if (latest == null) {
        if (errorIfNoCheckpoint) {
          throw new IllegalStateException("No completed checkpoint available");
        } else {
          return false;
        }
      }

      long recoveryTimestamp = System.currentTimeMillis();

      for (Map.Entry<JobVertexID, TaskState> taskGroupStateEntry :
          latest.getTaskStates().entrySet()) {
        TaskState taskState = taskGroupStateEntry.getValue();
        ExecutionJobVertex executionJobVertex = tasks.get(taskGroupStateEntry.getKey());

        if (executionJobVertex != null) {
          // check that we only restore the state if the parallelism has not been changed
          if (taskState.getParallelism() != executionJobVertex.getParallelism()) {
            throw new RuntimeException(
                "Cannot restore the latest checkpoint because "
                    + "the parallelism changed. The operator"
                    + executionJobVertex.getJobVertexId()
                    + " has parallelism "
                    + executionJobVertex.getParallelism()
                    + " whereas the corresponding"
                    + "state object has a parallelism of "
                    + taskState.getParallelism());
          }

          int counter = 0;

          List<Set<Integer>> keyGroupPartitions =
              createKeyGroupPartitions(numberKeyGroups, executionJobVertex.getParallelism());

          for (int i = 0; i < executionJobVertex.getParallelism(); i++) {
            SubtaskState subtaskState = taskState.getState(i);
            SerializedValue<StateHandle<?>> state = null;

            if (subtaskState != null) {
              // count the number of executions for which we set a state
              counter++;
              state = subtaskState.getState();
            }

            Map<Integer, SerializedValue<StateHandle<?>>> kvStateForTaskMap =
                taskState.getUnwrappedKvStates(keyGroupPartitions.get(i));

            Execution currentExecutionAttempt =
                executionJobVertex.getTaskVertices()[i].getCurrentExecutionAttempt();
            currentExecutionAttempt.setInitialState(state, kvStateForTaskMap, recoveryTimestamp);
          }

          if (allOrNothingState && counter > 0 && counter < executionJobVertex.getParallelism()) {
            throw new IllegalStateException(
                "The checkpoint contained state only for "
                    + "a subset of tasks for vertex "
                    + executionJobVertex);
          }
        } else {
          throw new IllegalStateException(
              "There is no execution job vertex for the job"
                  + " vertex ID "
                  + taskGroupStateEntry.getKey());
        }
      }

      return true;
    }
  }
Example #5
0
  /**
   * Receives an AcknowledgeCheckpoint message and returns whether the message was associated with a
   * pending checkpoint.
   *
   * @param message Checkpoint ack from the task manager
   * @return Flag indicating whether the ack'd checkpoint was associated with a pending checkpoint.
   * @throws Exception If the checkpoint cannot be added to the completed checkpoint store.
   */
  public boolean receiveAcknowledgeMessage(AcknowledgeCheckpoint message) throws Exception {
    if (shutdown || message == null) {
      return false;
    }
    if (!job.equals(message.getJob())) {
      LOG.error("Received AcknowledgeCheckpoint message for wrong job: {}", message);
      return false;
    }

    final long checkpointId = message.getCheckpointId();

    CompletedCheckpoint completed = null;
    PendingCheckpoint checkpoint;

    // Flag indicating whether the ack message was for a known pending
    // checkpoint.
    boolean isPendingCheckpoint;

    synchronized (lock) {
      // we need to check inside the lock for being shutdown as well, otherwise we
      // get races and invalid error log messages
      if (shutdown) {
        return false;
      }

      checkpoint = pendingCheckpoints.get(checkpointId);

      if (checkpoint != null && !checkpoint.isDiscarded()) {
        isPendingCheckpoint = true;

        if (checkpoint.acknowledgeTask(
            message.getTaskExecutionId(),
            message.getState(),
            message.getStateSize(),
            null)) { // TODO: Give KV-state to the acknowledgeTask method
          if (checkpoint.isFullyAcknowledged()) {
            completed = checkpoint.toCompletedCheckpoint();

            completedCheckpointStore.addCheckpoint(completed);

            LOG.info(
                "Completed checkpoint "
                    + checkpointId
                    + " (in "
                    + completed.getDuration()
                    + " ms)");

            if (LOG.isDebugEnabled()) {
              StringBuilder builder = new StringBuilder();
              for (Map.Entry<JobVertexID, TaskState> entry : completed.getTaskStates().entrySet()) {
                builder
                    .append("JobVertexID: ")
                    .append(entry.getKey())
                    .append(" {")
                    .append(entry.getValue())
                    .append("}");
              }

              LOG.debug(builder.toString());
            }

            pendingCheckpoints.remove(checkpointId);
            rememberRecentCheckpointId(checkpointId);

            dropSubsumedCheckpoints(completed.getTimestamp());

            onFullyAcknowledgedCheckpoint(completed);

            triggerQueuedRequests();
          }
        } else {
          // checkpoint did not accept message
          LOG.error(
              "Received duplicate or invalid acknowledge message for checkpoint "
                  + checkpointId
                  + " , task "
                  + message.getTaskExecutionId());
        }
      } else if (checkpoint != null) {
        // this should not happen
        throw new IllegalStateException(
            "Received message for discarded but non-removed checkpoint " + checkpointId);
      } else {
        // message is for an unknown checkpoint, or comes too late (checkpoint disposed)
        if (recentPendingCheckpoints.contains(checkpointId)) {
          isPendingCheckpoint = true;
          LOG.warn("Received late message for now expired checkpoint attempt " + checkpointId);
        } else {
          isPendingCheckpoint = false;
        }
      }
    }

    // send the confirmation messages to the necessary targets. we do this here
    // to be outside the lock scope
    if (completed != null) {
      final long timestamp = completed.getTimestamp();

      for (ExecutionVertex ev : tasksToCommitTo) {
        Execution ee = ev.getCurrentExecutionAttempt();
        if (ee != null) {
          ExecutionAttemptID attemptId = ee.getAttemptId();
          NotifyCheckpointComplete notifyMessage =
              new NotifyCheckpointComplete(job, attemptId, checkpointId, timestamp);
          ev.sendMessageToCurrentExecution(notifyMessage, ee.getAttemptId());
        }
      }

      statsTracker.onCompletedCheckpoint(completed);
    }

    return isPendingCheckpoint;
  }
Example #6
0
  public void restoreLatestCheckpointedState(
      Map<JobVertexID, ExecutionJobVertex> tasks,
      boolean errorIfNoCheckpoint,
      boolean allOrNothingState)
      throws Exception {

    synchronized (lock) {
      if (shutdown) {
        throw new IllegalStateException("CheckpointCoordinator is shut down");
      }

      // Recover the checkpoints
      completedCheckpointStore.recover();

      // restore from the latest checkpoint
      CompletedCheckpoint latest = completedCheckpointStore.getLatestCheckpoint();

      if (latest == null) {
        if (errorIfNoCheckpoint) {
          throw new IllegalStateException("No completed checkpoint available");
        } else {
          return;
        }
      }

      long recoveryTimestamp = System.currentTimeMillis();

      if (allOrNothingState) {
        Map<ExecutionJobVertex, Integer> stateCounts = new HashMap<ExecutionJobVertex, Integer>();

        for (StateForTask state : latest.getStates()) {
          ExecutionJobVertex vertex = tasks.get(state.getOperatorId());
          Execution exec =
              vertex.getTaskVertices()[state.getSubtask()].getCurrentExecutionAttempt();
          exec.setInitialState(state.getState(), recoveryTimestamp);

          Integer count = stateCounts.get(vertex);
          if (count != null) {
            stateCounts.put(vertex, count + 1);
          } else {
            stateCounts.put(vertex, 1);
          }
        }

        // validate that either all task vertices have state, or none
        for (Map.Entry<ExecutionJobVertex, Integer> entry : stateCounts.entrySet()) {
          ExecutionJobVertex vertex = entry.getKey();
          if (entry.getValue() != vertex.getParallelism()) {
            throw new IllegalStateException(
                "The checkpoint contained state only for a subset of tasks for vertex " + vertex);
          }
        }
      } else {
        for (StateForTask state : latest.getStates()) {
          ExecutionJobVertex vertex = tasks.get(state.getOperatorId());
          Execution exec =
              vertex.getTaskVertices()[state.getSubtask()].getCurrentExecutionAttempt();
          exec.setInitialState(state.getState(), recoveryTimestamp);
        }
      }
    }
  }