Example #1
0
  public boolean restoreLatestCheckpointedState(
      Map<JobVertexID, ExecutionJobVertex> tasks,
      boolean errorIfNoCheckpoint,
      boolean allOrNothingState)
      throws Exception {

    synchronized (lock) {
      if (shutdown) {
        throw new IllegalStateException("CheckpointCoordinator is shut down");
      }

      // Recover the checkpoints
      completedCheckpointStore.recover();

      // restore from the latest checkpoint
      CompletedCheckpoint latest = completedCheckpointStore.getLatestCheckpoint();

      if (latest == null) {
        if (errorIfNoCheckpoint) {
          throw new IllegalStateException("No completed checkpoint available");
        } else {
          return false;
        }
      }

      long recoveryTimestamp = System.currentTimeMillis();

      for (Map.Entry<JobVertexID, TaskState> taskGroupStateEntry :
          latest.getTaskStates().entrySet()) {
        TaskState taskState = taskGroupStateEntry.getValue();
        ExecutionJobVertex executionJobVertex = tasks.get(taskGroupStateEntry.getKey());

        if (executionJobVertex != null) {
          // check that we only restore the state if the parallelism has not been changed
          if (taskState.getParallelism() != executionJobVertex.getParallelism()) {
            throw new RuntimeException(
                "Cannot restore the latest checkpoint because "
                    + "the parallelism changed. The operator"
                    + executionJobVertex.getJobVertexId()
                    + " has parallelism "
                    + executionJobVertex.getParallelism()
                    + " whereas the corresponding"
                    + "state object has a parallelism of "
                    + taskState.getParallelism());
          }

          int counter = 0;

          List<Set<Integer>> keyGroupPartitions =
              createKeyGroupPartitions(numberKeyGroups, executionJobVertex.getParallelism());

          for (int i = 0; i < executionJobVertex.getParallelism(); i++) {
            SubtaskState subtaskState = taskState.getState(i);
            SerializedValue<StateHandle<?>> state = null;

            if (subtaskState != null) {
              // count the number of executions for which we set a state
              counter++;
              state = subtaskState.getState();
            }

            Map<Integer, SerializedValue<StateHandle<?>>> kvStateForTaskMap =
                taskState.getUnwrappedKvStates(keyGroupPartitions.get(i));

            Execution currentExecutionAttempt =
                executionJobVertex.getTaskVertices()[i].getCurrentExecutionAttempt();
            currentExecutionAttempt.setInitialState(state, kvStateForTaskMap, recoveryTimestamp);
          }

          if (allOrNothingState && counter > 0 && counter < executionJobVertex.getParallelism()) {
            throw new IllegalStateException(
                "The checkpoint contained state only for "
                    + "a subset of tasks for vertex "
                    + executionJobVertex);
          }
        } else {
          throw new IllegalStateException(
              "There is no execution job vertex for the job"
                  + " vertex ID "
                  + taskGroupStateEntry.getKey());
        }
      }

      return true;
    }
  }
Example #2
0
  public void restoreLatestCheckpointedState(
      Map<JobVertexID, ExecutionJobVertex> tasks,
      boolean errorIfNoCheckpoint,
      boolean allOrNothingState)
      throws Exception {

    synchronized (lock) {
      if (shutdown) {
        throw new IllegalStateException("CheckpointCoordinator is shut down");
      }

      // Recover the checkpoints
      completedCheckpointStore.recover();

      // restore from the latest checkpoint
      CompletedCheckpoint latest = completedCheckpointStore.getLatestCheckpoint();

      if (latest == null) {
        if (errorIfNoCheckpoint) {
          throw new IllegalStateException("No completed checkpoint available");
        } else {
          return;
        }
      }

      long recoveryTimestamp = System.currentTimeMillis();

      if (allOrNothingState) {
        Map<ExecutionJobVertex, Integer> stateCounts = new HashMap<ExecutionJobVertex, Integer>();

        for (StateForTask state : latest.getStates()) {
          ExecutionJobVertex vertex = tasks.get(state.getOperatorId());
          Execution exec =
              vertex.getTaskVertices()[state.getSubtask()].getCurrentExecutionAttempt();
          exec.setInitialState(state.getState(), recoveryTimestamp);

          Integer count = stateCounts.get(vertex);
          if (count != null) {
            stateCounts.put(vertex, count + 1);
          } else {
            stateCounts.put(vertex, 1);
          }
        }

        // validate that either all task vertices have state, or none
        for (Map.Entry<ExecutionJobVertex, Integer> entry : stateCounts.entrySet()) {
          ExecutionJobVertex vertex = entry.getKey();
          if (entry.getValue() != vertex.getParallelism()) {
            throw new IllegalStateException(
                "The checkpoint contained state only for a subset of tasks for vertex " + vertex);
          }
        }
      } else {
        for (StateForTask state : latest.getStates()) {
          ExecutionJobVertex vertex = tasks.get(state.getOperatorId());
          Execution exec =
              vertex.getTaskVertices()[state.getSubtask()].getCurrentExecutionAttempt();
          exec.setInitialState(state.getState(), recoveryTimestamp);
        }
      }
    }
  }