/** * Shuts down the checkpoint coordinator. * * <p>After this method has been called, the coordinator does not accept and further messages and * cannot trigger any further checkpoints. */ public void shutdown() throws Exception { synchronized (lock) { try { if (!shutdown) { shutdown = true; LOG.info("Stopping checkpoint coordinator for job " + job); periodicScheduling = false; triggerRequestQueued = false; // shut down the thread that handles the timeouts and pending triggers timer.cancel(); // make sure that the actor does not linger if (jobStatusListener != null) { jobStatusListener.tell(PoisonPill.getInstance()); jobStatusListener = null; } checkpointIdCounter.stop(); // clear and discard all pending checkpoints for (PendingCheckpoint pending : pendingCheckpoints.values()) { pending.discard(userClassLoader); } pendingCheckpoints.clear(); // clean and discard all successful checkpoints completedCheckpointStore.discardAllCheckpoints(); onShutdown(); } } finally { // Remove shutdown hook to prevent resource leaks, unless this is invoked by the // shutdown hook itself. if (shutdownHook != null && shutdownHook != Thread.currentThread()) { try { Runtime.getRuntime().removeShutdownHook(shutdownHook); } catch (IllegalStateException ignored) { // race, JVM is in shutdown already, we can safely ignore this } catch (Throwable t) { LOG.warn("Error unregistering checkpoint coordinator shutdown hook.", t); } } } } }
public List<CompletedCheckpoint> getSuccessfulCheckpoints() throws Exception { synchronized (lock) { return completedCheckpointStore.getAllCheckpoints(); } }
public int getNumberOfRetainedSuccessfulCheckpoints() { synchronized (lock) { return completedCheckpointStore.getNumberOfRetainedCheckpoints(); } }
public boolean restoreLatestCheckpointedState( Map<JobVertexID, ExecutionJobVertex> tasks, boolean errorIfNoCheckpoint, boolean allOrNothingState) throws Exception { synchronized (lock) { if (shutdown) { throw new IllegalStateException("CheckpointCoordinator is shut down"); } // Recover the checkpoints completedCheckpointStore.recover(); // restore from the latest checkpoint CompletedCheckpoint latest = completedCheckpointStore.getLatestCheckpoint(); if (latest == null) { if (errorIfNoCheckpoint) { throw new IllegalStateException("No completed checkpoint available"); } else { return false; } } long recoveryTimestamp = System.currentTimeMillis(); for (Map.Entry<JobVertexID, TaskState> taskGroupStateEntry : latest.getTaskStates().entrySet()) { TaskState taskState = taskGroupStateEntry.getValue(); ExecutionJobVertex executionJobVertex = tasks.get(taskGroupStateEntry.getKey()); if (executionJobVertex != null) { // check that we only restore the state if the parallelism has not been changed if (taskState.getParallelism() != executionJobVertex.getParallelism()) { throw new RuntimeException( "Cannot restore the latest checkpoint because " + "the parallelism changed. The operator" + executionJobVertex.getJobVertexId() + " has parallelism " + executionJobVertex.getParallelism() + " whereas the corresponding" + "state object has a parallelism of " + taskState.getParallelism()); } int counter = 0; List<Set<Integer>> keyGroupPartitions = createKeyGroupPartitions(numberKeyGroups, executionJobVertex.getParallelism()); for (int i = 0; i < executionJobVertex.getParallelism(); i++) { SubtaskState subtaskState = taskState.getState(i); SerializedValue<StateHandle<?>> state = null; if (subtaskState != null) { // count the number of executions for which we set a state counter++; state = subtaskState.getState(); } Map<Integer, SerializedValue<StateHandle<?>>> kvStateForTaskMap = taskState.getUnwrappedKvStates(keyGroupPartitions.get(i)); Execution currentExecutionAttempt = executionJobVertex.getTaskVertices()[i].getCurrentExecutionAttempt(); currentExecutionAttempt.setInitialState(state, kvStateForTaskMap, recoveryTimestamp); } if (allOrNothingState && counter > 0 && counter < executionJobVertex.getParallelism()) { throw new IllegalStateException( "The checkpoint contained state only for " + "a subset of tasks for vertex " + executionJobVertex); } } else { throw new IllegalStateException( "There is no execution job vertex for the job" + " vertex ID " + taskGroupStateEntry.getKey()); } } return true; } }
/** * Receives an AcknowledgeCheckpoint message and returns whether the message was associated with a * pending checkpoint. * * @param message Checkpoint ack from the task manager * @return Flag indicating whether the ack'd checkpoint was associated with a pending checkpoint. * @throws Exception If the checkpoint cannot be added to the completed checkpoint store. */ public boolean receiveAcknowledgeMessage(AcknowledgeCheckpoint message) throws Exception { if (shutdown || message == null) { return false; } if (!job.equals(message.getJob())) { LOG.error("Received AcknowledgeCheckpoint message for wrong job: {}", message); return false; } final long checkpointId = message.getCheckpointId(); CompletedCheckpoint completed = null; PendingCheckpoint checkpoint; // Flag indicating whether the ack message was for a known pending // checkpoint. boolean isPendingCheckpoint; synchronized (lock) { // we need to check inside the lock for being shutdown as well, otherwise we // get races and invalid error log messages if (shutdown) { return false; } checkpoint = pendingCheckpoints.get(checkpointId); if (checkpoint != null && !checkpoint.isDiscarded()) { isPendingCheckpoint = true; if (checkpoint.acknowledgeTask( message.getTaskExecutionId(), message.getState(), message.getStateSize(), null)) { // TODO: Give KV-state to the acknowledgeTask method if (checkpoint.isFullyAcknowledged()) { completed = checkpoint.toCompletedCheckpoint(); completedCheckpointStore.addCheckpoint(completed); LOG.info( "Completed checkpoint " + checkpointId + " (in " + completed.getDuration() + " ms)"); if (LOG.isDebugEnabled()) { StringBuilder builder = new StringBuilder(); for (Map.Entry<JobVertexID, TaskState> entry : completed.getTaskStates().entrySet()) { builder .append("JobVertexID: ") .append(entry.getKey()) .append(" {") .append(entry.getValue()) .append("}"); } LOG.debug(builder.toString()); } pendingCheckpoints.remove(checkpointId); rememberRecentCheckpointId(checkpointId); dropSubsumedCheckpoints(completed.getTimestamp()); onFullyAcknowledgedCheckpoint(completed); triggerQueuedRequests(); } } else { // checkpoint did not accept message LOG.error( "Received duplicate or invalid acknowledge message for checkpoint " + checkpointId + " , task " + message.getTaskExecutionId()); } } else if (checkpoint != null) { // this should not happen throw new IllegalStateException( "Received message for discarded but non-removed checkpoint " + checkpointId); } else { // message is for an unknown checkpoint, or comes too late (checkpoint disposed) if (recentPendingCheckpoints.contains(checkpointId)) { isPendingCheckpoint = true; LOG.warn("Received late message for now expired checkpoint attempt " + checkpointId); } else { isPendingCheckpoint = false; } } } // send the confirmation messages to the necessary targets. we do this here // to be outside the lock scope if (completed != null) { final long timestamp = completed.getTimestamp(); for (ExecutionVertex ev : tasksToCommitTo) { Execution ee = ev.getCurrentExecutionAttempt(); if (ee != null) { ExecutionAttemptID attemptId = ee.getAttemptId(); NotifyCheckpointComplete notifyMessage = new NotifyCheckpointComplete(job, attemptId, checkpointId, timestamp); ev.sendMessageToCurrentExecution(notifyMessage, ee.getAttemptId()); } } statsTracker.onCompletedCheckpoint(completed); } return isPendingCheckpoint; }
public void restoreLatestCheckpointedState( Map<JobVertexID, ExecutionJobVertex> tasks, boolean errorIfNoCheckpoint, boolean allOrNothingState) throws Exception { synchronized (lock) { if (shutdown) { throw new IllegalStateException("CheckpointCoordinator is shut down"); } // Recover the checkpoints completedCheckpointStore.recover(); // restore from the latest checkpoint CompletedCheckpoint latest = completedCheckpointStore.getLatestCheckpoint(); if (latest == null) { if (errorIfNoCheckpoint) { throw new IllegalStateException("No completed checkpoint available"); } else { return; } } long recoveryTimestamp = System.currentTimeMillis(); if (allOrNothingState) { Map<ExecutionJobVertex, Integer> stateCounts = new HashMap<ExecutionJobVertex, Integer>(); for (StateForTask state : latest.getStates()) { ExecutionJobVertex vertex = tasks.get(state.getOperatorId()); Execution exec = vertex.getTaskVertices()[state.getSubtask()].getCurrentExecutionAttempt(); exec.setInitialState(state.getState(), recoveryTimestamp); Integer count = stateCounts.get(vertex); if (count != null) { stateCounts.put(vertex, count + 1); } else { stateCounts.put(vertex, 1); } } // validate that either all task vertices have state, or none for (Map.Entry<ExecutionJobVertex, Integer> entry : stateCounts.entrySet()) { ExecutionJobVertex vertex = entry.getKey(); if (entry.getValue() != vertex.getParallelism()) { throw new IllegalStateException( "The checkpoint contained state only for a subset of tasks for vertex " + vertex); } } } else { for (StateForTask state : latest.getStates()) { ExecutionJobVertex vertex = tasks.get(state.getOperatorId()); Execution exec = vertex.getTaskVertices()[state.getSubtask()].getCurrentExecutionAttempt(); exec.setInitialState(state.getState(), recoveryTimestamp); } } } }