/** * Receives an AcknowledgeCheckpoint message and returns whether the message was associated with a * pending checkpoint. * * @param message Checkpoint ack from the task manager * @return Flag indicating whether the ack'd checkpoint was associated with a pending checkpoint. * @throws Exception If the checkpoint cannot be added to the completed checkpoint store. */ public boolean receiveAcknowledgeMessage(AcknowledgeCheckpoint message) throws Exception { if (shutdown || message == null) { return false; } if (!job.equals(message.getJob())) { LOG.error("Received AcknowledgeCheckpoint message for wrong job: {}", message); return false; } final long checkpointId = message.getCheckpointId(); CompletedCheckpoint completed = null; PendingCheckpoint checkpoint; // Flag indicating whether the ack message was for a known pending // checkpoint. boolean isPendingCheckpoint; synchronized (lock) { // we need to check inside the lock for being shutdown as well, otherwise we // get races and invalid error log messages if (shutdown) { return false; } checkpoint = pendingCheckpoints.get(checkpointId); if (checkpoint != null && !checkpoint.isDiscarded()) { isPendingCheckpoint = true; if (checkpoint.acknowledgeTask( message.getTaskExecutionId(), message.getState(), message.getStateSize(), null)) { // TODO: Give KV-state to the acknowledgeTask method if (checkpoint.isFullyAcknowledged()) { completed = checkpoint.toCompletedCheckpoint(); completedCheckpointStore.addCheckpoint(completed); LOG.info( "Completed checkpoint " + checkpointId + " (in " + completed.getDuration() + " ms)"); if (LOG.isDebugEnabled()) { StringBuilder builder = new StringBuilder(); for (Map.Entry<JobVertexID, TaskState> entry : completed.getTaskStates().entrySet()) { builder .append("JobVertexID: ") .append(entry.getKey()) .append(" {") .append(entry.getValue()) .append("}"); } LOG.debug(builder.toString()); } pendingCheckpoints.remove(checkpointId); rememberRecentCheckpointId(checkpointId); dropSubsumedCheckpoints(completed.getTimestamp()); onFullyAcknowledgedCheckpoint(completed); triggerQueuedRequests(); } } else { // checkpoint did not accept message LOG.error( "Received duplicate or invalid acknowledge message for checkpoint " + checkpointId + " , task " + message.getTaskExecutionId()); } } else if (checkpoint != null) { // this should not happen throw new IllegalStateException( "Received message for discarded but non-removed checkpoint " + checkpointId); } else { // message is for an unknown checkpoint, or comes too late (checkpoint disposed) if (recentPendingCheckpoints.contains(checkpointId)) { isPendingCheckpoint = true; LOG.warn("Received late message for now expired checkpoint attempt " + checkpointId); } else { isPendingCheckpoint = false; } } } // send the confirmation messages to the necessary targets. we do this here // to be outside the lock scope if (completed != null) { final long timestamp = completed.getTimestamp(); for (ExecutionVertex ev : tasksToCommitTo) { Execution ee = ev.getCurrentExecutionAttempt(); if (ee != null) { ExecutionAttemptID attemptId = ee.getAttemptId(); NotifyCheckpointComplete notifyMessage = new NotifyCheckpointComplete(job, attemptId, checkpointId, timestamp); ev.sendMessageToCurrentExecution(notifyMessage, ee.getAttemptId()); } } statsTracker.onCompletedCheckpoint(completed); } return isPendingCheckpoint; }
/** * Receives a {@link DeclineCheckpoint} message and returns whether the message was associated * with a pending checkpoint. * * @param message Checkpoint decline from the task manager * @return Flag indicating whether the declined checkpoint was associated with a pending * checkpoint. */ public boolean receiveDeclineMessage(DeclineCheckpoint message) throws Exception { if (shutdown || message == null) { return false; } if (!job.equals(message.getJob())) { LOG.error("Received DeclineCheckpoint message for wrong job: {}", message); return false; } final long checkpointId = message.getCheckpointId(); PendingCheckpoint checkpoint; // Flag indicating whether the ack message was for a known pending // checkpoint. boolean isPendingCheckpoint; synchronized (lock) { // we need to check inside the lock for being shutdown as well, otherwise we // get races and invalid error log messages if (shutdown) { return false; } checkpoint = pendingCheckpoints.get(checkpointId); if (checkpoint != null && !checkpoint.isDiscarded()) { isPendingCheckpoint = true; LOG.info( "Discarding checkpoint " + checkpointId + " because of checkpoint decline from task " + message.getTaskExecutionId()); pendingCheckpoints.remove(checkpointId); checkpoint.discard(userClassLoader); rememberRecentCheckpointId(checkpointId); boolean haveMoreRecentPending = false; Iterator<Map.Entry<Long, PendingCheckpoint>> entries = pendingCheckpoints.entrySet().iterator(); while (entries.hasNext()) { PendingCheckpoint p = entries.next().getValue(); if (!p.isDiscarded() && p.getCheckpointTimestamp() >= checkpoint.getCheckpointTimestamp()) { haveMoreRecentPending = true; break; } } if (!haveMoreRecentPending && !triggerRequestQueued) { LOG.info("Triggering new checkpoint because of discarded checkpoint " + checkpointId); triggerCheckpoint(System.currentTimeMillis()); } else if (!haveMoreRecentPending) { LOG.info( "Promoting queued checkpoint request because of discarded checkpoint " + checkpointId); triggerQueuedRequests(); } } else if (checkpoint != null) { // this should not happen throw new IllegalStateException( "Received message for discarded but non-removed checkpoint " + checkpointId); } else { // message is for an unknown checkpoint, or comes too late (checkpoint disposed) if (recentPendingCheckpoints.contains(checkpointId)) { isPendingCheckpoint = true; LOG.info( "Received another decline checkpoint message for now expired checkpoint attempt " + checkpointId); } else { isPendingCheckpoint = false; } } } return isPendingCheckpoint; }