@Override
  public String handleRequest(ExecutionJobVertex jobVertex, Map<String, String> params)
      throws Exception {
    StringWriter writer = new StringWriter();
    JsonGenerator gen = JsonFactory.jacksonFactory.createGenerator(writer);

    gen.writeStartObject();
    gen.writeStringField("id", jobVertex.getJobVertexId().toString());
    gen.writeNumberField("parallelism", jobVertex.getParallelism());

    gen.writeArrayFieldStart("subtasks");

    int num = 0;
    for (ExecutionVertex vertex : jobVertex.getTaskVertices()) {

      InstanceConnectionInfo location = vertex.getCurrentAssignedResourceLocation();
      String locationString = location == null ? "(unassigned)" : location.getHostname();

      gen.writeStartObject();

      gen.writeNumberField("subtask", num++);
      gen.writeNumberField("attempt", vertex.getCurrentExecutionAttempt().getAttemptNumber());
      gen.writeStringField("host", locationString);

      StringifiedAccumulatorResult[] accs =
          vertex.getCurrentExecutionAttempt().getUserAccumulatorsStringified();
      gen.writeArrayFieldStart("user-accumulators");
      for (StringifiedAccumulatorResult acc : accs) {
        gen.writeStartObject();
        gen.writeStringField("name", acc.getName());
        gen.writeStringField("type", acc.getType());
        gen.writeStringField("value", acc.getValue());
        gen.writeEndObject();
      }
      gen.writeEndArray();

      gen.writeEndObject();
    }
    gen.writeEndArray();

    gen.writeEndObject();
    gen.close();
    return writer.toString();
  }
  @Override
  public String handleRequest(ExecutionJobVertex jobVertex, Map<String, String> params)
      throws Exception {
    // Build a map that groups tasks by TaskManager
    Map<String, List<ExecutionVertex>> taskManagerVertices = new HashMap<>();

    for (ExecutionVertex vertex : jobVertex.getTaskVertices()) {
      TaskManagerLocation location = vertex.getCurrentAssignedResourceLocation();
      String taskManager =
          location == null ? "(unassigned)" : location.getHostname() + ":" + location.dataPort();

      List<ExecutionVertex> vertices = taskManagerVertices.get(taskManager);

      if (vertices == null) {
        vertices = new ArrayList<ExecutionVertex>();
        taskManagerVertices.put(taskManager, vertices);
      }

      vertices.add(vertex);
    }

    // Build JSON response
    final long now = System.currentTimeMillis();

    StringWriter writer = new StringWriter();
    JsonGenerator gen = JsonFactory.jacksonFactory.createGenerator(writer);

    gen.writeStartObject();

    gen.writeStringField("id", jobVertex.getJobVertexId().toString());
    gen.writeStringField("name", jobVertex.getJobVertex().getName());
    gen.writeNumberField("now", now);

    gen.writeArrayFieldStart("taskmanagers");
    for (Entry<String, List<ExecutionVertex>> entry : taskManagerVertices.entrySet()) {
      String host = entry.getKey();
      List<ExecutionVertex> taskVertices = entry.getValue();

      int[] tasksPerState = new int[ExecutionState.values().length];

      long startTime = Long.MAX_VALUE;
      long endTime = 0;
      boolean allFinished = true;

      LongCounter tmReadBytes = new LongCounter();
      LongCounter tmWriteBytes = new LongCounter();
      LongCounter tmReadRecords = new LongCounter();
      LongCounter tmWriteRecords = new LongCounter();

      for (ExecutionVertex vertex : taskVertices) {
        final ExecutionState state = vertex.getExecutionState();
        tasksPerState[state.ordinal()]++;

        // take the earliest start time
        long started = vertex.getStateTimestamp(ExecutionState.DEPLOYING);
        if (started > 0) {
          startTime = Math.min(startTime, started);
        }

        allFinished &= state.isTerminal();
        endTime = Math.max(endTime, vertex.getStateTimestamp(state));

        Map<AccumulatorRegistry.Metric, Accumulator<?, ?>> metrics =
            vertex.getCurrentExecutionAttempt().getFlinkAccumulators();

        if (metrics != null) {
          LongCounter readBytes =
              (LongCounter) metrics.get(AccumulatorRegistry.Metric.NUM_BYTES_IN);
          tmReadBytes.merge(readBytes);

          LongCounter writeBytes =
              (LongCounter) metrics.get(AccumulatorRegistry.Metric.NUM_BYTES_OUT);
          tmWriteBytes.merge(writeBytes);

          LongCounter readRecords =
              (LongCounter) metrics.get(AccumulatorRegistry.Metric.NUM_RECORDS_IN);
          tmReadRecords.merge(readRecords);

          LongCounter writeRecords =
              (LongCounter) metrics.get(AccumulatorRegistry.Metric.NUM_RECORDS_OUT);
          tmWriteRecords.merge(writeRecords);
        }
      }

      long duration;
      if (startTime < Long.MAX_VALUE) {
        if (allFinished) {
          duration = endTime - startTime;
        } else {
          endTime = -1L;
          duration = now - startTime;
        }
      } else {
        startTime = -1L;
        endTime = -1L;
        duration = -1L;
      }

      ExecutionState jobVertexState =
          ExecutionJobVertex.getAggregateJobVertexState(tasksPerState, taskVertices.size());

      gen.writeStartObject();

      gen.writeStringField("host", host);
      gen.writeStringField("status", jobVertexState.name());

      gen.writeNumberField("start-time", startTime);
      gen.writeNumberField("end-time", endTime);
      gen.writeNumberField("duration", duration);

      gen.writeObjectFieldStart("metrics");
      gen.writeNumberField("read-bytes", tmReadBytes.getLocalValuePrimitive());
      gen.writeNumberField("write-bytes", tmWriteBytes.getLocalValuePrimitive());
      gen.writeNumberField("read-records", tmReadRecords.getLocalValuePrimitive());
      gen.writeNumberField("write-records", tmWriteRecords.getLocalValuePrimitive());
      gen.writeEndObject();

      gen.writeObjectFieldStart("status-counts");
      for (ExecutionState state : ExecutionState.values()) {
        gen.writeNumberField(state.name(), tasksPerState[state.ordinal()]);
      }
      gen.writeEndObject();

      gen.writeEndObject();
    }
    gen.writeEndArray();

    gen.writeEndObject();

    gen.close();
    return writer.toString();
  }
Пример #3
0
  /**
   * Receives an AcknowledgeCheckpoint message and returns whether the message was associated with a
   * pending checkpoint.
   *
   * @param message Checkpoint ack from the task manager
   * @return Flag indicating whether the ack'd checkpoint was associated with a pending checkpoint.
   * @throws Exception If the checkpoint cannot be added to the completed checkpoint store.
   */
  public boolean receiveAcknowledgeMessage(AcknowledgeCheckpoint message) throws Exception {
    if (shutdown || message == null) {
      return false;
    }
    if (!job.equals(message.getJob())) {
      LOG.error("Received AcknowledgeCheckpoint message for wrong job: {}", message);
      return false;
    }

    final long checkpointId = message.getCheckpointId();

    CompletedCheckpoint completed = null;
    PendingCheckpoint checkpoint;

    // Flag indicating whether the ack message was for a known pending
    // checkpoint.
    boolean isPendingCheckpoint;

    synchronized (lock) {
      // we need to check inside the lock for being shutdown as well, otherwise we
      // get races and invalid error log messages
      if (shutdown) {
        return false;
      }

      checkpoint = pendingCheckpoints.get(checkpointId);

      if (checkpoint != null && !checkpoint.isDiscarded()) {
        isPendingCheckpoint = true;

        if (checkpoint.acknowledgeTask(
            message.getTaskExecutionId(),
            message.getState(),
            message.getStateSize(),
            null)) { // TODO: Give KV-state to the acknowledgeTask method
          if (checkpoint.isFullyAcknowledged()) {
            completed = checkpoint.toCompletedCheckpoint();

            completedCheckpointStore.addCheckpoint(completed);

            LOG.info(
                "Completed checkpoint "
                    + checkpointId
                    + " (in "
                    + completed.getDuration()
                    + " ms)");

            if (LOG.isDebugEnabled()) {
              StringBuilder builder = new StringBuilder();
              for (Map.Entry<JobVertexID, TaskState> entry : completed.getTaskStates().entrySet()) {
                builder
                    .append("JobVertexID: ")
                    .append(entry.getKey())
                    .append(" {")
                    .append(entry.getValue())
                    .append("}");
              }

              LOG.debug(builder.toString());
            }

            pendingCheckpoints.remove(checkpointId);
            rememberRecentCheckpointId(checkpointId);

            dropSubsumedCheckpoints(completed.getTimestamp());

            onFullyAcknowledgedCheckpoint(completed);

            triggerQueuedRequests();
          }
        } else {
          // checkpoint did not accept message
          LOG.error(
              "Received duplicate or invalid acknowledge message for checkpoint "
                  + checkpointId
                  + " , task "
                  + message.getTaskExecutionId());
        }
      } else if (checkpoint != null) {
        // this should not happen
        throw new IllegalStateException(
            "Received message for discarded but non-removed checkpoint " + checkpointId);
      } else {
        // message is for an unknown checkpoint, or comes too late (checkpoint disposed)
        if (recentPendingCheckpoints.contains(checkpointId)) {
          isPendingCheckpoint = true;
          LOG.warn("Received late message for now expired checkpoint attempt " + checkpointId);
        } else {
          isPendingCheckpoint = false;
        }
      }
    }

    // send the confirmation messages to the necessary targets. we do this here
    // to be outside the lock scope
    if (completed != null) {
      final long timestamp = completed.getTimestamp();

      for (ExecutionVertex ev : tasksToCommitTo) {
        Execution ee = ev.getCurrentExecutionAttempt();
        if (ee != null) {
          ExecutionAttemptID attemptId = ee.getAttemptId();
          NotifyCheckpointComplete notifyMessage =
              new NotifyCheckpointComplete(job, attemptId, checkpointId, timestamp);
          ev.sendMessageToCurrentExecution(notifyMessage, ee.getAttemptId());
        }
      }

      statsTracker.onCompletedCheckpoint(completed);
    }

    return isPendingCheckpoint;
  }
Пример #4
0
  /**
   * Triggers a new checkpoint and uses the given timestamp as the checkpoint timestamp.
   *
   * @param timestamp The timestamp for the checkpoint.
   * @param nextCheckpointId The checkpoint ID to use for this checkpoint or <code>-1</code> if the
   *     checkpoint ID counter should be queried.
   */
  public boolean triggerCheckpoint(long timestamp, long nextCheckpointId) throws Exception {
    // make some eager pre-checks
    synchronized (lock) {
      // abort if the coordinator has been shutdown in the meantime
      if (shutdown) {
        return false;
      }

      // sanity check: there should never be more than one trigger request queued
      if (triggerRequestQueued) {
        LOG.warn("Trying to trigger another checkpoint while one was queued already");
        return false;
      }

      // if too many checkpoints are currently in progress, we need to mark that a request is queued
      if (pendingCheckpoints.size() >= maxConcurrentCheckpointAttempts) {
        triggerRequestQueued = true;
        if (currentPeriodicTrigger != null) {
          currentPeriodicTrigger.cancel();
          currentPeriodicTrigger = null;
        }
        return false;
      }

      // make sure the minimum interval between checkpoints has passed
      if (lastTriggeredCheckpoint + minPauseBetweenCheckpoints > timestamp) {
        if (currentPeriodicTrigger != null) {
          currentPeriodicTrigger.cancel();
          currentPeriodicTrigger = null;
        }
        ScheduledTrigger trigger = new ScheduledTrigger();
        timer.scheduleAtFixedRate(trigger, minPauseBetweenCheckpoints, baseInterval);
        return false;
      }
    }

    // first check if all tasks that we need to trigger are running.
    // if not, abort the checkpoint
    ExecutionAttemptID[] triggerIDs = new ExecutionAttemptID[tasksToTrigger.length];
    for (int i = 0; i < tasksToTrigger.length; i++) {
      Execution ee = tasksToTrigger[i].getCurrentExecutionAttempt();
      if (ee != null && ee.getState() == ExecutionState.RUNNING) {
        triggerIDs[i] = ee.getAttemptId();
      } else {
        LOG.info(
            "Checkpoint triggering task {} is not being executed at the moment. Aborting checkpoint.",
            tasksToTrigger[i].getSimpleName());
        return false;
      }
    }

    // next, check if all tasks that need to acknowledge the checkpoint are running.
    // if not, abort the checkpoint
    Map<ExecutionAttemptID, ExecutionVertex> ackTasks = new HashMap<>(tasksToWaitFor.length);

    for (ExecutionVertex ev : tasksToWaitFor) {
      Execution ee = ev.getCurrentExecutionAttempt();
      if (ee != null) {
        ackTasks.put(ee.getAttemptId(), ev);
      } else {
        LOG.info(
            "Checkpoint acknowledging task {} is not being executed at the moment. Aborting checkpoint.",
            ev.getSimpleName());
        return false;
      }
    }

    // we will actually trigger this checkpoint!

    lastTriggeredCheckpoint = timestamp;
    final long checkpointID;
    if (nextCheckpointId < 0) {
      try {
        // this must happen outside the locked scope, because it communicates
        // with external services (in HA mode) and may block for a while.
        checkpointID = checkpointIdCounter.getAndIncrement();
      } catch (Throwable t) {
        int numUnsuccessful = ++numUnsuccessfulCheckpointsTriggers;
        LOG.warn(
            "Failed to trigger checkpoint ("
                + numUnsuccessful
                + " consecutive failed attempts so far)",
            t);
        return false;
      }
    } else {
      checkpointID = nextCheckpointId;
    }

    LOG.info("Triggering checkpoint " + checkpointID + " @ " + timestamp);

    final PendingCheckpoint checkpoint =
        new PendingCheckpoint(job, checkpointID, timestamp, ackTasks);

    // schedule the timer that will clean up the expired checkpoints
    TimerTask canceller =
        new TimerTask() {
          @Override
          public void run() {
            try {
              synchronized (lock) {
                // only do the work if the checkpoint is not discarded anyways
                // note that checkpoint completion discards the pending checkpoint object
                if (!checkpoint.isDiscarded()) {
                  LOG.info("Checkpoint " + checkpointID + " expired before completing.");

                  checkpoint.discard(userClassLoader);
                  pendingCheckpoints.remove(checkpointID);
                  rememberRecentCheckpointId(checkpointID);

                  onCancelCheckpoint(checkpointID);

                  triggerQueuedRequests();
                }
              }
            } catch (Throwable t) {
              LOG.error("Exception while handling checkpoint timeout", t);
            }
          }
        };

    try {
      // re-acquire the lock
      synchronized (lock) {
        // since we released the lock in the meantime, we need to re-check
        // that the conditions still hold. this is clumsy, but it allows us to
        // release the lock in the meantime while calls to external services are
        // blocking progress, and still gives us early checks that skip work
        // if no checkpoint can happen anyways
        if (shutdown) {
          return false;
        } else if (triggerRequestQueued) {
          LOG.warn("Trying to trigger another checkpoint while one was queued already");
          return false;
        } else if (pendingCheckpoints.size() >= maxConcurrentCheckpointAttempts) {
          triggerRequestQueued = true;
          if (currentPeriodicTrigger != null) {
            currentPeriodicTrigger.cancel();
            currentPeriodicTrigger = null;
          }
          return false;
        }

        pendingCheckpoints.put(checkpointID, checkpoint);
        timer.schedule(canceller, checkpointTimeout);
      }
      // end of lock scope

      // send the messages to the tasks that trigger their checkpoint
      for (int i = 0; i < tasksToTrigger.length; i++) {
        ExecutionAttemptID id = triggerIDs[i];
        TriggerCheckpoint message = new TriggerCheckpoint(job, id, checkpointID, timestamp);
        tasksToTrigger[i].sendMessageToCurrentExecution(message, id);
      }

      numUnsuccessfulCheckpointsTriggers = 0;
      return true;
    } catch (Throwable t) {
      // guard the map against concurrent modifications
      synchronized (lock) {
        pendingCheckpoints.remove(checkpointID);
      }

      int numUnsuccessful = ++numUnsuccessfulCheckpointsTriggers;
      LOG.warn(
          "Failed to trigger checkpoint ("
              + numUnsuccessful
              + " consecutive failed attempts so far)",
          t);
      if (!checkpoint.isDiscarded()) {
        checkpoint.discard(userClassLoader);
      }
      return false;
    }
  }