Пример #1
0
  public static Map<JobVertexID, ExecutionJobVertex> includeLegacyJobVertexIDs(
      Map<JobVertexID, ExecutionJobVertex> tasks) {

    Map<JobVertexID, ExecutionJobVertex> expanded = new HashMap<>(2 * tasks.size());
    // first include all new ids
    expanded.putAll(tasks);

    // now expand and add legacy ids
    for (ExecutionJobVertex executionJobVertex : tasks.values()) {
      if (null != executionJobVertex) {
        JobVertex jobVertex = executionJobVertex.getJobVertex();
        if (null != jobVertex) {
          List<JobVertexID> alternativeIds = jobVertex.getIdAlternatives();
          for (JobVertexID jobVertexID : alternativeIds) {
            ExecutionJobVertex old = expanded.put(jobVertexID, executionJobVertex);
            Preconditions.checkState(
                null == old || old.equals(executionJobVertex),
                "Ambiguous jobvertex id detected during expansion to legacy ids.");
          }
        }
      }
    }

    return expanded;
  }
  @Override
  public String handleRequest(ExecutionJobVertex jobVertex, Map<String, String> params)
      throws Exception {
    StringWriter writer = new StringWriter();
    JsonGenerator gen = JsonFactory.jacksonFactory.createGenerator(writer);

    gen.writeStartObject();
    gen.writeStringField("id", jobVertex.getJobVertexId().toString());
    gen.writeNumberField("parallelism", jobVertex.getParallelism());

    gen.writeArrayFieldStart("subtasks");

    int num = 0;
    for (ExecutionVertex vertex : jobVertex.getTaskVertices()) {

      InstanceConnectionInfo location = vertex.getCurrentAssignedResourceLocation();
      String locationString = location == null ? "(unassigned)" : location.getHostname();

      gen.writeStartObject();

      gen.writeNumberField("subtask", num++);
      gen.writeNumberField("attempt", vertex.getCurrentExecutionAttempt().getAttemptNumber());
      gen.writeStringField("host", locationString);

      StringifiedAccumulatorResult[] accs =
          vertex.getCurrentExecutionAttempt().getUserAccumulatorsStringified();
      gen.writeArrayFieldStart("user-accumulators");
      for (StringifiedAccumulatorResult acc : accs) {
        gen.writeStartObject();
        gen.writeStringField("name", acc.getName());
        gen.writeStringField("type", acc.getType());
        gen.writeStringField("value", acc.getValue());
        gen.writeEndObject();
      }
      gen.writeEndArray();

      gen.writeEndObject();
    }
    gen.writeEndArray();

    gen.writeEndObject();
    gen.close();
    return writer.toString();
  }
  @Override
  public String handleRequest(ExecutionJobVertex jobVertex, Map<String, String> params)
      throws Exception {
    // Build a map that groups tasks by TaskManager
    Map<String, List<ExecutionVertex>> taskManagerVertices = new HashMap<>();

    for (ExecutionVertex vertex : jobVertex.getTaskVertices()) {
      TaskManagerLocation location = vertex.getCurrentAssignedResourceLocation();
      String taskManager =
          location == null ? "(unassigned)" : location.getHostname() + ":" + location.dataPort();

      List<ExecutionVertex> vertices = taskManagerVertices.get(taskManager);

      if (vertices == null) {
        vertices = new ArrayList<ExecutionVertex>();
        taskManagerVertices.put(taskManager, vertices);
      }

      vertices.add(vertex);
    }

    // Build JSON response
    final long now = System.currentTimeMillis();

    StringWriter writer = new StringWriter();
    JsonGenerator gen = JsonFactory.jacksonFactory.createGenerator(writer);

    gen.writeStartObject();

    gen.writeStringField("id", jobVertex.getJobVertexId().toString());
    gen.writeStringField("name", jobVertex.getJobVertex().getName());
    gen.writeNumberField("now", now);

    gen.writeArrayFieldStart("taskmanagers");
    for (Entry<String, List<ExecutionVertex>> entry : taskManagerVertices.entrySet()) {
      String host = entry.getKey();
      List<ExecutionVertex> taskVertices = entry.getValue();

      int[] tasksPerState = new int[ExecutionState.values().length];

      long startTime = Long.MAX_VALUE;
      long endTime = 0;
      boolean allFinished = true;

      LongCounter tmReadBytes = new LongCounter();
      LongCounter tmWriteBytes = new LongCounter();
      LongCounter tmReadRecords = new LongCounter();
      LongCounter tmWriteRecords = new LongCounter();

      for (ExecutionVertex vertex : taskVertices) {
        final ExecutionState state = vertex.getExecutionState();
        tasksPerState[state.ordinal()]++;

        // take the earliest start time
        long started = vertex.getStateTimestamp(ExecutionState.DEPLOYING);
        if (started > 0) {
          startTime = Math.min(startTime, started);
        }

        allFinished &= state.isTerminal();
        endTime = Math.max(endTime, vertex.getStateTimestamp(state));

        Map<AccumulatorRegistry.Metric, Accumulator<?, ?>> metrics =
            vertex.getCurrentExecutionAttempt().getFlinkAccumulators();

        if (metrics != null) {
          LongCounter readBytes =
              (LongCounter) metrics.get(AccumulatorRegistry.Metric.NUM_BYTES_IN);
          tmReadBytes.merge(readBytes);

          LongCounter writeBytes =
              (LongCounter) metrics.get(AccumulatorRegistry.Metric.NUM_BYTES_OUT);
          tmWriteBytes.merge(writeBytes);

          LongCounter readRecords =
              (LongCounter) metrics.get(AccumulatorRegistry.Metric.NUM_RECORDS_IN);
          tmReadRecords.merge(readRecords);

          LongCounter writeRecords =
              (LongCounter) metrics.get(AccumulatorRegistry.Metric.NUM_RECORDS_OUT);
          tmWriteRecords.merge(writeRecords);
        }
      }

      long duration;
      if (startTime < Long.MAX_VALUE) {
        if (allFinished) {
          duration = endTime - startTime;
        } else {
          endTime = -1L;
          duration = now - startTime;
        }
      } else {
        startTime = -1L;
        endTime = -1L;
        duration = -1L;
      }

      ExecutionState jobVertexState =
          ExecutionJobVertex.getAggregateJobVertexState(tasksPerState, taskVertices.size());

      gen.writeStartObject();

      gen.writeStringField("host", host);
      gen.writeStringField("status", jobVertexState.name());

      gen.writeNumberField("start-time", startTime);
      gen.writeNumberField("end-time", endTime);
      gen.writeNumberField("duration", duration);

      gen.writeObjectFieldStart("metrics");
      gen.writeNumberField("read-bytes", tmReadBytes.getLocalValuePrimitive());
      gen.writeNumberField("write-bytes", tmWriteBytes.getLocalValuePrimitive());
      gen.writeNumberField("read-records", tmReadRecords.getLocalValuePrimitive());
      gen.writeNumberField("write-records", tmWriteRecords.getLocalValuePrimitive());
      gen.writeEndObject();

      gen.writeObjectFieldStart("status-counts");
      for (ExecutionState state : ExecutionState.values()) {
        gen.writeNumberField(state.name(), tasksPerState[state.ordinal()]);
      }
      gen.writeEndObject();

      gen.writeEndObject();
    }
    gen.writeEndArray();

    gen.writeEndObject();

    gen.close();
    return writer.toString();
  }
  @Test
  public void testBuildDeploymentDescriptor() {
    try {
      final JobID jobId = new JobID();

      final JobVertexID jid1 = new JobVertexID();
      final JobVertexID jid2 = new JobVertexID();
      final JobVertexID jid3 = new JobVertexID();
      final JobVertexID jid4 = new JobVertexID();

      JobVertex v1 = new JobVertex("v1", jid1);
      JobVertex v2 = new JobVertex("v2", jid2);
      JobVertex v3 = new JobVertex("v3", jid3);
      JobVertex v4 = new JobVertex("v4", jid4);

      v1.setParallelism(10);
      v2.setParallelism(10);
      v3.setParallelism(10);
      v4.setParallelism(10);

      v1.setInvokableClass(BatchTask.class);
      v2.setInvokableClass(BatchTask.class);
      v3.setInvokableClass(BatchTask.class);
      v4.setInvokableClass(BatchTask.class);

      v2.connectNewDataSetAsInput(v1, DistributionPattern.ALL_TO_ALL);
      v3.connectNewDataSetAsInput(v2, DistributionPattern.ALL_TO_ALL);
      v4.connectNewDataSetAsInput(v2, DistributionPattern.ALL_TO_ALL);

      ExecutionGraph eg =
          new ExecutionGraph(
              TestingUtils.defaultExecutionContext(),
              jobId,
              "some job",
              new Configuration(),
              new SerializedValue<>(new ExecutionConfig()),
              AkkaUtils.getDefaultTimeout(),
              new NoRestartStrategy());

      List<JobVertex> ordered = Arrays.asList(v1, v2, v3, v4);

      eg.attachJobGraph(ordered);

      ExecutionJobVertex ejv = eg.getAllVertices().get(jid2);
      ExecutionVertex vertex = ejv.getTaskVertices()[3];

      ExecutionGraphTestUtils.SimpleActorGateway instanceGateway =
          new ExecutionGraphTestUtils.SimpleActorGateway(TestingUtils.directExecutionContext());

      final Instance instance = getInstance(instanceGateway);

      final SimpleSlot slot = instance.allocateSimpleSlot(jobId);

      assertEquals(ExecutionState.CREATED, vertex.getExecutionState());

      vertex.deployToSlot(slot);

      assertEquals(ExecutionState.DEPLOYING, vertex.getExecutionState());

      TaskDeploymentDescriptor descr = instanceGateway.lastTDD;
      assertNotNull(descr);

      assertEquals(jobId, descr.getJobID());
      assertEquals(jid2, descr.getVertexID());
      assertEquals(3, descr.getIndexInSubtaskGroup());
      assertEquals(10, descr.getNumberOfSubtasks());
      assertEquals(BatchTask.class.getName(), descr.getInvokableClassName());
      assertEquals("v2", descr.getTaskName());

      List<ResultPartitionDeploymentDescriptor> producedPartitions = descr.getProducedPartitions();
      List<InputGateDeploymentDescriptor> consumedPartitions = descr.getInputGates();

      assertEquals(2, producedPartitions.size());
      assertEquals(1, consumedPartitions.size());

      assertEquals(10, producedPartitions.get(0).getNumberOfSubpartitions());
      assertEquals(10, producedPartitions.get(1).getNumberOfSubpartitions());
      assertEquals(10, consumedPartitions.get(0).getInputChannelDeploymentDescriptors().length);
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }
  }
Пример #5
0
  public boolean restoreLatestCheckpointedState(
      Map<JobVertexID, ExecutionJobVertex> tasks,
      boolean errorIfNoCheckpoint,
      boolean allOrNothingState)
      throws Exception {

    synchronized (lock) {
      if (shutdown) {
        throw new IllegalStateException("CheckpointCoordinator is shut down");
      }

      // Recover the checkpoints
      completedCheckpointStore.recover();

      // restore from the latest checkpoint
      CompletedCheckpoint latest = completedCheckpointStore.getLatestCheckpoint();

      if (latest == null) {
        if (errorIfNoCheckpoint) {
          throw new IllegalStateException("No completed checkpoint available");
        } else {
          return false;
        }
      }

      long recoveryTimestamp = System.currentTimeMillis();

      for (Map.Entry<JobVertexID, TaskState> taskGroupStateEntry :
          latest.getTaskStates().entrySet()) {
        TaskState taskState = taskGroupStateEntry.getValue();
        ExecutionJobVertex executionJobVertex = tasks.get(taskGroupStateEntry.getKey());

        if (executionJobVertex != null) {
          // check that we only restore the state if the parallelism has not been changed
          if (taskState.getParallelism() != executionJobVertex.getParallelism()) {
            throw new RuntimeException(
                "Cannot restore the latest checkpoint because "
                    + "the parallelism changed. The operator"
                    + executionJobVertex.getJobVertexId()
                    + " has parallelism "
                    + executionJobVertex.getParallelism()
                    + " whereas the corresponding"
                    + "state object has a parallelism of "
                    + taskState.getParallelism());
          }

          int counter = 0;

          List<Set<Integer>> keyGroupPartitions =
              createKeyGroupPartitions(numberKeyGroups, executionJobVertex.getParallelism());

          for (int i = 0; i < executionJobVertex.getParallelism(); i++) {
            SubtaskState subtaskState = taskState.getState(i);
            SerializedValue<StateHandle<?>> state = null;

            if (subtaskState != null) {
              // count the number of executions for which we set a state
              counter++;
              state = subtaskState.getState();
            }

            Map<Integer, SerializedValue<StateHandle<?>>> kvStateForTaskMap =
                taskState.getUnwrappedKvStates(keyGroupPartitions.get(i));

            Execution currentExecutionAttempt =
                executionJobVertex.getTaskVertices()[i].getCurrentExecutionAttempt();
            currentExecutionAttempt.setInitialState(state, kvStateForTaskMap, recoveryTimestamp);
          }

          if (allOrNothingState && counter > 0 && counter < executionJobVertex.getParallelism()) {
            throw new IllegalStateException(
                "The checkpoint contained state only for "
                    + "a subset of tasks for vertex "
                    + executionJobVertex);
          }
        } else {
          throw new IllegalStateException(
              "There is no execution job vertex for the job"
                  + " vertex ID "
                  + taskGroupStateEntry.getKey());
        }
      }

      return true;
    }
  }
Пример #6
0
  public void restoreLatestCheckpointedState(
      Map<JobVertexID, ExecutionJobVertex> tasks,
      boolean errorIfNoCheckpoint,
      boolean allOrNothingState)
      throws Exception {

    synchronized (lock) {
      if (shutdown) {
        throw new IllegalStateException("CheckpointCoordinator is shut down");
      }

      // Recover the checkpoints
      completedCheckpointStore.recover();

      // restore from the latest checkpoint
      CompletedCheckpoint latest = completedCheckpointStore.getLatestCheckpoint();

      if (latest == null) {
        if (errorIfNoCheckpoint) {
          throw new IllegalStateException("No completed checkpoint available");
        } else {
          return;
        }
      }

      long recoveryTimestamp = System.currentTimeMillis();

      if (allOrNothingState) {
        Map<ExecutionJobVertex, Integer> stateCounts = new HashMap<ExecutionJobVertex, Integer>();

        for (StateForTask state : latest.getStates()) {
          ExecutionJobVertex vertex = tasks.get(state.getOperatorId());
          Execution exec =
              vertex.getTaskVertices()[state.getSubtask()].getCurrentExecutionAttempt();
          exec.setInitialState(state.getState(), recoveryTimestamp);

          Integer count = stateCounts.get(vertex);
          if (count != null) {
            stateCounts.put(vertex, count + 1);
          } else {
            stateCounts.put(vertex, 1);
          }
        }

        // validate that either all task vertices have state, or none
        for (Map.Entry<ExecutionJobVertex, Integer> entry : stateCounts.entrySet()) {
          ExecutionJobVertex vertex = entry.getKey();
          if (entry.getValue() != vertex.getParallelism()) {
            throw new IllegalStateException(
                "The checkpoint contained state only for a subset of tasks for vertex " + vertex);
          }
        }
      } else {
        for (StateForTask state : latest.getStates()) {
          ExecutionJobVertex vertex = tasks.get(state.getOperatorId());
          Execution exec =
              vertex.getTaskVertices()[state.getSubtask()].getCurrentExecutionAttempt();
          exec.setInitialState(state.getState(), recoveryTimestamp);
        }
      }
    }
  }