@Override public String handleRequest(ExecutionJobVertex jobVertex, Map<String, String> params) throws Exception { StringWriter writer = new StringWriter(); JsonGenerator gen = JsonFactory.jacksonFactory.createGenerator(writer); gen.writeStartObject(); gen.writeStringField("id", jobVertex.getJobVertexId().toString()); gen.writeNumberField("parallelism", jobVertex.getParallelism()); gen.writeArrayFieldStart("subtasks"); int num = 0; for (ExecutionVertex vertex : jobVertex.getTaskVertices()) { InstanceConnectionInfo location = vertex.getCurrentAssignedResourceLocation(); String locationString = location == null ? "(unassigned)" : location.getHostname(); gen.writeStartObject(); gen.writeNumberField("subtask", num++); gen.writeNumberField("attempt", vertex.getCurrentExecutionAttempt().getAttemptNumber()); gen.writeStringField("host", locationString); StringifiedAccumulatorResult[] accs = vertex.getCurrentExecutionAttempt().getUserAccumulatorsStringified(); gen.writeArrayFieldStart("user-accumulators"); for (StringifiedAccumulatorResult acc : accs) { gen.writeStartObject(); gen.writeStringField("name", acc.getName()); gen.writeStringField("type", acc.getType()); gen.writeStringField("value", acc.getValue()); gen.writeEndObject(); } gen.writeEndArray(); gen.writeEndObject(); } gen.writeEndArray(); gen.writeEndObject(); gen.close(); return writer.toString(); }
@Override public String handleRequest(ExecutionJobVertex jobVertex, Map<String, String> params) throws Exception { // Build a map that groups tasks by TaskManager Map<String, List<ExecutionVertex>> taskManagerVertices = new HashMap<>(); for (ExecutionVertex vertex : jobVertex.getTaskVertices()) { TaskManagerLocation location = vertex.getCurrentAssignedResourceLocation(); String taskManager = location == null ? "(unassigned)" : location.getHostname() + ":" + location.dataPort(); List<ExecutionVertex> vertices = taskManagerVertices.get(taskManager); if (vertices == null) { vertices = new ArrayList<ExecutionVertex>(); taskManagerVertices.put(taskManager, vertices); } vertices.add(vertex); } // Build JSON response final long now = System.currentTimeMillis(); StringWriter writer = new StringWriter(); JsonGenerator gen = JsonFactory.jacksonFactory.createGenerator(writer); gen.writeStartObject(); gen.writeStringField("id", jobVertex.getJobVertexId().toString()); gen.writeStringField("name", jobVertex.getJobVertex().getName()); gen.writeNumberField("now", now); gen.writeArrayFieldStart("taskmanagers"); for (Entry<String, List<ExecutionVertex>> entry : taskManagerVertices.entrySet()) { String host = entry.getKey(); List<ExecutionVertex> taskVertices = entry.getValue(); int[] tasksPerState = new int[ExecutionState.values().length]; long startTime = Long.MAX_VALUE; long endTime = 0; boolean allFinished = true; LongCounter tmReadBytes = new LongCounter(); LongCounter tmWriteBytes = new LongCounter(); LongCounter tmReadRecords = new LongCounter(); LongCounter tmWriteRecords = new LongCounter(); for (ExecutionVertex vertex : taskVertices) { final ExecutionState state = vertex.getExecutionState(); tasksPerState[state.ordinal()]++; // take the earliest start time long started = vertex.getStateTimestamp(ExecutionState.DEPLOYING); if (started > 0) { startTime = Math.min(startTime, started); } allFinished &= state.isTerminal(); endTime = Math.max(endTime, vertex.getStateTimestamp(state)); Map<AccumulatorRegistry.Metric, Accumulator<?, ?>> metrics = vertex.getCurrentExecutionAttempt().getFlinkAccumulators(); if (metrics != null) { LongCounter readBytes = (LongCounter) metrics.get(AccumulatorRegistry.Metric.NUM_BYTES_IN); tmReadBytes.merge(readBytes); LongCounter writeBytes = (LongCounter) metrics.get(AccumulatorRegistry.Metric.NUM_BYTES_OUT); tmWriteBytes.merge(writeBytes); LongCounter readRecords = (LongCounter) metrics.get(AccumulatorRegistry.Metric.NUM_RECORDS_IN); tmReadRecords.merge(readRecords); LongCounter writeRecords = (LongCounter) metrics.get(AccumulatorRegistry.Metric.NUM_RECORDS_OUT); tmWriteRecords.merge(writeRecords); } } long duration; if (startTime < Long.MAX_VALUE) { if (allFinished) { duration = endTime - startTime; } else { endTime = -1L; duration = now - startTime; } } else { startTime = -1L; endTime = -1L; duration = -1L; } ExecutionState jobVertexState = ExecutionJobVertex.getAggregateJobVertexState(tasksPerState, taskVertices.size()); gen.writeStartObject(); gen.writeStringField("host", host); gen.writeStringField("status", jobVertexState.name()); gen.writeNumberField("start-time", startTime); gen.writeNumberField("end-time", endTime); gen.writeNumberField("duration", duration); gen.writeObjectFieldStart("metrics"); gen.writeNumberField("read-bytes", tmReadBytes.getLocalValuePrimitive()); gen.writeNumberField("write-bytes", tmWriteBytes.getLocalValuePrimitive()); gen.writeNumberField("read-records", tmReadRecords.getLocalValuePrimitive()); gen.writeNumberField("write-records", tmWriteRecords.getLocalValuePrimitive()); gen.writeEndObject(); gen.writeObjectFieldStart("status-counts"); for (ExecutionState state : ExecutionState.values()) { gen.writeNumberField(state.name(), tasksPerState[state.ordinal()]); } gen.writeEndObject(); gen.writeEndObject(); } gen.writeEndArray(); gen.writeEndObject(); gen.close(); return writer.toString(); }
@Test public void testBuildDeploymentDescriptor() { try { final JobID jobId = new JobID(); final JobVertexID jid1 = new JobVertexID(); final JobVertexID jid2 = new JobVertexID(); final JobVertexID jid3 = new JobVertexID(); final JobVertexID jid4 = new JobVertexID(); JobVertex v1 = new JobVertex("v1", jid1); JobVertex v2 = new JobVertex("v2", jid2); JobVertex v3 = new JobVertex("v3", jid3); JobVertex v4 = new JobVertex("v4", jid4); v1.setParallelism(10); v2.setParallelism(10); v3.setParallelism(10); v4.setParallelism(10); v1.setInvokableClass(BatchTask.class); v2.setInvokableClass(BatchTask.class); v3.setInvokableClass(BatchTask.class); v4.setInvokableClass(BatchTask.class); v2.connectNewDataSetAsInput(v1, DistributionPattern.ALL_TO_ALL); v3.connectNewDataSetAsInput(v2, DistributionPattern.ALL_TO_ALL); v4.connectNewDataSetAsInput(v2, DistributionPattern.ALL_TO_ALL); ExecutionGraph eg = new ExecutionGraph( TestingUtils.defaultExecutionContext(), jobId, "some job", new Configuration(), new SerializedValue<>(new ExecutionConfig()), AkkaUtils.getDefaultTimeout(), new NoRestartStrategy()); List<JobVertex> ordered = Arrays.asList(v1, v2, v3, v4); eg.attachJobGraph(ordered); ExecutionJobVertex ejv = eg.getAllVertices().get(jid2); ExecutionVertex vertex = ejv.getTaskVertices()[3]; ExecutionGraphTestUtils.SimpleActorGateway instanceGateway = new ExecutionGraphTestUtils.SimpleActorGateway(TestingUtils.directExecutionContext()); final Instance instance = getInstance(instanceGateway); final SimpleSlot slot = instance.allocateSimpleSlot(jobId); assertEquals(ExecutionState.CREATED, vertex.getExecutionState()); vertex.deployToSlot(slot); assertEquals(ExecutionState.DEPLOYING, vertex.getExecutionState()); TaskDeploymentDescriptor descr = instanceGateway.lastTDD; assertNotNull(descr); assertEquals(jobId, descr.getJobID()); assertEquals(jid2, descr.getVertexID()); assertEquals(3, descr.getIndexInSubtaskGroup()); assertEquals(10, descr.getNumberOfSubtasks()); assertEquals(BatchTask.class.getName(), descr.getInvokableClassName()); assertEquals("v2", descr.getTaskName()); List<ResultPartitionDeploymentDescriptor> producedPartitions = descr.getProducedPartitions(); List<InputGateDeploymentDescriptor> consumedPartitions = descr.getInputGates(); assertEquals(2, producedPartitions.size()); assertEquals(1, consumedPartitions.size()); assertEquals(10, producedPartitions.get(0).getNumberOfSubpartitions()); assertEquals(10, producedPartitions.get(1).getNumberOfSubpartitions()); assertEquals(10, consumedPartitions.get(0).getInputChannelDeploymentDescriptors().length); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
public boolean restoreLatestCheckpointedState( Map<JobVertexID, ExecutionJobVertex> tasks, boolean errorIfNoCheckpoint, boolean allOrNothingState) throws Exception { synchronized (lock) { if (shutdown) { throw new IllegalStateException("CheckpointCoordinator is shut down"); } // Recover the checkpoints completedCheckpointStore.recover(); // restore from the latest checkpoint CompletedCheckpoint latest = completedCheckpointStore.getLatestCheckpoint(); if (latest == null) { if (errorIfNoCheckpoint) { throw new IllegalStateException("No completed checkpoint available"); } else { return false; } } long recoveryTimestamp = System.currentTimeMillis(); for (Map.Entry<JobVertexID, TaskState> taskGroupStateEntry : latest.getTaskStates().entrySet()) { TaskState taskState = taskGroupStateEntry.getValue(); ExecutionJobVertex executionJobVertex = tasks.get(taskGroupStateEntry.getKey()); if (executionJobVertex != null) { // check that we only restore the state if the parallelism has not been changed if (taskState.getParallelism() != executionJobVertex.getParallelism()) { throw new RuntimeException( "Cannot restore the latest checkpoint because " + "the parallelism changed. The operator" + executionJobVertex.getJobVertexId() + " has parallelism " + executionJobVertex.getParallelism() + " whereas the corresponding" + "state object has a parallelism of " + taskState.getParallelism()); } int counter = 0; List<Set<Integer>> keyGroupPartitions = createKeyGroupPartitions(numberKeyGroups, executionJobVertex.getParallelism()); for (int i = 0; i < executionJobVertex.getParallelism(); i++) { SubtaskState subtaskState = taskState.getState(i); SerializedValue<StateHandle<?>> state = null; if (subtaskState != null) { // count the number of executions for which we set a state counter++; state = subtaskState.getState(); } Map<Integer, SerializedValue<StateHandle<?>>> kvStateForTaskMap = taskState.getUnwrappedKvStates(keyGroupPartitions.get(i)); Execution currentExecutionAttempt = executionJobVertex.getTaskVertices()[i].getCurrentExecutionAttempt(); currentExecutionAttempt.setInitialState(state, kvStateForTaskMap, recoveryTimestamp); } if (allOrNothingState && counter > 0 && counter < executionJobVertex.getParallelism()) { throw new IllegalStateException( "The checkpoint contained state only for " + "a subset of tasks for vertex " + executionJobVertex); } } else { throw new IllegalStateException( "There is no execution job vertex for the job" + " vertex ID " + taskGroupStateEntry.getKey()); } } return true; } }
public void restoreLatestCheckpointedState( Map<JobVertexID, ExecutionJobVertex> tasks, boolean errorIfNoCheckpoint, boolean allOrNothingState) throws Exception { synchronized (lock) { if (shutdown) { throw new IllegalStateException("CheckpointCoordinator is shut down"); } // Recover the checkpoints completedCheckpointStore.recover(); // restore from the latest checkpoint CompletedCheckpoint latest = completedCheckpointStore.getLatestCheckpoint(); if (latest == null) { if (errorIfNoCheckpoint) { throw new IllegalStateException("No completed checkpoint available"); } else { return; } } long recoveryTimestamp = System.currentTimeMillis(); if (allOrNothingState) { Map<ExecutionJobVertex, Integer> stateCounts = new HashMap<ExecutionJobVertex, Integer>(); for (StateForTask state : latest.getStates()) { ExecutionJobVertex vertex = tasks.get(state.getOperatorId()); Execution exec = vertex.getTaskVertices()[state.getSubtask()].getCurrentExecutionAttempt(); exec.setInitialState(state.getState(), recoveryTimestamp); Integer count = stateCounts.get(vertex); if (count != null) { stateCounts.put(vertex, count + 1); } else { stateCounts.put(vertex, 1); } } // validate that either all task vertices have state, or none for (Map.Entry<ExecutionJobVertex, Integer> entry : stateCounts.entrySet()) { ExecutionJobVertex vertex = entry.getKey(); if (entry.getValue() != vertex.getParallelism()) { throw new IllegalStateException( "The checkpoint contained state only for a subset of tasks for vertex " + vertex); } } } else { for (StateForTask state : latest.getStates()) { ExecutionJobVertex vertex = tasks.get(state.getOperatorId()); Execution exec = vertex.getTaskVertices()[state.getSubtask()].getCurrentExecutionAttempt(); exec.setInitialState(state.getState(), recoveryTimestamp); } } } }