@Test public void testDisposeSavepointSuccess() throws Exception { replaceStdOutAndStdErr(); try { String savepointPath = "expectedSavepointPath"; ActorGateway jobManager = mock(ActorGateway.class); Promise<Object> triggerResponse = new scala.concurrent.impl.Promise.DefaultPromise<>(); when(jobManager.ask( Mockito.eq(new JobManagerMessages.DisposeSavepoint(savepointPath)), Mockito.any(FiniteDuration.class))) .thenReturn(triggerResponse.future()); triggerResponse.success(JobManagerMessages.getDisposeSavepointSuccess()); CliFrontend frontend = new MockCliFrontend(CliFrontendTestUtils.getConfigDir(), jobManager); String[] parameters = {"-d", savepointPath}; int returnCode = frontend.savepoint(parameters); assertEquals(0, returnCode); verify(jobManager, times(1)) .ask( Mockito.eq(new JobManagerMessages.DisposeSavepoint(savepointPath)), Mockito.any(FiniteDuration.class)); String outMsg = buffer.toString(); assertTrue(outMsg.contains(savepointPath)); assertTrue(outMsg.contains("disposed")); } finally { restoreStdOutAndStdErr(); } }
/** * Tests that a job is properly canceled in the case of a leader change. However, this time only * the JMs are notified about the leader change and the TMs still believe the old leader to have * leadership. */ @Test public void testStateCleanupAfterNewLeaderElection() throws Exception { UUID leaderSessionID = UUID.randomUUID(); UUID newLeaderSessionID = UUID.randomUUID(); cluster.grantLeadership(0, leaderSessionID); cluster.notifyRetrievalListeners(0, leaderSessionID); cluster.waitForTaskManagersToBeRegistered(); // submit blocking job so that we can test job clean up cluster.submitJobDetached(job); ActorGateway jm = cluster.getLeaderGateway(timeout); Future<Object> wait = jm.ask(new WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout); Await.ready(wait, timeout); Future<Object> jobRemoval = jm.ask(new NotifyWhenJobRemoved(job.getJobID()), timeout); // only notify the JMs about the new leader JM(1) cluster.grantLeadership(1, newLeaderSessionID); // job should be removed anyway Await.ready(jobRemoval, timeout); }
@Test public void testTriggerSavepointFailure() throws Exception { replaceStdOutAndStdErr(); try { JobID jobId = new JobID(); ActorGateway jobManager = mock(ActorGateway.class); Promise<Object> triggerResponse = new scala.concurrent.impl.Promise.DefaultPromise<>(); when(jobManager.ask( Mockito.eq(new JobManagerMessages.TriggerSavepoint(jobId)), Mockito.any(FiniteDuration.class))) .thenReturn(triggerResponse.future()); Exception testException = new Exception("expectedTestException"); triggerResponse.success(new JobManagerMessages.TriggerSavepointFailure(jobId, testException)); CliFrontend frontend = new MockCliFrontend(CliFrontendTestUtils.getConfigDir(), jobManager); String[] parameters = {jobId.toString()}; int returnCode = frontend.savepoint(parameters); assertTrue(returnCode != 0); verify(jobManager, times(1)) .ask( Mockito.eq(new JobManagerMessages.TriggerSavepoint(jobId)), Mockito.any(FiniteDuration.class)); assertTrue(buffer.toString().contains("expectedTestException")); } finally { restoreStdOutAndStdErr(); } }
/** * Tests that a job is properly canceled in the event of a leader change. However, this time only * the TMs are notified about the changing leader. This should be enough to cancel the currently * running job, though. */ @Test public void testStateCleanupAfterListenerNotification() throws Exception { UUID leaderSessionID = UUID.randomUUID(); UUID newLeaderSessionID = UUID.randomUUID(); cluster.grantLeadership(0, leaderSessionID); cluster.notifyRetrievalListeners(0, leaderSessionID); cluster.waitForTaskManagersToBeRegistered(); // submit blocking job cluster.submitJobDetached(job); ActorGateway jm = cluster.getLeaderGateway(timeout); Future<Object> wait = jm.ask(new WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout); Await.ready(wait, timeout); Future<Object> jobRemoval = jm.ask(new NotifyWhenJobRemoved(job.getJobID()), timeout); // notify listeners (TMs) about the leader change cluster.notifyRetrievalListeners(1, newLeaderSessionID); Await.ready(jobRemoval, timeout); }
/** * This method sends a CancelTask message to the instance of the assigned slot. * * <p>The sending is tried up to NUM_CANCEL_CALL_TRIES times. */ private void sendCancelRpcCall() { final SimpleSlot slot = this.assignedResource; if (slot != null) { final ActorGateway gateway = slot.getInstance().getActorGateway(); Future<Object> cancelResult = gateway.retry( new CancelTask(attemptId), NUM_CANCEL_CALL_TRIES, timeout, executionContext); cancelResult.onComplete( new OnComplete<Object>() { @Override public void onComplete(Throwable failure, Object success) throws Throwable { if (failure != null) { fail(new Exception("Task could not be canceled.", failure)); } else { TaskOperationResult result = (TaskOperationResult) success; if (!result.success()) { LOG.debug( "Cancel task call did not find task. Probably akka message call" + " race."); } } } }, executionContext); } }
@Override public void run() { try { int returnValue; switch (type) { case YARN_SESSION: yCli = new FlinkYarnSessionCli("", "", false); returnValue = yCli.run(args); break; case CLI_FRONTEND: TestingCLI cli; try { cli = new TestingCLI(); returnValue = cli.parseParameters(args); } catch (Exception e) { throw new RuntimeException( "Failed to execute the following args with CliFrontend: " + Arrays.toString(args), e); } final ClusterClient client = cli.getClusterClient(); try { // check if the JobManager is still alive after running the job final FiniteDuration finiteDuration = new FiniteDuration(10, TimeUnit.SECONDS); ActorGateway jobManagerGateway = client.getJobManagerGateway(); Await.ready( jobManagerGateway.ask(new Identify(true), finiteDuration), finiteDuration); } catch (Exception e) { throw new RuntimeException( "It seems like the JobManager died although it should still be alive"); } // verify we would have shut down anyways and then shutdown Mockito.verify(cli.getSpiedClusterClient()).shutdown(); client.shutdown(); break; default: throw new RuntimeException("Unknown type " + type); } if (returnValue != this.expectedReturnValue) { Assert.fail( "The YARN session returned with unexpected value=" + returnValue + " expected=" + expectedReturnValue); } } catch (Throwable t) { LOG.info("Runner stopped with exception", t); // save error. this.runnerError = t; } }
private void sendFailIntermediateResultPartitionsRpcCall() { final SimpleSlot slot = this.assignedResource; if (slot != null) { final Instance instance = slot.getInstance(); if (instance.isAlive()) { final ActorGateway gateway = instance.getActorGateway(); // TODO For some tests this could be a problem when querying too early if all resources were // released gateway.tell(new FailIntermediateResultPartitions(attemptId)); } } }
/** * Verifies a correct error message when vertices with master initialization (input formats / * output formats) fail. */ @Test public void testFailureWhenInitializeOnMasterFails() { try { // create a simple job graph JobVertex jobVertex = new JobVertex("Vertex that fails in initializeOnMaster") { @Override public void initializeOnMaster(ClassLoader loader) throws Exception { throw new RuntimeException("test exception"); } }; jobVertex.setInvokableClass(Tasks.NoOpInvokable.class); JobGraph jg = new JobGraph("test job", jobVertex); // submit the job Future<Object> submitFuture = jmGateway.ask( new JobManagerMessages.SubmitJob(jg, ListeningBehaviour.EXECUTION_RESULT), timeout); try { Await.result(submitFuture, timeout); } catch (JobExecutionException e) { // that is what we expect // test that the exception nesting is not too deep assertTrue(e.getCause() instanceof RuntimeException); } catch (Exception e) { fail("Wrong exception type"); } } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
/** * Tests that a job is properly canceled in the case of a leader change. In such an event all * TaskManagers have to disconnect from the previous leader and connect to the newly elected * leader. */ @Test public void testStateCleanupAfterNewLeaderElectionAndListenerNotification() throws Exception { UUID leaderSessionID1 = UUID.randomUUID(); UUID leaderSessionID2 = UUID.randomUUID(); // first make JM(0) the leader cluster.grantLeadership(0, leaderSessionID1); // notify all listeners cluster.notifyRetrievalListeners(0, leaderSessionID1); cluster.waitForTaskManagersToBeRegistered(); // submit blocking job so that it is not finished when we cancel it cluster.submitJobDetached(job); ActorGateway jm = cluster.getLeaderGateway(timeout); Future<Object> wait = jm.ask(new WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout); Await.ready(wait, timeout); Future<Object> jobRemoval = jm.ask(new NotifyWhenJobRemoved(job.getJobID()), timeout); // make the JM(1) the new leader cluster.grantLeadership(1, leaderSessionID2); // notify all listeners about the event cluster.notifyRetrievalListeners(1, leaderSessionID2); Await.ready(jobRemoval, timeout); cluster.waitForTaskManagersToBeRegistered(); ActorGateway jm2 = cluster.getLeaderGateway(timeout); Future<Object> futureNumberSlots = jm2.ask(JobManagerMessages.getRequestTotalNumberOfSlots(), timeout); // check that all TMs have registered at the new leader int numberSlots = (Integer) Await.result(futureNumberSlots, timeout); assertEquals(parallelism, numberSlots); // try to resubmit now the non-blocking job, it should complete successfully Tasks.BlockingOnceReceiver$.MODULE$.blocking_$eq(false); cluster.submitJobAndWait(job, false, timeout); }
/** * Tests that the same JobManager can be reelected as the leader. Even though, the same JM is * elected as the next leader, all currently running jobs should be canceled properly and all TMs * should disconnect from the leader and then reconnect to it. */ @Test public void testReelectionOfSameJobManager() throws Exception { UUID leaderSessionID = UUID.randomUUID(); UUID newLeaderSessionID = UUID.randomUUID(); FiniteDuration shortTimeout = new FiniteDuration(20, TimeUnit.SECONDS); cluster.grantLeadership(0, leaderSessionID); cluster.notifyRetrievalListeners(0, leaderSessionID); cluster.waitForTaskManagersToBeRegistered(); // submit blocking job cluster.submitJobDetached(job); ActorGateway jm = cluster.getLeaderGateway(timeout); Future<Object> wait = jm.ask(new WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout); Await.ready(wait, timeout); Future<Object> jobRemoval = jm.ask(new NotifyWhenJobRemoved(job.getJobID()), timeout); // make JM(0) again the leader --> this implies first a leadership revokal cluster.grantLeadership(0, newLeaderSessionID); Await.ready(jobRemoval, timeout); // The TMs should not be able to reconnect since they don't know the current leader // session ID try { cluster.waitForTaskManagersToBeRegistered(shortTimeout); fail("TaskManager should not be able to register at JobManager."); } catch (TimeoutException e) { // expected exception since the TMs have still the old leader session ID } // notify the TMs about the new (old) leader cluster.notifyRetrievalListeners(0, newLeaderSessionID); cluster.waitForTaskManagersToBeRegistered(); // try to resubmit now the non-blocking job, it should complete successfully Tasks.BlockingOnceReceiver$.MODULE$.blocking_$eq(false); cluster.submitJobAndWait(job, false, timeout); }
@Test public void testFailureWhenJarBlobsMissing() { try { // create a simple job graph JobVertex jobVertex = new JobVertex("Test Vertex"); jobVertex.setInvokableClass(Tasks.NoOpInvokable.class); JobGraph jg = new JobGraph("test job", jobVertex); // request the blob port from the job manager Future<Object> future = jmGateway.ask(JobManagerMessages.getRequestBlobManagerPort(), timeout); int blobPort = (Integer) Await.result(future, timeout); // upload two dummy bytes and add their keys to the job graph as dependencies BlobKey key1, key2; BlobClient bc = new BlobClient(new InetSocketAddress("localhost", blobPort)); try { key1 = bc.put(new byte[10]); key2 = bc.put(new byte[10]); // delete one of the blobs to make sure that the startup failed bc.delete(key2); } finally { bc.close(); } jg.addBlob(key1); jg.addBlob(key2); // submit the job Future<Object> submitFuture = jmGateway.ask( new JobManagerMessages.SubmitJob(jg, ListeningBehaviour.EXECUTION_RESULT), timeout); try { Await.result(submitFuture, timeout); } catch (JobExecutionException e) { // that is what we expect assertTrue(e.getCause() instanceof IOException); } catch (Exception e) { fail("Wrong exception type"); } } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
/** * Sends an UpdatePartitionInfo message to the instance of the consumerSlot. * * @param consumerSlot Slot to whose instance the message will be sent * @param updatePartitionInfo UpdatePartitionInfo message */ private void sendUpdatePartitionInfoRpcCall( final SimpleSlot consumerSlot, final UpdatePartitionInfo updatePartitionInfo) { if (consumerSlot != null) { final Instance instance = consumerSlot.getInstance(); final ActorGateway gateway = instance.getActorGateway(); Future<Object> futureUpdate = gateway.ask(updatePartitionInfo, timeout); futureUpdate.onFailure( new OnFailure() { @Override public void onFailure(Throwable failure) throws Throwable { fail( new IllegalStateException( "Update task on instance " + instance + " failed due to:", failure)); } }, executionContext); } }
/** * Shuts down the checkpoint coordinator. * * <p>After this method has been called, the coordinator does not accept and further messages and * cannot trigger any further checkpoints. */ public void shutdown() throws Exception { synchronized (lock) { try { if (!shutdown) { shutdown = true; LOG.info("Stopping checkpoint coordinator for job " + job); periodicScheduling = false; triggerRequestQueued = false; // shut down the thread that handles the timeouts and pending triggers timer.cancel(); // make sure that the actor does not linger if (jobStatusListener != null) { jobStatusListener.tell(PoisonPill.getInstance()); jobStatusListener = null; } checkpointIdCounter.stop(); // clear and discard all pending checkpoints for (PendingCheckpoint pending : pendingCheckpoints.values()) { pending.discard(userClassLoader); } pendingCheckpoints.clear(); // clean and discard all successful checkpoints completedCheckpointStore.discardAllCheckpoints(); onShutdown(); } } finally { // Remove shutdown hook to prevent resource leaks, unless this is invoked by the // shutdown hook itself. if (shutdownHook != null && shutdownHook != Thread.currentThread()) { try { Runtime.getRuntime().removeShutdownHook(shutdownHook); } catch (IllegalStateException ignored) { // race, JVM is in shutdown already, we can safely ignore this } catch (Throwable t) { LOG.warn("Error unregistering checkpoint coordinator shutdown hook.", t); } } } } }
public ExecutionGraph getExecutionGraph(JobID jid) { ExecutionGraph cached = cache.get(jid); if (cached != null) { return cached; } try { Future<Object> future = source.ask(new JobManagerMessages.RequestJob(jid), timeout); Object result = Await.result(future, timeout); if (result instanceof JobManagerMessages.JobNotFound) { return null; } else if (result instanceof JobManagerMessages.JobFound) { ExecutionGraph eg = ((JobManagerMessages.JobFound) result).executionGraph(); cache.put(jid, eg); return eg; } else { throw new RuntimeException("Unknown response from JobManager / Archive: " + result); } } catch (Exception e) { throw new RuntimeException("Error requesting execution graph", e); } }
/** * Tests that a job cannot be restarted from a savepoint with a different parallelism if the * rescaled operator has non-partitioned state. * * @throws Exception */ @Test public void testSavepointRescalingNonPartitionedStateCausesException() throws Exception { final int parallelism = numSlots / 2; final int parallelism2 = numSlots; final int maxParallelism = 13; FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES); Deadline deadline = timeout.fromNow(); JobID jobID = null; ActorGateway jobManager = null; try { jobManager = cluster.getLeaderGateway(deadline.timeLeft()); JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, false); jobID = jobGraph.getJobID(); cluster.submitJobDetached(jobGraph); Object savepointResponse = null; // wait until the operator is started StateSourceBase.workStartedLatch.await(); while (deadline.hasTimeLeft()) { Future<Object> savepointPathFuture = jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID), deadline.timeLeft()); FiniteDuration waitingTime = new FiniteDuration(10, TimeUnit.SECONDS); savepointResponse = Await.result(savepointPathFuture, waitingTime); if (savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess) { break; } } assertTrue(savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess); final String savepointPath = ((JobManagerMessages.TriggerSavepointSuccess) savepointResponse).savepointPath(); Future<Object> jobRemovedFuture = jobManager.ask( new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft()); Future<Object> cancellationResponseFuture = jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft()); Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft()); assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess); Await.ready(jobRemovedFuture, deadline.timeLeft()); // job successfully removed jobID = null; JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, false); scaledJobGraph.setSavepointPath(savepointPath); jobID = scaledJobGraph.getJobID(); cluster.submitJobAndWait(scaledJobGraph, false); jobID = null; } catch (JobExecutionException exception) { if (exception.getCause() instanceof SuppressRestartsException) { SuppressRestartsException suppressRestartsException = (SuppressRestartsException) exception.getCause(); if (suppressRestartsException.getCause() instanceof IllegalStateException) { // we expect a IllegalStateException wrapped in a SuppressRestartsException wrapped // in a JobExecutionException, because the job containing non-partitioned state // is being rescaled } else { throw exception; } } else { throw exception; } } finally { // clear any left overs from a possibly failed job if (jobID != null && jobManager != null) { Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout); try { Await.ready(jobRemovedFuture, timeout); } catch (TimeoutException | InterruptedException ie) { fail("Failed while cleaning up the cluster."); } } } }
/** * Tests rescaling of partitioned operator state. More specific, we test the mechanism with {@link * ListCheckpointed} as it subsumes {@link * org.apache.flink.streaming.api.checkpoint.CheckpointedFunction}. */ public void testSavepointRescalingPartitionedOperatorState(boolean scaleOut) throws Exception { final int parallelism = scaleOut ? numSlots : numSlots / 2; final int parallelism2 = scaleOut ? numSlots / 2 : numSlots; final int maxParallelism = 13; FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES); Deadline deadline = timeout.fromNow(); JobID jobID = null; ActorGateway jobManager = null; int counterSize = Math.max(parallelism, parallelism2); PartitionedStateSource.CHECK_CORRECT_SNAPSHOT = new int[counterSize]; PartitionedStateSource.CHECK_CORRECT_RESTORE = new int[counterSize]; try { jobManager = cluster.getLeaderGateway(deadline.timeLeft()); JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, true); jobID = jobGraph.getJobID(); cluster.submitJobDetached(jobGraph); Object savepointResponse = null; // wait until the operator is started StateSourceBase.workStartedLatch.await(); while (deadline.hasTimeLeft()) { Future<Object> savepointPathFuture = jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID), deadline.timeLeft()); FiniteDuration waitingTime = new FiniteDuration(10, TimeUnit.SECONDS); savepointResponse = Await.result(savepointPathFuture, waitingTime); if (savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess) { break; } } assertTrue(savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess); final String savepointPath = ((JobManagerMessages.TriggerSavepointSuccess) savepointResponse).savepointPath(); Future<Object> jobRemovedFuture = jobManager.ask( new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft()); Future<Object> cancellationResponseFuture = jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft()); Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft()); assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess); Await.ready(jobRemovedFuture, deadline.timeLeft()); // job successfully removed jobID = null; JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, true); scaledJobGraph.setSavepointPath(savepointPath); jobID = scaledJobGraph.getJobID(); cluster.submitJobAndWait(scaledJobGraph, false); int sumExp = 0; int sumAct = 0; for (int c : PartitionedStateSource.CHECK_CORRECT_SNAPSHOT) { sumExp += c; } for (int c : PartitionedStateSource.CHECK_CORRECT_RESTORE) { sumAct += c; } assertEquals(sumExp, sumAct); jobID = null; } finally { // clear any left overs from a possibly failed job if (jobID != null && jobManager != null) { Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout); try { Await.ready(jobRemovedFuture, timeout); } catch (TimeoutException | InterruptedException ie) { fail("Failed while cleaning up the cluster."); } } } }
@Override public String handleJsonRequest( Map<String, String> pathParams, Map<String, String> queryParams, ActorGateway jobManager) throws Exception { try { if (jobManager != null) { // whether one task manager's metrics are requested, or all task manager, we // return them in an array. This avoids unnecessary code complexity. // If only one task manager is requested, we only fetch one task manager metrics. final List<Instance> instances = new ArrayList<>(); if (pathParams.containsKey(TASK_MANAGER_ID_KEY)) { try { InstanceID instanceID = new InstanceID(StringUtils.hexStringToByte(pathParams.get(TASK_MANAGER_ID_KEY))); Future<Object> future = jobManager.ask( new JobManagerMessages.RequestTaskManagerInstance(instanceID), timeout); TaskManagerInstance instance = (TaskManagerInstance) Await.result(future, timeout); if (instance.instance().nonEmpty()) { instances.add(instance.instance().get()); } } // this means the id string was invalid. Keep the list empty. catch (IllegalArgumentException e) { // do nothing. } } else { Future<Object> future = jobManager.ask(JobManagerMessages.getRequestRegisteredTaskManagers(), timeout); RegisteredTaskManagers taskManagers = (RegisteredTaskManagers) Await.result(future, timeout); instances.addAll(taskManagers.asJavaCollection()); } StringWriter writer = new StringWriter(); JsonGenerator gen = JsonFactory.jacksonFactory.createGenerator(writer); gen.writeStartObject(); gen.writeArrayFieldStart("taskmanagers"); for (Instance instance : instances) { gen.writeStartObject(); gen.writeStringField("id", instance.getId().toString()); gen.writeStringField("path", instance.getActorGateway().path()); gen.writeNumberField("dataPort", instance.getTaskManagerLocation().dataPort()); gen.writeNumberField("timeSinceLastHeartbeat", instance.getLastHeartBeat()); gen.writeNumberField("slotsNumber", instance.getTotalNumberOfSlots()); gen.writeNumberField("freeSlots", instance.getNumberOfAvailableSlots()); gen.writeNumberField("cpuCores", instance.getResources().getNumberOfCPUCores()); gen.writeNumberField("physicalMemory", instance.getResources().getSizeOfPhysicalMemory()); gen.writeNumberField("freeMemory", instance.getResources().getSizeOfJvmHeap()); gen.writeNumberField("managedMemory", instance.getResources().getSizeOfManagedMemory()); // only send metrics when only one task manager requests them. if (pathParams.containsKey(TASK_MANAGER_ID_KEY)) { byte[] report = instance.getLastMetricsReport(); if (report != null) { gen.writeFieldName("metrics"); gen.writeRawValue(new String(report, "utf-8")); } } gen.writeEndObject(); } gen.writeEndArray(); gen.writeEndObject(); gen.close(); return writer.toString(); } else { throw new Exception("No connection to the leading JobManager."); } } catch (Exception e) { throw new RuntimeException("Failed to fetch list of all task managers: " + e.getMessage(), e); } }
/** * Tests that a job with non partitioned state can be restarted from a savepoint with a different * parallelism if the operator with non-partitioned state are not rescaled. * * @throws Exception */ @Test public void testSavepointRescalingWithKeyedAndNonPartitionedState() throws Exception { int numberKeys = 42; int numberElements = 1000; int numberElements2 = 500; int parallelism = numSlots / 2; int parallelism2 = numSlots; int maxParallelism = 13; FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES); Deadline deadline = timeout.fromNow(); ActorGateway jobManager = null; JobID jobID = null; try { jobManager = cluster.getLeaderGateway(deadline.timeLeft()); JobGraph jobGraph = createJobGraphWithKeyedAndNonPartitionedOperatorState( parallelism, maxParallelism, parallelism, numberKeys, numberElements, false, 100); jobID = jobGraph.getJobID(); cluster.submitJobDetached(jobGraph); // wait til the sources have emitted numberElements for each key and completed a checkpoint SubtaskIndexFlatMapper.workCompletedLatch.await( deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS); // verify the current state Set<Tuple2<Integer, Integer>> actualResult = CollectionSink.getElementsSet(); Set<Tuple2<Integer, Integer>> expectedResult = new HashSet<>(); for (int key = 0; key < numberKeys; key++) { int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism); expectedResult.add( Tuple2.of( KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup( maxParallelism, parallelism, keyGroupIndex), numberElements * key)); } assertEquals(expectedResult, actualResult); // clear the CollectionSink set for the restarted job CollectionSink.clearElementsSet(); Future<Object> savepointPathFuture = jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID), deadline.timeLeft()); final String savepointPath = ((JobManagerMessages.TriggerSavepointSuccess) Await.result(savepointPathFuture, deadline.timeLeft())) .savepointPath(); Future<Object> jobRemovedFuture = jobManager.ask( new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft()); Future<Object> cancellationResponseFuture = jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft()); Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft()); assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess); Await.ready(jobRemovedFuture, deadline.timeLeft()); jobID = null; JobGraph scaledJobGraph = createJobGraphWithKeyedAndNonPartitionedOperatorState( parallelism2, maxParallelism, parallelism, numberKeys, numberElements + numberElements2, true, 100); scaledJobGraph.setSavepointPath(savepointPath); jobID = scaledJobGraph.getJobID(); cluster.submitJobAndWait(scaledJobGraph, false); jobID = null; Set<Tuple2<Integer, Integer>> actualResult2 = CollectionSink.getElementsSet(); Set<Tuple2<Integer, Integer>> expectedResult2 = new HashSet<>(); for (int key = 0; key < numberKeys; key++) { int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism); expectedResult2.add( Tuple2.of( KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup( maxParallelism, parallelism2, keyGroupIndex), key * (numberElements + numberElements2))); } assertEquals(expectedResult2, actualResult2); } finally { // clear the CollectionSink set for the restarted job CollectionSink.clearElementsSet(); // clear any left overs from a possibly failed job if (jobID != null && jobManager != null) { Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout); try { Await.ready(jobRemovedFuture, timeout); } catch (TimeoutException | InterruptedException ie) { fail("Failed while cleaning up the cluster."); } } } }
public void deployToSlot(final SimpleSlot slot) throws JobException { // sanity checks if (slot == null) { throw new NullPointerException(); } if (!slot.isAlive()) { throw new JobException("Target slot for deployment is not alive."); } // make sure exactly one deployment call happens from the correct state // note: the transition from CREATED to DEPLOYING is for testing purposes only ExecutionState previous = this.state; if (previous == SCHEDULED || previous == CREATED) { if (!transitionState(previous, DEPLOYING)) { // race condition, someone else beat us to the deploying call. // this should actually not happen and indicates a race somewhere else throw new IllegalStateException("Cannot deploy task: Concurrent deployment call race."); } } else { // vertex may have been cancelled, or it was already scheduled throw new IllegalStateException( "The vertex must be in CREATED or SCHEDULED state to be deployed. Found state " + previous); } try { // good, we are allowed to deploy if (!slot.setExecutedVertex(this)) { throw new JobException("Could not assign the ExecutionVertex to the slot " + slot); } this.assignedResource = slot; this.assignedResourceLocation = slot.getInstance().getInstanceConnectionInfo(); // race double check, did we fail/cancel and do we need to release the slot? if (this.state != DEPLOYING) { slot.releaseSlot(); return; } if (LOG.isInfoEnabled()) { LOG.info( String.format( "Deploying %s (attempt #%d) to %s", vertex.getSimpleName(), attemptNumber, slot.getInstance().getInstanceConnectionInfo().getHostname())); } final TaskDeploymentDescriptor deployment = vertex.createDeploymentDescriptor( attemptId, slot, operatorState, recoveryTimestamp, attemptNumber); // register this execution at the execution graph, to receive call backs vertex.getExecutionGraph().registerExecution(this); final Instance instance = slot.getInstance(); final ActorGateway gateway = instance.getActorGateway(); final Future<Object> deployAction = gateway.ask(new SubmitTask(deployment), timeout); deployAction.onComplete( new OnComplete<Object>() { @Override public void onComplete(Throwable failure, Object success) throws Throwable { if (failure != null) { if (failure instanceof TimeoutException) { String taskname = deployment.getTaskInfo().getTaskNameWithSubtasks() + " (" + attemptId + ')'; markFailed( new Exception( "Cannot deploy task " + taskname + " - TaskManager (" + instance + ") not responding after a timeout of " + timeout, failure)); } else { markFailed(failure); } } else { if (!(success.equals(Messages.getAcknowledge()))) { markFailed( new Exception( "Failed to deploy the task to slot " + slot + ": Response was not of type Acknowledge")); } } } }, executionContext); } catch (Throwable t) { markFailed(t); ExceptionUtils.rethrow(t); } }