/** * Tests rescaling of partitioned operator state. More specific, we test the mechanism with {@link * ListCheckpointed} as it subsumes {@link * org.apache.flink.streaming.api.checkpoint.CheckpointedFunction}. */ public void testSavepointRescalingPartitionedOperatorState(boolean scaleOut) throws Exception { final int parallelism = scaleOut ? numSlots : numSlots / 2; final int parallelism2 = scaleOut ? numSlots / 2 : numSlots; final int maxParallelism = 13; FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES); Deadline deadline = timeout.fromNow(); JobID jobID = null; ActorGateway jobManager = null; int counterSize = Math.max(parallelism, parallelism2); PartitionedStateSource.CHECK_CORRECT_SNAPSHOT = new int[counterSize]; PartitionedStateSource.CHECK_CORRECT_RESTORE = new int[counterSize]; try { jobManager = cluster.getLeaderGateway(deadline.timeLeft()); JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, true); jobID = jobGraph.getJobID(); cluster.submitJobDetached(jobGraph); Object savepointResponse = null; // wait until the operator is started StateSourceBase.workStartedLatch.await(); while (deadline.hasTimeLeft()) { Future<Object> savepointPathFuture = jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID), deadline.timeLeft()); FiniteDuration waitingTime = new FiniteDuration(10, TimeUnit.SECONDS); savepointResponse = Await.result(savepointPathFuture, waitingTime); if (savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess) { break; } } assertTrue(savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess); final String savepointPath = ((JobManagerMessages.TriggerSavepointSuccess) savepointResponse).savepointPath(); Future<Object> jobRemovedFuture = jobManager.ask( new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft()); Future<Object> cancellationResponseFuture = jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft()); Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft()); assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess); Await.ready(jobRemovedFuture, deadline.timeLeft()); // job successfully removed jobID = null; JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, true); scaledJobGraph.setSavepointPath(savepointPath); jobID = scaledJobGraph.getJobID(); cluster.submitJobAndWait(scaledJobGraph, false); int sumExp = 0; int sumAct = 0; for (int c : PartitionedStateSource.CHECK_CORRECT_SNAPSHOT) { sumExp += c; } for (int c : PartitionedStateSource.CHECK_CORRECT_RESTORE) { sumAct += c; } assertEquals(sumExp, sumAct); jobID = null; } finally { // clear any left overs from a possibly failed job if (jobID != null && jobManager != null) { Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout); try { Await.ready(jobRemovedFuture, timeout); } catch (TimeoutException | InterruptedException ie) { fail("Failed while cleaning up the cluster."); } } } }
/** * Tests that a job cannot be restarted from a savepoint with a different parallelism if the * rescaled operator has non-partitioned state. * * @throws Exception */ @Test public void testSavepointRescalingNonPartitionedStateCausesException() throws Exception { final int parallelism = numSlots / 2; final int parallelism2 = numSlots; final int maxParallelism = 13; FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES); Deadline deadline = timeout.fromNow(); JobID jobID = null; ActorGateway jobManager = null; try { jobManager = cluster.getLeaderGateway(deadline.timeLeft()); JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, false); jobID = jobGraph.getJobID(); cluster.submitJobDetached(jobGraph); Object savepointResponse = null; // wait until the operator is started StateSourceBase.workStartedLatch.await(); while (deadline.hasTimeLeft()) { Future<Object> savepointPathFuture = jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID), deadline.timeLeft()); FiniteDuration waitingTime = new FiniteDuration(10, TimeUnit.SECONDS); savepointResponse = Await.result(savepointPathFuture, waitingTime); if (savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess) { break; } } assertTrue(savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess); final String savepointPath = ((JobManagerMessages.TriggerSavepointSuccess) savepointResponse).savepointPath(); Future<Object> jobRemovedFuture = jobManager.ask( new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft()); Future<Object> cancellationResponseFuture = jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft()); Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft()); assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess); Await.ready(jobRemovedFuture, deadline.timeLeft()); // job successfully removed jobID = null; JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, false); scaledJobGraph.setSavepointPath(savepointPath); jobID = scaledJobGraph.getJobID(); cluster.submitJobAndWait(scaledJobGraph, false); jobID = null; } catch (JobExecutionException exception) { if (exception.getCause() instanceof SuppressRestartsException) { SuppressRestartsException suppressRestartsException = (SuppressRestartsException) exception.getCause(); if (suppressRestartsException.getCause() instanceof IllegalStateException) { // we expect a IllegalStateException wrapped in a SuppressRestartsException wrapped // in a JobExecutionException, because the job containing non-partitioned state // is being rescaled } else { throw exception; } } else { throw exception; } } finally { // clear any left overs from a possibly failed job if (jobID != null && jobManager != null) { Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout); try { Await.ready(jobRemovedFuture, timeout); } catch (TimeoutException | InterruptedException ie) { fail("Failed while cleaning up the cluster."); } } } }