/** * Tests that a job with non partitioned state can be restarted from a savepoint with a different * parallelism if the operator with non-partitioned state are not rescaled. * * @throws Exception */ @Test public void testSavepointRescalingWithKeyedAndNonPartitionedState() throws Exception { int numberKeys = 42; int numberElements = 1000; int numberElements2 = 500; int parallelism = numSlots / 2; int parallelism2 = numSlots; int maxParallelism = 13; FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES); Deadline deadline = timeout.fromNow(); ActorGateway jobManager = null; JobID jobID = null; try { jobManager = cluster.getLeaderGateway(deadline.timeLeft()); JobGraph jobGraph = createJobGraphWithKeyedAndNonPartitionedOperatorState( parallelism, maxParallelism, parallelism, numberKeys, numberElements, false, 100); jobID = jobGraph.getJobID(); cluster.submitJobDetached(jobGraph); // wait til the sources have emitted numberElements for each key and completed a checkpoint SubtaskIndexFlatMapper.workCompletedLatch.await( deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS); // verify the current state Set<Tuple2<Integer, Integer>> actualResult = CollectionSink.getElementsSet(); Set<Tuple2<Integer, Integer>> expectedResult = new HashSet<>(); for (int key = 0; key < numberKeys; key++) { int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism); expectedResult.add( Tuple2.of( KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup( maxParallelism, parallelism, keyGroupIndex), numberElements * key)); } assertEquals(expectedResult, actualResult); // clear the CollectionSink set for the restarted job CollectionSink.clearElementsSet(); Future<Object> savepointPathFuture = jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID), deadline.timeLeft()); final String savepointPath = ((JobManagerMessages.TriggerSavepointSuccess) Await.result(savepointPathFuture, deadline.timeLeft())) .savepointPath(); Future<Object> jobRemovedFuture = jobManager.ask( new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft()); Future<Object> cancellationResponseFuture = jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft()); Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft()); assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess); Await.ready(jobRemovedFuture, deadline.timeLeft()); jobID = null; JobGraph scaledJobGraph = createJobGraphWithKeyedAndNonPartitionedOperatorState( parallelism2, maxParallelism, parallelism, numberKeys, numberElements + numberElements2, true, 100); scaledJobGraph.setSavepointPath(savepointPath); jobID = scaledJobGraph.getJobID(); cluster.submitJobAndWait(scaledJobGraph, false); jobID = null; Set<Tuple2<Integer, Integer>> actualResult2 = CollectionSink.getElementsSet(); Set<Tuple2<Integer, Integer>> expectedResult2 = new HashSet<>(); for (int key = 0; key < numberKeys; key++) { int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism); expectedResult2.add( Tuple2.of( KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup( maxParallelism, parallelism2, keyGroupIndex), key * (numberElements + numberElements2))); } assertEquals(expectedResult2, actualResult2); } finally { // clear the CollectionSink set for the restarted job CollectionSink.clearElementsSet(); // clear any left overs from a possibly failed job if (jobID != null && jobManager != null) { Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout); try { Await.ready(jobRemovedFuture, timeout); } catch (TimeoutException | InterruptedException ie) { fail("Failed while cleaning up the cluster."); } } } }
/** * Tests rescaling of partitioned operator state. More specific, we test the mechanism with {@link * ListCheckpointed} as it subsumes {@link * org.apache.flink.streaming.api.checkpoint.CheckpointedFunction}. */ public void testSavepointRescalingPartitionedOperatorState(boolean scaleOut) throws Exception { final int parallelism = scaleOut ? numSlots : numSlots / 2; final int parallelism2 = scaleOut ? numSlots / 2 : numSlots; final int maxParallelism = 13; FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES); Deadline deadline = timeout.fromNow(); JobID jobID = null; ActorGateway jobManager = null; int counterSize = Math.max(parallelism, parallelism2); PartitionedStateSource.CHECK_CORRECT_SNAPSHOT = new int[counterSize]; PartitionedStateSource.CHECK_CORRECT_RESTORE = new int[counterSize]; try { jobManager = cluster.getLeaderGateway(deadline.timeLeft()); JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, true); jobID = jobGraph.getJobID(); cluster.submitJobDetached(jobGraph); Object savepointResponse = null; // wait until the operator is started StateSourceBase.workStartedLatch.await(); while (deadline.hasTimeLeft()) { Future<Object> savepointPathFuture = jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID), deadline.timeLeft()); FiniteDuration waitingTime = new FiniteDuration(10, TimeUnit.SECONDS); savepointResponse = Await.result(savepointPathFuture, waitingTime); if (savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess) { break; } } assertTrue(savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess); final String savepointPath = ((JobManagerMessages.TriggerSavepointSuccess) savepointResponse).savepointPath(); Future<Object> jobRemovedFuture = jobManager.ask( new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft()); Future<Object> cancellationResponseFuture = jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft()); Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft()); assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess); Await.ready(jobRemovedFuture, deadline.timeLeft()); // job successfully removed jobID = null; JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, true); scaledJobGraph.setSavepointPath(savepointPath); jobID = scaledJobGraph.getJobID(); cluster.submitJobAndWait(scaledJobGraph, false); int sumExp = 0; int sumAct = 0; for (int c : PartitionedStateSource.CHECK_CORRECT_SNAPSHOT) { sumExp += c; } for (int c : PartitionedStateSource.CHECK_CORRECT_RESTORE) { sumAct += c; } assertEquals(sumExp, sumAct); jobID = null; } finally { // clear any left overs from a possibly failed job if (jobID != null && jobManager != null) { Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout); try { Await.ready(jobRemovedFuture, timeout); } catch (TimeoutException | InterruptedException ie) { fail("Failed while cleaning up the cluster."); } } } }
/** * Tests that a job cannot be restarted from a savepoint with a different parallelism if the * rescaled operator has non-partitioned state. * * @throws Exception */ @Test public void testSavepointRescalingNonPartitionedStateCausesException() throws Exception { final int parallelism = numSlots / 2; final int parallelism2 = numSlots; final int maxParallelism = 13; FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES); Deadline deadline = timeout.fromNow(); JobID jobID = null; ActorGateway jobManager = null; try { jobManager = cluster.getLeaderGateway(deadline.timeLeft()); JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, false); jobID = jobGraph.getJobID(); cluster.submitJobDetached(jobGraph); Object savepointResponse = null; // wait until the operator is started StateSourceBase.workStartedLatch.await(); while (deadline.hasTimeLeft()) { Future<Object> savepointPathFuture = jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID), deadline.timeLeft()); FiniteDuration waitingTime = new FiniteDuration(10, TimeUnit.SECONDS); savepointResponse = Await.result(savepointPathFuture, waitingTime); if (savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess) { break; } } assertTrue(savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess); final String savepointPath = ((JobManagerMessages.TriggerSavepointSuccess) savepointResponse).savepointPath(); Future<Object> jobRemovedFuture = jobManager.ask( new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft()); Future<Object> cancellationResponseFuture = jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft()); Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft()); assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess); Await.ready(jobRemovedFuture, deadline.timeLeft()); // job successfully removed jobID = null; JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, false); scaledJobGraph.setSavepointPath(savepointPath); jobID = scaledJobGraph.getJobID(); cluster.submitJobAndWait(scaledJobGraph, false); jobID = null; } catch (JobExecutionException exception) { if (exception.getCause() instanceof SuppressRestartsException) { SuppressRestartsException suppressRestartsException = (SuppressRestartsException) exception.getCause(); if (suppressRestartsException.getCause() instanceof IllegalStateException) { // we expect a IllegalStateException wrapped in a SuppressRestartsException wrapped // in a JobExecutionException, because the job containing non-partitioned state // is being rescaled } else { throw exception; } } else { throw exception; } } finally { // clear any left overs from a possibly failed job if (jobID != null && jobManager != null) { Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout); try { Await.ready(jobRemovedFuture, timeout); } catch (TimeoutException | InterruptedException ie) { fail("Failed while cleaning up the cluster."); } } } }