Exemplo n.º 1
0
  /**
   * Tests rescaling of partitioned operator state. More specific, we test the mechanism with {@link
   * ListCheckpointed} as it subsumes {@link
   * org.apache.flink.streaming.api.checkpoint.CheckpointedFunction}.
   */
  public void testSavepointRescalingPartitionedOperatorState(boolean scaleOut) throws Exception {
    final int parallelism = scaleOut ? numSlots : numSlots / 2;
    final int parallelism2 = scaleOut ? numSlots / 2 : numSlots;
    final int maxParallelism = 13;

    FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES);
    Deadline deadline = timeout.fromNow();

    JobID jobID = null;
    ActorGateway jobManager = null;

    int counterSize = Math.max(parallelism, parallelism2);

    PartitionedStateSource.CHECK_CORRECT_SNAPSHOT = new int[counterSize];
    PartitionedStateSource.CHECK_CORRECT_RESTORE = new int[counterSize];

    try {
      jobManager = cluster.getLeaderGateway(deadline.timeLeft());

      JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, true);

      jobID = jobGraph.getJobID();

      cluster.submitJobDetached(jobGraph);

      Object savepointResponse = null;

      // wait until the operator is started
      StateSourceBase.workStartedLatch.await();

      while (deadline.hasTimeLeft()) {

        Future<Object> savepointPathFuture =
            jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID), deadline.timeLeft());
        FiniteDuration waitingTime = new FiniteDuration(10, TimeUnit.SECONDS);
        savepointResponse = Await.result(savepointPathFuture, waitingTime);

        if (savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess) {
          break;
        }
      }

      assertTrue(savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess);

      final String savepointPath =
          ((JobManagerMessages.TriggerSavepointSuccess) savepointResponse).savepointPath();

      Future<Object> jobRemovedFuture =
          jobManager.ask(
              new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft());

      Future<Object> cancellationResponseFuture =
          jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft());

      Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft());

      assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess);

      Await.ready(jobRemovedFuture, deadline.timeLeft());

      // job successfully removed
      jobID = null;

      JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, true);

      scaledJobGraph.setSavepointPath(savepointPath);

      jobID = scaledJobGraph.getJobID();

      cluster.submitJobAndWait(scaledJobGraph, false);

      int sumExp = 0;
      int sumAct = 0;

      for (int c : PartitionedStateSource.CHECK_CORRECT_SNAPSHOT) {
        sumExp += c;
      }

      for (int c : PartitionedStateSource.CHECK_CORRECT_RESTORE) {
        sumAct += c;
      }

      assertEquals(sumExp, sumAct);
      jobID = null;

    } finally {
      // clear any left overs from a possibly failed job
      if (jobID != null && jobManager != null) {
        Future<Object> jobRemovedFuture =
            jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout);

        try {
          Await.ready(jobRemovedFuture, timeout);
        } catch (TimeoutException | InterruptedException ie) {
          fail("Failed while cleaning up the cluster.");
        }
      }
    }
  }
Exemplo n.º 2
0
  /**
   * Tests that a job cannot be restarted from a savepoint with a different parallelism if the
   * rescaled operator has non-partitioned state.
   *
   * @throws Exception
   */
  @Test
  public void testSavepointRescalingNonPartitionedStateCausesException() throws Exception {
    final int parallelism = numSlots / 2;
    final int parallelism2 = numSlots;
    final int maxParallelism = 13;

    FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES);
    Deadline deadline = timeout.fromNow();

    JobID jobID = null;
    ActorGateway jobManager = null;

    try {
      jobManager = cluster.getLeaderGateway(deadline.timeLeft());

      JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, false);

      jobID = jobGraph.getJobID();

      cluster.submitJobDetached(jobGraph);

      Object savepointResponse = null;

      // wait until the operator is started
      StateSourceBase.workStartedLatch.await();

      while (deadline.hasTimeLeft()) {

        Future<Object> savepointPathFuture =
            jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID), deadline.timeLeft());
        FiniteDuration waitingTime = new FiniteDuration(10, TimeUnit.SECONDS);
        savepointResponse = Await.result(savepointPathFuture, waitingTime);

        if (savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess) {
          break;
        }
      }

      assertTrue(savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess);

      final String savepointPath =
          ((JobManagerMessages.TriggerSavepointSuccess) savepointResponse).savepointPath();

      Future<Object> jobRemovedFuture =
          jobManager.ask(
              new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft());

      Future<Object> cancellationResponseFuture =
          jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft());

      Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft());

      assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess);

      Await.ready(jobRemovedFuture, deadline.timeLeft());

      // job successfully removed
      jobID = null;

      JobGraph scaledJobGraph =
          createJobGraphWithOperatorState(parallelism2, maxParallelism, false);

      scaledJobGraph.setSavepointPath(savepointPath);

      jobID = scaledJobGraph.getJobID();

      cluster.submitJobAndWait(scaledJobGraph, false);

      jobID = null;

    } catch (JobExecutionException exception) {
      if (exception.getCause() instanceof SuppressRestartsException) {
        SuppressRestartsException suppressRestartsException =
            (SuppressRestartsException) exception.getCause();

        if (suppressRestartsException.getCause() instanceof IllegalStateException) {
          // we expect a IllegalStateException wrapped in a SuppressRestartsException wrapped
          // in a JobExecutionException, because the job containing non-partitioned state
          // is being rescaled
        } else {
          throw exception;
        }
      } else {
        throw exception;
      }
    } finally {
      // clear any left overs from a possibly failed job
      if (jobID != null && jobManager != null) {
        Future<Object> jobRemovedFuture =
            jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout);

        try {
          Await.ready(jobRemovedFuture, timeout);
        } catch (TimeoutException | InterruptedException ie) {
          fail("Failed while cleaning up the cluster.");
        }
      }
    }
  }