/**
   * Tests that a job is properly canceled in the event of a leader change. However, this time only
   * the TMs are notified about the changing leader. This should be enough to cancel the currently
   * running job, though.
   */
  @Test
  public void testStateCleanupAfterListenerNotification() throws Exception {
    UUID leaderSessionID = UUID.randomUUID();
    UUID newLeaderSessionID = UUID.randomUUID();

    cluster.grantLeadership(0, leaderSessionID);
    cluster.notifyRetrievalListeners(0, leaderSessionID);

    cluster.waitForTaskManagersToBeRegistered();

    // submit blocking job
    cluster.submitJobDetached(job);

    ActorGateway jm = cluster.getLeaderGateway(timeout);

    Future<Object> wait =
        jm.ask(new WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout);

    Await.ready(wait, timeout);

    Future<Object> jobRemoval = jm.ask(new NotifyWhenJobRemoved(job.getJobID()), timeout);

    // notify listeners (TMs) about the leader change
    cluster.notifyRetrievalListeners(1, newLeaderSessionID);

    Await.ready(jobRemoval, timeout);
  }
  private static String[] runTests(
      int numHosts, int slotsPerHost, int parallelism, TestLocatableInputSplit[] splits)
      throws Exception {
    AbstractJobVertex vertex = new AbstractJobVertex("test vertex");
    vertex.setParallelism(parallelism);
    vertex.setInvokableClass(DummyInvokable.class);
    vertex.setInputSplitSource(new TestInputSplitSource(splits));

    JobGraph jobGraph = new JobGraph("test job", vertex);

    ExecutionGraph eg =
        new ExecutionGraph(
            jobGraph.getJobID(), jobGraph.getName(), jobGraph.getJobConfiguration(), TIMEOUT);
    eg.setQueuedSchedulingAllowed(false);

    eg.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());

    Scheduler scheduler = getScheduler(numHosts, slotsPerHost);
    eg.scheduleForExecution(scheduler);

    ExecutionVertex[] tasks = eg.getVerticesTopologically().iterator().next().getTaskVertices();
    assertEquals(parallelism, tasks.length);

    String[] hostsForTasks = new String[parallelism];
    for (int i = 0; i < parallelism; i++) {
      hostsForTasks[i] = tasks[i].getCurrentAssignedResourceLocation().getHostname();
    }

    return hostsForTasks;
  }
  /**
   * Tests that a job is properly canceled in the case of a leader change. However, this time only
   * the JMs are notified about the leader change and the TMs still believe the old leader to have
   * leadership.
   */
  @Test
  public void testStateCleanupAfterNewLeaderElection() throws Exception {
    UUID leaderSessionID = UUID.randomUUID();
    UUID newLeaderSessionID = UUID.randomUUID();

    cluster.grantLeadership(0, leaderSessionID);
    cluster.notifyRetrievalListeners(0, leaderSessionID);

    cluster.waitForTaskManagersToBeRegistered();

    // submit blocking job so that we can test job clean up
    cluster.submitJobDetached(job);

    ActorGateway jm = cluster.getLeaderGateway(timeout);

    Future<Object> wait =
        jm.ask(new WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout);

    Await.ready(wait, timeout);

    Future<Object> jobRemoval = jm.ask(new NotifyWhenJobRemoved(job.getJobID()), timeout);

    // only notify the JMs about the new leader JM(1)
    cluster.grantLeadership(1, newLeaderSessionID);

    // job should be removed anyway
    Await.ready(jobRemoval, timeout);
  }
  private JobGraph createTestJobGraph(
      String jobName, int senderParallelism, int receiverParallelism) {

    // The sender and receiver invokable logic ensure that each subtask gets the expected data
    final JobVertex sender = new JobVertex("Sender");
    sender.setInvokableClass(RoundRobinSubtaskIndexSender.class);
    sender
        .getConfiguration()
        .setInteger(RoundRobinSubtaskIndexSender.CONFIG_KEY, receiverParallelism);
    sender.setParallelism(senderParallelism);

    final JobVertex receiver = new JobVertex("Receiver");
    receiver.setInvokableClass(SubtaskIndexReceiver.class);
    receiver.getConfiguration().setInteger(SubtaskIndexReceiver.CONFIG_KEY, senderParallelism);
    receiver.setParallelism(receiverParallelism);

    receiver.connectNewDataSetAsInput(
        sender, DistributionPattern.ALL_TO_ALL, ResultPartitionType.BLOCKING);

    final JobGraph jobGraph = new JobGraph(jobName, sender, receiver);

    // We need to allow queued scheduling, because there are not enough slots available
    // to run all tasks at once. We queue tasks and then let them finish/consume the blocking
    // result one after the other.
    jobGraph.setAllowQueuedScheduling(true);

    return jobGraph;
  }
示例#5
0
  /**
   * Parameter {@code uploadedJarLocation} is actually used to point to the local jar, because Flink
   * does not support uploading a jar file before hand. Jar files are always uploaded directly when
   * a program is submitted.
   */
  public void submitTopologyWithOpts(
      final String name, final String uploadedJarLocation, final FlinkTopology topology)
      throws AlreadyAliveException, InvalidTopologyException {

    if (this.getTopologyJobId(name) != null) {
      throw new AlreadyAliveException();
    }

    final URI uploadedJarUri;
    final URL uploadedJarUrl;
    try {
      uploadedJarUri = new File(uploadedJarLocation).getAbsoluteFile().toURI();
      uploadedJarUrl = uploadedJarUri.toURL();
      JobWithJars.checkJarFile(uploadedJarUrl);
    } catch (final IOException e) {
      throw new RuntimeException("Problem with jar file " + uploadedJarLocation, e);
    }

    try {
      FlinkClient.addStormConfigToTopology(topology, conf);
    } catch (ClassNotFoundException e) {
      LOG.error("Could not register class for Kryo serialization.", e);
      throw new InvalidTopologyException("Could not register class for Kryo serialization.");
    }

    final StreamGraph streamGraph = topology.getExecutionEnvironment().getStreamGraph();
    streamGraph.setJobName(name);

    final JobGraph jobGraph = streamGraph.getJobGraph();
    jobGraph.addJar(new Path(uploadedJarUri));

    final Configuration configuration = jobGraph.getJobConfiguration();
    configuration.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, jobManagerHost);
    configuration.setInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, jobManagerPort);

    final Client client;
    try {
      client = new Client(configuration);
    } catch (IOException e) {
      throw new RuntimeException("Could not establish a connection to the job manager", e);
    }

    try {
      ClassLoader classLoader =
          JobWithJars.buildUserCodeClassLoader(
              Lists.newArrayList(uploadedJarUrl),
              Collections.<URL>emptyList(),
              this.getClass().getClassLoader());
      client.runDetached(jobGraph, classLoader);
    } catch (final ProgramInvocationException e) {
      throw new RuntimeException("Cannot execute job due to ProgramInvocationException", e);
    }
  }
  /**
   * Tests that a job is properly canceled in the case of a leader change. In such an event all
   * TaskManagers have to disconnect from the previous leader and connect to the newly elected
   * leader.
   */
  @Test
  public void testStateCleanupAfterNewLeaderElectionAndListenerNotification() throws Exception {
    UUID leaderSessionID1 = UUID.randomUUID();
    UUID leaderSessionID2 = UUID.randomUUID();

    // first make JM(0) the leader
    cluster.grantLeadership(0, leaderSessionID1);
    // notify all listeners
    cluster.notifyRetrievalListeners(0, leaderSessionID1);

    cluster.waitForTaskManagersToBeRegistered();

    // submit blocking job so that it is not finished when we cancel it
    cluster.submitJobDetached(job);

    ActorGateway jm = cluster.getLeaderGateway(timeout);

    Future<Object> wait =
        jm.ask(new WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout);

    Await.ready(wait, timeout);

    Future<Object> jobRemoval = jm.ask(new NotifyWhenJobRemoved(job.getJobID()), timeout);

    // make the JM(1) the new leader
    cluster.grantLeadership(1, leaderSessionID2);
    // notify all listeners about the event
    cluster.notifyRetrievalListeners(1, leaderSessionID2);

    Await.ready(jobRemoval, timeout);

    cluster.waitForTaskManagersToBeRegistered();

    ActorGateway jm2 = cluster.getLeaderGateway(timeout);

    Future<Object> futureNumberSlots =
        jm2.ask(JobManagerMessages.getRequestTotalNumberOfSlots(), timeout);

    // check that all TMs have registered at the new leader
    int numberSlots = (Integer) Await.result(futureNumberSlots, timeout);

    assertEquals(parallelism, numberSlots);

    // try to resubmit now the non-blocking job, it should complete successfully
    Tasks.BlockingOnceReceiver$.MODULE$.blocking_$eq(false);
    cluster.submitJobAndWait(job, false, timeout);
  }
  /**
   * Tests that the same JobManager can be reelected as the leader. Even though, the same JM is
   * elected as the next leader, all currently running jobs should be canceled properly and all TMs
   * should disconnect from the leader and then reconnect to it.
   */
  @Test
  public void testReelectionOfSameJobManager() throws Exception {
    UUID leaderSessionID = UUID.randomUUID();
    UUID newLeaderSessionID = UUID.randomUUID();

    FiniteDuration shortTimeout = new FiniteDuration(20, TimeUnit.SECONDS);

    cluster.grantLeadership(0, leaderSessionID);
    cluster.notifyRetrievalListeners(0, leaderSessionID);

    cluster.waitForTaskManagersToBeRegistered();

    // submit blocking job
    cluster.submitJobDetached(job);

    ActorGateway jm = cluster.getLeaderGateway(timeout);

    Future<Object> wait =
        jm.ask(new WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout);

    Await.ready(wait, timeout);

    Future<Object> jobRemoval = jm.ask(new NotifyWhenJobRemoved(job.getJobID()), timeout);

    // make JM(0) again the leader --> this implies first a leadership revokal
    cluster.grantLeadership(0, newLeaderSessionID);

    Await.ready(jobRemoval, timeout);

    // The TMs should not be able to reconnect since they don't know the current leader
    // session ID
    try {
      cluster.waitForTaskManagersToBeRegistered(shortTimeout);
      fail("TaskManager should not be able to register at JobManager.");
    } catch (TimeoutException e) {
      // expected exception since the TMs have still the old leader session ID
    }

    // notify the TMs about the new (old) leader
    cluster.notifyRetrievalListeners(0, newLeaderSessionID);

    cluster.waitForTaskManagersToBeRegistered();

    // try to resubmit now the non-blocking job, it should complete successfully
    Tasks.BlockingOnceReceiver$.MODULE$.blocking_$eq(false);
    cluster.submitJobAndWait(job, false, timeout);
  }
  @Test
  public void testFailureWhenJarBlobsMissing() {
    try {
      // create a simple job graph
      JobVertex jobVertex = new JobVertex("Test Vertex");
      jobVertex.setInvokableClass(Tasks.NoOpInvokable.class);
      JobGraph jg = new JobGraph("test job", jobVertex);

      // request the blob port from the job manager
      Future<Object> future =
          jmGateway.ask(JobManagerMessages.getRequestBlobManagerPort(), timeout);
      int blobPort = (Integer) Await.result(future, timeout);

      // upload two dummy bytes and add their keys to the job graph as dependencies
      BlobKey key1, key2;
      BlobClient bc = new BlobClient(new InetSocketAddress("localhost", blobPort));
      try {
        key1 = bc.put(new byte[10]);
        key2 = bc.put(new byte[10]);

        // delete one of the blobs to make sure that the startup failed
        bc.delete(key2);
      } finally {
        bc.close();
      }

      jg.addBlob(key1);
      jg.addBlob(key2);

      // submit the job
      Future<Object> submitFuture =
          jmGateway.ask(
              new JobManagerMessages.SubmitJob(jg, ListeningBehaviour.EXECUTION_RESULT), timeout);
      try {
        Await.result(submitFuture, timeout);
      } catch (JobExecutionException e) {
        // that is what we expect
        assertTrue(e.getCause() instanceof IOException);
      } catch (Exception e) {
        fail("Wrong exception type");
      }
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }
  }
示例#9
0
  public JobManagerJobMetricGroup addJob(JobGraph job) {
    JobID jobId = job.getJobID();
    String jobName = job.getName();
    // get or create a jobs metric group
    JobManagerJobMetricGroup currentJobGroup;
    synchronized (this) {
      if (!isClosed()) {
        currentJobGroup = jobs.get(jobId);

        if (currentJobGroup == null || currentJobGroup.isClosed()) {
          currentJobGroup = new JobManagerJobMetricGroup(registry, this, jobId, jobName);
          jobs.put(jobId, currentJobGroup);
        }
        return currentJobGroup;
      } else {
        return null;
      }
    }
  }
  @Test
  public void testMultipleInstancesPerHost() {

    TestLocatableInputSplit[] splits =
        new TestLocatableInputSplit[] {
          new TestLocatableInputSplit(1, "host1"),
          new TestLocatableInputSplit(2, "host1"),
          new TestLocatableInputSplit(3, "host2"),
          new TestLocatableInputSplit(4, "host2"),
          new TestLocatableInputSplit(5, "host3"),
          new TestLocatableInputSplit(6, "host3")
        };

    try {
      AbstractJobVertex vertex = new AbstractJobVertex("test vertex");
      vertex.setParallelism(6);
      vertex.setInvokableClass(DummyInvokable.class);
      vertex.setInputSplitSource(new TestInputSplitSource(splits));

      JobGraph jobGraph = new JobGraph("test job", vertex);

      ExecutionGraph eg =
          new ExecutionGraph(
              jobGraph.getJobID(), jobGraph.getName(), jobGraph.getJobConfiguration(), TIMEOUT);

      eg.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
      eg.setQueuedSchedulingAllowed(false);

      // create a scheduler with 6 instances where always two are on the same host
      Scheduler scheduler = new Scheduler();
      Instance i1 = getInstance(new byte[] {10, 0, 1, 1}, 12345, "host1", 1);
      Instance i2 = getInstance(new byte[] {10, 0, 1, 1}, 12346, "host1", 1);
      Instance i3 = getInstance(new byte[] {10, 0, 1, 2}, 12345, "host2", 1);
      Instance i4 = getInstance(new byte[] {10, 0, 1, 2}, 12346, "host2", 1);
      Instance i5 = getInstance(new byte[] {10, 0, 1, 3}, 12345, "host3", 1);
      Instance i6 = getInstance(new byte[] {10, 0, 1, 3}, 12346, "host4", 1);
      scheduler.newInstanceAvailable(i1);
      scheduler.newInstanceAvailable(i2);
      scheduler.newInstanceAvailable(i3);
      scheduler.newInstanceAvailable(i4);
      scheduler.newInstanceAvailable(i5);
      scheduler.newInstanceAvailable(i6);

      eg.scheduleForExecution(scheduler);

      ExecutionVertex[] tasks = eg.getVerticesTopologically().iterator().next().getTaskVertices();
      assertEquals(6, tasks.length);

      Instance taskInstance1 = tasks[0].getCurrentAssignedResource().getInstance();
      Instance taskInstance2 = tasks[1].getCurrentAssignedResource().getInstance();
      Instance taskInstance3 = tasks[2].getCurrentAssignedResource().getInstance();
      Instance taskInstance4 = tasks[3].getCurrentAssignedResource().getInstance();
      Instance taskInstance5 = tasks[4].getCurrentAssignedResource().getInstance();
      Instance taskInstance6 = tasks[5].getCurrentAssignedResource().getInstance();

      assertTrue(taskInstance1 == i1 || taskInstance1 == i2);
      assertTrue(taskInstance2 == i1 || taskInstance2 == i2);
      assertTrue(taskInstance3 == i3 || taskInstance3 == i4);
      assertTrue(taskInstance4 == i3 || taskInstance4 == i4);
      assertTrue(taskInstance5 == i5 || taskInstance5 == i6);
      assertTrue(taskInstance6 == i5 || taskInstance6 == i6);
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }
  }
  protected void internalNonBlockingStart() throws IOException {

    Plan plan = env.createProgramPlan();

    Optimizer optimizer =
        new Optimizer(new DataStatistics(), new org.apache.flink.configuration.Configuration());
    OptimizedPlan optimizedPlan = optimizer.compile(plan);

    final JobGraph jobGraph = new JobGraphGenerator().compileJobGraph(optimizedPlan);
    for (String jarPath : classPath) {
      jobGraph.addJar(new Path(jarPath));
    }

    jobID = jobGraph.getJobID();
    accumulatorCache.setJobID(jobID);

    if (isLocalExecution()) {

      flowStep.logInfo("Executing in local mode.");

      startLocalCluster();

      org.apache.flink.configuration.Configuration config =
          new org.apache.flink.configuration.Configuration();
      config.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, localCluster.hostname());

      client = new Client(config);
      client.setPrintStatusDuringExecution(env.getConfig().isSysoutLoggingEnabled());

    } else {

      flowStep.logInfo("Executing in cluster mode.");

      try {
        String path =
            this.getClass().getProtectionDomain().getCodeSource().getLocation().toURI().getPath();
        jobGraph.addJar(new Path(path));
        classPath.add(path);
      } catch (URISyntaxException e) {
        throw new IOException("Could not add the submission JAR as a dependency.");
      }

      client = ((ContextEnvironment) env).getClient();
    }

    List<URL> fileList = new ArrayList<URL>(classPath.size());
    for (String path : classPath) {
      URL url;
      try {
        url = new URL(path);
      } catch (MalformedURLException e) {
        url = new URL("file://" + path);
      }
      fileList.add(url);
    }

    final ClassLoader loader =
        JobWithJars.buildUserCodeClassLoader(
            fileList, Collections.<URL>emptyList(), getClass().getClassLoader());

    accumulatorCache.setClient(client);

    final Callable<JobSubmissionResult> callable =
        new Callable<JobSubmissionResult>() {
          @Override
          public JobSubmissionResult call() throws Exception {
            return client.runBlocking(jobGraph, loader);
          }
        };

    jobSubmission = executorService.submit(callable);

    flowStep.logInfo("submitted Flink job: " + jobID);
  }
示例#12
0
  /**
   * Tests rescaling of partitioned operator state. More specific, we test the mechanism with {@link
   * ListCheckpointed} as it subsumes {@link
   * org.apache.flink.streaming.api.checkpoint.CheckpointedFunction}.
   */
  public void testSavepointRescalingPartitionedOperatorState(boolean scaleOut) throws Exception {
    final int parallelism = scaleOut ? numSlots : numSlots / 2;
    final int parallelism2 = scaleOut ? numSlots / 2 : numSlots;
    final int maxParallelism = 13;

    FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES);
    Deadline deadline = timeout.fromNow();

    JobID jobID = null;
    ActorGateway jobManager = null;

    int counterSize = Math.max(parallelism, parallelism2);

    PartitionedStateSource.CHECK_CORRECT_SNAPSHOT = new int[counterSize];
    PartitionedStateSource.CHECK_CORRECT_RESTORE = new int[counterSize];

    try {
      jobManager = cluster.getLeaderGateway(deadline.timeLeft());

      JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, true);

      jobID = jobGraph.getJobID();

      cluster.submitJobDetached(jobGraph);

      Object savepointResponse = null;

      // wait until the operator is started
      StateSourceBase.workStartedLatch.await();

      while (deadline.hasTimeLeft()) {

        Future<Object> savepointPathFuture =
            jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID), deadline.timeLeft());
        FiniteDuration waitingTime = new FiniteDuration(10, TimeUnit.SECONDS);
        savepointResponse = Await.result(savepointPathFuture, waitingTime);

        if (savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess) {
          break;
        }
      }

      assertTrue(savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess);

      final String savepointPath =
          ((JobManagerMessages.TriggerSavepointSuccess) savepointResponse).savepointPath();

      Future<Object> jobRemovedFuture =
          jobManager.ask(
              new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft());

      Future<Object> cancellationResponseFuture =
          jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft());

      Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft());

      assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess);

      Await.ready(jobRemovedFuture, deadline.timeLeft());

      // job successfully removed
      jobID = null;

      JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, true);

      scaledJobGraph.setSavepointPath(savepointPath);

      jobID = scaledJobGraph.getJobID();

      cluster.submitJobAndWait(scaledJobGraph, false);

      int sumExp = 0;
      int sumAct = 0;

      for (int c : PartitionedStateSource.CHECK_CORRECT_SNAPSHOT) {
        sumExp += c;
      }

      for (int c : PartitionedStateSource.CHECK_CORRECT_RESTORE) {
        sumAct += c;
      }

      assertEquals(sumExp, sumAct);
      jobID = null;

    } finally {
      // clear any left overs from a possibly failed job
      if (jobID != null && jobManager != null) {
        Future<Object> jobRemovedFuture =
            jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout);

        try {
          Await.ready(jobRemovedFuture, timeout);
        } catch (TimeoutException | InterruptedException ie) {
          fail("Failed while cleaning up the cluster.");
        }
      }
    }
  }
示例#13
0
  /**
   * Tests that a job with non partitioned state can be restarted from a savepoint with a different
   * parallelism if the operator with non-partitioned state are not rescaled.
   *
   * @throws Exception
   */
  @Test
  public void testSavepointRescalingWithKeyedAndNonPartitionedState() throws Exception {
    int numberKeys = 42;
    int numberElements = 1000;
    int numberElements2 = 500;
    int parallelism = numSlots / 2;
    int parallelism2 = numSlots;
    int maxParallelism = 13;

    FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES);
    Deadline deadline = timeout.fromNow();

    ActorGateway jobManager = null;
    JobID jobID = null;

    try {
      jobManager = cluster.getLeaderGateway(deadline.timeLeft());

      JobGraph jobGraph =
          createJobGraphWithKeyedAndNonPartitionedOperatorState(
              parallelism, maxParallelism, parallelism, numberKeys, numberElements, false, 100);

      jobID = jobGraph.getJobID();

      cluster.submitJobDetached(jobGraph);

      // wait til the sources have emitted numberElements for each key and completed a checkpoint
      SubtaskIndexFlatMapper.workCompletedLatch.await(
          deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

      // verify the current state

      Set<Tuple2<Integer, Integer>> actualResult = CollectionSink.getElementsSet();

      Set<Tuple2<Integer, Integer>> expectedResult = new HashSet<>();

      for (int key = 0; key < numberKeys; key++) {
        int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism);

        expectedResult.add(
            Tuple2.of(
                KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup(
                    maxParallelism, parallelism, keyGroupIndex),
                numberElements * key));
      }

      assertEquals(expectedResult, actualResult);

      // clear the CollectionSink set for the restarted job
      CollectionSink.clearElementsSet();

      Future<Object> savepointPathFuture =
          jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID), deadline.timeLeft());

      final String savepointPath =
          ((JobManagerMessages.TriggerSavepointSuccess)
                  Await.result(savepointPathFuture, deadline.timeLeft()))
              .savepointPath();

      Future<Object> jobRemovedFuture =
          jobManager.ask(
              new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft());

      Future<Object> cancellationResponseFuture =
          jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft());

      Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft());

      assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess);

      Await.ready(jobRemovedFuture, deadline.timeLeft());

      jobID = null;

      JobGraph scaledJobGraph =
          createJobGraphWithKeyedAndNonPartitionedOperatorState(
              parallelism2,
              maxParallelism,
              parallelism,
              numberKeys,
              numberElements + numberElements2,
              true,
              100);

      scaledJobGraph.setSavepointPath(savepointPath);

      jobID = scaledJobGraph.getJobID();

      cluster.submitJobAndWait(scaledJobGraph, false);

      jobID = null;

      Set<Tuple2<Integer, Integer>> actualResult2 = CollectionSink.getElementsSet();

      Set<Tuple2<Integer, Integer>> expectedResult2 = new HashSet<>();

      for (int key = 0; key < numberKeys; key++) {
        int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism);
        expectedResult2.add(
            Tuple2.of(
                KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup(
                    maxParallelism, parallelism2, keyGroupIndex),
                key * (numberElements + numberElements2)));
      }

      assertEquals(expectedResult2, actualResult2);

    } finally {
      // clear the CollectionSink set for the restarted job
      CollectionSink.clearElementsSet();

      // clear any left overs from a possibly failed job
      if (jobID != null && jobManager != null) {
        Future<Object> jobRemovedFuture =
            jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout);

        try {
          Await.ready(jobRemovedFuture, timeout);
        } catch (TimeoutException | InterruptedException ie) {
          fail("Failed while cleaning up the cluster.");
        }
      }
    }
  }
示例#14
0
  /**
   * Tests that a job cannot be restarted from a savepoint with a different parallelism if the
   * rescaled operator has non-partitioned state.
   *
   * @throws Exception
   */
  @Test
  public void testSavepointRescalingNonPartitionedStateCausesException() throws Exception {
    final int parallelism = numSlots / 2;
    final int parallelism2 = numSlots;
    final int maxParallelism = 13;

    FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES);
    Deadline deadline = timeout.fromNow();

    JobID jobID = null;
    ActorGateway jobManager = null;

    try {
      jobManager = cluster.getLeaderGateway(deadline.timeLeft());

      JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, false);

      jobID = jobGraph.getJobID();

      cluster.submitJobDetached(jobGraph);

      Object savepointResponse = null;

      // wait until the operator is started
      StateSourceBase.workStartedLatch.await();

      while (deadline.hasTimeLeft()) {

        Future<Object> savepointPathFuture =
            jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID), deadline.timeLeft());
        FiniteDuration waitingTime = new FiniteDuration(10, TimeUnit.SECONDS);
        savepointResponse = Await.result(savepointPathFuture, waitingTime);

        if (savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess) {
          break;
        }
      }

      assertTrue(savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess);

      final String savepointPath =
          ((JobManagerMessages.TriggerSavepointSuccess) savepointResponse).savepointPath();

      Future<Object> jobRemovedFuture =
          jobManager.ask(
              new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft());

      Future<Object> cancellationResponseFuture =
          jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft());

      Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft());

      assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess);

      Await.ready(jobRemovedFuture, deadline.timeLeft());

      // job successfully removed
      jobID = null;

      JobGraph scaledJobGraph =
          createJobGraphWithOperatorState(parallelism2, maxParallelism, false);

      scaledJobGraph.setSavepointPath(savepointPath);

      jobID = scaledJobGraph.getJobID();

      cluster.submitJobAndWait(scaledJobGraph, false);

      jobID = null;

    } catch (JobExecutionException exception) {
      if (exception.getCause() instanceof SuppressRestartsException) {
        SuppressRestartsException suppressRestartsException =
            (SuppressRestartsException) exception.getCause();

        if (suppressRestartsException.getCause() instanceof IllegalStateException) {
          // we expect a IllegalStateException wrapped in a SuppressRestartsException wrapped
          // in a JobExecutionException, because the job containing non-partitioned state
          // is being rescaled
        } else {
          throw exception;
        }
      } else {
        throw exception;
      }
    } finally {
      // clear any left overs from a possibly failed job
      if (jobID != null && jobManager != null) {
        Future<Object> jobRemovedFuture =
            jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout);

        try {
          Await.ready(jobRemovedFuture, timeout);
        } catch (TimeoutException | InterruptedException ie) {
          fail("Failed while cleaning up the cluster.");
        }
      }
    }
  }