@Test
  public void testDisposeSavepointSuccess() throws Exception {
    replaceStdOutAndStdErr();

    try {
      String savepointPath = "expectedSavepointPath";
      ActorGateway jobManager = mock(ActorGateway.class);

      Promise<Object> triggerResponse = new scala.concurrent.impl.Promise.DefaultPromise<>();

      when(jobManager.ask(
              Mockito.eq(new JobManagerMessages.DisposeSavepoint(savepointPath)),
              Mockito.any(FiniteDuration.class)))
          .thenReturn(triggerResponse.future());

      triggerResponse.success(JobManagerMessages.getDisposeSavepointSuccess());

      CliFrontend frontend = new MockCliFrontend(CliFrontendTestUtils.getConfigDir(), jobManager);

      String[] parameters = {"-d", savepointPath};
      int returnCode = frontend.savepoint(parameters);

      assertEquals(0, returnCode);
      verify(jobManager, times(1))
          .ask(
              Mockito.eq(new JobManagerMessages.DisposeSavepoint(savepointPath)),
              Mockito.any(FiniteDuration.class));

      String outMsg = buffer.toString();
      assertTrue(outMsg.contains(savepointPath));
      assertTrue(outMsg.contains("disposed"));
    } finally {
      restoreStdOutAndStdErr();
    }
  }
  /**
   * Tests that a job is properly canceled in the case of a leader change. However, this time only
   * the JMs are notified about the leader change and the TMs still believe the old leader to have
   * leadership.
   */
  @Test
  public void testStateCleanupAfterNewLeaderElection() throws Exception {
    UUID leaderSessionID = UUID.randomUUID();
    UUID newLeaderSessionID = UUID.randomUUID();

    cluster.grantLeadership(0, leaderSessionID);
    cluster.notifyRetrievalListeners(0, leaderSessionID);

    cluster.waitForTaskManagersToBeRegistered();

    // submit blocking job so that we can test job clean up
    cluster.submitJobDetached(job);

    ActorGateway jm = cluster.getLeaderGateway(timeout);

    Future<Object> wait =
        jm.ask(new WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout);

    Await.ready(wait, timeout);

    Future<Object> jobRemoval = jm.ask(new NotifyWhenJobRemoved(job.getJobID()), timeout);

    // only notify the JMs about the new leader JM(1)
    cluster.grantLeadership(1, newLeaderSessionID);

    // job should be removed anyway
    Await.ready(jobRemoval, timeout);
  }
  @Test
  public void testTriggerSavepointFailure() throws Exception {
    replaceStdOutAndStdErr();

    try {
      JobID jobId = new JobID();
      ActorGateway jobManager = mock(ActorGateway.class);

      Promise<Object> triggerResponse = new scala.concurrent.impl.Promise.DefaultPromise<>();

      when(jobManager.ask(
              Mockito.eq(new JobManagerMessages.TriggerSavepoint(jobId)),
              Mockito.any(FiniteDuration.class)))
          .thenReturn(triggerResponse.future());

      Exception testException = new Exception("expectedTestException");

      triggerResponse.success(new JobManagerMessages.TriggerSavepointFailure(jobId, testException));

      CliFrontend frontend = new MockCliFrontend(CliFrontendTestUtils.getConfigDir(), jobManager);

      String[] parameters = {jobId.toString()};
      int returnCode = frontend.savepoint(parameters);

      assertTrue(returnCode != 0);
      verify(jobManager, times(1))
          .ask(
              Mockito.eq(new JobManagerMessages.TriggerSavepoint(jobId)),
              Mockito.any(FiniteDuration.class));

      assertTrue(buffer.toString().contains("expectedTestException"));
    } finally {
      restoreStdOutAndStdErr();
    }
  }
  /**
   * Tests that a job is properly canceled in the event of a leader change. However, this time only
   * the TMs are notified about the changing leader. This should be enough to cancel the currently
   * running job, though.
   */
  @Test
  public void testStateCleanupAfterListenerNotification() throws Exception {
    UUID leaderSessionID = UUID.randomUUID();
    UUID newLeaderSessionID = UUID.randomUUID();

    cluster.grantLeadership(0, leaderSessionID);
    cluster.notifyRetrievalListeners(0, leaderSessionID);

    cluster.waitForTaskManagersToBeRegistered();

    // submit blocking job
    cluster.submitJobDetached(job);

    ActorGateway jm = cluster.getLeaderGateway(timeout);

    Future<Object> wait =
        jm.ask(new WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout);

    Await.ready(wait, timeout);

    Future<Object> jobRemoval = jm.ask(new NotifyWhenJobRemoved(job.getJobID()), timeout);

    // notify listeners (TMs) about the leader change
    cluster.notifyRetrievalListeners(1, newLeaderSessionID);

    Await.ready(jobRemoval, timeout);
  }
Esempio n. 5
0
  /**
   * This method sends a CancelTask message to the instance of the assigned slot.
   *
   * <p>The sending is tried up to NUM_CANCEL_CALL_TRIES times.
   */
  private void sendCancelRpcCall() {
    final SimpleSlot slot = this.assignedResource;

    if (slot != null) {

      final ActorGateway gateway = slot.getInstance().getActorGateway();

      Future<Object> cancelResult =
          gateway.retry(
              new CancelTask(attemptId), NUM_CANCEL_CALL_TRIES, timeout, executionContext);

      cancelResult.onComplete(
          new OnComplete<Object>() {

            @Override
            public void onComplete(Throwable failure, Object success) throws Throwable {
              if (failure != null) {
                fail(new Exception("Task could not be canceled.", failure));
              } else {
                TaskOperationResult result = (TaskOperationResult) success;
                if (!result.success()) {
                  LOG.debug(
                      "Cancel task call did not find task. Probably akka message call" + " race.");
                }
              }
            }
          },
          executionContext);
    }
  }
Esempio n. 6
0
    @Override
    public void run() {
      try {
        int returnValue;
        switch (type) {
          case YARN_SESSION:
            yCli = new FlinkYarnSessionCli("", "", false);
            returnValue = yCli.run(args);
            break;
          case CLI_FRONTEND:
            TestingCLI cli;
            try {
              cli = new TestingCLI();
              returnValue = cli.parseParameters(args);
            } catch (Exception e) {
              throw new RuntimeException(
                  "Failed to execute the following args with CliFrontend: " + Arrays.toString(args),
                  e);
            }

            final ClusterClient client = cli.getClusterClient();
            try {
              // check if the JobManager is still alive after running the job
              final FiniteDuration finiteDuration = new FiniteDuration(10, TimeUnit.SECONDS);
              ActorGateway jobManagerGateway = client.getJobManagerGateway();
              Await.ready(
                  jobManagerGateway.ask(new Identify(true), finiteDuration), finiteDuration);
            } catch (Exception e) {
              throw new RuntimeException(
                  "It seems like the JobManager died although it should still be alive");
            }
            // verify we would have shut down anyways and then shutdown
            Mockito.verify(cli.getSpiedClusterClient()).shutdown();
            client.shutdown();

            break;
          default:
            throw new RuntimeException("Unknown type " + type);
        }

        if (returnValue != this.expectedReturnValue) {
          Assert.fail(
              "The YARN session returned with unexpected value="
                  + returnValue
                  + " expected="
                  + expectedReturnValue);
        }
      } catch (Throwable t) {
        LOG.info("Runner stopped with exception", t);
        // save error.
        this.runnerError = t;
      }
    }
Esempio n. 7
0
  private void sendFailIntermediateResultPartitionsRpcCall() {
    final SimpleSlot slot = this.assignedResource;

    if (slot != null) {
      final Instance instance = slot.getInstance();

      if (instance.isAlive()) {
        final ActorGateway gateway = instance.getActorGateway();

        // TODO For some tests this could be a problem when querying too early if all resources were
        // released
        gateway.tell(new FailIntermediateResultPartitions(attemptId));
      }
    }
  }
  /**
   * Verifies a correct error message when vertices with master initialization (input formats /
   * output formats) fail.
   */
  @Test
  public void testFailureWhenInitializeOnMasterFails() {
    try {
      // create a simple job graph

      JobVertex jobVertex =
          new JobVertex("Vertex that fails in initializeOnMaster") {

            @Override
            public void initializeOnMaster(ClassLoader loader) throws Exception {
              throw new RuntimeException("test exception");
            }
          };

      jobVertex.setInvokableClass(Tasks.NoOpInvokable.class);
      JobGraph jg = new JobGraph("test job", jobVertex);

      // submit the job
      Future<Object> submitFuture =
          jmGateway.ask(
              new JobManagerMessages.SubmitJob(jg, ListeningBehaviour.EXECUTION_RESULT), timeout);
      try {
        Await.result(submitFuture, timeout);
      } catch (JobExecutionException e) {
        // that is what we expect
        // test that the exception nesting is not too deep
        assertTrue(e.getCause() instanceof RuntimeException);
      } catch (Exception e) {
        fail("Wrong exception type");
      }
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }
  }
  /**
   * Tests that a job is properly canceled in the case of a leader change. In such an event all
   * TaskManagers have to disconnect from the previous leader and connect to the newly elected
   * leader.
   */
  @Test
  public void testStateCleanupAfterNewLeaderElectionAndListenerNotification() throws Exception {
    UUID leaderSessionID1 = UUID.randomUUID();
    UUID leaderSessionID2 = UUID.randomUUID();

    // first make JM(0) the leader
    cluster.grantLeadership(0, leaderSessionID1);
    // notify all listeners
    cluster.notifyRetrievalListeners(0, leaderSessionID1);

    cluster.waitForTaskManagersToBeRegistered();

    // submit blocking job so that it is not finished when we cancel it
    cluster.submitJobDetached(job);

    ActorGateway jm = cluster.getLeaderGateway(timeout);

    Future<Object> wait =
        jm.ask(new WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout);

    Await.ready(wait, timeout);

    Future<Object> jobRemoval = jm.ask(new NotifyWhenJobRemoved(job.getJobID()), timeout);

    // make the JM(1) the new leader
    cluster.grantLeadership(1, leaderSessionID2);
    // notify all listeners about the event
    cluster.notifyRetrievalListeners(1, leaderSessionID2);

    Await.ready(jobRemoval, timeout);

    cluster.waitForTaskManagersToBeRegistered();

    ActorGateway jm2 = cluster.getLeaderGateway(timeout);

    Future<Object> futureNumberSlots =
        jm2.ask(JobManagerMessages.getRequestTotalNumberOfSlots(), timeout);

    // check that all TMs have registered at the new leader
    int numberSlots = (Integer) Await.result(futureNumberSlots, timeout);

    assertEquals(parallelism, numberSlots);

    // try to resubmit now the non-blocking job, it should complete successfully
    Tasks.BlockingOnceReceiver$.MODULE$.blocking_$eq(false);
    cluster.submitJobAndWait(job, false, timeout);
  }
  /**
   * Tests that the same JobManager can be reelected as the leader. Even though, the same JM is
   * elected as the next leader, all currently running jobs should be canceled properly and all TMs
   * should disconnect from the leader and then reconnect to it.
   */
  @Test
  public void testReelectionOfSameJobManager() throws Exception {
    UUID leaderSessionID = UUID.randomUUID();
    UUID newLeaderSessionID = UUID.randomUUID();

    FiniteDuration shortTimeout = new FiniteDuration(20, TimeUnit.SECONDS);

    cluster.grantLeadership(0, leaderSessionID);
    cluster.notifyRetrievalListeners(0, leaderSessionID);

    cluster.waitForTaskManagersToBeRegistered();

    // submit blocking job
    cluster.submitJobDetached(job);

    ActorGateway jm = cluster.getLeaderGateway(timeout);

    Future<Object> wait =
        jm.ask(new WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout);

    Await.ready(wait, timeout);

    Future<Object> jobRemoval = jm.ask(new NotifyWhenJobRemoved(job.getJobID()), timeout);

    // make JM(0) again the leader --> this implies first a leadership revokal
    cluster.grantLeadership(0, newLeaderSessionID);

    Await.ready(jobRemoval, timeout);

    // The TMs should not be able to reconnect since they don't know the current leader
    // session ID
    try {
      cluster.waitForTaskManagersToBeRegistered(shortTimeout);
      fail("TaskManager should not be able to register at JobManager.");
    } catch (TimeoutException e) {
      // expected exception since the TMs have still the old leader session ID
    }

    // notify the TMs about the new (old) leader
    cluster.notifyRetrievalListeners(0, newLeaderSessionID);

    cluster.waitForTaskManagersToBeRegistered();

    // try to resubmit now the non-blocking job, it should complete successfully
    Tasks.BlockingOnceReceiver$.MODULE$.blocking_$eq(false);
    cluster.submitJobAndWait(job, false, timeout);
  }
Esempio n. 11
0
  @Test
  public void testFailureWhenJarBlobsMissing() {
    try {
      // create a simple job graph
      JobVertex jobVertex = new JobVertex("Test Vertex");
      jobVertex.setInvokableClass(Tasks.NoOpInvokable.class);
      JobGraph jg = new JobGraph("test job", jobVertex);

      // request the blob port from the job manager
      Future<Object> future =
          jmGateway.ask(JobManagerMessages.getRequestBlobManagerPort(), timeout);
      int blobPort = (Integer) Await.result(future, timeout);

      // upload two dummy bytes and add their keys to the job graph as dependencies
      BlobKey key1, key2;
      BlobClient bc = new BlobClient(new InetSocketAddress("localhost", blobPort));
      try {
        key1 = bc.put(new byte[10]);
        key2 = bc.put(new byte[10]);

        // delete one of the blobs to make sure that the startup failed
        bc.delete(key2);
      } finally {
        bc.close();
      }

      jg.addBlob(key1);
      jg.addBlob(key2);

      // submit the job
      Future<Object> submitFuture =
          jmGateway.ask(
              new JobManagerMessages.SubmitJob(jg, ListeningBehaviour.EXECUTION_RESULT), timeout);
      try {
        Await.result(submitFuture, timeout);
      } catch (JobExecutionException e) {
        // that is what we expect
        assertTrue(e.getCause() instanceof IOException);
      } catch (Exception e) {
        fail("Wrong exception type");
      }
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }
  }
Esempio n. 12
0
  /**
   * Sends an UpdatePartitionInfo message to the instance of the consumerSlot.
   *
   * @param consumerSlot Slot to whose instance the message will be sent
   * @param updatePartitionInfo UpdatePartitionInfo message
   */
  private void sendUpdatePartitionInfoRpcCall(
      final SimpleSlot consumerSlot, final UpdatePartitionInfo updatePartitionInfo) {

    if (consumerSlot != null) {
      final Instance instance = consumerSlot.getInstance();
      final ActorGateway gateway = instance.getActorGateway();

      Future<Object> futureUpdate = gateway.ask(updatePartitionInfo, timeout);

      futureUpdate.onFailure(
          new OnFailure() {
            @Override
            public void onFailure(Throwable failure) throws Throwable {
              fail(
                  new IllegalStateException(
                      "Update task on instance " + instance + " failed due to:", failure));
            }
          },
          executionContext);
    }
  }
Esempio n. 13
0
  /**
   * Shuts down the checkpoint coordinator.
   *
   * <p>After this method has been called, the coordinator does not accept and further messages and
   * cannot trigger any further checkpoints.
   */
  public void shutdown() throws Exception {
    synchronized (lock) {
      try {
        if (!shutdown) {
          shutdown = true;
          LOG.info("Stopping checkpoint coordinator for job " + job);

          periodicScheduling = false;
          triggerRequestQueued = false;

          // shut down the thread that handles the timeouts and pending triggers
          timer.cancel();

          // make sure that the actor does not linger
          if (jobStatusListener != null) {
            jobStatusListener.tell(PoisonPill.getInstance());
            jobStatusListener = null;
          }

          checkpointIdCounter.stop();

          // clear and discard all pending checkpoints
          for (PendingCheckpoint pending : pendingCheckpoints.values()) {
            pending.discard(userClassLoader);
          }
          pendingCheckpoints.clear();

          // clean and discard all successful checkpoints
          completedCheckpointStore.discardAllCheckpoints();

          onShutdown();
        }
      } finally {
        // Remove shutdown hook to prevent resource leaks, unless this is invoked by the
        // shutdown hook itself.
        if (shutdownHook != null && shutdownHook != Thread.currentThread()) {
          try {
            Runtime.getRuntime().removeShutdownHook(shutdownHook);
          } catch (IllegalStateException ignored) {
            // race, JVM is in shutdown already, we can safely ignore this
          } catch (Throwable t) {
            LOG.warn("Error unregistering checkpoint coordinator shutdown hook.", t);
          }
        }
      }
    }
  }
Esempio n. 14
0
  public ExecutionGraph getExecutionGraph(JobID jid) {
    ExecutionGraph cached = cache.get(jid);
    if (cached != null) {
      return cached;
    }

    try {
      Future<Object> future = source.ask(new JobManagerMessages.RequestJob(jid), timeout);
      Object result = Await.result(future, timeout);
      if (result instanceof JobManagerMessages.JobNotFound) {
        return null;
      } else if (result instanceof JobManagerMessages.JobFound) {
        ExecutionGraph eg = ((JobManagerMessages.JobFound) result).executionGraph();
        cache.put(jid, eg);
        return eg;
      } else {
        throw new RuntimeException("Unknown response from JobManager / Archive: " + result);
      }
    } catch (Exception e) {
      throw new RuntimeException("Error requesting execution graph", e);
    }
  }
Esempio n. 15
0
  /**
   * Tests that a job cannot be restarted from a savepoint with a different parallelism if the
   * rescaled operator has non-partitioned state.
   *
   * @throws Exception
   */
  @Test
  public void testSavepointRescalingNonPartitionedStateCausesException() throws Exception {
    final int parallelism = numSlots / 2;
    final int parallelism2 = numSlots;
    final int maxParallelism = 13;

    FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES);
    Deadline deadline = timeout.fromNow();

    JobID jobID = null;
    ActorGateway jobManager = null;

    try {
      jobManager = cluster.getLeaderGateway(deadline.timeLeft());

      JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, false);

      jobID = jobGraph.getJobID();

      cluster.submitJobDetached(jobGraph);

      Object savepointResponse = null;

      // wait until the operator is started
      StateSourceBase.workStartedLatch.await();

      while (deadline.hasTimeLeft()) {

        Future<Object> savepointPathFuture =
            jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID), deadline.timeLeft());
        FiniteDuration waitingTime = new FiniteDuration(10, TimeUnit.SECONDS);
        savepointResponse = Await.result(savepointPathFuture, waitingTime);

        if (savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess) {
          break;
        }
      }

      assertTrue(savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess);

      final String savepointPath =
          ((JobManagerMessages.TriggerSavepointSuccess) savepointResponse).savepointPath();

      Future<Object> jobRemovedFuture =
          jobManager.ask(
              new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft());

      Future<Object> cancellationResponseFuture =
          jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft());

      Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft());

      assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess);

      Await.ready(jobRemovedFuture, deadline.timeLeft());

      // job successfully removed
      jobID = null;

      JobGraph scaledJobGraph =
          createJobGraphWithOperatorState(parallelism2, maxParallelism, false);

      scaledJobGraph.setSavepointPath(savepointPath);

      jobID = scaledJobGraph.getJobID();

      cluster.submitJobAndWait(scaledJobGraph, false);

      jobID = null;

    } catch (JobExecutionException exception) {
      if (exception.getCause() instanceof SuppressRestartsException) {
        SuppressRestartsException suppressRestartsException =
            (SuppressRestartsException) exception.getCause();

        if (suppressRestartsException.getCause() instanceof IllegalStateException) {
          // we expect a IllegalStateException wrapped in a SuppressRestartsException wrapped
          // in a JobExecutionException, because the job containing non-partitioned state
          // is being rescaled
        } else {
          throw exception;
        }
      } else {
        throw exception;
      }
    } finally {
      // clear any left overs from a possibly failed job
      if (jobID != null && jobManager != null) {
        Future<Object> jobRemovedFuture =
            jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout);

        try {
          Await.ready(jobRemovedFuture, timeout);
        } catch (TimeoutException | InterruptedException ie) {
          fail("Failed while cleaning up the cluster.");
        }
      }
    }
  }
Esempio n. 16
0
  /**
   * Tests rescaling of partitioned operator state. More specific, we test the mechanism with {@link
   * ListCheckpointed} as it subsumes {@link
   * org.apache.flink.streaming.api.checkpoint.CheckpointedFunction}.
   */
  public void testSavepointRescalingPartitionedOperatorState(boolean scaleOut) throws Exception {
    final int parallelism = scaleOut ? numSlots : numSlots / 2;
    final int parallelism2 = scaleOut ? numSlots / 2 : numSlots;
    final int maxParallelism = 13;

    FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES);
    Deadline deadline = timeout.fromNow();

    JobID jobID = null;
    ActorGateway jobManager = null;

    int counterSize = Math.max(parallelism, parallelism2);

    PartitionedStateSource.CHECK_CORRECT_SNAPSHOT = new int[counterSize];
    PartitionedStateSource.CHECK_CORRECT_RESTORE = new int[counterSize];

    try {
      jobManager = cluster.getLeaderGateway(deadline.timeLeft());

      JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, true);

      jobID = jobGraph.getJobID();

      cluster.submitJobDetached(jobGraph);

      Object savepointResponse = null;

      // wait until the operator is started
      StateSourceBase.workStartedLatch.await();

      while (deadline.hasTimeLeft()) {

        Future<Object> savepointPathFuture =
            jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID), deadline.timeLeft());
        FiniteDuration waitingTime = new FiniteDuration(10, TimeUnit.SECONDS);
        savepointResponse = Await.result(savepointPathFuture, waitingTime);

        if (savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess) {
          break;
        }
      }

      assertTrue(savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess);

      final String savepointPath =
          ((JobManagerMessages.TriggerSavepointSuccess) savepointResponse).savepointPath();

      Future<Object> jobRemovedFuture =
          jobManager.ask(
              new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft());

      Future<Object> cancellationResponseFuture =
          jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft());

      Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft());

      assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess);

      Await.ready(jobRemovedFuture, deadline.timeLeft());

      // job successfully removed
      jobID = null;

      JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, true);

      scaledJobGraph.setSavepointPath(savepointPath);

      jobID = scaledJobGraph.getJobID();

      cluster.submitJobAndWait(scaledJobGraph, false);

      int sumExp = 0;
      int sumAct = 0;

      for (int c : PartitionedStateSource.CHECK_CORRECT_SNAPSHOT) {
        sumExp += c;
      }

      for (int c : PartitionedStateSource.CHECK_CORRECT_RESTORE) {
        sumAct += c;
      }

      assertEquals(sumExp, sumAct);
      jobID = null;

    } finally {
      // clear any left overs from a possibly failed job
      if (jobID != null && jobManager != null) {
        Future<Object> jobRemovedFuture =
            jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout);

        try {
          Await.ready(jobRemovedFuture, timeout);
        } catch (TimeoutException | InterruptedException ie) {
          fail("Failed while cleaning up the cluster.");
        }
      }
    }
  }
Esempio n. 17
0
  @Override
  public String handleJsonRequest(
      Map<String, String> pathParams, Map<String, String> queryParams, ActorGateway jobManager)
      throws Exception {
    try {
      if (jobManager != null) {
        // whether one task manager's metrics are requested, or all task manager, we
        // return them in an array. This avoids unnecessary code complexity.
        // If only one task manager is requested, we only fetch one task manager metrics.
        final List<Instance> instances = new ArrayList<>();
        if (pathParams.containsKey(TASK_MANAGER_ID_KEY)) {
          try {
            InstanceID instanceID =
                new InstanceID(StringUtils.hexStringToByte(pathParams.get(TASK_MANAGER_ID_KEY)));
            Future<Object> future =
                jobManager.ask(
                    new JobManagerMessages.RequestTaskManagerInstance(instanceID), timeout);
            TaskManagerInstance instance = (TaskManagerInstance) Await.result(future, timeout);
            if (instance.instance().nonEmpty()) {
              instances.add(instance.instance().get());
            }
          }
          // this means the id string was invalid. Keep the list empty.
          catch (IllegalArgumentException e) {
            // do nothing.
          }
        } else {
          Future<Object> future =
              jobManager.ask(JobManagerMessages.getRequestRegisteredTaskManagers(), timeout);
          RegisteredTaskManagers taskManagers =
              (RegisteredTaskManagers) Await.result(future, timeout);
          instances.addAll(taskManagers.asJavaCollection());
        }

        StringWriter writer = new StringWriter();
        JsonGenerator gen = JsonFactory.jacksonFactory.createGenerator(writer);

        gen.writeStartObject();
        gen.writeArrayFieldStart("taskmanagers");

        for (Instance instance : instances) {
          gen.writeStartObject();
          gen.writeStringField("id", instance.getId().toString());
          gen.writeStringField("path", instance.getActorGateway().path());
          gen.writeNumberField("dataPort", instance.getTaskManagerLocation().dataPort());
          gen.writeNumberField("timeSinceLastHeartbeat", instance.getLastHeartBeat());
          gen.writeNumberField("slotsNumber", instance.getTotalNumberOfSlots());
          gen.writeNumberField("freeSlots", instance.getNumberOfAvailableSlots());
          gen.writeNumberField("cpuCores", instance.getResources().getNumberOfCPUCores());
          gen.writeNumberField("physicalMemory", instance.getResources().getSizeOfPhysicalMemory());
          gen.writeNumberField("freeMemory", instance.getResources().getSizeOfJvmHeap());
          gen.writeNumberField("managedMemory", instance.getResources().getSizeOfManagedMemory());

          // only send metrics when only one task manager requests them.
          if (pathParams.containsKey(TASK_MANAGER_ID_KEY)) {
            byte[] report = instance.getLastMetricsReport();
            if (report != null) {
              gen.writeFieldName("metrics");
              gen.writeRawValue(new String(report, "utf-8"));
            }
          }

          gen.writeEndObject();
        }

        gen.writeEndArray();
        gen.writeEndObject();

        gen.close();
        return writer.toString();
      } else {
        throw new Exception("No connection to the leading JobManager.");
      }
    } catch (Exception e) {
      throw new RuntimeException("Failed to fetch list of all task managers: " + e.getMessage(), e);
    }
  }
Esempio n. 18
0
  /**
   * Tests that a job with non partitioned state can be restarted from a savepoint with a different
   * parallelism if the operator with non-partitioned state are not rescaled.
   *
   * @throws Exception
   */
  @Test
  public void testSavepointRescalingWithKeyedAndNonPartitionedState() throws Exception {
    int numberKeys = 42;
    int numberElements = 1000;
    int numberElements2 = 500;
    int parallelism = numSlots / 2;
    int parallelism2 = numSlots;
    int maxParallelism = 13;

    FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES);
    Deadline deadline = timeout.fromNow();

    ActorGateway jobManager = null;
    JobID jobID = null;

    try {
      jobManager = cluster.getLeaderGateway(deadline.timeLeft());

      JobGraph jobGraph =
          createJobGraphWithKeyedAndNonPartitionedOperatorState(
              parallelism, maxParallelism, parallelism, numberKeys, numberElements, false, 100);

      jobID = jobGraph.getJobID();

      cluster.submitJobDetached(jobGraph);

      // wait til the sources have emitted numberElements for each key and completed a checkpoint
      SubtaskIndexFlatMapper.workCompletedLatch.await(
          deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

      // verify the current state

      Set<Tuple2<Integer, Integer>> actualResult = CollectionSink.getElementsSet();

      Set<Tuple2<Integer, Integer>> expectedResult = new HashSet<>();

      for (int key = 0; key < numberKeys; key++) {
        int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism);

        expectedResult.add(
            Tuple2.of(
                KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup(
                    maxParallelism, parallelism, keyGroupIndex),
                numberElements * key));
      }

      assertEquals(expectedResult, actualResult);

      // clear the CollectionSink set for the restarted job
      CollectionSink.clearElementsSet();

      Future<Object> savepointPathFuture =
          jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID), deadline.timeLeft());

      final String savepointPath =
          ((JobManagerMessages.TriggerSavepointSuccess)
                  Await.result(savepointPathFuture, deadline.timeLeft()))
              .savepointPath();

      Future<Object> jobRemovedFuture =
          jobManager.ask(
              new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft());

      Future<Object> cancellationResponseFuture =
          jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft());

      Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft());

      assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess);

      Await.ready(jobRemovedFuture, deadline.timeLeft());

      jobID = null;

      JobGraph scaledJobGraph =
          createJobGraphWithKeyedAndNonPartitionedOperatorState(
              parallelism2,
              maxParallelism,
              parallelism,
              numberKeys,
              numberElements + numberElements2,
              true,
              100);

      scaledJobGraph.setSavepointPath(savepointPath);

      jobID = scaledJobGraph.getJobID();

      cluster.submitJobAndWait(scaledJobGraph, false);

      jobID = null;

      Set<Tuple2<Integer, Integer>> actualResult2 = CollectionSink.getElementsSet();

      Set<Tuple2<Integer, Integer>> expectedResult2 = new HashSet<>();

      for (int key = 0; key < numberKeys; key++) {
        int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism);
        expectedResult2.add(
            Tuple2.of(
                KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup(
                    maxParallelism, parallelism2, keyGroupIndex),
                key * (numberElements + numberElements2)));
      }

      assertEquals(expectedResult2, actualResult2);

    } finally {
      // clear the CollectionSink set for the restarted job
      CollectionSink.clearElementsSet();

      // clear any left overs from a possibly failed job
      if (jobID != null && jobManager != null) {
        Future<Object> jobRemovedFuture =
            jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout);

        try {
          Await.ready(jobRemovedFuture, timeout);
        } catch (TimeoutException | InterruptedException ie) {
          fail("Failed while cleaning up the cluster.");
        }
      }
    }
  }
Esempio n. 19
0
  public void deployToSlot(final SimpleSlot slot) throws JobException {
    // sanity checks
    if (slot == null) {
      throw new NullPointerException();
    }
    if (!slot.isAlive()) {
      throw new JobException("Target slot for deployment is not alive.");
    }

    // make sure exactly one deployment call happens from the correct state
    // note: the transition from CREATED to DEPLOYING is for testing purposes only
    ExecutionState previous = this.state;
    if (previous == SCHEDULED || previous == CREATED) {
      if (!transitionState(previous, DEPLOYING)) {
        // race condition, someone else beat us to the deploying call.
        // this should actually not happen and indicates a race somewhere else
        throw new IllegalStateException("Cannot deploy task: Concurrent deployment call race.");
      }
    } else {
      // vertex may have been cancelled, or it was already scheduled
      throw new IllegalStateException(
          "The vertex must be in CREATED or SCHEDULED state to be deployed. Found state "
              + previous);
    }

    try {
      // good, we are allowed to deploy
      if (!slot.setExecutedVertex(this)) {
        throw new JobException("Could not assign the ExecutionVertex to the slot " + slot);
      }
      this.assignedResource = slot;
      this.assignedResourceLocation = slot.getInstance().getInstanceConnectionInfo();

      // race double check, did we fail/cancel and do we need to release the slot?
      if (this.state != DEPLOYING) {
        slot.releaseSlot();
        return;
      }

      if (LOG.isInfoEnabled()) {
        LOG.info(
            String.format(
                "Deploying %s (attempt #%d) to %s",
                vertex.getSimpleName(),
                attemptNumber,
                slot.getInstance().getInstanceConnectionInfo().getHostname()));
      }

      final TaskDeploymentDescriptor deployment =
          vertex.createDeploymentDescriptor(
              attemptId, slot, operatorState, recoveryTimestamp, attemptNumber);

      // register this execution at the execution graph, to receive call backs
      vertex.getExecutionGraph().registerExecution(this);

      final Instance instance = slot.getInstance();
      final ActorGateway gateway = instance.getActorGateway();

      final Future<Object> deployAction = gateway.ask(new SubmitTask(deployment), timeout);

      deployAction.onComplete(
          new OnComplete<Object>() {

            @Override
            public void onComplete(Throwable failure, Object success) throws Throwable {
              if (failure != null) {
                if (failure instanceof TimeoutException) {
                  String taskname =
                      deployment.getTaskInfo().getTaskNameWithSubtasks() + " (" + attemptId + ')';

                  markFailed(
                      new Exception(
                          "Cannot deploy task "
                              + taskname
                              + " - TaskManager ("
                              + instance
                              + ") not responding after a timeout of "
                              + timeout,
                          failure));
                } else {
                  markFailed(failure);
                }
              } else {
                if (!(success.equals(Messages.getAcknowledge()))) {
                  markFailed(
                      new Exception(
                          "Failed to deploy the task to slot "
                              + slot
                              + ": Response was not of type Acknowledge"));
                }
              }
            }
          },
          executionContext);
    } catch (Throwable t) {
      markFailed(t);
      ExceptionUtils.rethrow(t);
    }
  }