예제 #1
0
  @BeforeClass
  public static void setupJobManager() {
    Configuration config = new Configuration();

    int port = NetUtils.getAvailablePort();

    config.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, "localhost");
    config.setInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, port);

    scala.Option<Tuple2<String, Object>> listeningAddress =
        scala.Option.apply(new Tuple2<String, Object>("localhost", port));
    jobManagerSystem = AkkaUtils.createActorSystem(config, listeningAddress);
    ActorRef jobManagerActorRef =
        JobManager.startJobManagerActors(
                config,
                jobManagerSystem,
                StreamingMode.BATCH_ONLY,
                JobManager.class,
                MemoryArchivist.class)
            ._1();

    try {
      LeaderRetrievalService lrs = LeaderRetrievalUtils.createLeaderRetrievalService(config);

      jmGateway = LeaderRetrievalUtils.retrieveLeaderGateway(lrs, jobManagerSystem, timeout);
    } catch (Exception e) {
      fail("Could not retrieve the JobManager gateway. " + e.getMessage());
    }
  }
예제 #2
0
  private FiniteDuration getTimeout() {
    final Configuration configuration = GlobalConfiguration.getConfiguration();
    if (this.timeout != null) {
      configuration.setString(ConfigConstants.AKKA_ASK_TIMEOUT, this.timeout);
    }

    return AkkaUtils.getClientTimeout(configuration);
  }
예제 #3
0
  private ActorRef getJobManager() throws IOException {
    final Configuration configuration = GlobalConfiguration.getConfiguration();

    ActorSystem actorSystem;
    try {
      final scala.Tuple2<String, Object> systemEndpoint = new scala.Tuple2<String, Object>("", 0);
      actorSystem =
          AkkaUtils.createActorSystem(
              configuration, new Some<scala.Tuple2<String, Object>>(systemEndpoint));
    } catch (final Exception e) {
      throw new RuntimeException("Could not start actor system to communicate with JobManager", e);
    }

    return JobManager.getJobManagerActorRef(
        new InetSocketAddress(this.jobManagerHost, this.jobManagerPort),
        actorSystem,
        AkkaUtils.getLookupTimeout(configuration));
  }
  @BeforeClass
  public static void startActorSystem() {
    config = new Configuration();
    config.setString(ConfigConstants.AKKA_ASK_TIMEOUT, "5 s");
    config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "200 ms");
    config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "2 s");
    config.setDouble(ConfigConstants.AKKA_WATCH_THRESHOLD, 2.0);

    actorSystem = AkkaUtils.createLocalActorSystem(config);
  }
  private Map<ExecutionAttemptID, Execution> setupExecution(
      JobVertex v1, int dop1, JobVertex v2, int dop2) throws Exception {
    final JobID jobId = new JobID();

    v1.setParallelism(dop1);
    v2.setParallelism(dop2);

    v1.setInvokableClass(BatchTask.class);
    v2.setInvokableClass(BatchTask.class);

    // execution graph that executes actions synchronously
    ExecutionGraph eg =
        new ExecutionGraph(
            TestingUtils.directExecutionContext(),
            jobId,
            "some job",
            new Configuration(),
            new SerializedValue<>(new ExecutionConfig()),
            AkkaUtils.getDefaultTimeout(),
            new NoRestartStrategy());

    eg.setQueuedSchedulingAllowed(false);

    List<JobVertex> ordered = Arrays.asList(v1, v2);
    eg.attachJobGraph(ordered);

    Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
    for (int i = 0; i < dop1 + dop2; i++) {
      scheduler.newInstanceAvailable(
          ExecutionGraphTestUtils.getInstance(
              new ExecutionGraphTestUtils.SimpleActorGateway(
                  TestingUtils.directExecutionContext())));
    }
    assertEquals(dop1 + dop2, scheduler.getNumberOfAvailableSlots());

    // schedule, this triggers mock deployment
    eg.scheduleForExecution(scheduler);

    Map<ExecutionAttemptID, Execution> executions = eg.getRegisteredExecutions();
    assertEquals(dop1 + dop2, executions.size());

    return executions;
  }
예제 #6
0
  /**
   * Create a new Flink on YARN cluster.
   *
   * @param yarnClient
   * @param appId the YARN application ID
   * @param hadoopConfig
   * @param flinkConfig
   * @param sessionFilesDir
   * @param detached Set to true if no actor system or RPC communication with the cluster should be
   *     established
   * @throws IOException
   * @throws YarnException
   */
  public FlinkYarnCluster(
      final YarnClient yarnClient,
      final ApplicationId appId,
      Configuration hadoopConfig,
      org.apache.flink.configuration.Configuration flinkConfig,
      Path sessionFilesDir,
      boolean detached)
      throws IOException, YarnException {
    this.akkaDuration = AkkaUtils.getTimeout(flinkConfig);
    this.akkaTimeout = Timeout.durationToTimeout(akkaDuration);
    this.yarnClient = yarnClient;
    this.hadoopConfig = hadoopConfig;
    this.sessionFilesDir = sessionFilesDir;
    this.applicationId = appId;
    this.detached = detached;
    this.flinkConfig = flinkConfig;
    this.appId = appId;

    // get one application report manually
    intialAppReport = yarnClient.getApplicationReport(appId);
    String jobManagerHost = intialAppReport.getHost();
    int jobManagerPort = intialAppReport.getRpcPort();
    this.jobManagerAddress = new InetSocketAddress(jobManagerHost, jobManagerPort);
  }
예제 #7
0
  public WebRuntimeMonitor(
      Configuration config, LeaderRetrievalService leaderRetrievalService, ActorSystem actorSystem)
      throws IOException, InterruptedException {

    this.leaderRetrievalService = checkNotNull(leaderRetrievalService);
    this.timeout = AkkaUtils.getTimeout(config);
    this.retriever =
        new JobManagerRetriever(this, actorSystem, AkkaUtils.getTimeout(config), timeout);

    final WebMonitorConfig cfg = new WebMonitorConfig(config);

    final int configuredPort = cfg.getWebFrontendPort();
    if (configuredPort < 0) {
      throw new IllegalArgumentException("Web frontend port is invalid: " + configuredPort);
    }

    final WebMonitorUtils.LogFileLocation logFiles = WebMonitorUtils.LogFileLocation.find(config);

    // create an empty directory in temp for the web server
    String rootDirFileName = "flink-web-" + UUID.randomUUID();
    webRootDir = new File(getBaseDir(config), rootDirFileName);
    LOG.info("Using directory {} for the web interface files", webRootDir);

    final boolean webSubmitAllow = cfg.isProgramSubmitEnabled();
    if (webSubmitAllow) {
      // create storage for uploads
      String uploadDirName = "flink-web-upload-" + UUID.randomUUID();
      this.uploadDir = new File(getBaseDir(config), uploadDirName);
      if (!uploadDir.mkdir() || !uploadDir.canWrite()) {
        throw new IOException("Unable to create temporary directory to support jar uploads.");
      }
      LOG.info("Using directory {} for web frontend JAR file uploads", uploadDir);
    } else {
      this.uploadDir = null;
    }

    ExecutionGraphHolder currentGraphs = new ExecutionGraphHolder();

    // - Back pressure stats ----------------------------------------------

    stackTraceSamples = new StackTraceSampleCoordinator(actorSystem, 60000);

    // Back pressure stats tracker config
    int cleanUpInterval =
        config.getInteger(
            ConfigConstants.JOB_MANAGER_WEB_BACK_PRESSURE_CLEAN_UP_INTERVAL,
            ConfigConstants.DEFAULT_JOB_MANAGER_WEB_BACK_PRESSURE_CLEAN_UP_INTERVAL);

    int refreshInterval =
        config.getInteger(
            ConfigConstants.JOB_MANAGER_WEB_BACK_PRESSURE_REFRESH_INTERVAL,
            ConfigConstants.DEFAULT_JOB_MANAGER_WEB_BACK_PRESSURE_REFRESH_INTERVAL);

    int numSamples =
        config.getInteger(
            ConfigConstants.JOB_MANAGER_WEB_BACK_PRESSURE_NUM_SAMPLES,
            ConfigConstants.DEFAULT_JOB_MANAGER_WEB_BACK_PRESSURE_NUM_SAMPLES);

    int delay =
        config.getInteger(
            ConfigConstants.JOB_MANAGER_WEB_BACK_PRESSURE_DELAY,
            ConfigConstants.DEFAULT_JOB_MANAGER_WEB_BACK_PRESSURE_DELAY);

    FiniteDuration delayBetweenSamples = new FiniteDuration(delay, TimeUnit.MILLISECONDS);

    backPressureStatsTracker =
        new BackPressureStatsTracker(
            stackTraceSamples, cleanUpInterval, numSamples, delayBetweenSamples);

    // --------------------------------------------------------------------

    executorService = new ForkJoinPool();

    ExecutionContextExecutor context = ExecutionContext$.MODULE$.fromExecutor(executorService);

    router =
        new Router()
            // config how to interact with this web server
            .GET("/config", handler(new DashboardConfigHandler(cfg.getRefreshInterval())))

            // the overview - how many task managers, slots, free slots, ...
            .GET("/overview", handler(new ClusterOverviewHandler(DEFAULT_REQUEST_TIMEOUT)))

            // job manager configuration
            .GET("/jobmanager/config", handler(new JobManagerConfigHandler(config)))

            // overview over jobs
            .GET(
                "/joboverview",
                handler(new CurrentJobsOverviewHandler(DEFAULT_REQUEST_TIMEOUT, true, true)))
            .GET(
                "/joboverview/running",
                handler(new CurrentJobsOverviewHandler(DEFAULT_REQUEST_TIMEOUT, true, false)))
            .GET(
                "/joboverview/completed",
                handler(new CurrentJobsOverviewHandler(DEFAULT_REQUEST_TIMEOUT, false, true)))
            .GET("/jobs", handler(new CurrentJobIdsHandler(DEFAULT_REQUEST_TIMEOUT)))
            .GET("/jobs/:jobid", handler(new JobDetailsHandler(currentGraphs)))
            .GET("/jobs/:jobid/vertices", handler(new JobDetailsHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid",
                handler(new JobVertexDetailsHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/subtasktimes",
                handler(new SubtasksTimesHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/taskmanagers",
                handler(new JobVertexTaskManagersHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/accumulators",
                handler(new JobVertexAccumulatorsHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/checkpoints",
                handler(new JobVertexCheckpointsHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/backpressure",
                handler(
                    new JobVertexBackPressureHandler(
                        currentGraphs, backPressureStatsTracker, refreshInterval)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/subtasks/accumulators",
                handler(new SubtasksAllAccumulatorsHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/subtasks/:subtasknum",
                handler(new SubtaskCurrentAttemptDetailsHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/subtasks/:subtasknum/attempts/:attempt",
                handler(new SubtaskExecutionAttemptDetailsHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/subtasks/:subtasknum/attempts/:attempt/accumulators",
                handler(new SubtaskExecutionAttemptAccumulatorsHandler(currentGraphs)))
            .GET("/jobs/:jobid/plan", handler(new JobPlanHandler(currentGraphs)))
            .GET("/jobs/:jobid/config", handler(new JobConfigHandler(currentGraphs)))
            .GET("/jobs/:jobid/exceptions", handler(new JobExceptionsHandler(currentGraphs)))
            .GET("/jobs/:jobid/accumulators", handler(new JobAccumulatorsHandler(currentGraphs)))
            .GET("/jobs/:jobid/checkpoints", handler(new JobCheckpointsHandler(currentGraphs)))
            .GET("/taskmanagers", handler(new TaskManagersHandler(DEFAULT_REQUEST_TIMEOUT)))
            .GET(
                "/taskmanagers/:" + TaskManagersHandler.TASK_MANAGER_ID_KEY + "/metrics",
                handler(new TaskManagersHandler(DEFAULT_REQUEST_TIMEOUT)))
            .GET(
                "/taskmanagers/:" + TaskManagersHandler.TASK_MANAGER_ID_KEY + "/log",
                new TaskManagerLogHandler(
                    retriever,
                    context,
                    jobManagerAddressPromise.future(),
                    timeout,
                    TaskManagerLogHandler.FileMode.LOG,
                    config))
            .GET(
                "/taskmanagers/:" + TaskManagersHandler.TASK_MANAGER_ID_KEY + "/stdout",
                new TaskManagerLogHandler(
                    retriever,
                    context,
                    jobManagerAddressPromise.future(),
                    timeout,
                    TaskManagerLogHandler.FileMode.STDOUT,
                    config))

            // log and stdout
            .GET(
                "/jobmanager/log",
                logFiles.logFile == null
                    ? new ConstantTextHandler("(log file unavailable)")
                    : new StaticFileServerHandler(
                        retriever, jobManagerAddressPromise.future(), timeout, logFiles.logFile))
            .GET(
                "/jobmanager/stdout",
                logFiles.stdOutFile == null
                    ? new ConstantTextHandler("(stdout file unavailable)")
                    : new StaticFileServerHandler(
                        retriever, jobManagerAddressPromise.future(), timeout, logFiles.stdOutFile))

            // Cancel a job via GET (for proper integration with YARN this has to be performed via
            // GET)
            .GET("/jobs/:jobid/yarn-cancel", handler(new JobCancellationHandler()))

            // DELETE is the preferred way of canceling a job (Rest-conform)
            .DELETE("/jobs/:jobid/cancel", handler(new JobCancellationHandler()))

            // stop a job via GET (for proper integration with YARN this has to be performed via
            // GET)
            .GET("/jobs/:jobid/yarn-stop", handler(new JobStoppingHandler()))

            // DELETE is the preferred way of stopping a job (Rest-conform)
            .DELETE("/jobs/:jobid/stop", handler(new JobStoppingHandler()));

    if (webSubmitAllow) {
      router
          // fetch the list of uploaded jars.
          .GET("/jars", handler(new JarListHandler(uploadDir)))

          // get plan for an uploaded jar
          .GET("/jars/:jarid/plan", handler(new JarPlanHandler(uploadDir)))

          // run a jar
          .POST("/jars/:jarid/run", handler(new JarRunHandler(uploadDir, timeout)))

          // upload a jar
          .POST("/jars/upload", handler(new JarUploadHandler(uploadDir)))

          // delete an uploaded jar from submission interface
          .DELETE("/jars/:jarid", handler(new JarDeleteHandler(uploadDir)));
    } else {
      router
          // send an Access Denied message (sort of)
          // Every other GET request will go to the File Server, which will not provide
          // access to the jar directory anyway, because it doesn't exist in webRootDir.
          .GET("/jars", handler(new JarAccessDeniedHandler()));
    }

    // this handler serves all the static contents
    router.GET(
        "/:*",
        new StaticFileServerHandler(
            retriever, jobManagerAddressPromise.future(), timeout, webRootDir));

    // add shutdown hook for deleting the directories and remaining temp files on shutdown
    try {
      Runtime.getRuntime()
          .addShutdownHook(
              new Thread() {
                @Override
                public void run() {
                  cleanup();
                }
              });
    } catch (IllegalStateException e) {
      // race, JVM is in shutdown already, we can safely ignore this
      LOG.debug("Unable to add shutdown hook, shutdown already in progress", e);
    } catch (Throwable t) {
      // these errors usually happen when the shutdown is already in progress
      LOG.warn("Error while adding shutdown hook", t);
    }

    ChannelInitializer<SocketChannel> initializer =
        new ChannelInitializer<SocketChannel>() {

          @Override
          protected void initChannel(SocketChannel ch) {
            Handler handler = new Handler(router);

            ch.pipeline()
                .addLast(new HttpServerCodec())
                .addLast(new HttpRequestHandler(uploadDir))
                .addLast(handler.name(), handler)
                .addLast(new PipelineErrorHandler(LOG));
          }
        };

    NioEventLoopGroup bossGroup = new NioEventLoopGroup(1);
    NioEventLoopGroup workerGroup = new NioEventLoopGroup();

    this.bootstrap = new ServerBootstrap();
    this.bootstrap
        .group(bossGroup, workerGroup)
        .channel(NioServerSocketChannel.class)
        .childHandler(initializer);

    Channel ch = this.bootstrap.bind(configuredPort).sync().channel();
    this.serverChannel = ch;

    InetSocketAddress bindAddress = (InetSocketAddress) ch.localAddress();
    String address = bindAddress.getAddress().getHostAddress();
    int port = bindAddress.getPort();

    LOG.info("Web frontend listening at " + address + ':' + port);
  }
예제 #8
0
  public WebRuntimeMonitor(
      Configuration config, LeaderRetrievalService leaderRetrievalService, ActorSystem actorSystem)
      throws IOException, InterruptedException {
    this.leaderRetrievalService = checkNotNull(leaderRetrievalService);

    final WebMonitorConfig cfg = new WebMonitorConfig(config);

    // create an empty directory in temp for the web server
    String fileName = String.format("flink-web-%s", UUID.randomUUID().toString());
    webRootDir = new File(System.getProperty("java.io.tmpdir"), fileName);
    LOG.info("Using directory {} for the web interface files", webRootDir);

    // figure out where our logs are
    final String flinkRoot = config.getString(ConfigConstants.FLINK_BASE_DIR_PATH_KEY, null);
    final String defaultLogDirectory = flinkRoot + "/log";
    final String logDirectories =
        config.getString(ConfigConstants.JOB_MANAGER_WEB_LOG_PATH_KEY, defaultLogDirectory);

    // find out which directory holds the path for log and stdout
    final ArrayList<String> logPaths = new ArrayList<>();
    final ArrayList<String> outPaths = new ArrayList<>();

    // yarn allows for multiple log directories. Search in all.
    for (String paths : logDirectories.split(",")) {
      File dir = new File(paths);
      if (dir.exists() && dir.isDirectory() && dir.canRead()) {
        if (dir.listFiles(LOG_FILE_PATTERN).length == 1) {
          logPaths.add(paths);
        }
        if (dir.listFiles(STDOUT_FILE_PATTERN).length == 1) {
          outPaths.add(paths);
        }
      }
    }

    // we don't want any ambiguities. There must be only one log and out file.
    if (logPaths.size() != 1 || outPaths.size() != 1) {
      throw new IllegalConfigurationException(
          "The path to the log and out files (" + logDirectories + ") is not valid.");
    }

    final File logDir = new File(logPaths.get(0));
    final File outDir = new File(outPaths.get(0));
    LOG.info("Serving job manager logs from {}", logDir.getAbsolutePath());
    LOG.info("Serving job manager stdout from {}", outDir.getAbsolutePath());

    // port configuration
    this.configuredPort = cfg.getWebFrontendPort();
    if (this.configuredPort < 0) {
      throw new IllegalArgumentException("Web frontend port is invalid: " + this.configuredPort);
    }

    timeout = AkkaUtils.getTimeout(config);
    FiniteDuration lookupTimeout = AkkaUtils.getTimeout(config);

    retriever = new JobManagerRetriever(this, actorSystem, lookupTimeout, timeout);

    ExecutionGraphHolder currentGraphs = new ExecutionGraphHolder();

    router =
        new Router()
            // config how to interact with this web server
            .GET("/config", handler(new DashboardConfigHandler(cfg.getRefreshInterval())))

            // the overview - how many task managers, slots, free slots, ...
            .GET("/overview", handler(new ClusterOverviewHandler(DEFAULT_REQUEST_TIMEOUT)))

            // job manager configuration, log and stdout
            .GET("/jobmanager/config", handler(new JobManagerConfigHandler(config)))

            // overview over jobs
            .GET(
                "/joboverview",
                handler(new CurrentJobsOverviewHandler(DEFAULT_REQUEST_TIMEOUT, true, true)))
            .GET(
                "/joboverview/running",
                handler(new CurrentJobsOverviewHandler(DEFAULT_REQUEST_TIMEOUT, true, false)))
            .GET(
                "/joboverview/completed",
                handler(new CurrentJobsOverviewHandler(DEFAULT_REQUEST_TIMEOUT, false, true)))
            .GET("/jobs", handler(new CurrentJobIdsHandler(retriever, DEFAULT_REQUEST_TIMEOUT)))
            .GET("/jobs/:jobid", handler(new JobDetailsHandler(currentGraphs)))
            .GET("/jobs/:jobid/vertices", handler(new JobDetailsHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid",
                handler(new JobVertexDetailsHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/subtasktimes",
                handler(new SubtasksTimesHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/accumulators",
                handler(new JobVertexAccumulatorsHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/subtasks/accumulators",
                handler(new SubtasksAllAccumulatorsHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/subtasks/:subtasknum",
                handler(new SubtaskCurrentAttemptDetailsHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/subtasks/:subtasknum/attempts/:attempt",
                handler(new SubtaskExecutionAttemptDetailsHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/subtasks/:subtasknum/attempts/:attempt/accumulators",
                handler(new SubtaskExecutionAttemptAccumulatorsHandler(currentGraphs)))
            .GET("/jobs/:jobid/plan", handler(new JobPlanHandler(currentGraphs)))
            .GET("/jobs/:jobid/config", handler(new JobConfigHandler(currentGraphs)))
            .GET("/jobs/:jobid/exceptions", handler(new JobExceptionsHandler(currentGraphs)))
            .GET("/jobs/:jobid/accumulators", handler(new JobAccumulatorsHandler(currentGraphs)))
            .GET("/taskmanagers", handler(new TaskManagersHandler(DEFAULT_REQUEST_TIMEOUT)))
            .GET(
                "/taskmanagers/:" + TaskManagersHandler.TASK_MANAGER_ID_KEY,
                handler(new TaskManagersHandler(DEFAULT_REQUEST_TIMEOUT)))
            .GET(
                "/jobmanager/log",
                new StaticFileServerHandler(
                    retriever, jobManagerAddressPromise.future(), timeout, logDir))
            .GET(
                "/jobmanager/stdout",
                new StaticFileServerHandler(
                    retriever, jobManagerAddressPromise.future(), timeout, outDir))
            // this handler serves all the static contents
            .GET(
                "/:*",
                new StaticFileServerHandler(
                    retriever, jobManagerAddressPromise.future(), timeout, webRootDir));

    synchronized (startupShutdownLock) {

      // add shutdown hook for deleting the directory
      try {
        Runtime.getRuntime()
            .addShutdownHook(
                new Thread() {
                  @Override
                  public void run() {
                    shutdown();
                  }
                });
      } catch (IllegalStateException e) {
        // race, JVM is in shutdown already, we can safely ignore this
        LOG.debug("Unable to add shutdown hook, shutdown already in progress", e);
      } catch (Throwable t) {
        // these errors usually happen when the shutdown is already in progress
        LOG.warn("Error while adding shutdown hook", t);
      }

      ChannelInitializer<SocketChannel> initializer =
          new ChannelInitializer<SocketChannel>() {

            @Override
            protected void initChannel(SocketChannel ch) {
              Handler handler = new Handler(router);

              ch.pipeline()
                  .addLast(new HttpServerCodec())
                  .addLast(new HttpObjectAggregator(65536))
                  .addLast(new ChunkedWriteHandler())
                  .addLast(handler.name(), handler);
            }
          };

      NioEventLoopGroup bossGroup = new NioEventLoopGroup(1);
      NioEventLoopGroup workerGroup = new NioEventLoopGroup();

      this.bootstrap = new ServerBootstrap();
      this.bootstrap
          .group(bossGroup, workerGroup)
          .channel(NioServerSocketChannel.class)
          .childHandler(initializer);

      Channel ch = this.bootstrap.bind(configuredPort).sync().channel();
      this.serverChannel = ch;

      InetSocketAddress bindAddress = (InetSocketAddress) ch.localAddress();
      String address = bindAddress.getAddress().getHostAddress();
      int port = bindAddress.getPort();

      LOG.info("Web frontend listening at " + address + ':' + port);
    }
  }
예제 #9
0
  public WebRuntimeMonitor(
      Configuration config, LeaderRetrievalService leaderRetrievalService, ActorSystem actorSystem)
      throws IOException {
    this.leaderRetrievalService = checkNotNull(leaderRetrievalService);

    final WebMonitorConfig cfg = new WebMonitorConfig(config);

    // create an empty directory in temp for the web server
    String fileName = String.format("flink-web-%s", UUID.randomUUID().toString());
    webRootDir = new File(System.getProperty("java.io.tmpdir"), fileName);
    LOG.info("Using directory {} for the web interface files", webRootDir);

    // port configuration
    this.configuredPort = cfg.getWebFrontendPort();
    if (this.configuredPort < 0) {
      throw new IllegalArgumentException("Web frontend port is invalid: " + this.configuredPort);
    }

    FiniteDuration timeout = AkkaUtils.getTimeout(config);
    FiniteDuration lookupTimeout = AkkaUtils.getTimeout(config);

    retriever = new JobManagerArchiveRetriever(this, actorSystem, lookupTimeout, timeout);

    ExecutionGraphHolder currentGraphs = new ExecutionGraphHolder(retriever);

    router =
        new Router()
            // config how to interact with this web server
            .GET("/config", handler(new DashboardConfigHandler(cfg.getRefreshInterval())))

            // the overview - how many task managers, slots, free slots, ...
            .GET(
                "/overview",
                handler(new ClusterOverviewHandler(retriever, DEFAULT_REQUEST_TIMEOUT)))

            // job manager configuration
            .GET("/jobmanager/config", handler(new JobManagerConfigHandler(config)))

            // overview over jobs
            .GET(
                "/joboverview",
                handler(
                    new CurrentJobsOverviewHandler(retriever, DEFAULT_REQUEST_TIMEOUT, true, true)))
            .GET(
                "/joboverview/running",
                handler(
                    new CurrentJobsOverviewHandler(
                        retriever, DEFAULT_REQUEST_TIMEOUT, true, false)))
            .GET(
                "/joboverview/completed",
                handler(
                    new CurrentJobsOverviewHandler(
                        retriever, DEFAULT_REQUEST_TIMEOUT, false, true)))
            .GET("/jobs", handler(new CurrentJobIdsHandler(retriever, DEFAULT_REQUEST_TIMEOUT)))
            .GET("/jobs/:jobid", handler(new JobDetailsHandler(currentGraphs)))
            .GET("/jobs/:jobid/vertices", handler(new JobDetailsHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid",
                handler(new JobVertexDetailsHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/subtasktimes",
                handler(new SubtasksTimesHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/accumulators",
                handler(new JobVertexAccumulatorsHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/subtasks/accumulators",
                handler(new SubtasksAllAccumulatorsHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/subtasks/:subtasknum",
                handler(new SubtaskCurrentAttemptDetailsHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/subtasks/:subtasknum/attempts/:attempt",
                handler(new SubtaskExecutionAttemptDetailsHandler(currentGraphs)))
            .GET(
                "/jobs/:jobid/vertices/:vertexid/subtasks/:subtasknum/attempts/:attempt/accumulators",
                handler(new SubtaskExecutionAttemptAccumulatorsHandler(currentGraphs)))
            .GET("/jobs/:jobid/plan", handler(new JobPlanHandler(currentGraphs)))
            .GET("/jobs/:jobid/config", handler(new JobConfigHandler(currentGraphs)))
            .GET("/jobs/:jobid/exceptions", handler(new JobExceptionsHandler(currentGraphs)))
            .GET("/jobs/:jobid/accumulators", handler(new JobAccumulatorsHandler(currentGraphs)))
            .GET(
                "/taskmanagers",
                handler(new TaskManagersHandler(retriever, DEFAULT_REQUEST_TIMEOUT)))
            .GET(
                "/taskmanagers/:" + TaskManagersHandler.TASK_MANAGER_ID_KEY,
                handler(new TaskManagersHandler(retriever, DEFAULT_REQUEST_TIMEOUT)))

            // this handler serves all the static contents
            .GET("/:*", new StaticFileServerHandler(webRootDir));
  }
  @Test
  public void testBuildDeploymentDescriptor() {
    try {
      final JobID jobId = new JobID();

      final JobVertexID jid1 = new JobVertexID();
      final JobVertexID jid2 = new JobVertexID();
      final JobVertexID jid3 = new JobVertexID();
      final JobVertexID jid4 = new JobVertexID();

      JobVertex v1 = new JobVertex("v1", jid1);
      JobVertex v2 = new JobVertex("v2", jid2);
      JobVertex v3 = new JobVertex("v3", jid3);
      JobVertex v4 = new JobVertex("v4", jid4);

      v1.setParallelism(10);
      v2.setParallelism(10);
      v3.setParallelism(10);
      v4.setParallelism(10);

      v1.setInvokableClass(BatchTask.class);
      v2.setInvokableClass(BatchTask.class);
      v3.setInvokableClass(BatchTask.class);
      v4.setInvokableClass(BatchTask.class);

      v2.connectNewDataSetAsInput(v1, DistributionPattern.ALL_TO_ALL);
      v3.connectNewDataSetAsInput(v2, DistributionPattern.ALL_TO_ALL);
      v4.connectNewDataSetAsInput(v2, DistributionPattern.ALL_TO_ALL);

      ExecutionGraph eg =
          new ExecutionGraph(
              TestingUtils.defaultExecutionContext(),
              jobId,
              "some job",
              new Configuration(),
              new SerializedValue<>(new ExecutionConfig()),
              AkkaUtils.getDefaultTimeout(),
              new NoRestartStrategy());

      List<JobVertex> ordered = Arrays.asList(v1, v2, v3, v4);

      eg.attachJobGraph(ordered);

      ExecutionJobVertex ejv = eg.getAllVertices().get(jid2);
      ExecutionVertex vertex = ejv.getTaskVertices()[3];

      ExecutionGraphTestUtils.SimpleActorGateway instanceGateway =
          new ExecutionGraphTestUtils.SimpleActorGateway(TestingUtils.directExecutionContext());

      final Instance instance = getInstance(instanceGateway);

      final SimpleSlot slot = instance.allocateSimpleSlot(jobId);

      assertEquals(ExecutionState.CREATED, vertex.getExecutionState());

      vertex.deployToSlot(slot);

      assertEquals(ExecutionState.DEPLOYING, vertex.getExecutionState());

      TaskDeploymentDescriptor descr = instanceGateway.lastTDD;
      assertNotNull(descr);

      assertEquals(jobId, descr.getJobID());
      assertEquals(jid2, descr.getVertexID());
      assertEquals(3, descr.getIndexInSubtaskGroup());
      assertEquals(10, descr.getNumberOfSubtasks());
      assertEquals(BatchTask.class.getName(), descr.getInvokableClassName());
      assertEquals("v2", descr.getTaskName());

      List<ResultPartitionDeploymentDescriptor> producedPartitions = descr.getProducedPartitions();
      List<InputGateDeploymentDescriptor> consumedPartitions = descr.getInputGates();

      assertEquals(2, producedPartitions.size());
      assertEquals(1, consumedPartitions.size());

      assertEquals(10, producedPartitions.get(0).getNumberOfSubpartitions());
      assertEquals(10, producedPartitions.get(1).getNumberOfSubpartitions());
      assertEquals(10, consumedPartitions.get(0).getInputChannelDeploymentDescriptors().length);
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }
  }
  @Test
  /**
   * Tests that a blocking batch job fails if there are not enough resources left to schedule the
   * succeeding tasks. This test case is related to [FLINK-4296] where finished producing tasks
   * swallow the fail exception when scheduling a consumer task.
   */
  public void testNoResourceAvailableFailure() throws Exception {
    final JobID jobId = new JobID();
    JobVertex v1 = new JobVertex("source");
    JobVertex v2 = new JobVertex("sink");

    int dop1 = 1;
    int dop2 = 1;

    v1.setParallelism(dop1);
    v2.setParallelism(dop2);

    v1.setInvokableClass(BatchTask.class);
    v2.setInvokableClass(BatchTask.class);

    v2.connectNewDataSetAsInput(
        v1, DistributionPattern.POINTWISE, ResultPartitionType.BLOCKING, false);

    // execution graph that executes actions synchronously
    ExecutionGraph eg =
        new ExecutionGraph(
            TestingUtils.directExecutionContext(),
            jobId,
            "failing test job",
            new Configuration(),
            new SerializedValue<>(new ExecutionConfig()),
            AkkaUtils.getDefaultTimeout(),
            new NoRestartStrategy());

    eg.setQueuedSchedulingAllowed(false);

    List<JobVertex> ordered = Arrays.asList(v1, v2);
    eg.attachJobGraph(ordered);

    Scheduler scheduler = new Scheduler(TestingUtils.directExecutionContext());
    for (int i = 0; i < dop1; i++) {
      scheduler.newInstanceAvailable(
          ExecutionGraphTestUtils.getInstance(
              new ExecutionGraphTestUtils.SimpleActorGateway(
                  TestingUtils.directExecutionContext())));
    }
    assertEquals(dop1, scheduler.getNumberOfAvailableSlots());

    // schedule, this triggers mock deployment
    eg.scheduleForExecution(scheduler);

    ExecutionAttemptID attemptID =
        eg.getJobVertex(v1.getID())
            .getTaskVertices()[0]
            .getCurrentExecutionAttempt()
            .getAttemptId();
    eg.updateState(new TaskExecutionState(jobId, attemptID, ExecutionState.RUNNING));
    eg.updateState(
        new TaskExecutionState(
            jobId,
            attemptID,
            ExecutionState.FINISHED,
            null,
            new AccumulatorSnapshot(
                jobId,
                attemptID,
                new HashMap<AccumulatorRegistry.Metric, Accumulator<?, ?>>(),
                new HashMap<String, Accumulator<?, ?>>())));

    assertEquals(JobStatus.FAILED, eg.getState());
  }
예제 #12
0
  /**
   * Connect the FlinkYarnCluster to the ApplicationMaster.
   *
   * <p>Detached YARN sessions don't need to connect to the ApplicationMaster. Detached per job YARN
   * sessions need to connect until the required number of TaskManagers have been started.
   *
   * @throws IOException
   */
  public void connectToCluster() throws IOException {
    if (isConnected) {
      throw new IllegalStateException("Can not connect to the cluster again");
    }

    // start actor system
    LOG.info("Start actor system.");
    InetAddress ownHostname =
        NetUtils.resolveAddress(
            jobManagerAddress); // find name of own public interface, able to connect to the JM
    actorSystem =
        AkkaUtils.createActorSystem(
            flinkConfig,
            new Some(new Tuple2<String, Integer>(ownHostname.getCanonicalHostName(), 0)));

    // start application client
    LOG.info("Start application client.");

    applicationClient =
        actorSystem.actorOf(
            Props.create(ApplicationClient.class, flinkConfig), "applicationClient");

    // instruct ApplicationClient to start a periodical status polling
    applicationClient.tell(
        new Messages.LocalRegisterClient(this.jobManagerAddress), applicationClient);

    actorRunner =
        new Thread(
            new Runnable() {
              @Override
              public void run() {
                // blocks until ApplicationMaster has been stopped
                actorSystem.awaitTermination();

                // get final application report
                try {
                  ApplicationReport appReport = yarnClient.getApplicationReport(appId);

                  LOG.info(
                      "Application "
                          + appId
                          + " finished with state "
                          + appReport.getYarnApplicationState()
                          + " and final state "
                          + appReport.getFinalApplicationStatus()
                          + " at "
                          + appReport.getFinishTime());

                  if (appReport.getYarnApplicationState() == YarnApplicationState.FAILED
                      || appReport.getYarnApplicationState() == YarnApplicationState.KILLED) {
                    LOG.warn("Application failed. Diagnostics " + appReport.getDiagnostics());
                    LOG.warn(
                        "If log aggregation is activated in the Hadoop cluster, we recommend to retrieve "
                            + "the full application log using this command:\n"
                            + "\tyarn logs -applicationId "
                            + appReport.getApplicationId()
                            + "\n"
                            + "(It sometimes takes a few seconds until the logs are aggregated)");
                  }
                } catch (Exception e) {
                  LOG.warn("Error while getting final application report", e);
                }
              }
            });
    actorRunner.setDaemon(true);
    actorRunner.start();

    pollingRunner = new PollingThread(yarnClient, appId);
    pollingRunner.setDaemon(true);
    pollingRunner.start();

    Runtime.getRuntime().addShutdownHook(clientShutdownHook);

    isConnected = true;
  }
  @Test
  public void testJobManagerProcessFailure() throws Exception {
    // Config
    final int numberOfJobManagers = 2;
    final int numberOfTaskManagers = 2;
    final int numberOfSlotsPerTaskManager = 2;

    assertEquals(PARALLELISM, numberOfTaskManagers * numberOfSlotsPerTaskManager);

    // Setup
    // Test actor system
    ActorSystem testActorSystem;

    // Job managers
    final JobManagerProcess[] jmProcess = new JobManagerProcess[numberOfJobManagers];

    // Task managers
    final ActorSystem[] tmActorSystem = new ActorSystem[numberOfTaskManagers];

    // Leader election service
    LeaderRetrievalService leaderRetrievalService = null;

    // Coordination between the processes goes through a directory
    File coordinateTempDir = null;

    try {
      final Deadline deadline = TestTimeOut.fromNow();

      // Coordination directory
      coordinateTempDir = createTempDirectory();

      // Job Managers
      Configuration config =
          ZooKeeperTestUtils.createZooKeeperRecoveryModeConfig(
              ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());

      // Start first process
      jmProcess[0] = new JobManagerProcess(0, config);
      jmProcess[0].createAndStart();

      // Task manager configuration
      config.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, 4);
      config.setInteger(ConfigConstants.TASK_MANAGER_NETWORK_NUM_BUFFERS_KEY, 100);
      config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 2);

      // Start the task manager process
      for (int i = 0; i < numberOfTaskManagers; i++) {
        tmActorSystem[i] = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
        TaskManager.startTaskManagerComponentsAndActor(
            config,
            tmActorSystem[i],
            "localhost",
            Option.<String>empty(),
            Option.<LeaderRetrievalService>empty(),
            false,
            TaskManager.class);
      }

      // Test actor system
      testActorSystem = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());

      jmProcess[0].getActorRef(testActorSystem, deadline.timeLeft());

      // Leader listener
      TestingListener leaderListener = new TestingListener();
      leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
      leaderRetrievalService.start(leaderListener);

      // Initial submission
      leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());

      String leaderAddress = leaderListener.getAddress();
      UUID leaderId = leaderListener.getLeaderSessionID();

      // Get the leader ref
      ActorRef leaderRef =
          AkkaUtils.getActorRef(leaderAddress, testActorSystem, deadline.timeLeft());
      ActorGateway leaderGateway = new AkkaActorGateway(leaderRef, leaderId);

      // Wait for all task managers to connect to the leading job manager
      JobManagerActorTestUtils.waitForTaskManagers(
          numberOfTaskManagers, leaderGateway, deadline.timeLeft());

      final File coordinateDirClosure = coordinateTempDir;
      final Throwable[] errorRef = new Throwable[1];

      // we trigger program execution in a separate thread
      Thread programTrigger =
          new Thread("Program Trigger") {
            @Override
            public void run() {
              try {
                testJobManagerFailure(ZooKeeper.getConnectString(), coordinateDirClosure);
              } catch (Throwable t) {
                t.printStackTrace();
                errorRef[0] = t;
              }
            }
          };

      // start the test program
      programTrigger.start();

      // wait until all marker files are in place, indicating that all tasks have started
      AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(
          coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, deadline.timeLeft().toMillis());

      // Kill one of the job managers and trigger recovery
      jmProcess[0].destroy();

      jmProcess[1] = new JobManagerProcess(1, config);
      jmProcess[1].createAndStart();

      jmProcess[1].getActorRef(testActorSystem, deadline.timeLeft());

      // we create the marker file which signals the program functions tasks that they can complete
      AbstractTaskManagerProcessFailureRecoveryTest.touchFile(
          new File(coordinateTempDir, PROCEED_MARKER_FILE));

      programTrigger.join(deadline.timeLeft().toMillis());

      // We wait for the finish marker file. We don't wait for the program trigger, because
      // we submit in detached mode.
      AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(
          coordinateTempDir, FINISH_MARKER_FILE_PREFIX, 1, deadline.timeLeft().toMillis());

      // check that the program really finished
      assertFalse("The program did not finish in time", programTrigger.isAlive());

      // check whether the program encountered an error
      if (errorRef[0] != null) {
        Throwable error = errorRef[0];
        error.printStackTrace();
        fail(
            "The program encountered a "
                + error.getClass().getSimpleName()
                + " : "
                + error.getMessage());
      }
    } catch (Throwable t) {
      // Print early (in some situations the process logs get too big
      // for Travis and the root problem is not shown)
      t.printStackTrace();

      for (JobManagerProcess p : jmProcess) {
        if (p != null) {
          p.printProcessLog();
        }
      }

      throw t;
    } finally {
      for (int i = 0; i < numberOfTaskManagers; i++) {
        if (tmActorSystem[i] != null) {
          tmActorSystem[i].shutdown();
        }
      }

      if (leaderRetrievalService != null) {
        leaderRetrievalService.stop();
      }

      for (JobManagerProcess jmProces : jmProcess) {
        if (jmProces != null) {
          jmProces.destroy();
        }
      }

      // Delete coordination directory
      if (coordinateTempDir != null) {
        try {
          FileUtils.deleteDirectory(coordinateTempDir);
        } catch (Throwable ignored) {
        }
      }
    }
  }
 @BeforeClass
 public static void setup() {
   testActorSystem = AkkaUtils.createLocalActorSystem(new Configuration());
 }
예제 #15
0
  /*
   * Test setup:
   * - v1 is isolated, no slot sharing
   * - v2 and v3 (not connected) share slots
   * - v4 and v5 (connected) share slots
   */
  @Test
  public void testAssignSlotSharingGroup() {
    try {
      JobVertex v1 = new JobVertex("v1");
      JobVertex v2 = new JobVertex("v2");
      JobVertex v3 = new JobVertex("v3");
      JobVertex v4 = new JobVertex("v4");
      JobVertex v5 = new JobVertex("v5");

      v1.setParallelism(4);
      v2.setParallelism(5);
      v3.setParallelism(7);
      v4.setParallelism(1);
      v5.setParallelism(11);

      v2.connectNewDataSetAsInput(v1, DistributionPattern.POINTWISE);
      v5.connectNewDataSetAsInput(v4, DistributionPattern.POINTWISE);

      SlotSharingGroup jg1 = new SlotSharingGroup();
      v2.setSlotSharingGroup(jg1);
      v3.setSlotSharingGroup(jg1);

      SlotSharingGroup jg2 = new SlotSharingGroup();
      v4.setSlotSharingGroup(jg2);
      v5.setSlotSharingGroup(jg2);

      List<JobVertex> vertices = new ArrayList<JobVertex>(Arrays.asList(v1, v2, v3, v4, v5));

      ExecutionGraph eg =
          new ExecutionGraph(
              TestingUtils.defaultExecutionContext(),
              new JobID(),
              "test job",
              new Configuration(),
              ExecutionConfigTest.getSerializedConfig(),
              AkkaUtils.getDefaultTimeout(),
              new NoRestartStrategy());
      eg.attachJobGraph(vertices);

      // verify that the vertices are all in the same slot sharing group
      SlotSharingGroup group1 = null;
      SlotSharingGroup group2 = null;

      // verify that v1 tasks have no slot sharing group
      assertNull(eg.getJobVertex(v1.getID()).getSlotSharingGroup());

      // v2 and v3 are shared
      group1 = eg.getJobVertex(v2.getID()).getSlotSharingGroup();
      assertNotNull(group1);
      assertEquals(group1, eg.getJobVertex(v3.getID()).getSlotSharingGroup());

      assertEquals(2, group1.getJobVertexIds().size());
      assertTrue(group1.getJobVertexIds().contains(v2.getID()));
      assertTrue(group1.getJobVertexIds().contains(v3.getID()));

      // v4 and v5 are shared
      group2 = eg.getJobVertex(v4.getID()).getSlotSharingGroup();
      assertNotNull(group2);
      assertEquals(group2, eg.getJobVertex(v5.getID()).getSlotSharingGroup());

      assertEquals(2, group1.getJobVertexIds().size());
      assertTrue(group2.getJobVertexIds().contains(v4.getID()));
      assertTrue(group2.getJobVertexIds().contains(v5.getID()));
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }
  }