@BeforeClass public static void setupJobManager() { Configuration config = new Configuration(); int port = NetUtils.getAvailablePort(); config.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, "localhost"); config.setInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, port); scala.Option<Tuple2<String, Object>> listeningAddress = scala.Option.apply(new Tuple2<String, Object>("localhost", port)); jobManagerSystem = AkkaUtils.createActorSystem(config, listeningAddress); ActorRef jobManagerActorRef = JobManager.startJobManagerActors( config, jobManagerSystem, StreamingMode.BATCH_ONLY, JobManager.class, MemoryArchivist.class) ._1(); try { LeaderRetrievalService lrs = LeaderRetrievalUtils.createLeaderRetrievalService(config); jmGateway = LeaderRetrievalUtils.retrieveLeaderGateway(lrs, jobManagerSystem, timeout); } catch (Exception e) { fail("Could not retrieve the JobManager gateway. " + e.getMessage()); } }
private FiniteDuration getTimeout() { final Configuration configuration = GlobalConfiguration.getConfiguration(); if (this.timeout != null) { configuration.setString(ConfigConstants.AKKA_ASK_TIMEOUT, this.timeout); } return AkkaUtils.getClientTimeout(configuration); }
private ActorRef getJobManager() throws IOException { final Configuration configuration = GlobalConfiguration.getConfiguration(); ActorSystem actorSystem; try { final scala.Tuple2<String, Object> systemEndpoint = new scala.Tuple2<String, Object>("", 0); actorSystem = AkkaUtils.createActorSystem( configuration, new Some<scala.Tuple2<String, Object>>(systemEndpoint)); } catch (final Exception e) { throw new RuntimeException("Could not start actor system to communicate with JobManager", e); } return JobManager.getJobManagerActorRef( new InetSocketAddress(this.jobManagerHost, this.jobManagerPort), actorSystem, AkkaUtils.getLookupTimeout(configuration)); }
@BeforeClass public static void startActorSystem() { config = new Configuration(); config.setString(ConfigConstants.AKKA_ASK_TIMEOUT, "5 s"); config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "200 ms"); config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "2 s"); config.setDouble(ConfigConstants.AKKA_WATCH_THRESHOLD, 2.0); actorSystem = AkkaUtils.createLocalActorSystem(config); }
private Map<ExecutionAttemptID, Execution> setupExecution( JobVertex v1, int dop1, JobVertex v2, int dop2) throws Exception { final JobID jobId = new JobID(); v1.setParallelism(dop1); v2.setParallelism(dop2); v1.setInvokableClass(BatchTask.class); v2.setInvokableClass(BatchTask.class); // execution graph that executes actions synchronously ExecutionGraph eg = new ExecutionGraph( TestingUtils.directExecutionContext(), jobId, "some job", new Configuration(), new SerializedValue<>(new ExecutionConfig()), AkkaUtils.getDefaultTimeout(), new NoRestartStrategy()); eg.setQueuedSchedulingAllowed(false); List<JobVertex> ordered = Arrays.asList(v1, v2); eg.attachJobGraph(ordered); Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext()); for (int i = 0; i < dop1 + dop2; i++) { scheduler.newInstanceAvailable( ExecutionGraphTestUtils.getInstance( new ExecutionGraphTestUtils.SimpleActorGateway( TestingUtils.directExecutionContext()))); } assertEquals(dop1 + dop2, scheduler.getNumberOfAvailableSlots()); // schedule, this triggers mock deployment eg.scheduleForExecution(scheduler); Map<ExecutionAttemptID, Execution> executions = eg.getRegisteredExecutions(); assertEquals(dop1 + dop2, executions.size()); return executions; }
/** * Create a new Flink on YARN cluster. * * @param yarnClient * @param appId the YARN application ID * @param hadoopConfig * @param flinkConfig * @param sessionFilesDir * @param detached Set to true if no actor system or RPC communication with the cluster should be * established * @throws IOException * @throws YarnException */ public FlinkYarnCluster( final YarnClient yarnClient, final ApplicationId appId, Configuration hadoopConfig, org.apache.flink.configuration.Configuration flinkConfig, Path sessionFilesDir, boolean detached) throws IOException, YarnException { this.akkaDuration = AkkaUtils.getTimeout(flinkConfig); this.akkaTimeout = Timeout.durationToTimeout(akkaDuration); this.yarnClient = yarnClient; this.hadoopConfig = hadoopConfig; this.sessionFilesDir = sessionFilesDir; this.applicationId = appId; this.detached = detached; this.flinkConfig = flinkConfig; this.appId = appId; // get one application report manually intialAppReport = yarnClient.getApplicationReport(appId); String jobManagerHost = intialAppReport.getHost(); int jobManagerPort = intialAppReport.getRpcPort(); this.jobManagerAddress = new InetSocketAddress(jobManagerHost, jobManagerPort); }
public WebRuntimeMonitor( Configuration config, LeaderRetrievalService leaderRetrievalService, ActorSystem actorSystem) throws IOException, InterruptedException { this.leaderRetrievalService = checkNotNull(leaderRetrievalService); this.timeout = AkkaUtils.getTimeout(config); this.retriever = new JobManagerRetriever(this, actorSystem, AkkaUtils.getTimeout(config), timeout); final WebMonitorConfig cfg = new WebMonitorConfig(config); final int configuredPort = cfg.getWebFrontendPort(); if (configuredPort < 0) { throw new IllegalArgumentException("Web frontend port is invalid: " + configuredPort); } final WebMonitorUtils.LogFileLocation logFiles = WebMonitorUtils.LogFileLocation.find(config); // create an empty directory in temp for the web server String rootDirFileName = "flink-web-" + UUID.randomUUID(); webRootDir = new File(getBaseDir(config), rootDirFileName); LOG.info("Using directory {} for the web interface files", webRootDir); final boolean webSubmitAllow = cfg.isProgramSubmitEnabled(); if (webSubmitAllow) { // create storage for uploads String uploadDirName = "flink-web-upload-" + UUID.randomUUID(); this.uploadDir = new File(getBaseDir(config), uploadDirName); if (!uploadDir.mkdir() || !uploadDir.canWrite()) { throw new IOException("Unable to create temporary directory to support jar uploads."); } LOG.info("Using directory {} for web frontend JAR file uploads", uploadDir); } else { this.uploadDir = null; } ExecutionGraphHolder currentGraphs = new ExecutionGraphHolder(); // - Back pressure stats ---------------------------------------------- stackTraceSamples = new StackTraceSampleCoordinator(actorSystem, 60000); // Back pressure stats tracker config int cleanUpInterval = config.getInteger( ConfigConstants.JOB_MANAGER_WEB_BACK_PRESSURE_CLEAN_UP_INTERVAL, ConfigConstants.DEFAULT_JOB_MANAGER_WEB_BACK_PRESSURE_CLEAN_UP_INTERVAL); int refreshInterval = config.getInteger( ConfigConstants.JOB_MANAGER_WEB_BACK_PRESSURE_REFRESH_INTERVAL, ConfigConstants.DEFAULT_JOB_MANAGER_WEB_BACK_PRESSURE_REFRESH_INTERVAL); int numSamples = config.getInteger( ConfigConstants.JOB_MANAGER_WEB_BACK_PRESSURE_NUM_SAMPLES, ConfigConstants.DEFAULT_JOB_MANAGER_WEB_BACK_PRESSURE_NUM_SAMPLES); int delay = config.getInteger( ConfigConstants.JOB_MANAGER_WEB_BACK_PRESSURE_DELAY, ConfigConstants.DEFAULT_JOB_MANAGER_WEB_BACK_PRESSURE_DELAY); FiniteDuration delayBetweenSamples = new FiniteDuration(delay, TimeUnit.MILLISECONDS); backPressureStatsTracker = new BackPressureStatsTracker( stackTraceSamples, cleanUpInterval, numSamples, delayBetweenSamples); // -------------------------------------------------------------------- executorService = new ForkJoinPool(); ExecutionContextExecutor context = ExecutionContext$.MODULE$.fromExecutor(executorService); router = new Router() // config how to interact with this web server .GET("/config", handler(new DashboardConfigHandler(cfg.getRefreshInterval()))) // the overview - how many task managers, slots, free slots, ... .GET("/overview", handler(new ClusterOverviewHandler(DEFAULT_REQUEST_TIMEOUT))) // job manager configuration .GET("/jobmanager/config", handler(new JobManagerConfigHandler(config))) // overview over jobs .GET( "/joboverview", handler(new CurrentJobsOverviewHandler(DEFAULT_REQUEST_TIMEOUT, true, true))) .GET( "/joboverview/running", handler(new CurrentJobsOverviewHandler(DEFAULT_REQUEST_TIMEOUT, true, false))) .GET( "/joboverview/completed", handler(new CurrentJobsOverviewHandler(DEFAULT_REQUEST_TIMEOUT, false, true))) .GET("/jobs", handler(new CurrentJobIdsHandler(DEFAULT_REQUEST_TIMEOUT))) .GET("/jobs/:jobid", handler(new JobDetailsHandler(currentGraphs))) .GET("/jobs/:jobid/vertices", handler(new JobDetailsHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid", handler(new JobVertexDetailsHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/subtasktimes", handler(new SubtasksTimesHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/taskmanagers", handler(new JobVertexTaskManagersHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/accumulators", handler(new JobVertexAccumulatorsHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/checkpoints", handler(new JobVertexCheckpointsHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/backpressure", handler( new JobVertexBackPressureHandler( currentGraphs, backPressureStatsTracker, refreshInterval))) .GET( "/jobs/:jobid/vertices/:vertexid/subtasks/accumulators", handler(new SubtasksAllAccumulatorsHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/subtasks/:subtasknum", handler(new SubtaskCurrentAttemptDetailsHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/subtasks/:subtasknum/attempts/:attempt", handler(new SubtaskExecutionAttemptDetailsHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/subtasks/:subtasknum/attempts/:attempt/accumulators", handler(new SubtaskExecutionAttemptAccumulatorsHandler(currentGraphs))) .GET("/jobs/:jobid/plan", handler(new JobPlanHandler(currentGraphs))) .GET("/jobs/:jobid/config", handler(new JobConfigHandler(currentGraphs))) .GET("/jobs/:jobid/exceptions", handler(new JobExceptionsHandler(currentGraphs))) .GET("/jobs/:jobid/accumulators", handler(new JobAccumulatorsHandler(currentGraphs))) .GET("/jobs/:jobid/checkpoints", handler(new JobCheckpointsHandler(currentGraphs))) .GET("/taskmanagers", handler(new TaskManagersHandler(DEFAULT_REQUEST_TIMEOUT))) .GET( "/taskmanagers/:" + TaskManagersHandler.TASK_MANAGER_ID_KEY + "/metrics", handler(new TaskManagersHandler(DEFAULT_REQUEST_TIMEOUT))) .GET( "/taskmanagers/:" + TaskManagersHandler.TASK_MANAGER_ID_KEY + "/log", new TaskManagerLogHandler( retriever, context, jobManagerAddressPromise.future(), timeout, TaskManagerLogHandler.FileMode.LOG, config)) .GET( "/taskmanagers/:" + TaskManagersHandler.TASK_MANAGER_ID_KEY + "/stdout", new TaskManagerLogHandler( retriever, context, jobManagerAddressPromise.future(), timeout, TaskManagerLogHandler.FileMode.STDOUT, config)) // log and stdout .GET( "/jobmanager/log", logFiles.logFile == null ? new ConstantTextHandler("(log file unavailable)") : new StaticFileServerHandler( retriever, jobManagerAddressPromise.future(), timeout, logFiles.logFile)) .GET( "/jobmanager/stdout", logFiles.stdOutFile == null ? new ConstantTextHandler("(stdout file unavailable)") : new StaticFileServerHandler( retriever, jobManagerAddressPromise.future(), timeout, logFiles.stdOutFile)) // Cancel a job via GET (for proper integration with YARN this has to be performed via // GET) .GET("/jobs/:jobid/yarn-cancel", handler(new JobCancellationHandler())) // DELETE is the preferred way of canceling a job (Rest-conform) .DELETE("/jobs/:jobid/cancel", handler(new JobCancellationHandler())) // stop a job via GET (for proper integration with YARN this has to be performed via // GET) .GET("/jobs/:jobid/yarn-stop", handler(new JobStoppingHandler())) // DELETE is the preferred way of stopping a job (Rest-conform) .DELETE("/jobs/:jobid/stop", handler(new JobStoppingHandler())); if (webSubmitAllow) { router // fetch the list of uploaded jars. .GET("/jars", handler(new JarListHandler(uploadDir))) // get plan for an uploaded jar .GET("/jars/:jarid/plan", handler(new JarPlanHandler(uploadDir))) // run a jar .POST("/jars/:jarid/run", handler(new JarRunHandler(uploadDir, timeout))) // upload a jar .POST("/jars/upload", handler(new JarUploadHandler(uploadDir))) // delete an uploaded jar from submission interface .DELETE("/jars/:jarid", handler(new JarDeleteHandler(uploadDir))); } else { router // send an Access Denied message (sort of) // Every other GET request will go to the File Server, which will not provide // access to the jar directory anyway, because it doesn't exist in webRootDir. .GET("/jars", handler(new JarAccessDeniedHandler())); } // this handler serves all the static contents router.GET( "/:*", new StaticFileServerHandler( retriever, jobManagerAddressPromise.future(), timeout, webRootDir)); // add shutdown hook for deleting the directories and remaining temp files on shutdown try { Runtime.getRuntime() .addShutdownHook( new Thread() { @Override public void run() { cleanup(); } }); } catch (IllegalStateException e) { // race, JVM is in shutdown already, we can safely ignore this LOG.debug("Unable to add shutdown hook, shutdown already in progress", e); } catch (Throwable t) { // these errors usually happen when the shutdown is already in progress LOG.warn("Error while adding shutdown hook", t); } ChannelInitializer<SocketChannel> initializer = new ChannelInitializer<SocketChannel>() { @Override protected void initChannel(SocketChannel ch) { Handler handler = new Handler(router); ch.pipeline() .addLast(new HttpServerCodec()) .addLast(new HttpRequestHandler(uploadDir)) .addLast(handler.name(), handler) .addLast(new PipelineErrorHandler(LOG)); } }; NioEventLoopGroup bossGroup = new NioEventLoopGroup(1); NioEventLoopGroup workerGroup = new NioEventLoopGroup(); this.bootstrap = new ServerBootstrap(); this.bootstrap .group(bossGroup, workerGroup) .channel(NioServerSocketChannel.class) .childHandler(initializer); Channel ch = this.bootstrap.bind(configuredPort).sync().channel(); this.serverChannel = ch; InetSocketAddress bindAddress = (InetSocketAddress) ch.localAddress(); String address = bindAddress.getAddress().getHostAddress(); int port = bindAddress.getPort(); LOG.info("Web frontend listening at " + address + ':' + port); }
public WebRuntimeMonitor( Configuration config, LeaderRetrievalService leaderRetrievalService, ActorSystem actorSystem) throws IOException, InterruptedException { this.leaderRetrievalService = checkNotNull(leaderRetrievalService); final WebMonitorConfig cfg = new WebMonitorConfig(config); // create an empty directory in temp for the web server String fileName = String.format("flink-web-%s", UUID.randomUUID().toString()); webRootDir = new File(System.getProperty("java.io.tmpdir"), fileName); LOG.info("Using directory {} for the web interface files", webRootDir); // figure out where our logs are final String flinkRoot = config.getString(ConfigConstants.FLINK_BASE_DIR_PATH_KEY, null); final String defaultLogDirectory = flinkRoot + "/log"; final String logDirectories = config.getString(ConfigConstants.JOB_MANAGER_WEB_LOG_PATH_KEY, defaultLogDirectory); // find out which directory holds the path for log and stdout final ArrayList<String> logPaths = new ArrayList<>(); final ArrayList<String> outPaths = new ArrayList<>(); // yarn allows for multiple log directories. Search in all. for (String paths : logDirectories.split(",")) { File dir = new File(paths); if (dir.exists() && dir.isDirectory() && dir.canRead()) { if (dir.listFiles(LOG_FILE_PATTERN).length == 1) { logPaths.add(paths); } if (dir.listFiles(STDOUT_FILE_PATTERN).length == 1) { outPaths.add(paths); } } } // we don't want any ambiguities. There must be only one log and out file. if (logPaths.size() != 1 || outPaths.size() != 1) { throw new IllegalConfigurationException( "The path to the log and out files (" + logDirectories + ") is not valid."); } final File logDir = new File(logPaths.get(0)); final File outDir = new File(outPaths.get(0)); LOG.info("Serving job manager logs from {}", logDir.getAbsolutePath()); LOG.info("Serving job manager stdout from {}", outDir.getAbsolutePath()); // port configuration this.configuredPort = cfg.getWebFrontendPort(); if (this.configuredPort < 0) { throw new IllegalArgumentException("Web frontend port is invalid: " + this.configuredPort); } timeout = AkkaUtils.getTimeout(config); FiniteDuration lookupTimeout = AkkaUtils.getTimeout(config); retriever = new JobManagerRetriever(this, actorSystem, lookupTimeout, timeout); ExecutionGraphHolder currentGraphs = new ExecutionGraphHolder(); router = new Router() // config how to interact with this web server .GET("/config", handler(new DashboardConfigHandler(cfg.getRefreshInterval()))) // the overview - how many task managers, slots, free slots, ... .GET("/overview", handler(new ClusterOverviewHandler(DEFAULT_REQUEST_TIMEOUT))) // job manager configuration, log and stdout .GET("/jobmanager/config", handler(new JobManagerConfigHandler(config))) // overview over jobs .GET( "/joboverview", handler(new CurrentJobsOverviewHandler(DEFAULT_REQUEST_TIMEOUT, true, true))) .GET( "/joboverview/running", handler(new CurrentJobsOverviewHandler(DEFAULT_REQUEST_TIMEOUT, true, false))) .GET( "/joboverview/completed", handler(new CurrentJobsOverviewHandler(DEFAULT_REQUEST_TIMEOUT, false, true))) .GET("/jobs", handler(new CurrentJobIdsHandler(retriever, DEFAULT_REQUEST_TIMEOUT))) .GET("/jobs/:jobid", handler(new JobDetailsHandler(currentGraphs))) .GET("/jobs/:jobid/vertices", handler(new JobDetailsHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid", handler(new JobVertexDetailsHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/subtasktimes", handler(new SubtasksTimesHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/accumulators", handler(new JobVertexAccumulatorsHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/subtasks/accumulators", handler(new SubtasksAllAccumulatorsHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/subtasks/:subtasknum", handler(new SubtaskCurrentAttemptDetailsHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/subtasks/:subtasknum/attempts/:attempt", handler(new SubtaskExecutionAttemptDetailsHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/subtasks/:subtasknum/attempts/:attempt/accumulators", handler(new SubtaskExecutionAttemptAccumulatorsHandler(currentGraphs))) .GET("/jobs/:jobid/plan", handler(new JobPlanHandler(currentGraphs))) .GET("/jobs/:jobid/config", handler(new JobConfigHandler(currentGraphs))) .GET("/jobs/:jobid/exceptions", handler(new JobExceptionsHandler(currentGraphs))) .GET("/jobs/:jobid/accumulators", handler(new JobAccumulatorsHandler(currentGraphs))) .GET("/taskmanagers", handler(new TaskManagersHandler(DEFAULT_REQUEST_TIMEOUT))) .GET( "/taskmanagers/:" + TaskManagersHandler.TASK_MANAGER_ID_KEY, handler(new TaskManagersHandler(DEFAULT_REQUEST_TIMEOUT))) .GET( "/jobmanager/log", new StaticFileServerHandler( retriever, jobManagerAddressPromise.future(), timeout, logDir)) .GET( "/jobmanager/stdout", new StaticFileServerHandler( retriever, jobManagerAddressPromise.future(), timeout, outDir)) // this handler serves all the static contents .GET( "/:*", new StaticFileServerHandler( retriever, jobManagerAddressPromise.future(), timeout, webRootDir)); synchronized (startupShutdownLock) { // add shutdown hook for deleting the directory try { Runtime.getRuntime() .addShutdownHook( new Thread() { @Override public void run() { shutdown(); } }); } catch (IllegalStateException e) { // race, JVM is in shutdown already, we can safely ignore this LOG.debug("Unable to add shutdown hook, shutdown already in progress", e); } catch (Throwable t) { // these errors usually happen when the shutdown is already in progress LOG.warn("Error while adding shutdown hook", t); } ChannelInitializer<SocketChannel> initializer = new ChannelInitializer<SocketChannel>() { @Override protected void initChannel(SocketChannel ch) { Handler handler = new Handler(router); ch.pipeline() .addLast(new HttpServerCodec()) .addLast(new HttpObjectAggregator(65536)) .addLast(new ChunkedWriteHandler()) .addLast(handler.name(), handler); } }; NioEventLoopGroup bossGroup = new NioEventLoopGroup(1); NioEventLoopGroup workerGroup = new NioEventLoopGroup(); this.bootstrap = new ServerBootstrap(); this.bootstrap .group(bossGroup, workerGroup) .channel(NioServerSocketChannel.class) .childHandler(initializer); Channel ch = this.bootstrap.bind(configuredPort).sync().channel(); this.serverChannel = ch; InetSocketAddress bindAddress = (InetSocketAddress) ch.localAddress(); String address = bindAddress.getAddress().getHostAddress(); int port = bindAddress.getPort(); LOG.info("Web frontend listening at " + address + ':' + port); } }
public WebRuntimeMonitor( Configuration config, LeaderRetrievalService leaderRetrievalService, ActorSystem actorSystem) throws IOException { this.leaderRetrievalService = checkNotNull(leaderRetrievalService); final WebMonitorConfig cfg = new WebMonitorConfig(config); // create an empty directory in temp for the web server String fileName = String.format("flink-web-%s", UUID.randomUUID().toString()); webRootDir = new File(System.getProperty("java.io.tmpdir"), fileName); LOG.info("Using directory {} for the web interface files", webRootDir); // port configuration this.configuredPort = cfg.getWebFrontendPort(); if (this.configuredPort < 0) { throw new IllegalArgumentException("Web frontend port is invalid: " + this.configuredPort); } FiniteDuration timeout = AkkaUtils.getTimeout(config); FiniteDuration lookupTimeout = AkkaUtils.getTimeout(config); retriever = new JobManagerArchiveRetriever(this, actorSystem, lookupTimeout, timeout); ExecutionGraphHolder currentGraphs = new ExecutionGraphHolder(retriever); router = new Router() // config how to interact with this web server .GET("/config", handler(new DashboardConfigHandler(cfg.getRefreshInterval()))) // the overview - how many task managers, slots, free slots, ... .GET( "/overview", handler(new ClusterOverviewHandler(retriever, DEFAULT_REQUEST_TIMEOUT))) // job manager configuration .GET("/jobmanager/config", handler(new JobManagerConfigHandler(config))) // overview over jobs .GET( "/joboverview", handler( new CurrentJobsOverviewHandler(retriever, DEFAULT_REQUEST_TIMEOUT, true, true))) .GET( "/joboverview/running", handler( new CurrentJobsOverviewHandler( retriever, DEFAULT_REQUEST_TIMEOUT, true, false))) .GET( "/joboverview/completed", handler( new CurrentJobsOverviewHandler( retriever, DEFAULT_REQUEST_TIMEOUT, false, true))) .GET("/jobs", handler(new CurrentJobIdsHandler(retriever, DEFAULT_REQUEST_TIMEOUT))) .GET("/jobs/:jobid", handler(new JobDetailsHandler(currentGraphs))) .GET("/jobs/:jobid/vertices", handler(new JobDetailsHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid", handler(new JobVertexDetailsHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/subtasktimes", handler(new SubtasksTimesHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/accumulators", handler(new JobVertexAccumulatorsHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/subtasks/accumulators", handler(new SubtasksAllAccumulatorsHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/subtasks/:subtasknum", handler(new SubtaskCurrentAttemptDetailsHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/subtasks/:subtasknum/attempts/:attempt", handler(new SubtaskExecutionAttemptDetailsHandler(currentGraphs))) .GET( "/jobs/:jobid/vertices/:vertexid/subtasks/:subtasknum/attempts/:attempt/accumulators", handler(new SubtaskExecutionAttemptAccumulatorsHandler(currentGraphs))) .GET("/jobs/:jobid/plan", handler(new JobPlanHandler(currentGraphs))) .GET("/jobs/:jobid/config", handler(new JobConfigHandler(currentGraphs))) .GET("/jobs/:jobid/exceptions", handler(new JobExceptionsHandler(currentGraphs))) .GET("/jobs/:jobid/accumulators", handler(new JobAccumulatorsHandler(currentGraphs))) .GET( "/taskmanagers", handler(new TaskManagersHandler(retriever, DEFAULT_REQUEST_TIMEOUT))) .GET( "/taskmanagers/:" + TaskManagersHandler.TASK_MANAGER_ID_KEY, handler(new TaskManagersHandler(retriever, DEFAULT_REQUEST_TIMEOUT))) // this handler serves all the static contents .GET("/:*", new StaticFileServerHandler(webRootDir)); }
@Test public void testBuildDeploymentDescriptor() { try { final JobID jobId = new JobID(); final JobVertexID jid1 = new JobVertexID(); final JobVertexID jid2 = new JobVertexID(); final JobVertexID jid3 = new JobVertexID(); final JobVertexID jid4 = new JobVertexID(); JobVertex v1 = new JobVertex("v1", jid1); JobVertex v2 = new JobVertex("v2", jid2); JobVertex v3 = new JobVertex("v3", jid3); JobVertex v4 = new JobVertex("v4", jid4); v1.setParallelism(10); v2.setParallelism(10); v3.setParallelism(10); v4.setParallelism(10); v1.setInvokableClass(BatchTask.class); v2.setInvokableClass(BatchTask.class); v3.setInvokableClass(BatchTask.class); v4.setInvokableClass(BatchTask.class); v2.connectNewDataSetAsInput(v1, DistributionPattern.ALL_TO_ALL); v3.connectNewDataSetAsInput(v2, DistributionPattern.ALL_TO_ALL); v4.connectNewDataSetAsInput(v2, DistributionPattern.ALL_TO_ALL); ExecutionGraph eg = new ExecutionGraph( TestingUtils.defaultExecutionContext(), jobId, "some job", new Configuration(), new SerializedValue<>(new ExecutionConfig()), AkkaUtils.getDefaultTimeout(), new NoRestartStrategy()); List<JobVertex> ordered = Arrays.asList(v1, v2, v3, v4); eg.attachJobGraph(ordered); ExecutionJobVertex ejv = eg.getAllVertices().get(jid2); ExecutionVertex vertex = ejv.getTaskVertices()[3]; ExecutionGraphTestUtils.SimpleActorGateway instanceGateway = new ExecutionGraphTestUtils.SimpleActorGateway(TestingUtils.directExecutionContext()); final Instance instance = getInstance(instanceGateway); final SimpleSlot slot = instance.allocateSimpleSlot(jobId); assertEquals(ExecutionState.CREATED, vertex.getExecutionState()); vertex.deployToSlot(slot); assertEquals(ExecutionState.DEPLOYING, vertex.getExecutionState()); TaskDeploymentDescriptor descr = instanceGateway.lastTDD; assertNotNull(descr); assertEquals(jobId, descr.getJobID()); assertEquals(jid2, descr.getVertexID()); assertEquals(3, descr.getIndexInSubtaskGroup()); assertEquals(10, descr.getNumberOfSubtasks()); assertEquals(BatchTask.class.getName(), descr.getInvokableClassName()); assertEquals("v2", descr.getTaskName()); List<ResultPartitionDeploymentDescriptor> producedPartitions = descr.getProducedPartitions(); List<InputGateDeploymentDescriptor> consumedPartitions = descr.getInputGates(); assertEquals(2, producedPartitions.size()); assertEquals(1, consumedPartitions.size()); assertEquals(10, producedPartitions.get(0).getNumberOfSubpartitions()); assertEquals(10, producedPartitions.get(1).getNumberOfSubpartitions()); assertEquals(10, consumedPartitions.get(0).getInputChannelDeploymentDescriptors().length); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
@Test /** * Tests that a blocking batch job fails if there are not enough resources left to schedule the * succeeding tasks. This test case is related to [FLINK-4296] where finished producing tasks * swallow the fail exception when scheduling a consumer task. */ public void testNoResourceAvailableFailure() throws Exception { final JobID jobId = new JobID(); JobVertex v1 = new JobVertex("source"); JobVertex v2 = new JobVertex("sink"); int dop1 = 1; int dop2 = 1; v1.setParallelism(dop1); v2.setParallelism(dop2); v1.setInvokableClass(BatchTask.class); v2.setInvokableClass(BatchTask.class); v2.connectNewDataSetAsInput( v1, DistributionPattern.POINTWISE, ResultPartitionType.BLOCKING, false); // execution graph that executes actions synchronously ExecutionGraph eg = new ExecutionGraph( TestingUtils.directExecutionContext(), jobId, "failing test job", new Configuration(), new SerializedValue<>(new ExecutionConfig()), AkkaUtils.getDefaultTimeout(), new NoRestartStrategy()); eg.setQueuedSchedulingAllowed(false); List<JobVertex> ordered = Arrays.asList(v1, v2); eg.attachJobGraph(ordered); Scheduler scheduler = new Scheduler(TestingUtils.directExecutionContext()); for (int i = 0; i < dop1; i++) { scheduler.newInstanceAvailable( ExecutionGraphTestUtils.getInstance( new ExecutionGraphTestUtils.SimpleActorGateway( TestingUtils.directExecutionContext()))); } assertEquals(dop1, scheduler.getNumberOfAvailableSlots()); // schedule, this triggers mock deployment eg.scheduleForExecution(scheduler); ExecutionAttemptID attemptID = eg.getJobVertex(v1.getID()) .getTaskVertices()[0] .getCurrentExecutionAttempt() .getAttemptId(); eg.updateState(new TaskExecutionState(jobId, attemptID, ExecutionState.RUNNING)); eg.updateState( new TaskExecutionState( jobId, attemptID, ExecutionState.FINISHED, null, new AccumulatorSnapshot( jobId, attemptID, new HashMap<AccumulatorRegistry.Metric, Accumulator<?, ?>>(), new HashMap<String, Accumulator<?, ?>>()))); assertEquals(JobStatus.FAILED, eg.getState()); }
/** * Connect the FlinkYarnCluster to the ApplicationMaster. * * <p>Detached YARN sessions don't need to connect to the ApplicationMaster. Detached per job YARN * sessions need to connect until the required number of TaskManagers have been started. * * @throws IOException */ public void connectToCluster() throws IOException { if (isConnected) { throw new IllegalStateException("Can not connect to the cluster again"); } // start actor system LOG.info("Start actor system."); InetAddress ownHostname = NetUtils.resolveAddress( jobManagerAddress); // find name of own public interface, able to connect to the JM actorSystem = AkkaUtils.createActorSystem( flinkConfig, new Some(new Tuple2<String, Integer>(ownHostname.getCanonicalHostName(), 0))); // start application client LOG.info("Start application client."); applicationClient = actorSystem.actorOf( Props.create(ApplicationClient.class, flinkConfig), "applicationClient"); // instruct ApplicationClient to start a periodical status polling applicationClient.tell( new Messages.LocalRegisterClient(this.jobManagerAddress), applicationClient); actorRunner = new Thread( new Runnable() { @Override public void run() { // blocks until ApplicationMaster has been stopped actorSystem.awaitTermination(); // get final application report try { ApplicationReport appReport = yarnClient.getApplicationReport(appId); LOG.info( "Application " + appId + " finished with state " + appReport.getYarnApplicationState() + " and final state " + appReport.getFinalApplicationStatus() + " at " + appReport.getFinishTime()); if (appReport.getYarnApplicationState() == YarnApplicationState.FAILED || appReport.getYarnApplicationState() == YarnApplicationState.KILLED) { LOG.warn("Application failed. Diagnostics " + appReport.getDiagnostics()); LOG.warn( "If log aggregation is activated in the Hadoop cluster, we recommend to retrieve " + "the full application log using this command:\n" + "\tyarn logs -applicationId " + appReport.getApplicationId() + "\n" + "(It sometimes takes a few seconds until the logs are aggregated)"); } } catch (Exception e) { LOG.warn("Error while getting final application report", e); } } }); actorRunner.setDaemon(true); actorRunner.start(); pollingRunner = new PollingThread(yarnClient, appId); pollingRunner.setDaemon(true); pollingRunner.start(); Runtime.getRuntime().addShutdownHook(clientShutdownHook); isConnected = true; }
@Test public void testJobManagerProcessFailure() throws Exception { // Config final int numberOfJobManagers = 2; final int numberOfTaskManagers = 2; final int numberOfSlotsPerTaskManager = 2; assertEquals(PARALLELISM, numberOfTaskManagers * numberOfSlotsPerTaskManager); // Setup // Test actor system ActorSystem testActorSystem; // Job managers final JobManagerProcess[] jmProcess = new JobManagerProcess[numberOfJobManagers]; // Task managers final ActorSystem[] tmActorSystem = new ActorSystem[numberOfTaskManagers]; // Leader election service LeaderRetrievalService leaderRetrievalService = null; // Coordination between the processes goes through a directory File coordinateTempDir = null; try { final Deadline deadline = TestTimeOut.fromNow(); // Coordination directory coordinateTempDir = createTempDirectory(); // Job Managers Configuration config = ZooKeeperTestUtils.createZooKeeperRecoveryModeConfig( ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath()); // Start first process jmProcess[0] = new JobManagerProcess(0, config); jmProcess[0].createAndStart(); // Task manager configuration config.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, 4); config.setInteger(ConfigConstants.TASK_MANAGER_NETWORK_NUM_BUFFERS_KEY, 100); config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 2); // Start the task manager process for (int i = 0; i < numberOfTaskManagers; i++) { tmActorSystem[i] = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig()); TaskManager.startTaskManagerComponentsAndActor( config, tmActorSystem[i], "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class); } // Test actor system testActorSystem = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig()); jmProcess[0].getActorRef(testActorSystem, deadline.timeLeft()); // Leader listener TestingListener leaderListener = new TestingListener(); leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config); leaderRetrievalService.start(leaderListener); // Initial submission leaderListener.waitForNewLeader(deadline.timeLeft().toMillis()); String leaderAddress = leaderListener.getAddress(); UUID leaderId = leaderListener.getLeaderSessionID(); // Get the leader ref ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testActorSystem, deadline.timeLeft()); ActorGateway leaderGateway = new AkkaActorGateway(leaderRef, leaderId); // Wait for all task managers to connect to the leading job manager JobManagerActorTestUtils.waitForTaskManagers( numberOfTaskManagers, leaderGateway, deadline.timeLeft()); final File coordinateDirClosure = coordinateTempDir; final Throwable[] errorRef = new Throwable[1]; // we trigger program execution in a separate thread Thread programTrigger = new Thread("Program Trigger") { @Override public void run() { try { testJobManagerFailure(ZooKeeper.getConnectString(), coordinateDirClosure); } catch (Throwable t) { t.printStackTrace(); errorRef[0] = t; } } }; // start the test program programTrigger.start(); // wait until all marker files are in place, indicating that all tasks have started AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles( coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, deadline.timeLeft().toMillis()); // Kill one of the job managers and trigger recovery jmProcess[0].destroy(); jmProcess[1] = new JobManagerProcess(1, config); jmProcess[1].createAndStart(); jmProcess[1].getActorRef(testActorSystem, deadline.timeLeft()); // we create the marker file which signals the program functions tasks that they can complete AbstractTaskManagerProcessFailureRecoveryTest.touchFile( new File(coordinateTempDir, PROCEED_MARKER_FILE)); programTrigger.join(deadline.timeLeft().toMillis()); // We wait for the finish marker file. We don't wait for the program trigger, because // we submit in detached mode. AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles( coordinateTempDir, FINISH_MARKER_FILE_PREFIX, 1, deadline.timeLeft().toMillis()); // check that the program really finished assertFalse("The program did not finish in time", programTrigger.isAlive()); // check whether the program encountered an error if (errorRef[0] != null) { Throwable error = errorRef[0]; error.printStackTrace(); fail( "The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage()); } } catch (Throwable t) { // Print early (in some situations the process logs get too big // for Travis and the root problem is not shown) t.printStackTrace(); for (JobManagerProcess p : jmProcess) { if (p != null) { p.printProcessLog(); } } throw t; } finally { for (int i = 0; i < numberOfTaskManagers; i++) { if (tmActorSystem[i] != null) { tmActorSystem[i].shutdown(); } } if (leaderRetrievalService != null) { leaderRetrievalService.stop(); } for (JobManagerProcess jmProces : jmProcess) { if (jmProces != null) { jmProces.destroy(); } } // Delete coordination directory if (coordinateTempDir != null) { try { FileUtils.deleteDirectory(coordinateTempDir); } catch (Throwable ignored) { } } } }
@BeforeClass public static void setup() { testActorSystem = AkkaUtils.createLocalActorSystem(new Configuration()); }
/* * Test setup: * - v1 is isolated, no slot sharing * - v2 and v3 (not connected) share slots * - v4 and v5 (connected) share slots */ @Test public void testAssignSlotSharingGroup() { try { JobVertex v1 = new JobVertex("v1"); JobVertex v2 = new JobVertex("v2"); JobVertex v3 = new JobVertex("v3"); JobVertex v4 = new JobVertex("v4"); JobVertex v5 = new JobVertex("v5"); v1.setParallelism(4); v2.setParallelism(5); v3.setParallelism(7); v4.setParallelism(1); v5.setParallelism(11); v2.connectNewDataSetAsInput(v1, DistributionPattern.POINTWISE); v5.connectNewDataSetAsInput(v4, DistributionPattern.POINTWISE); SlotSharingGroup jg1 = new SlotSharingGroup(); v2.setSlotSharingGroup(jg1); v3.setSlotSharingGroup(jg1); SlotSharingGroup jg2 = new SlotSharingGroup(); v4.setSlotSharingGroup(jg2); v5.setSlotSharingGroup(jg2); List<JobVertex> vertices = new ArrayList<JobVertex>(Arrays.asList(v1, v2, v3, v4, v5)); ExecutionGraph eg = new ExecutionGraph( TestingUtils.defaultExecutionContext(), new JobID(), "test job", new Configuration(), ExecutionConfigTest.getSerializedConfig(), AkkaUtils.getDefaultTimeout(), new NoRestartStrategy()); eg.attachJobGraph(vertices); // verify that the vertices are all in the same slot sharing group SlotSharingGroup group1 = null; SlotSharingGroup group2 = null; // verify that v1 tasks have no slot sharing group assertNull(eg.getJobVertex(v1.getID()).getSlotSharingGroup()); // v2 and v3 are shared group1 = eg.getJobVertex(v2.getID()).getSlotSharingGroup(); assertNotNull(group1); assertEquals(group1, eg.getJobVertex(v3.getID()).getSlotSharingGroup()); assertEquals(2, group1.getJobVertexIds().size()); assertTrue(group1.getJobVertexIds().contains(v2.getID())); assertTrue(group1.getJobVertexIds().contains(v3.getID())); // v4 and v5 are shared group2 = eg.getJobVertex(v4.getID()).getSlotSharingGroup(); assertNotNull(group2); assertEquals(group2, eg.getJobVertex(v5.getID()).getSlotSharingGroup()); assertEquals(2, group1.getJobVertexIds().size()); assertTrue(group2.getJobVertexIds().contains(v4.getID())); assertTrue(group2.getJobVertexIds().contains(v5.getID())); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }