/** Test regular operation, including command line parameter parsing. */ @Test(timeout = 60000) // timeout after a minute. public void testDetachedMode() { LOG.info("Starting testDetachedMode()"); addTestAppender(FlinkYarnSessionCli.class, Level.INFO); Runner runner = startWithArgs( new String[] { "-j", flinkUberjar.getAbsolutePath(), "-t", flinkLibFolder.getAbsolutePath(), "-n", "1", "-jm", "768", "-tm", "1024", "--name", "MyCustomName", // test setting a custom name "--detached" }, "Flink JobManager is now running on", RunTypes.YARN_SESSION); checkForLogString("The Flink YARN client has been started in detached mode"); Assert.assertFalse("The runner should detach.", runner.isAlive()); LOG.info("Waiting until two containers are running"); // wait until two containers are running while (getRunningContainers() < 2) { sleep(500); } LOG.info("Two containers are running. Killing the application"); // kill application "externally". try { YarnClient yc = YarnClient.createYarnClient(); yc.init(yarnConfiguration); yc.start(); List<ApplicationReport> apps = yc.getApplications(EnumSet.of(YarnApplicationState.RUNNING)); Assert.assertEquals(1, apps.size()); // Only one running ApplicationReport app = apps.get(0); Assert.assertEquals("MyCustomName", app.getName()); ApplicationId id = app.getApplicationId(); yc.killApplication(id); while (yc.getApplications(EnumSet.of(YarnApplicationState.KILLED)).size() == 0) { sleep(500); } } catch (Throwable t) { LOG.warn("Killing failed", t); Assert.fail(); } LOG.info("Finished testDetachedMode()"); }
@Test(timeout = 10000) public void testGetApplications() throws YarnException, IOException { Configuration conf = new Configuration(); final YarnClient client = new MockYarnClient(); client.init(conf); client.start(); List<ApplicationReport> expectedReports = ((MockYarnClient) client).getReports(); List<ApplicationReport> reports = client.getApplications(); Assert.assertEquals(reports, expectedReports); Set<String> appTypes = new HashSet<String>(); appTypes.add("YARN"); appTypes.add("NON-YARN"); reports = client.getApplications(appTypes, null); Assert.assertEquals(reports.size(), 2); Assert.assertTrue( (reports.get(0).getApplicationType().equals("YARN") && reports.get(1).getApplicationType().equals("NON-YARN")) || (reports.get(1).getApplicationType().equals("YARN") && reports.get(0).getApplicationType().equals("NON-YARN"))); for (ApplicationReport report : reports) { Assert.assertTrue(expectedReports.contains(report)); } EnumSet<YarnApplicationState> appStates = EnumSet.noneOf(YarnApplicationState.class); appStates.add(YarnApplicationState.FINISHED); appStates.add(YarnApplicationState.FAILED); reports = client.getApplications(null, appStates); Assert.assertEquals(reports.size(), 2); Assert.assertTrue( (reports.get(0).getApplicationType().equals("NON-YARN") && reports.get(1).getApplicationType().equals("NON-MAPREDUCE")) || (reports.get(1).getApplicationType().equals("NON-YARN") && reports.get(0).getApplicationType().equals("NON-MAPREDUCE"))); for (ApplicationReport report : reports) { Assert.assertTrue(expectedReports.contains(report)); } reports = client.getApplications(appTypes, appStates); Assert.assertEquals(reports.size(), 1); Assert.assertTrue((reports.get(0).getApplicationType().equals("NON-YARN"))); for (ApplicationReport report : reports) { Assert.assertTrue(expectedReports.contains(report)); } client.stop(); }
@Before public void checkClusterEmpty() throws IOException, YarnException { if (yarnClient == null) { yarnClient = YarnClient.createYarnClient(); yarnClient.init(yarnConfiguration); yarnClient.start(); } List<ApplicationReport> apps = yarnClient.getApplications(); for (ApplicationReport app : apps) { if (app.getYarnApplicationState() != YarnApplicationState.FINISHED && app.getYarnApplicationState() != YarnApplicationState.KILLED && app.getYarnApplicationState() != YarnApplicationState.FAILED) { Assert.fail( "There is at least one application on the cluster is not finished." + "App " + app.getApplicationId() + " is in state " + app.getYarnApplicationState()); } } }
private void testDetachedPerJobYarnClusterInternal(String job) { YarnClient yc = YarnClient.createYarnClient(); yc.init(yarnConfiguration); yc.start(); // get temporary folder for writing output of wordcount example File tmpOutFolder = null; try { tmpOutFolder = tmp.newFolder(); } catch (IOException e) { throw new RuntimeException(e); } // get temporary file for reading input data for wordcount example File tmpInFile; try { tmpInFile = tmp.newFile(); FileUtils.writeStringToFile(tmpInFile, WordCountData.TEXT); } catch (IOException e) { throw new RuntimeException(e); } Runner runner = startWithArgs( new String[] { "run", "-m", "yarn-cluster", "-yj", flinkUberjar.getAbsolutePath(), "-yt", flinkLibFolder.getAbsolutePath(), "-yn", "1", "-yjm", "768", "-yD", "yarn.heap-cutoff-ratio=0.5", // test if the cutoff is passed correctly "-ytm", "1024", "-ys", "2", // test requesting slots from YARN. "--yarndetached", job, tmpInFile.getAbsoluteFile().toString(), tmpOutFolder.getAbsoluteFile().toString() }, "Job has been submitted with JobID", RunTypes.CLI_FRONTEND); // it should usually be 2, but on slow machines, the number varies Assert.assertTrue("There should be at most 2 containers running", getRunningContainers() <= 2); // give the runner some time to detach for (int attempt = 0; runner.isAlive() && attempt < 5; attempt++) { try { Thread.sleep(500); } catch (InterruptedException e) { } } Assert.assertFalse("The runner should detach.", runner.isAlive()); LOG.info("CLI Frontend has returned, so the job is running"); // find out the application id and wait until it has finished. try { List<ApplicationReport> apps = yc.getApplications(EnumSet.of(YarnApplicationState.RUNNING)); ApplicationId tmpAppId; if (apps.size() == 1) { // Better method to find the right appId. But sometimes the app is shutting down very fast // Only one running tmpAppId = apps.get(0).getApplicationId(); LOG.info("waiting for the job with appId {} to finish", tmpAppId); // wait until the app has finished while (yc.getApplications(EnumSet.of(YarnApplicationState.RUNNING)).size() > 0) { sleep(500); } } else { // get appId by finding the latest finished appid apps = yc.getApplications(); Collections.sort( apps, new Comparator<ApplicationReport>() { @Override public int compare(ApplicationReport o1, ApplicationReport o2) { return o1.getApplicationId().compareTo(o2.getApplicationId()) * -1; } }); tmpAppId = apps.get(0).getApplicationId(); LOG.info( "Selected {} as the last appId from {}", tmpAppId, Arrays.toString(apps.toArray())); } final ApplicationId id = tmpAppId; // now it has finished. // check the output files. File[] listOfOutputFiles = tmpOutFolder.listFiles(); Assert.assertNotNull("Taskmanager output not found", listOfOutputFiles); LOG.info("The job has finished. TaskManager output files found in {}", tmpOutFolder); // read all output files in output folder to one output string String content = ""; for (File f : listOfOutputFiles) { if (f.isFile()) { content += FileUtils.readFileToString(f) + "\n"; } } // String content = FileUtils.readFileToString(taskmanagerOut); // check for some of the wordcount outputs. Assert.assertTrue( "Expected string 'da 5' or '(all,2)' not found in string '" + content + "'", content.contains("da 5") || content.contains("(da,5)") || content.contains("(all,2)")); Assert.assertTrue( "Expected string 'der 29' or '(mind,1)' not found in string'" + content + "'", content.contains("der 29") || content.contains("(der,29)") || content.contains("(mind,1)")); // check if the heap size for the TaskManager was set correctly File jobmanagerLog = YarnTestBase.findFile( "..", new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.contains("jobmanager.log") && dir.getAbsolutePath().contains(id.toString()); } }); Assert.assertNotNull("Unable to locate JobManager log", jobmanagerLog); content = FileUtils.readFileToString(jobmanagerLog); // TM was started with 1024 but we cut off 50% (NOT THE DEFAULT VALUE) String expected = "Starting TM with command=$JAVA_HOME/bin/java -Xms424m -Xmx424m"; Assert.assertTrue( "Expected string '" + expected + "' not found in JobManager log: '" + jobmanagerLog + "'", content.contains(expected)); expected = " (2/2) (attempt #0) to "; Assert.assertTrue( "Expected string '" + expected + "' not found in JobManager log." + "This string checks that the job has been started with a parallelism of 2. Log contents: '" + jobmanagerLog + "'", content.contains(expected)); // make sure the detached app is really finished. LOG.info("Checking again that app has finished"); ApplicationReport rep; do { sleep(500); rep = yc.getApplicationReport(id); LOG.info("Got report {}", rep); } while (rep.getYarnApplicationState() == YarnApplicationState.RUNNING); } catch (Throwable t) { LOG.warn("Error while detached yarn session was running", t); Assert.fail(t.getMessage()); } }
/** Test TaskManager failure */ @Test(timeout = 100000) // timeout after 100 seconds public void testTaskManagerFailure() { LOG.info("Starting testTaskManagerFailure()"); Runner runner = startWithArgs( new String[] { "-j", flinkUberjar.getAbsolutePath(), "-t", flinkLibFolder.getAbsolutePath(), "-n", "1", "-jm", "768", "-tm", "1024", "-nm", "customName", "-Dfancy-configuration-value=veryFancy", "-Dyarn.maximum-failed-containers=3" }, "Number of connected TaskManagers changed to 1. Slots available: 1", RunTypes.YARN_SESSION); Assert.assertEquals(2, getRunningContainers()); // ------------------------ Test if JobManager web interface is accessible ------- try { YarnClient yc = YarnClient.createYarnClient(); yc.init(yarnConfiguration); yc.start(); List<ApplicationReport> apps = yc.getApplications(EnumSet.of(YarnApplicationState.RUNNING)); Assert.assertEquals(1, apps.size()); // Only one running ApplicationReport app = apps.get(0); Assert.assertEquals("customName", app.getName()); String url = app.getTrackingUrl(); if (!url.endsWith("/")) { url += "/"; } if (!url.startsWith("http://")) { url = "http://" + url; } LOG.info("Got application URL from YARN {}", url); String response = TestBaseUtils.getFromHTTP(url + "taskmanagers/"); JSONObject parsedTMs = new JSONObject(response); JSONArray taskManagers = parsedTMs.getJSONArray("taskmanagers"); Assert.assertNotNull(taskManagers); Assert.assertEquals(1, taskManagers.length()); Assert.assertEquals(1, taskManagers.getJSONObject(0).getInt("slotsNumber")); // get the configuration from webinterface & check if the dynamic properties from YARN show up // there. String jsonConfig = TestBaseUtils.getFromHTTP(url + "jobmanager/config"); JSONArray parsed = new JSONArray(jsonConfig); Map<String, String> parsedConfig = WebMonitorUtils.fromKeyValueJsonArray(parsed); Assert.assertEquals("veryFancy", parsedConfig.get("fancy-configuration-value")); Assert.assertEquals("3", parsedConfig.get("yarn.maximum-failed-containers")); // -------------- FLINK-1902: check if jobmanager hostname/port are shown in web interface // first, get the hostname/port String oC = outContent.toString(); Pattern p = Pattern.compile("Flink JobManager is now running on ([a-zA-Z0-9.-]+):([0-9]+)"); Matcher matches = p.matcher(oC); String hostname = null; String port = null; while (matches.find()) { hostname = matches.group(1).toLowerCase(); port = matches.group(2); } LOG.info("Extracted hostname:port: {} {}", hostname, port); Assert.assertEquals( "unable to find hostname in " + parsed, hostname, parsedConfig.get(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY)); Assert.assertEquals( "unable to find port in " + parsed, port, parsedConfig.get(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY)); // test logfile access String logs = TestBaseUtils.getFromHTTP(url + "jobmanager/log"); Assert.assertTrue(logs.contains("Starting YARN ApplicationMaster/JobManager (Version")); } catch (Throwable e) { LOG.warn("Error while running test", e); Assert.fail(e.getMessage()); } // ------------------------ Kill container with TaskManager ------- // find container id of taskManager: ContainerId taskManagerContainer = null; NodeManager nodeManager = null; UserGroupInformation remoteUgi = null; NMTokenIdentifier nmIdent = null; try { remoteUgi = UserGroupInformation.getCurrentUser(); } catch (IOException e) { LOG.warn("Unable to get curr user", e); Assert.fail(); } for (int nmId = 0; nmId < NUM_NODEMANAGERS; nmId++) { NodeManager nm = yarnCluster.getNodeManager(nmId); ConcurrentMap<ContainerId, Container> containers = nm.getNMContext().getContainers(); for (Map.Entry<ContainerId, Container> entry : containers.entrySet()) { String command = Joiner.on(" ").join(entry.getValue().getLaunchContext().getCommands()); if (command.contains(YarnTaskManagerRunner.class.getSimpleName())) { taskManagerContainer = entry.getKey(); nodeManager = nm; nmIdent = new NMTokenIdentifier(taskManagerContainer.getApplicationAttemptId(), null, "", 0); // allow myself to do stuff with the container // remoteUgi.addCredentials(entry.getValue().getCredentials()); remoteUgi.addTokenIdentifier(nmIdent); } } sleep(500); } Assert.assertNotNull("Unable to find container with TaskManager", taskManagerContainer); Assert.assertNotNull("Illegal state", nodeManager); List<ContainerId> toStop = new LinkedList<ContainerId>(); toStop.add(taskManagerContainer); StopContainersRequest scr = StopContainersRequest.newInstance(toStop); try { nodeManager.getNMContext().getContainerManager().stopContainers(scr); } catch (Throwable e) { LOG.warn("Error stopping container", e); Assert.fail("Error stopping container: " + e.getMessage()); } // stateful termination check: // wait until we saw a container being killed and AFTERWARDS a new one launched boolean ok = false; do { LOG.debug("Waiting for correct order of events. Output: {}", errContent.toString()); String o = errContent.toString(); int killedOff = o.indexOf("Container killed by the ApplicationMaster"); if (killedOff != -1) { o = o.substring(killedOff); ok = o.indexOf("Launching container") > 0; } sleep(1000); } while (!ok); // send "stop" command to command line interface runner.sendStop(); // wait for the thread to stop try { runner.join(1000); } catch (InterruptedException e) { LOG.warn("Interrupted while stopping runner", e); } LOG.warn("stopped"); // ----------- Send output to logger System.setOut(originalStdout); System.setErr(originalStderr); String oC = outContent.toString(); String eC = errContent.toString(); LOG.info("Sending stdout content through logger: \n\n{}\n\n", oC); LOG.info("Sending stderr content through logger: \n\n{}\n\n", eC); // ------ Check if everything happened correctly Assert.assertTrue( "Expect to see failed container", eC.contains("New messages from the YARN cluster")); Assert.assertTrue( "Expect to see failed container", eC.contains("Container killed by the ApplicationMaster")); Assert.assertTrue( "Expect to see new container started", eC.contains("Launching container") && eC.contains("on host")); // cleanup auth for the subsequent tests. remoteUgi.getTokenIdentifiers().remove(nmIdent); LOG.info("Finished testTaskManagerFailure()"); }